@reconcrap/people-network-memory 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +476 -0
- package/docs/mcp_tools.md +138 -0
- package/harness_adapters/openclaw/mcp.managed.unix.template.json +25 -0
- package/harness_adapters/openclaw/mcp.managed.windows.template.json +26 -0
- package/harness_adapters/openclaw/mcp.template.json +14 -0
- package/harness_adapters/openclaw/ppl/SKILL.md +114 -0
- package/package.json +30 -0
- package/pyproject.toml +26 -0
- package/scripts/install_windows.ps1 +92 -0
- package/scripts/npm/people-memory.js +276 -0
- package/scripts/people_memory_bootstrap.py +247 -0
- package/scripts/run_graphiti_live_from_liepin.ps1 +87 -0
- package/scripts/run_tests_with_artifacts.ps1 +307 -0
- package/src/people_network_memory/__init__.py +6 -0
- package/src/people_network_memory/application/__init__.py +16 -0
- package/src/people_network_memory/application/normalization.py +1441 -0
- package/src/people_network_memory/application/services.py +921 -0
- package/src/people_network_memory/cli.py +1212 -0
- package/src/people_network_memory/config.py +268 -0
- package/src/people_network_memory/domain/__init__.py +55 -0
- package/src/people_network_memory/domain/identity.py +77 -0
- package/src/people_network_memory/domain/models.py +355 -0
- package/src/people_network_memory/fixtures/__init__.py +6 -0
- package/src/people_network_memory/fixtures/eval.py +398 -0
- package/src/people_network_memory/fixtures/extractor_eval.py +364 -0
- package/src/people_network_memory/fixtures/generator.py +290 -0
- package/src/people_network_memory/fixtures/report.py +252 -0
- package/src/people_network_memory/graphiti_adapter/__init__.py +9 -0
- package/src/people_network_memory/graphiti_adapter/episode_formatter.py +70 -0
- package/src/people_network_memory/graphiti_adapter/graphiti_store.py +655 -0
- package/src/people_network_memory/graphiti_adapter/indexer.py +194 -0
- package/src/people_network_memory/graphiti_adapter/ontology.py +68 -0
- package/src/people_network_memory/harness_adapters/__init__.py +2 -0
- package/src/people_network_memory/harness_adapters/openclaw/__init__.py +9 -0
- package/src/people_network_memory/harness_adapters/openclaw/installer.py +577 -0
- package/src/people_network_memory/harness_adapters/openclaw/integration_eval.py +508 -0
- package/src/people_network_memory/harness_adapters/openclaw/smoke.py +292 -0
- package/src/people_network_memory/infrastructure/__init__.py +2 -0
- package/src/people_network_memory/infrastructure/archive_backup.py +171 -0
- package/src/people_network_memory/infrastructure/diagnostics.py +171 -0
- package/src/people_network_memory/infrastructure/embeddings.py +155 -0
- package/src/people_network_memory/infrastructure/file_store.py +129 -0
- package/src/people_network_memory/infrastructure/graphiti_promotion.py +212 -0
- package/src/people_network_memory/infrastructure/id_generator.py +40 -0
- package/src/people_network_memory/infrastructure/in_memory_store.py +1008 -0
- package/src/people_network_memory/infrastructure/llm_extractor.py +476 -0
- package/src/people_network_memory/infrastructure/llm_identity_advisor.py +200 -0
- package/src/people_network_memory/infrastructure/llm_judge.py +162 -0
- package/src/people_network_memory/infrastructure/redaction.py +21 -0
- package/src/people_network_memory/infrastructure/release_check.py +186 -0
- package/src/people_network_memory/infrastructure/retrieval_intent.py +98 -0
- package/src/people_network_memory/infrastructure/semantic_index.py +262 -0
- package/src/people_network_memory/mcp_server/__init__.py +2 -0
- package/src/people_network_memory/mcp_server/contracts.py +85 -0
- package/src/people_network_memory/mcp_server/runtime.py +133 -0
- package/src/people_network_memory/mcp_server/tools.py +588 -0
- package/src/people_network_memory/ports/__init__.py +2 -0
- package/src/people_network_memory/ports/errors.py +25 -0
- package/src/people_network_memory/ports/interfaces.py +103 -0
- package/src/people_network_memory/projection/__init__.py +6 -0
- package/src/people_network_memory/projection/builders.py +46 -0
|
@@ -0,0 +1,1441 @@
|
|
|
1
|
+
"""Conservative source-text normalization for low-friction capture."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from datetime import date, timedelta
|
|
8
|
+
|
|
9
|
+
from people_network_memory.domain.models import (
|
|
10
|
+
AttributedClaim,
|
|
11
|
+
DirectFact,
|
|
12
|
+
FollowUpTask,
|
|
13
|
+
MentionedPerson,
|
|
14
|
+
Participant,
|
|
15
|
+
PersonRef,
|
|
16
|
+
RelationshipAssertion,
|
|
17
|
+
SocialInteraction,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
PLACEHOLDER_NAME = r"[Pp]erson\s+[A-Z]"
|
|
22
|
+
LATIN_NAME = r"[A-Z][A-Za-z'-]+(?:\s+[A-Z][A-Za-z'-]+){0,2}"
|
|
23
|
+
CJK_NAME = r"[\u4e00-\u9fff]{2,8}"
|
|
24
|
+
CJK_ALIAS = r"[\u4e00-\u9fffA-Za-z0-9_-]{1,12}"
|
|
25
|
+
PERSON = rf"(?:{PLACEHOLDER_NAME}|{LATIN_NAME}|{CJK_NAME})"
|
|
26
|
+
|
|
27
|
+
EN_MET_RE = re.compile(
|
|
28
|
+
rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+"
|
|
29
|
+
rf"(?:(?i:another|a\s+different|different|new)\s+)?"
|
|
30
|
+
rf"(?P<person>{PERSON})(?=\s+(?i:at|today|yesterday|on|who|that)\b|[.,;]|$)",
|
|
31
|
+
)
|
|
32
|
+
EN_REMEMBER_RE = re.compile(rf"\b(?i:remember|add|save)\s+(?P<person>{PERSON})\b")
|
|
33
|
+
EN_PLACE_RE = re.compile(
|
|
34
|
+
rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+{PERSON}"
|
|
35
|
+
rf"(?:\s+(?i:today|yesterday)|\s+(?i:on)\s+\d{{4}}-\d{{1,2}}-\d{{1,2}})?"
|
|
36
|
+
rf"\s+(?i:at)\s+(?P<place>[^.;,()]+)",
|
|
37
|
+
)
|
|
38
|
+
EN_ANOTHER_PLACE_RE = re.compile(
|
|
39
|
+
rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+"
|
|
40
|
+
rf"(?:(?i:another|a\s+different|different|new)\s+){PERSON}"
|
|
41
|
+
rf"(?:\s+(?i:today|yesterday)|\s+(?i:on)\s+\d{{4}}-\d{{1,2}}-\d{{1,2}})?"
|
|
42
|
+
rf"\s+(?i:at)\s+(?P<place>[^.;,()]+)",
|
|
43
|
+
)
|
|
44
|
+
EN_TOPIC_RE = re.compile(r"\b(?i:discussed|talked about|chatted about)\s+(?P<topic>[^.;]+)")
|
|
45
|
+
EN_MENTION_RE = re.compile(
|
|
46
|
+
rf"\b(?P<speaker>{PERSON})\s+(?i:mentioned|said|told me)\s+(?P<target>{PERSON})\b",
|
|
47
|
+
)
|
|
48
|
+
EN_MET_KNOWS_RE = re.compile(
|
|
49
|
+
rf"\b(?i:met|spoke with|had coffee with|had dinner with|called)\s+"
|
|
50
|
+
rf"(?P<source>{PERSON})\s+(?i:who|that)\s+(?i:also\s+)?(?i:knows|knew)\s+(?P<target>{PERSON})\b",
|
|
51
|
+
)
|
|
52
|
+
EN_KNOWS_RE = re.compile(
|
|
53
|
+
rf"\b(?P<source>{PERSON})\s+(?i:also\s+)?(?i:knows|knew)\s+(?P<target>{PERSON})\b",
|
|
54
|
+
)
|
|
55
|
+
EN_FRIENDS_WITH_RE = re.compile(
|
|
56
|
+
rf"\b(?P<source>{PERSON})\s+(?i:is\s+)?(?i:friends?\s+with)\s+(?P<target>{PERSON})\b",
|
|
57
|
+
)
|
|
58
|
+
EN_WORKS_WITH_RE = re.compile(
|
|
59
|
+
rf"\b(?P<source>{PERSON})\s+(?i:works|worked|is\s+working)\s+(?i:with)\s+(?P<target>{PERSON})\b",
|
|
60
|
+
)
|
|
61
|
+
EN_INTRODUCED_ME_RE = re.compile(
|
|
62
|
+
rf"\b(?P<source>{PERSON})\s+(?i:introduced\s+me\s+to)\s+(?P<target>{PERSON})\b",
|
|
63
|
+
)
|
|
64
|
+
EN_PROMISE_RE = re.compile(r"\b(?i:(?:i\s+)?promised to)\s+(?P<task>[^.;]+)")
|
|
65
|
+
EN_ASKED_ME_RE = re.compile(r"\b(?i:(?:he|she|they|[A-Z][A-Za-z'-]+)\s+asked\s+me\s+to)\s+(?P<task>[^.;]+)")
|
|
66
|
+
ZH_PROMISE_RE = re.compile(r"(?:我)?(?:答应|承诺|说好|约好)(?P<task>[^。;;]+)")
|
|
67
|
+
EN_WORK_RE = re.compile(
|
|
68
|
+
rf"\b(?P<person>{PERSON})\s+(?i:currently\s+)?(?i:works|is working|worked)\s+(?i:at|for)\s+"
|
|
69
|
+
r"(?P<organization>[^.;,]+?)(?:\s+(?i:as)\s+(?P<role>[^.;,]+))?(?=[.;,]|$)"
|
|
70
|
+
)
|
|
71
|
+
EN_STUDIED_RE = re.compile(
|
|
72
|
+
rf"\b(?P<person>{PERSON})\s+(?i:studied at|graduated from|went to)\s+(?P<school>[^.;,]+)"
|
|
73
|
+
)
|
|
74
|
+
EN_INTEREST_RE = re.compile(
|
|
75
|
+
rf"\b(?P<person>{PERSON})\s+(?i:likes|enjoys|is interested in|is into)\s+(?P<value>[^.;,]+)"
|
|
76
|
+
)
|
|
77
|
+
DEFAULT_WORK_RE = re.compile(r"\b(?i:works|is working|worked)\s+(?i:at|for)\s+(?P<organization>[^.;,]+)")
|
|
78
|
+
DEFAULT_STUDIED_RE = re.compile(r"\b(?i:studied at|graduated from|went to)\s+(?P<school>[^.;,]+)")
|
|
79
|
+
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
|
80
|
+
PHONE_RE = re.compile(r"(?<!\d)(?:\+?\d[\d\s().-]{6,}\d)(?!\d)")
|
|
81
|
+
ISO_DATE_RE = re.compile(r"\b(?P<year>20\d{2})[-/](?P<month>\d{1,2})[-/](?P<day>\d{1,2})\b")
|
|
82
|
+
MONTH_DAY_RE = re.compile(
|
|
83
|
+
r"\b(?P<month>jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
|
|
84
|
+
r"jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)"
|
|
85
|
+
r"\s+(?P<day>\d{1,2})\b",
|
|
86
|
+
flags=re.IGNORECASE,
|
|
87
|
+
)
|
|
88
|
+
ZH_NEXT_WEEKDAY_RE = re.compile(
|
|
89
|
+
r"下(?:周|星期|礼拜)(?P<weekday>[一二三四五六日天])"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
ZH_PARTICIPANT_RE = re.compile(
|
|
93
|
+
rf"(?:见了|认识了|遇到|碰到)(?P<person>{PERSON})"
|
|
94
|
+
r"(?=\s*[((,,。.]|聊了|说|提到|也|和|他|她|其|在|喜欢|正在|可能|已经|$)"
|
|
95
|
+
)
|
|
96
|
+
ZH_CHAT_WITH_RE = re.compile(rf"和(?P<person>{PERSON})(?:聊了|聊天|见面)")
|
|
97
|
+
ZH_ARRANGED_WITH_RE = re.compile(rf"和(?P<person>{PERSON})(?:约好|说好)")
|
|
98
|
+
ZH_COMPLETED_SEND_TO_RE = re.compile(
|
|
99
|
+
rf"(?:已经|已|刚刚|刚)?(?:给|把)(?P<person>{PERSON})"
|
|
100
|
+
r"(?:发了|发送了|发过去了|分享了|转发了)"
|
|
101
|
+
)
|
|
102
|
+
ZH_REMEMBER_RE = re.compile(rf"(?:记住|记录|保存)(?P<person>{PERSON})")
|
|
103
|
+
ZH_PLACE_RE = re.compile(r"在(?P<place>[^,。,.]+?)(?:又)?见了")
|
|
104
|
+
ZH_TOPIC_RE = re.compile(r"聊了(?P<topic>[^,。,.]+)")
|
|
105
|
+
ZH_FOCUS_TOPIC_RE = re.compile(
|
|
106
|
+
r"(?:重点看|重点关注|正在关注|也在关注|关注)(?P<topic>[^,。,.;;]+)"
|
|
107
|
+
)
|
|
108
|
+
ZH_ORG_FOCUS_CLAIM_RE = re.compile(
|
|
109
|
+
r"(?P<subject>[\u4e00-\u9fffA-Za-z0-9_-]{2,30}"
|
|
110
|
+
r"(?:公司|集团|科技|智能|机器人|资本|基金|大学|学院|实验室))"
|
|
111
|
+
r"(?P<time>上半年|下半年|今年|明年|后续|接下来|未来)"
|
|
112
|
+
r"(?:会|将|要)?"
|
|
113
|
+
r"(?P<focus>重点看|重点关注|关注)"
|
|
114
|
+
r"(?P<topic>[^,。,.;;]+)"
|
|
115
|
+
)
|
|
116
|
+
ZH_MENTION_RE = re.compile(rf"(?P<speaker>{PERSON})提到(?P<target>{PERSON})")
|
|
117
|
+
ZH_DEFAULT_WORK_RE = re.compile(
|
|
118
|
+
r"(?:他|她|他们|她们|这个人|这人|此人)?在"
|
|
119
|
+
r"(?P<organization>[^,。,.;;]{2,30}?)(?:做|负责|从事|搞)"
|
|
120
|
+
r"(?P<role>[^,。,.;;]+)"
|
|
121
|
+
)
|
|
122
|
+
ZH_CONTACT_PREF_RE = re.compile(
|
|
123
|
+
r"(?:喜欢|偏好|倾向于|最好)(?:用|通过)?"
|
|
124
|
+
r"(?P<method>微信|短信|电话|邮件|邮箱|WhatsApp|飞书|钉钉)"
|
|
125
|
+
r"(?:联系|沟通|跟进)"
|
|
126
|
+
)
|
|
127
|
+
CONTACT_PREFERENCE_ONLY_RE = re.compile(
|
|
128
|
+
r"(?:之后|以后|后续|下次|以后有事)?\s*"
|
|
129
|
+
r"(?:最好|偏好|喜欢|倾向于)?\s*(?:用|通过)?\s*"
|
|
130
|
+
r"(?:微信|短信|电话|邮件|邮箱|whatsapp|飞书|钉钉)\s*"
|
|
131
|
+
r"(?:联系|沟通|跟进)",
|
|
132
|
+
flags=re.IGNORECASE,
|
|
133
|
+
)
|
|
134
|
+
CONCRETE_FOLLOW_UP_CUE_RE = re.compile(
|
|
135
|
+
r"(?:"
|
|
136
|
+
r"明天|后天|下周|周[一二三四五六日天]|星期[一二三四五六日天]|礼拜[一二三四五六日天]|"
|
|
137
|
+
r"\d{1,2}[月/.-]\d{1,2}|20\d{2}[-/]\d{1,2}[-/]\d{1,2}|"
|
|
138
|
+
r"答应|承诺|说好|约好|提醒我|需要我|让我|"
|
|
139
|
+
r"发|发送|推荐|介绍|引荐|安排|确认|准备|"
|
|
140
|
+
r"tomorrow|next\s+week|next\s+\w+day|by\s+\w+day|"
|
|
141
|
+
r"promised|asked\s+me\s+to|need\s+to|send|introduce|share|confirm|schedule"
|
|
142
|
+
r")",
|
|
143
|
+
flags=re.IGNORECASE,
|
|
144
|
+
)
|
|
145
|
+
ZH_ALIAS_CONTEXT = r"(?:[,。,.;;]|见了|认识了|遇到|碰到|记住|记录|保存|和)\s*"
|
|
146
|
+
ZH_ALIAS_PAREN_RE = re.compile(
|
|
147
|
+
rf"{ZH_ALIAS_CONTEXT}(?P<canonical>{PERSON})\s*[((](?P<alias>{CJK_ALIAS})[))]"
|
|
148
|
+
)
|
|
149
|
+
ZH_ALIAS_START_PAREN_RE = re.compile(
|
|
150
|
+
rf"^(?P<canonical>{PERSON})\s*[((](?P<alias>{CJK_ALIAS})[))]"
|
|
151
|
+
)
|
|
152
|
+
ZH_ALIAS_PHRASE_RE = re.compile(
|
|
153
|
+
rf"{ZH_ALIAS_CONTEXT}(?P<canonical>{PERSON})"
|
|
154
|
+
rf"(?:又叫|也叫|昵称(?:是|叫)?|绰号(?:是|叫)?|外号(?:是|叫)?|花名(?:是|叫)?)"
|
|
155
|
+
rf"(?P<alias>{PERSON})"
|
|
156
|
+
)
|
|
157
|
+
ZH_ALIAS_START_PHRASE_RE = re.compile(
|
|
158
|
+
rf"^(?P<canonical>{PERSON})"
|
|
159
|
+
rf"(?:又叫|也叫|昵称(?:是|叫)?|绰号(?:是|叫)?|外号(?:是|叫)?|花名(?:是|叫)?)"
|
|
160
|
+
rf"(?P<alias>{PERSON})"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
CJK_ALIAS_DESCRIPTORS = {
|
|
164
|
+
"前同事",
|
|
165
|
+
"同事",
|
|
166
|
+
"朋友",
|
|
167
|
+
"老同学",
|
|
168
|
+
"同学",
|
|
169
|
+
"校友",
|
|
170
|
+
"客户",
|
|
171
|
+
"投资人",
|
|
172
|
+
"创始人",
|
|
173
|
+
"合伙人",
|
|
174
|
+
"产品经理",
|
|
175
|
+
"工程师",
|
|
176
|
+
}
|
|
177
|
+
CJK_ALIAS_DESCRIPTOR_SUFFIXES = (
|
|
178
|
+
"经理",
|
|
179
|
+
"工程师",
|
|
180
|
+
"负责人",
|
|
181
|
+
"总监",
|
|
182
|
+
"同事",
|
|
183
|
+
"朋友",
|
|
184
|
+
"客户",
|
|
185
|
+
"投资人",
|
|
186
|
+
"创始人",
|
|
187
|
+
"合伙人",
|
|
188
|
+
"校友",
|
|
189
|
+
"同学",
|
|
190
|
+
)
|
|
191
|
+
BAD_PERSON_LABELS = {"i", "me", "my", "you", "he", "him", "his", "she", "her", "they", "them"}
|
|
192
|
+
ORGANIZATION_LABEL_SUFFIXES = (
|
|
193
|
+
"公司",
|
|
194
|
+
"集团",
|
|
195
|
+
"科技",
|
|
196
|
+
"智能",
|
|
197
|
+
"机器人",
|
|
198
|
+
"资本",
|
|
199
|
+
"基金",
|
|
200
|
+
"大学",
|
|
201
|
+
"学院",
|
|
202
|
+
"实验室",
|
|
203
|
+
)
|
|
204
|
+
PHONE_PREDICATES = {"phone", "mobile", "telephone", "tel"}
|
|
205
|
+
EMAIL_PREDICATES = {"email", "mail"}
|
|
206
|
+
MONTHS = {
|
|
207
|
+
"jan": 1,
|
|
208
|
+
"january": 1,
|
|
209
|
+
"feb": 2,
|
|
210
|
+
"february": 2,
|
|
211
|
+
"mar": 3,
|
|
212
|
+
"march": 3,
|
|
213
|
+
"apr": 4,
|
|
214
|
+
"april": 4,
|
|
215
|
+
"may": 5,
|
|
216
|
+
"jun": 6,
|
|
217
|
+
"june": 6,
|
|
218
|
+
"jul": 7,
|
|
219
|
+
"july": 7,
|
|
220
|
+
"aug": 8,
|
|
221
|
+
"august": 8,
|
|
222
|
+
"sep": 9,
|
|
223
|
+
"september": 9,
|
|
224
|
+
"oct": 10,
|
|
225
|
+
"october": 10,
|
|
226
|
+
"nov": 11,
|
|
227
|
+
"november": 11,
|
|
228
|
+
"dec": 12,
|
|
229
|
+
"december": 12,
|
|
230
|
+
}
|
|
231
|
+
ZH_WEEKDAYS = {
|
|
232
|
+
"一": 0,
|
|
233
|
+
"二": 1,
|
|
234
|
+
"三": 2,
|
|
235
|
+
"四": 3,
|
|
236
|
+
"五": 4,
|
|
237
|
+
"六": 5,
|
|
238
|
+
"日": 6,
|
|
239
|
+
"天": 6,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def normalize_interaction(interaction: SocialInteraction) -> SocialInteraction:
|
|
244
|
+
"""Fill obvious structure from source_text without inventing uncertain facts."""
|
|
245
|
+
|
|
246
|
+
source = interaction.source_text
|
|
247
|
+
explicit_aliases = _explicit_aliases(source)
|
|
248
|
+
participants = list(interaction.participants)
|
|
249
|
+
mentioned = list(interaction.mentioned_people)
|
|
250
|
+
topics = list(interaction.topics)
|
|
251
|
+
direct_facts = list(interaction.direct_facts)
|
|
252
|
+
attributed_claims = list(interaction.attributed_claims)
|
|
253
|
+
follow_ups = list(interaction.follow_ups)
|
|
254
|
+
relationships = list(interaction.relationships)
|
|
255
|
+
place = interaction.place
|
|
256
|
+
|
|
257
|
+
if not participants:
|
|
258
|
+
for label in _participant_labels(source):
|
|
259
|
+
_append_participant(participants, label)
|
|
260
|
+
for subject in _fact_subject_labels(source):
|
|
261
|
+
_append_participant(participants, subject)
|
|
262
|
+
for label in explicit_aliases:
|
|
263
|
+
_append_participant(participants, label)
|
|
264
|
+
|
|
265
|
+
if not place:
|
|
266
|
+
place = _first_match(source, [EN_ANOTHER_PLACE_RE, EN_PLACE_RE, ZH_PLACE_RE], "place")
|
|
267
|
+
|
|
268
|
+
topics = _normalize_topics(source, topics)
|
|
269
|
+
|
|
270
|
+
if not mentioned:
|
|
271
|
+
for speaker, target in _mention_pairs(source):
|
|
272
|
+
_append_participant(participants, speaker)
|
|
273
|
+
_append_mentioned(mentioned, target, mentioned_by=speaker)
|
|
274
|
+
for relationship_source, relationship_target, _ in _relationship_triples(source):
|
|
275
|
+
_append_mentioned(mentioned, relationship_target, mentioned_by=relationship_source)
|
|
276
|
+
if not attributed_claims:
|
|
277
|
+
for speaker, target in _mention_pairs(source):
|
|
278
|
+
attributed_claims.append(
|
|
279
|
+
AttributedClaim(
|
|
280
|
+
speaker=PersonRef(label=speaker),
|
|
281
|
+
subject=PersonRef(label=target),
|
|
282
|
+
claim_text=_claim_sentence(source, speaker, target),
|
|
283
|
+
)
|
|
284
|
+
)
|
|
285
|
+
if not relationships:
|
|
286
|
+
for relationship_source, relationship_target, relationship_type in _relationship_triples(source):
|
|
287
|
+
relationships.append(
|
|
288
|
+
RelationshipAssertion(
|
|
289
|
+
source=PersonRef(label=relationship_source),
|
|
290
|
+
target=PersonRef(label=relationship_target),
|
|
291
|
+
relationship_type=relationship_type,
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
direct_facts.extend(
|
|
296
|
+
_direct_facts(
|
|
297
|
+
source,
|
|
298
|
+
participants,
|
|
299
|
+
fallback_subject=_single_explicit_alias_ref(explicit_aliases),
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
for claim in _source_focus_claims(source, participants):
|
|
303
|
+
if not any(existing.claim_text == claim.claim_text for existing in attributed_claims):
|
|
304
|
+
attributed_claims.append(claim)
|
|
305
|
+
|
|
306
|
+
if not follow_ups:
|
|
307
|
+
for task in _follow_up_labels(source):
|
|
308
|
+
related = [
|
|
309
|
+
participant.person
|
|
310
|
+
for participant in participants
|
|
311
|
+
if _person_label_referenced_in_text(participant.person.label, task)
|
|
312
|
+
]
|
|
313
|
+
follow_ups.append(FollowUpTask(description=f"Promised to {task}", related_people=related))
|
|
314
|
+
|
|
315
|
+
(
|
|
316
|
+
participants,
|
|
317
|
+
mentioned,
|
|
318
|
+
direct_facts,
|
|
319
|
+
attributed_claims,
|
|
320
|
+
follow_ups,
|
|
321
|
+
relationships,
|
|
322
|
+
) = _sanitize_interaction_fields(
|
|
323
|
+
source=source,
|
|
324
|
+
occurred_at=interaction.occurred_at.date() if interaction.occurred_at else None,
|
|
325
|
+
place=place,
|
|
326
|
+
participants=participants,
|
|
327
|
+
mentioned=mentioned,
|
|
328
|
+
direct_facts=direct_facts,
|
|
329
|
+
attributed_claims=attributed_claims,
|
|
330
|
+
follow_ups=follow_ups,
|
|
331
|
+
relationships=relationships,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
(
|
|
335
|
+
participants,
|
|
336
|
+
mentioned,
|
|
337
|
+
direct_facts,
|
|
338
|
+
attributed_claims,
|
|
339
|
+
follow_ups,
|
|
340
|
+
relationships,
|
|
341
|
+
) = _canonicalize_shorthand_refs(
|
|
342
|
+
participants=participants,
|
|
343
|
+
mentioned=mentioned,
|
|
344
|
+
direct_facts=direct_facts,
|
|
345
|
+
attributed_claims=attributed_claims,
|
|
346
|
+
follow_ups=follow_ups,
|
|
347
|
+
relationships=relationships,
|
|
348
|
+
explicit_aliases=explicit_aliases,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
return interaction.model_copy(
|
|
352
|
+
update={
|
|
353
|
+
"place": place,
|
|
354
|
+
"participants": participants,
|
|
355
|
+
"mentioned_people": mentioned,
|
|
356
|
+
"topics": topics,
|
|
357
|
+
"direct_facts": direct_facts,
|
|
358
|
+
"attributed_claims": attributed_claims,
|
|
359
|
+
"follow_ups": follow_ups,
|
|
360
|
+
"relationships": relationships,
|
|
361
|
+
}
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _participant_labels(text: str) -> list[str]:
|
|
366
|
+
labels = [match.group("person").strip() for match in EN_MET_RE.finditer(text)]
|
|
367
|
+
labels.extend(match.group("person").strip() for match in EN_REMEMBER_RE.finditer(text))
|
|
368
|
+
labels.extend(match.group("person").strip() for match in ZH_PARTICIPANT_RE.finditer(text))
|
|
369
|
+
labels.extend(match.group("person").strip() for match in ZH_CHAT_WITH_RE.finditer(text))
|
|
370
|
+
labels.extend(match.group("person").strip() for match in ZH_ARRANGED_WITH_RE.finditer(text))
|
|
371
|
+
labels.extend(match.group("person").strip() for match in ZH_COMPLETED_SEND_TO_RE.finditer(text))
|
|
372
|
+
labels.extend(match.group("person").strip() for match in ZH_REMEMBER_RE.finditer(text))
|
|
373
|
+
return _unique_labels(labels)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _fact_subject_labels(text: str) -> list[str]:
|
|
377
|
+
labels: list[str] = []
|
|
378
|
+
for pattern in [EN_WORK_RE, EN_STUDIED_RE, EN_INTEREST_RE]:
|
|
379
|
+
labels.extend(match.group("person").strip() for match in pattern.finditer(text))
|
|
380
|
+
return _unique_labels(label for label in labels if _is_probable_person(label))
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _topic_labels(text: str) -> list[str]:
|
|
384
|
+
labels = [match.group("topic").strip() for match in EN_TOPIC_RE.finditer(text)]
|
|
385
|
+
labels.extend(match.group("topic").strip() for match in ZH_TOPIC_RE.finditer(text))
|
|
386
|
+
labels.extend(match.group("topic").strip() for match in ZH_FOCUS_TOPIC_RE.finditer(text))
|
|
387
|
+
return _unique_labels(labels)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _normalize_topics(source: str, topics: list[str]) -> list[str]:
|
|
391
|
+
normalized: list[str] = []
|
|
392
|
+
for topic in topics:
|
|
393
|
+
cleaned = _clean_topic_label(topic)
|
|
394
|
+
if cleaned and not _is_noise_topic(cleaned) and cleaned not in normalized:
|
|
395
|
+
normalized.append(cleaned)
|
|
396
|
+
for label in _topic_labels(source):
|
|
397
|
+
cleaned = _clean_topic_label(label)
|
|
398
|
+
if cleaned and not _is_noise_topic(cleaned) and cleaned not in normalized:
|
|
399
|
+
normalized.append(cleaned)
|
|
400
|
+
return normalized
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _clean_topic_label(value: str) -> str:
|
|
404
|
+
value = value.strip(" \t\r\n,.;:,。!?!?\"'")
|
|
405
|
+
value = re.sub(r"^(?:他|她|他们|她们|这个人|这人|现在|也|在|会|将|要)\s*", "", value)
|
|
406
|
+
return value.strip(" \t\r\n,.;:,。!?!?\"'")
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _is_noise_topic(value: str) -> bool:
|
|
410
|
+
value = value.strip()
|
|
411
|
+
if not value:
|
|
412
|
+
return True
|
|
413
|
+
if value in {"今天", "昨天", "刚刚", "早些时候", "上午", "下午", "晚上"}:
|
|
414
|
+
return True
|
|
415
|
+
return bool(
|
|
416
|
+
re.fullmatch(
|
|
417
|
+
r"(?:半|一|二|三|四|五|六|七|八|九|十|两|\d+)+"
|
|
418
|
+
r"(?:分钟|小时|天|周|个月|年)",
|
|
419
|
+
value,
|
|
420
|
+
)
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _source_focus_claims(
|
|
425
|
+
source: str, participants: list[Participant]
|
|
426
|
+
) -> list[AttributedClaim]:
|
|
427
|
+
speaker = _default_subject(participants)
|
|
428
|
+
if not speaker:
|
|
429
|
+
return []
|
|
430
|
+
claims: list[AttributedClaim] = []
|
|
431
|
+
for match in ZH_ORG_FOCUS_CLAIM_RE.finditer(source):
|
|
432
|
+
subject = _clean_focus_subject(match.group("subject"))
|
|
433
|
+
topic = _clean_topic_label(match.group("topic"))
|
|
434
|
+
if not topic or _is_noise_topic(topic):
|
|
435
|
+
continue
|
|
436
|
+
claim_text = match.group(0).strip()
|
|
437
|
+
for prefix in ("他说", "她说", "他们说", "她们说", "表示", "提到"):
|
|
438
|
+
if claim_text.startswith(prefix):
|
|
439
|
+
claim_text = claim_text[len(prefix) :].strip()
|
|
440
|
+
break
|
|
441
|
+
claims.append(
|
|
442
|
+
AttributedClaim(
|
|
443
|
+
speaker=speaker,
|
|
444
|
+
subject=PersonRef(label=subject),
|
|
445
|
+
claim_text=claim_text,
|
|
446
|
+
claim_type="business_focus",
|
|
447
|
+
)
|
|
448
|
+
)
|
|
449
|
+
return claims
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _clean_focus_subject(value: str) -> str:
|
|
453
|
+
value = value.strip()
|
|
454
|
+
for prefix in ("他说", "她说", "他们说", "她们说", "表示", "提到"):
|
|
455
|
+
if value.startswith(prefix):
|
|
456
|
+
return value[len(prefix) :].strip()
|
|
457
|
+
return value
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _mention_pairs(text: str) -> list[tuple[str, str]]:
|
|
461
|
+
pairs = [
|
|
462
|
+
(match.group("speaker").strip(), match.group("target").strip())
|
|
463
|
+
for match in EN_MENTION_RE.finditer(text)
|
|
464
|
+
]
|
|
465
|
+
pairs.extend(
|
|
466
|
+
(match.group("speaker").strip(), match.group("target").strip())
|
|
467
|
+
for match in ZH_MENTION_RE.finditer(text)
|
|
468
|
+
)
|
|
469
|
+
return [
|
|
470
|
+
pair
|
|
471
|
+
for pair in _unique_pairs(pairs)
|
|
472
|
+
if _is_probable_person(pair[0]) and _is_probable_person(pair[1])
|
|
473
|
+
]
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _relationship_triples(text: str) -> list[tuple[str, str, str]]:
|
|
477
|
+
triples: list[tuple[str, str, str]] = []
|
|
478
|
+
relationship_patterns = [
|
|
479
|
+
(EN_MET_KNOWS_RE, "knows"),
|
|
480
|
+
(EN_KNOWS_RE, "knows"),
|
|
481
|
+
(EN_FRIENDS_WITH_RE, "friends_with"),
|
|
482
|
+
(EN_WORKS_WITH_RE, "works_with"),
|
|
483
|
+
(EN_INTRODUCED_ME_RE, "introduced_by"),
|
|
484
|
+
]
|
|
485
|
+
for pattern, relationship_type in relationship_patterns:
|
|
486
|
+
triples.extend(
|
|
487
|
+
(
|
|
488
|
+
match.group("source").strip(),
|
|
489
|
+
match.group("target").strip(),
|
|
490
|
+
relationship_type,
|
|
491
|
+
)
|
|
492
|
+
for match in pattern.finditer(text)
|
|
493
|
+
)
|
|
494
|
+
return [
|
|
495
|
+
triple
|
|
496
|
+
for triple in _unique_triples(triples)
|
|
497
|
+
if _is_probable_person(triple[0]) and _is_probable_person(triple[1])
|
|
498
|
+
]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _direct_facts(
|
|
502
|
+
text: str,
|
|
503
|
+
participants: list[Participant],
|
|
504
|
+
*,
|
|
505
|
+
fallback_subject: PersonRef | None = None,
|
|
506
|
+
) -> list[DirectFact]:
|
|
507
|
+
facts: list[DirectFact] = []
|
|
508
|
+
for match in EN_WORK_RE.finditer(text):
|
|
509
|
+
subject = PersonRef(label=match.group("person").strip())
|
|
510
|
+
organization = match.group("organization").strip()
|
|
511
|
+
facts.append(
|
|
512
|
+
DirectFact(
|
|
513
|
+
subject=subject,
|
|
514
|
+
predicate="works_at",
|
|
515
|
+
value=organization,
|
|
516
|
+
metadata=_optional_metadata(role=match.group("role")),
|
|
517
|
+
)
|
|
518
|
+
)
|
|
519
|
+
for match in EN_STUDIED_RE.finditer(text):
|
|
520
|
+
facts.append(
|
|
521
|
+
DirectFact(
|
|
522
|
+
subject=PersonRef(label=match.group("person").strip()),
|
|
523
|
+
predicate="studied_at",
|
|
524
|
+
value=match.group("school").strip(),
|
|
525
|
+
)
|
|
526
|
+
)
|
|
527
|
+
for match in EN_INTEREST_RE.finditer(text):
|
|
528
|
+
facts.append(
|
|
529
|
+
DirectFact(
|
|
530
|
+
subject=PersonRef(label=match.group("person").strip()),
|
|
531
|
+
predicate="interest",
|
|
532
|
+
value=match.group("value").strip(),
|
|
533
|
+
)
|
|
534
|
+
)
|
|
535
|
+
subject = _default_subject(participants) or fallback_subject
|
|
536
|
+
if subject:
|
|
537
|
+
for match in DEFAULT_WORK_RE.finditer(text):
|
|
538
|
+
facts.append(
|
|
539
|
+
DirectFact(
|
|
540
|
+
subject=subject,
|
|
541
|
+
predicate="works_at",
|
|
542
|
+
value=match.group("organization").strip(),
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
for match in DEFAULT_STUDIED_RE.finditer(text):
|
|
546
|
+
facts.append(
|
|
547
|
+
DirectFact(
|
|
548
|
+
subject=subject,
|
|
549
|
+
predicate="studied_at",
|
|
550
|
+
value=match.group("school").strip(),
|
|
551
|
+
)
|
|
552
|
+
)
|
|
553
|
+
for match in ZH_DEFAULT_WORK_RE.finditer(text):
|
|
554
|
+
organization = match.group("organization").strip()
|
|
555
|
+
role = match.group("role").strip()
|
|
556
|
+
facts.append(
|
|
557
|
+
DirectFact(
|
|
558
|
+
subject=subject,
|
|
559
|
+
predicate="works_at",
|
|
560
|
+
value=organization,
|
|
561
|
+
metadata=_optional_metadata(role=role),
|
|
562
|
+
)
|
|
563
|
+
)
|
|
564
|
+
for match in ZH_CONTACT_PREF_RE.finditer(text):
|
|
565
|
+
facts.append(
|
|
566
|
+
DirectFact(
|
|
567
|
+
subject=subject,
|
|
568
|
+
predicate="preference",
|
|
569
|
+
value=f"{match.group('method').strip()}联系",
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
for email in EMAIL_RE.findall(text):
|
|
573
|
+
facts.append(DirectFact(subject=subject, predicate="email", value=email))
|
|
574
|
+
for phone in PHONE_RE.findall(text):
|
|
575
|
+
facts.append(DirectFact(subject=subject, predicate="phone", value=phone.strip()))
|
|
576
|
+
return facts
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def _follow_up_labels(text: str) -> list[str]:
|
|
580
|
+
labels = [match.group("task").strip() for match in EN_PROMISE_RE.finditer(text)]
|
|
581
|
+
labels.extend(match.group("task").strip() for match in EN_ASKED_ME_RE.finditer(text))
|
|
582
|
+
labels.extend(match.group("task").strip() for match in ZH_PROMISE_RE.finditer(text))
|
|
583
|
+
return _unique_labels(label.strip(" \t\r\n,.;:,。!?!?\"'") for label in labels)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def _append_participant(participants: list[Participant], label: str) -> None:
|
|
587
|
+
label = _clean_person_label(label)
|
|
588
|
+
if not _is_probable_person(label):
|
|
589
|
+
return
|
|
590
|
+
for index, participant in enumerate(participants):
|
|
591
|
+
existing = participant.person.label
|
|
592
|
+
if existing.lower() == label.lower():
|
|
593
|
+
return
|
|
594
|
+
if _same_person_label(existing, label):
|
|
595
|
+
if _is_full_latin_name(label) and not _is_full_latin_name(existing):
|
|
596
|
+
participants[index] = participant.model_copy(
|
|
597
|
+
update={"person": participant.person.model_copy(update={"label": label})}
|
|
598
|
+
)
|
|
599
|
+
return
|
|
600
|
+
participants.append(Participant(person=PersonRef(label=label)))
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _append_mentioned(
|
|
604
|
+
mentioned: list[MentionedPerson], label: str, *, mentioned_by: str | None = None
|
|
605
|
+
) -> None:
|
|
606
|
+
label = _clean_person_label(label)
|
|
607
|
+
mentioned_by = _clean_person_label(mentioned_by) if mentioned_by else None
|
|
608
|
+
if not _is_probable_person(label):
|
|
609
|
+
return
|
|
610
|
+
if any(item.person.label == label for item in mentioned):
|
|
611
|
+
return
|
|
612
|
+
mentioned.append(
|
|
613
|
+
MentionedPerson(
|
|
614
|
+
person=PersonRef(label=label),
|
|
615
|
+
mentioned_by=PersonRef(label=mentioned_by) if mentioned_by else None,
|
|
616
|
+
)
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _first_match(text: str, patterns: list[re.Pattern[str]], group: str) -> str | None:
|
|
621
|
+
for pattern in patterns:
|
|
622
|
+
match = pattern.search(text)
|
|
623
|
+
if match:
|
|
624
|
+
value = match.group(group).strip()
|
|
625
|
+
if group == "place":
|
|
626
|
+
value = _clean_place_label(value)
|
|
627
|
+
return value or None
|
|
628
|
+
return None
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def _clean_place_label(value: str) -> str:
|
|
632
|
+
return re.sub(
|
|
633
|
+
r"\s+\b(?:this afternoon|this morning|this evening|today|yesterday|tonight)\b.*$",
|
|
634
|
+
"",
|
|
635
|
+
value.strip(),
|
|
636
|
+
flags=re.IGNORECASE,
|
|
637
|
+
).strip()
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _claim_sentence(text: str, speaker: str, target: str) -> str:
|
|
641
|
+
for sentence in re.split(r"(?<=[.;。])\s*", text):
|
|
642
|
+
if speaker in sentence and target in sentence:
|
|
643
|
+
return sentence.strip()
|
|
644
|
+
return f"{speaker} mentioned {target}."
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _sanitize_interaction_fields(
|
|
648
|
+
*,
|
|
649
|
+
source: str,
|
|
650
|
+
occurred_at: date | None,
|
|
651
|
+
place: str | None,
|
|
652
|
+
participants: list[Participant],
|
|
653
|
+
mentioned: list[MentionedPerson],
|
|
654
|
+
direct_facts: list[DirectFact],
|
|
655
|
+
attributed_claims: list[AttributedClaim],
|
|
656
|
+
follow_ups: list[FollowUpTask],
|
|
657
|
+
relationships: list[RelationshipAssertion],
|
|
658
|
+
) -> tuple[
|
|
659
|
+
list[Participant],
|
|
660
|
+
list[MentionedPerson],
|
|
661
|
+
list[DirectFact],
|
|
662
|
+
list[AttributedClaim],
|
|
663
|
+
list[FollowUpTask],
|
|
664
|
+
list[RelationshipAssertion],
|
|
665
|
+
]:
|
|
666
|
+
cleaned_participants: list[Participant] = []
|
|
667
|
+
for participant in participants:
|
|
668
|
+
person = _clean_ref(participant.person, place=place)
|
|
669
|
+
if person:
|
|
670
|
+
cleaned_participants.append(participant.model_copy(update={"person": person}))
|
|
671
|
+
cleaned_participants = _dedupe_participants(cleaned_participants)
|
|
672
|
+
default_ref = _default_subject(cleaned_participants)
|
|
673
|
+
|
|
674
|
+
cleaned_mentioned: list[MentionedPerson] = []
|
|
675
|
+
for item in mentioned:
|
|
676
|
+
person = _clean_ref(item.person, place=place)
|
|
677
|
+
if not person:
|
|
678
|
+
continue
|
|
679
|
+
mentioned_by = _clean_ref(item.mentioned_by, place=place) if item.mentioned_by else None
|
|
680
|
+
cleaned_mentioned.append(
|
|
681
|
+
item.model_copy(update={"person": person, "mentioned_by": mentioned_by})
|
|
682
|
+
)
|
|
683
|
+
cleaned_mentioned = _dedupe_mentioned(cleaned_mentioned)
|
|
684
|
+
|
|
685
|
+
cleaned_facts: list[DirectFact] = []
|
|
686
|
+
for fact in direct_facts:
|
|
687
|
+
subject = _clean_ref(fact.subject, place=place)
|
|
688
|
+
if subject is None and _is_bad_person_label(fact.subject.label) and default_ref:
|
|
689
|
+
subject = default_ref
|
|
690
|
+
if subject is None:
|
|
691
|
+
continue
|
|
692
|
+
normalized = fact.model_copy(update={"subject": subject})
|
|
693
|
+
if _valid_direct_fact(normalized, source=source):
|
|
694
|
+
cleaned_facts.append(normalized)
|
|
695
|
+
cleaned_facts = _dedupe_model_items(cleaned_facts)
|
|
696
|
+
|
|
697
|
+
cleaned_claims: list[AttributedClaim] = []
|
|
698
|
+
for claim in attributed_claims:
|
|
699
|
+
speaker = _clean_ref(claim.speaker, place=place) if claim.speaker else None
|
|
700
|
+
subject = _clean_ref(claim.subject, place=place) if claim.subject else None
|
|
701
|
+
if speaker is None and claim.speaker and _is_bad_person_label(claim.speaker.label):
|
|
702
|
+
speaker = default_ref
|
|
703
|
+
if subject is None and claim.subject and _is_bad_person_label(claim.subject.label):
|
|
704
|
+
subject = default_ref
|
|
705
|
+
if not speaker and not subject:
|
|
706
|
+
continue
|
|
707
|
+
if speaker and subject and _same_person_label(speaker.label, subject.label):
|
|
708
|
+
if _looks_like_follow_up_claim(claim.claim_text):
|
|
709
|
+
continue
|
|
710
|
+
cleaned_claims.append(claim.model_copy(update={"speaker": speaker, "subject": subject}))
|
|
711
|
+
cleaned_claims = _dedupe_model_items(cleaned_claims)
|
|
712
|
+
|
|
713
|
+
cleaned_follow_ups: list[FollowUpTask] = []
|
|
714
|
+
for follow_up in follow_ups:
|
|
715
|
+
if _looks_like_contact_preference_only_follow_up(follow_up.description):
|
|
716
|
+
continue
|
|
717
|
+
related = [
|
|
718
|
+
ref
|
|
719
|
+
for ref in (
|
|
720
|
+
_clean_ref(ref, place=place) for ref in follow_up.related_people
|
|
721
|
+
)
|
|
722
|
+
if ref is not None
|
|
723
|
+
]
|
|
724
|
+
if not related and default_ref:
|
|
725
|
+
related = [default_ref]
|
|
726
|
+
due_at = follow_up.due_at or _date_hint_from_text(
|
|
727
|
+
follow_up.description,
|
|
728
|
+
source,
|
|
729
|
+
reference_date=occurred_at,
|
|
730
|
+
)
|
|
731
|
+
cleaned_follow_ups.append(
|
|
732
|
+
follow_up.model_copy(update={"related_people": related, "due_at": due_at})
|
|
733
|
+
)
|
|
734
|
+
cleaned_follow_ups = _dedupe_follow_ups(cleaned_follow_ups)
|
|
735
|
+
|
|
736
|
+
cleaned_relationships: list[RelationshipAssertion] = []
|
|
737
|
+
for relationship in relationships:
|
|
738
|
+
source_ref = _clean_ref(relationship.source, place=place)
|
|
739
|
+
target_ref = _clean_ref(relationship.target, place=place)
|
|
740
|
+
if not source_ref or not target_ref:
|
|
741
|
+
continue
|
|
742
|
+
cleaned_relationships.append(
|
|
743
|
+
relationship.model_copy(update={"source": source_ref, "target": target_ref})
|
|
744
|
+
)
|
|
745
|
+
cleaned_relationships = _dedupe_model_items(cleaned_relationships)
|
|
746
|
+
|
|
747
|
+
return (
|
|
748
|
+
cleaned_participants,
|
|
749
|
+
cleaned_mentioned,
|
|
750
|
+
cleaned_facts,
|
|
751
|
+
cleaned_claims,
|
|
752
|
+
cleaned_follow_ups,
|
|
753
|
+
cleaned_relationships,
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def _clean_ref(ref: PersonRef | None, *, place: str | None) -> PersonRef | None:
|
|
758
|
+
if ref is None:
|
|
759
|
+
return None
|
|
760
|
+
label = _clean_person_label(ref.label)
|
|
761
|
+
aliases = [
|
|
762
|
+
alias
|
|
763
|
+
for alias in (_clean_person_label(alias) for alias in ref.aliases)
|
|
764
|
+
if alias and alias != label and not _is_date_like(alias)
|
|
765
|
+
]
|
|
766
|
+
updated = ref.model_copy(update={"label": label, "aliases": _unique_labels(aliases)})
|
|
767
|
+
if not _valid_person_ref(updated, place=place):
|
|
768
|
+
return None
|
|
769
|
+
return updated
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def _clean_person_label(label: str | None) -> str:
|
|
773
|
+
label = (label or "").strip(" \t\r\n,.;:,。!?!?\"'")
|
|
774
|
+
if not label:
|
|
775
|
+
return ""
|
|
776
|
+
label = re.sub(
|
|
777
|
+
r"^(?:另一个|另外一个|另一位|另位|不同的|新的)\s*",
|
|
778
|
+
"",
|
|
779
|
+
label,
|
|
780
|
+
).strip()
|
|
781
|
+
label = re.sub(
|
|
782
|
+
r"^(?:this|that|another|a\s+different|different|new)\s+",
|
|
783
|
+
"",
|
|
784
|
+
label,
|
|
785
|
+
flags=re.IGNORECASE,
|
|
786
|
+
).strip()
|
|
787
|
+
if re.fullmatch(r"[\u4e00-\u9fff]+", label):
|
|
788
|
+
for marker in ["也", "正在", "可能", "已经"]:
|
|
789
|
+
if marker in label:
|
|
790
|
+
prefix = label.split(marker, 1)[0]
|
|
791
|
+
if 2 <= len(prefix) <= 4:
|
|
792
|
+
return prefix
|
|
793
|
+
return label
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def _valid_person_ref(ref: PersonRef, *, place: str | None) -> bool:
|
|
797
|
+
if _is_bad_person_label(ref.label):
|
|
798
|
+
return False
|
|
799
|
+
if _looks_like_organization_label(ref.label):
|
|
800
|
+
return False
|
|
801
|
+
if _is_date_like(ref.label):
|
|
802
|
+
return False
|
|
803
|
+
if place and ref.label.strip().casefold() == place.strip().casefold():
|
|
804
|
+
return False
|
|
805
|
+
return _is_probable_person(ref.label)
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def _is_bad_person_label(label: str) -> bool:
|
|
809
|
+
return label.strip().casefold() in BAD_PERSON_LABELS
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def _looks_like_organization_label(label: str) -> bool:
|
|
813
|
+
stripped = label.strip()
|
|
814
|
+
if stripped.startswith("测试"):
|
|
815
|
+
return False
|
|
816
|
+
return any(stripped.endswith(suffix) for suffix in ORGANIZATION_LABEL_SUFFIXES)
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def _is_date_like(value: str) -> bool:
|
|
820
|
+
return bool(ISO_DATE_RE.fullmatch(value.strip()))
|
|
821
|
+
|
|
822
|
+
|
|
823
|
+
def _valid_direct_fact(fact: DirectFact, *, source: str = "") -> bool:
|
|
824
|
+
predicate = fact.predicate.strip().casefold()
|
|
825
|
+
value = fact.value.strip()
|
|
826
|
+
if predicate in PHONE_PREDICATES:
|
|
827
|
+
return bool(PHONE_RE.fullmatch(value)) and not _is_date_like(value)
|
|
828
|
+
if predicate in EMAIL_PREDICATES:
|
|
829
|
+
return bool(EMAIL_RE.fullmatch(value))
|
|
830
|
+
if predicate in {"works_at", "worked_at", "work", "current_job"}:
|
|
831
|
+
if _looks_like_contrast_only_work_fact(source, fact.subject.label, value):
|
|
832
|
+
return False
|
|
833
|
+
return bool(value)
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def _looks_like_contrast_only_work_fact(source: str, person_label: str, organization: str) -> bool:
|
|
837
|
+
if not source or not person_label or not organization:
|
|
838
|
+
return False
|
|
839
|
+
person = re.escape(person_label.strip())
|
|
840
|
+
org = re.escape(organization.strip())
|
|
841
|
+
patterns = [
|
|
842
|
+
rf"和{org}(?:的|那个|那位)?{person}[^。;;,.,]*(?:不是同一个人|不是一个人|不同的人)",
|
|
843
|
+
rf"{org}(?:的|那个|那位)?{person}[^。;;,.,]*(?:不是同一个人|不是一个人|不同的人)",
|
|
844
|
+
rf"(?:不是同一个人|不是一个人|不同的人)[^。;;,.,]*{org}(?:的|那个|那位)?{person}",
|
|
845
|
+
]
|
|
846
|
+
return any(re.search(pattern, source) for pattern in patterns)
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def _looks_like_follow_up_claim(text: str) -> bool:
|
|
850
|
+
lowered = text.casefold()
|
|
851
|
+
return any(
|
|
852
|
+
phrase in lowered
|
|
853
|
+
for phrase in [
|
|
854
|
+
"asked me to",
|
|
855
|
+
"asked us to",
|
|
856
|
+
"reminded me to",
|
|
857
|
+
"promised to",
|
|
858
|
+
"答应",
|
|
859
|
+
"提醒我",
|
|
860
|
+
]
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def _looks_like_contact_preference_only_follow_up(text: str) -> bool:
|
|
865
|
+
normalized = " ".join(text.casefold().split())
|
|
866
|
+
if not CONTACT_PREFERENCE_ONLY_RE.search(normalized):
|
|
867
|
+
return False
|
|
868
|
+
without_contact_phrase = CONTACT_PREFERENCE_ONLY_RE.sub("", normalized).strip()
|
|
869
|
+
without_contact_phrase = re.sub(r"^[,,。;;\s]+|[,,。;;\s]+$", "", without_contact_phrase)
|
|
870
|
+
if CONCRETE_FOLLOW_UP_CUE_RE.search(without_contact_phrase):
|
|
871
|
+
return False
|
|
872
|
+
return True
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def _dedupe_mentioned(items: list[MentionedPerson]) -> list[MentionedPerson]:
|
|
876
|
+
by_person: dict[str, MentionedPerson] = {}
|
|
877
|
+
for item in items:
|
|
878
|
+
key = item.person.label.casefold()
|
|
879
|
+
existing = by_person.get(key)
|
|
880
|
+
if existing is None or _mentioned_quality(item) > _mentioned_quality(existing):
|
|
881
|
+
by_person[key] = item
|
|
882
|
+
return list(by_person.values())
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
def _mentioned_quality(item: MentionedPerson) -> int:
|
|
886
|
+
return int(item.mentioned_by is not None) + int(bool(item.context))
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def _dedupe_follow_ups(items: list[FollowUpTask]) -> list[FollowUpTask]:
|
|
890
|
+
result: list[FollowUpTask] = []
|
|
891
|
+
for item in items:
|
|
892
|
+
existing_index = next(
|
|
893
|
+
(
|
|
894
|
+
index
|
|
895
|
+
for index, existing in enumerate(result)
|
|
896
|
+
if _similar_follow_up(existing.description, item.description)
|
|
897
|
+
),
|
|
898
|
+
None,
|
|
899
|
+
)
|
|
900
|
+
if existing_index is None:
|
|
901
|
+
result.append(item)
|
|
902
|
+
continue
|
|
903
|
+
existing = result[existing_index]
|
|
904
|
+
replacement = item if _follow_up_quality(item) > _follow_up_quality(existing) else existing
|
|
905
|
+
result[existing_index] = replacement
|
|
906
|
+
return result
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def _similar_follow_up(left: str, right: str) -> bool:
|
|
910
|
+
left_key = _follow_up_key(left)
|
|
911
|
+
right_key = _follow_up_key(right)
|
|
912
|
+
if left_key == right_key:
|
|
913
|
+
return True
|
|
914
|
+
return left_key in right_key or right_key in left_key
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def _follow_up_quality(item: FollowUpTask) -> int:
|
|
918
|
+
return len(item.description) + (25 if item.due_at else 0) + (5 * len(item.related_people))
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def _follow_up_key(value: str) -> str:
|
|
922
|
+
return re.sub(r"\s+", " ", value.casefold().replace("promised to ", "")).strip()
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def _dedupe_model_items(items: list):
|
|
926
|
+
result: list = []
|
|
927
|
+
seen: set[str] = set()
|
|
928
|
+
for item in items:
|
|
929
|
+
key = item.model_dump_json()
|
|
930
|
+
if key in seen:
|
|
931
|
+
continue
|
|
932
|
+
seen.add(key)
|
|
933
|
+
result.append(item)
|
|
934
|
+
return result
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
def _dedupe_direct_facts(items: list[DirectFact]) -> list[DirectFact]:
|
|
938
|
+
result: list[DirectFact] = []
|
|
939
|
+
seen: dict[tuple[str, str, str], int] = {}
|
|
940
|
+
for item in items:
|
|
941
|
+
key = (
|
|
942
|
+
item.subject.label.casefold(),
|
|
943
|
+
item.predicate.casefold(),
|
|
944
|
+
item.value.casefold(),
|
|
945
|
+
)
|
|
946
|
+
if key in seen:
|
|
947
|
+
existing_index = seen[key]
|
|
948
|
+
existing = result[existing_index]
|
|
949
|
+
if not existing.metadata and item.metadata:
|
|
950
|
+
result[existing_index] = item
|
|
951
|
+
continue
|
|
952
|
+
seen[key] = len(result)
|
|
953
|
+
result.append(item)
|
|
954
|
+
return result
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def _date_hint_from_text(
|
|
958
|
+
text: str, source: str, *, reference_date: date | None = None
|
|
959
|
+
) -> date | None:
|
|
960
|
+
match = ISO_DATE_RE.search(text)
|
|
961
|
+
if match:
|
|
962
|
+
return _safe_date(
|
|
963
|
+
int(match.group("year")),
|
|
964
|
+
int(match.group("month")),
|
|
965
|
+
int(match.group("day")),
|
|
966
|
+
)
|
|
967
|
+
source_date = _first_source_date(source) or reference_date or date.today()
|
|
968
|
+
month_match = MONTH_DAY_RE.search(text)
|
|
969
|
+
if month_match and source_date:
|
|
970
|
+
month = MONTHS[month_match.group("month").casefold()]
|
|
971
|
+
return _safe_date(source_date.year, month, int(month_match.group("day")))
|
|
972
|
+
zh_weekday_match = ZH_NEXT_WEEKDAY_RE.search(text)
|
|
973
|
+
if zh_weekday_match and source_date:
|
|
974
|
+
target_weekday = ZH_WEEKDAYS[zh_weekday_match.group("weekday")]
|
|
975
|
+
days_until_next_week = 7 - source_date.weekday() + target_weekday
|
|
976
|
+
return source_date + timedelta(days=days_until_next_week)
|
|
977
|
+
if "tomorrow" in text.casefold() and source_date:
|
|
978
|
+
return source_date + timedelta(days=1)
|
|
979
|
+
return None
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def _first_source_date(source: str) -> date | None:
|
|
983
|
+
match = ISO_DATE_RE.search(source)
|
|
984
|
+
if not match:
|
|
985
|
+
return None
|
|
986
|
+
return _safe_date(
|
|
987
|
+
int(match.group("year")),
|
|
988
|
+
int(match.group("month")),
|
|
989
|
+
int(match.group("day")),
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def _safe_date(year: int, month: int, day: int) -> date | None:
|
|
994
|
+
try:
|
|
995
|
+
return date(year, month, day)
|
|
996
|
+
except ValueError:
|
|
997
|
+
return None
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def _canonicalize_shorthand_refs(
|
|
1001
|
+
*,
|
|
1002
|
+
participants: list[Participant],
|
|
1003
|
+
mentioned: list[MentionedPerson],
|
|
1004
|
+
direct_facts: list[DirectFact],
|
|
1005
|
+
attributed_claims: list[AttributedClaim],
|
|
1006
|
+
follow_ups: list[FollowUpTask],
|
|
1007
|
+
relationships: list[RelationshipAssertion],
|
|
1008
|
+
explicit_aliases: dict[str, list[str]] | None = None,
|
|
1009
|
+
) -> tuple[
|
|
1010
|
+
list[Participant],
|
|
1011
|
+
list[MentionedPerson],
|
|
1012
|
+
list[DirectFact],
|
|
1013
|
+
list[AttributedClaim],
|
|
1014
|
+
list[FollowUpTask],
|
|
1015
|
+
list[RelationshipAssertion],
|
|
1016
|
+
]:
|
|
1017
|
+
explicit_aliases = explicit_aliases or {}
|
|
1018
|
+
alias_map = _participant_shorthand_map(participants)
|
|
1019
|
+
alias_map.update(_explicit_alias_map(explicit_aliases, participants))
|
|
1020
|
+
if not alias_map and not explicit_aliases:
|
|
1021
|
+
return participants, mentioned, direct_facts, attributed_claims, follow_ups, relationships
|
|
1022
|
+
|
|
1023
|
+
canonical_participants = _dedupe_participants(
|
|
1024
|
+
[
|
|
1025
|
+
participant.model_copy(
|
|
1026
|
+
update={
|
|
1027
|
+
"person": _canonicalize_ref(
|
|
1028
|
+
participant.person,
|
|
1029
|
+
alias_map,
|
|
1030
|
+
explicit_aliases,
|
|
1031
|
+
)
|
|
1032
|
+
}
|
|
1033
|
+
)
|
|
1034
|
+
for participant in participants
|
|
1035
|
+
]
|
|
1036
|
+
)
|
|
1037
|
+
return (
|
|
1038
|
+
canonical_participants,
|
|
1039
|
+
[
|
|
1040
|
+
item.model_copy(
|
|
1041
|
+
update={
|
|
1042
|
+
"person": _canonicalize_ref(item.person, alias_map, explicit_aliases),
|
|
1043
|
+
"mentioned_by": (
|
|
1044
|
+
_canonicalize_ref(
|
|
1045
|
+
item.mentioned_by,
|
|
1046
|
+
alias_map,
|
|
1047
|
+
explicit_aliases,
|
|
1048
|
+
)
|
|
1049
|
+
if item.mentioned_by
|
|
1050
|
+
else None
|
|
1051
|
+
),
|
|
1052
|
+
}
|
|
1053
|
+
)
|
|
1054
|
+
for item in mentioned
|
|
1055
|
+
],
|
|
1056
|
+
_dedupe_direct_facts([
|
|
1057
|
+
fact.model_copy(
|
|
1058
|
+
update={
|
|
1059
|
+
"subject": _canonicalize_ref(
|
|
1060
|
+
fact.subject,
|
|
1061
|
+
alias_map,
|
|
1062
|
+
explicit_aliases,
|
|
1063
|
+
)
|
|
1064
|
+
}
|
|
1065
|
+
)
|
|
1066
|
+
for fact in direct_facts
|
|
1067
|
+
]),
|
|
1068
|
+
[
|
|
1069
|
+
claim.model_copy(
|
|
1070
|
+
update={
|
|
1071
|
+
"speaker": (
|
|
1072
|
+
_canonicalize_ref(
|
|
1073
|
+
claim.speaker,
|
|
1074
|
+
alias_map,
|
|
1075
|
+
explicit_aliases,
|
|
1076
|
+
)
|
|
1077
|
+
if claim.speaker
|
|
1078
|
+
else None
|
|
1079
|
+
),
|
|
1080
|
+
"subject": (
|
|
1081
|
+
_canonicalize_ref(
|
|
1082
|
+
claim.subject,
|
|
1083
|
+
alias_map,
|
|
1084
|
+
explicit_aliases,
|
|
1085
|
+
)
|
|
1086
|
+
if claim.subject
|
|
1087
|
+
else None
|
|
1088
|
+
),
|
|
1089
|
+
}
|
|
1090
|
+
)
|
|
1091
|
+
for claim in attributed_claims
|
|
1092
|
+
],
|
|
1093
|
+
[
|
|
1094
|
+
follow_up.model_copy(
|
|
1095
|
+
update={
|
|
1096
|
+
"related_people": [
|
|
1097
|
+
_canonicalize_ref(ref, alias_map, explicit_aliases)
|
|
1098
|
+
for ref in follow_up.related_people
|
|
1099
|
+
]
|
|
1100
|
+
}
|
|
1101
|
+
)
|
|
1102
|
+
for follow_up in follow_ups
|
|
1103
|
+
],
|
|
1104
|
+
[
|
|
1105
|
+
relationship.model_copy(
|
|
1106
|
+
update={
|
|
1107
|
+
"source": _canonicalize_ref(
|
|
1108
|
+
relationship.source,
|
|
1109
|
+
alias_map,
|
|
1110
|
+
explicit_aliases,
|
|
1111
|
+
),
|
|
1112
|
+
"target": _canonicalize_ref(
|
|
1113
|
+
relationship.target,
|
|
1114
|
+
alias_map,
|
|
1115
|
+
explicit_aliases,
|
|
1116
|
+
),
|
|
1117
|
+
}
|
|
1118
|
+
)
|
|
1119
|
+
for relationship in relationships
|
|
1120
|
+
],
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _explicit_aliases(text: str) -> dict[str, list[str]]:
|
|
1125
|
+
aliases: dict[str, list[str]] = {}
|
|
1126
|
+
for pattern in [ZH_ALIAS_PAREN_RE, ZH_ALIAS_START_PAREN_RE]:
|
|
1127
|
+
for match in pattern.finditer(text):
|
|
1128
|
+
canonical = match.group("canonical").strip()
|
|
1129
|
+
alias = match.group("alias").strip()
|
|
1130
|
+
_add_explicit_alias(aliases, canonical, alias)
|
|
1131
|
+
for pattern in [ZH_ALIAS_PHRASE_RE, ZH_ALIAS_START_PHRASE_RE]:
|
|
1132
|
+
for match in pattern.finditer(text):
|
|
1133
|
+
canonical = match.group("canonical").strip()
|
|
1134
|
+
alias = match.group("alias").strip()
|
|
1135
|
+
_add_explicit_alias(aliases, canonical, alias)
|
|
1136
|
+
return aliases
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def _single_explicit_alias_ref(explicit_aliases: dict[str, list[str]]) -> PersonRef | None:
|
|
1140
|
+
if len(explicit_aliases) != 1:
|
|
1141
|
+
return None
|
|
1142
|
+
canonical, aliases = next(iter(explicit_aliases.items()))
|
|
1143
|
+
return PersonRef(label=canonical, aliases=aliases)
|
|
1144
|
+
|
|
1145
|
+
|
|
1146
|
+
def _add_explicit_alias(aliases: dict[str, list[str]], canonical: str, alias: str) -> None:
|
|
1147
|
+
if _looks_like_cjk_context_fragment(canonical):
|
|
1148
|
+
return
|
|
1149
|
+
if not _is_probable_person(canonical):
|
|
1150
|
+
return
|
|
1151
|
+
if not _is_probable_alias(alias):
|
|
1152
|
+
return
|
|
1153
|
+
if _is_date_like(alias):
|
|
1154
|
+
return
|
|
1155
|
+
if canonical.casefold() == alias.casefold():
|
|
1156
|
+
return
|
|
1157
|
+
items = aliases.setdefault(canonical, [])
|
|
1158
|
+
if alias not in items:
|
|
1159
|
+
items.append(alias)
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def _looks_like_cjk_context_fragment(label: str) -> bool:
|
|
1163
|
+
if not re.fullmatch(CJK_NAME, label):
|
|
1164
|
+
return False
|
|
1165
|
+
if label.startswith(("今天", "昨天", "前天", "今晚", "上午", "下午", "晚上")):
|
|
1166
|
+
return True
|
|
1167
|
+
return any(marker in label for marker in ["见了", "认识了", "遇到", "碰到", "记住", "记录", "保存"])
|
|
1168
|
+
|
|
1169
|
+
|
|
1170
|
+
def _is_probable_alias(alias: str) -> bool:
|
|
1171
|
+
alias = alias.strip()
|
|
1172
|
+
if not alias or alias in CJK_ALIAS_DESCRIPTORS:
|
|
1173
|
+
return False
|
|
1174
|
+
if alias.endswith(CJK_ALIAS_DESCRIPTOR_SUFFIXES):
|
|
1175
|
+
return False
|
|
1176
|
+
return bool(re.fullmatch(CJK_ALIAS, alias))
|
|
1177
|
+
|
|
1178
|
+
|
|
1179
|
+
def _explicit_alias_map(
|
|
1180
|
+
explicit_aliases: dict[str, list[str]],
|
|
1181
|
+
participants: list[Participant],
|
|
1182
|
+
) -> dict[str, str]:
|
|
1183
|
+
canonical_by_alias: dict[str, str | None] = {}
|
|
1184
|
+
explicit_lookup: dict[str, str | None] = {}
|
|
1185
|
+
for canonical, aliases in explicit_aliases.items():
|
|
1186
|
+
_remember_alias_candidate(explicit_lookup, canonical, canonical)
|
|
1187
|
+
for alias in aliases:
|
|
1188
|
+
_remember_alias_candidate(canonical_by_alias, alias, canonical)
|
|
1189
|
+
_remember_alias_candidate(explicit_lookup, alias, canonical)
|
|
1190
|
+
for participant in participants:
|
|
1191
|
+
ref = participant.person
|
|
1192
|
+
if ref.person_id or ref.email or ref.phone or ref.company_hint:
|
|
1193
|
+
continue
|
|
1194
|
+
label_key = ref.label.strip().casefold()
|
|
1195
|
+
if not label_key:
|
|
1196
|
+
continue
|
|
1197
|
+
explicit_canonical = _explicit_canonical_for_ref(ref, explicit_lookup, explicit_aliases)
|
|
1198
|
+
if explicit_canonical and explicit_canonical != ref.label:
|
|
1199
|
+
_remember_alias_candidate(canonical_by_alias, ref.label, explicit_canonical)
|
|
1200
|
+
for alias in ref.aliases:
|
|
1201
|
+
_remember_alias_candidate(canonical_by_alias, alias, explicit_canonical)
|
|
1202
|
+
for alias in ref.aliases:
|
|
1203
|
+
canonical = canonical_by_alias.get(alias.strip().casefold())
|
|
1204
|
+
if not canonical or canonical == ref.label:
|
|
1205
|
+
continue
|
|
1206
|
+
_remember_alias_candidate(canonical_by_alias, ref.label, canonical)
|
|
1207
|
+
return {
|
|
1208
|
+
alias: canonical
|
|
1209
|
+
for alias, canonical in canonical_by_alias.items()
|
|
1210
|
+
if canonical is not None and not _has_distinct_alias_ref(participants, alias, canonical)
|
|
1211
|
+
}
|
|
1212
|
+
|
|
1213
|
+
|
|
1214
|
+
def _remember_alias_candidate(
|
|
1215
|
+
mapping: dict[str, str | None], alias: str, canonical: str
|
|
1216
|
+
) -> None:
|
|
1217
|
+
alias_key = alias.strip().casefold()
|
|
1218
|
+
if not alias_key:
|
|
1219
|
+
return
|
|
1220
|
+
if alias_key not in mapping:
|
|
1221
|
+
mapping[alias_key] = canonical
|
|
1222
|
+
elif mapping[alias_key] != canonical:
|
|
1223
|
+
mapping[alias_key] = None
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def _explicit_canonical_for_ref(
|
|
1227
|
+
ref: PersonRef,
|
|
1228
|
+
explicit_lookup: dict[str, str | None],
|
|
1229
|
+
explicit_aliases: dict[str, list[str]],
|
|
1230
|
+
) -> str | None:
|
|
1231
|
+
labels = [ref.label, *ref.aliases]
|
|
1232
|
+
matches = {
|
|
1233
|
+
canonical
|
|
1234
|
+
for label in labels
|
|
1235
|
+
for canonical in [explicit_lookup.get(label.strip().casefold())]
|
|
1236
|
+
if canonical is not None
|
|
1237
|
+
}
|
|
1238
|
+
for canonical in explicit_aliases:
|
|
1239
|
+
if _same_test_prefixed_cjk_name(canonical, ref.label):
|
|
1240
|
+
matches.add(canonical)
|
|
1241
|
+
return next(iter(matches)) if len(matches) == 1 else None
|
|
1242
|
+
|
|
1243
|
+
|
|
1244
|
+
def _same_test_prefixed_cjk_name(canonical: str, label: str) -> bool:
|
|
1245
|
+
canonical = canonical.strip()
|
|
1246
|
+
label = label.strip()
|
|
1247
|
+
if not canonical.startswith("测试"):
|
|
1248
|
+
return False
|
|
1249
|
+
stripped = canonical.removeprefix("测试")
|
|
1250
|
+
return bool(stripped and label == stripped and re.fullmatch(CJK_NAME, stripped))
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def _participant_shorthand_map(participants: list[Participant]) -> dict[str, str]:
|
|
1254
|
+
canonical_by_alias: dict[str, str | None] = {}
|
|
1255
|
+
for participant in participants:
|
|
1256
|
+
label = participant.person.label.strip()
|
|
1257
|
+
for alias in participant.person.aliases:
|
|
1258
|
+
alias_key = alias.strip().casefold()
|
|
1259
|
+
if not alias_key or alias_key == label.casefold():
|
|
1260
|
+
continue
|
|
1261
|
+
if alias_key not in canonical_by_alias:
|
|
1262
|
+
canonical_by_alias[alias_key] = label
|
|
1263
|
+
elif canonical_by_alias[alias_key] != label:
|
|
1264
|
+
canonical_by_alias[alias_key] = None
|
|
1265
|
+
if not _is_full_latin_name(label):
|
|
1266
|
+
continue
|
|
1267
|
+
for alias in _latin_name_aliases(label):
|
|
1268
|
+
if alias not in canonical_by_alias:
|
|
1269
|
+
canonical_by_alias[alias] = label
|
|
1270
|
+
elif canonical_by_alias[alias] != label:
|
|
1271
|
+
canonical_by_alias[alias] = None
|
|
1272
|
+
return {
|
|
1273
|
+
alias: full
|
|
1274
|
+
for alias, full in canonical_by_alias.items()
|
|
1275
|
+
if full is not None and not _has_distinct_alias_ref(participants, alias, full)
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
def _has_distinct_alias_ref(
|
|
1280
|
+
participants: list[Participant], alias: str, full_name: str
|
|
1281
|
+
) -> bool:
|
|
1282
|
+
for participant in participants:
|
|
1283
|
+
ref = participant.person
|
|
1284
|
+
label = ref.label.strip()
|
|
1285
|
+
if label.lower() != alias:
|
|
1286
|
+
continue
|
|
1287
|
+
if ref.person_id or ref.email or ref.phone or ref.company_hint:
|
|
1288
|
+
return True
|
|
1289
|
+
if label.lower() == full_name.lower():
|
|
1290
|
+
return True
|
|
1291
|
+
return False
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
def _canonicalize_ref(
|
|
1295
|
+
ref: PersonRef,
|
|
1296
|
+
alias_map: dict[str, str],
|
|
1297
|
+
explicit_aliases: dict[str, list[str]],
|
|
1298
|
+
) -> PersonRef:
|
|
1299
|
+
label = ref.label.strip()
|
|
1300
|
+
explicit_canonical = _explicit_canonical_label(label, explicit_aliases)
|
|
1301
|
+
canonical = (
|
|
1302
|
+
label
|
|
1303
|
+
if (ref.person_id or ref.email or ref.phone)
|
|
1304
|
+
else explicit_canonical or alias_map.get(label.casefold())
|
|
1305
|
+
)
|
|
1306
|
+
if not canonical and not (ref.person_id or ref.email or ref.phone):
|
|
1307
|
+
for alias in ref.aliases:
|
|
1308
|
+
canonical = alias_map.get(alias.strip().casefold())
|
|
1309
|
+
if canonical:
|
|
1310
|
+
break
|
|
1311
|
+
if not canonical:
|
|
1312
|
+
canonical = label
|
|
1313
|
+
aliases = [alias for alias in ref.aliases if alias != canonical]
|
|
1314
|
+
if label != canonical and label not in aliases:
|
|
1315
|
+
aliases.append(label)
|
|
1316
|
+
for alias in explicit_aliases.get(canonical, []):
|
|
1317
|
+
if alias != canonical and alias not in aliases:
|
|
1318
|
+
aliases.append(alias)
|
|
1319
|
+
if canonical == ref.label and aliases == ref.aliases:
|
|
1320
|
+
return ref
|
|
1321
|
+
return ref.model_copy(update={"label": canonical, "aliases": aliases})
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
def _explicit_canonical_label(
|
|
1325
|
+
label: str, explicit_aliases: dict[str, list[str]]
|
|
1326
|
+
) -> str | None:
|
|
1327
|
+
label_key = label.strip().casefold()
|
|
1328
|
+
for canonical in explicit_aliases:
|
|
1329
|
+
if canonical.strip().casefold() == label_key:
|
|
1330
|
+
return canonical
|
|
1331
|
+
return None
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def _dedupe_participants(participants: list[Participant]) -> list[Participant]:
|
|
1335
|
+
seen: dict[str, int] = {}
|
|
1336
|
+
result: list[Participant] = []
|
|
1337
|
+
for participant in participants:
|
|
1338
|
+
key = participant.person.label.lower()
|
|
1339
|
+
if key in seen:
|
|
1340
|
+
index = seen[key]
|
|
1341
|
+
existing = result[index]
|
|
1342
|
+
aliases = _unique_labels([*existing.person.aliases, *participant.person.aliases])
|
|
1343
|
+
result[index] = existing.model_copy(
|
|
1344
|
+
update={"person": existing.person.model_copy(update={"aliases": aliases})}
|
|
1345
|
+
)
|
|
1346
|
+
continue
|
|
1347
|
+
seen[key] = len(result)
|
|
1348
|
+
result.append(participant)
|
|
1349
|
+
return result
|
|
1350
|
+
|
|
1351
|
+
|
|
1352
|
+
def _person_label_referenced_in_text(label: str, text: str) -> bool:
|
|
1353
|
+
if label in text:
|
|
1354
|
+
return True
|
|
1355
|
+
if _is_full_latin_name(label):
|
|
1356
|
+
first = label.split()[0]
|
|
1357
|
+
return bool(re.search(rf"\b{re.escape(first)}\b", text))
|
|
1358
|
+
return False
|
|
1359
|
+
|
|
1360
|
+
|
|
1361
|
+
def _same_person_label(existing: str, candidate: str) -> bool:
|
|
1362
|
+
if existing.lower() == candidate.lower():
|
|
1363
|
+
return True
|
|
1364
|
+
existing_key = existing.strip().lower()
|
|
1365
|
+
candidate_key = candidate.strip().lower()
|
|
1366
|
+
if candidate_key in _latin_name_aliases(existing):
|
|
1367
|
+
return True
|
|
1368
|
+
if existing_key in _latin_name_aliases(candidate):
|
|
1369
|
+
return True
|
|
1370
|
+
return False
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def _is_full_latin_name(label: str) -> bool:
|
|
1374
|
+
return bool(re.fullmatch(r"[A-Z][A-Za-z'-]+(?:\s+[A-Z][A-Za-z'-]+)+", label.strip()))
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
def _latin_name_aliases(label: str) -> set[str]:
|
|
1378
|
+
if not _is_full_latin_name(label):
|
|
1379
|
+
return set()
|
|
1380
|
+
tokens = label.strip().split()
|
|
1381
|
+
aliases = {tokens[0].lower()}
|
|
1382
|
+
aliases.update(" ".join(tokens[:index]).lower() for index in range(2, len(tokens)))
|
|
1383
|
+
if tokens[0].casefold() == "test" and len(tokens) >= 3:
|
|
1384
|
+
aliases.add(tokens[1].lower())
|
|
1385
|
+
aliases.update(" ".join(tokens[1:index]).lower() for index in range(3, len(tokens)))
|
|
1386
|
+
return aliases
|
|
1387
|
+
|
|
1388
|
+
|
|
1389
|
+
def _default_subject(participants: list[Participant]) -> PersonRef | None:
|
|
1390
|
+
if len(participants) == 1:
|
|
1391
|
+
return participants[0].person
|
|
1392
|
+
return None
|
|
1393
|
+
|
|
1394
|
+
|
|
1395
|
+
def _optional_metadata(**values: object) -> dict[str, object]:
|
|
1396
|
+
return {key: value.strip() for key, value in values.items() if isinstance(value, str) and value.strip()}
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
def _unique_labels(labels: Iterable[object]) -> list[str]:
|
|
1400
|
+
seen: set[str] = set()
|
|
1401
|
+
result: list[str] = []
|
|
1402
|
+
for label in labels:
|
|
1403
|
+
normalized = str(label).strip()
|
|
1404
|
+
key = normalized.lower()
|
|
1405
|
+
if normalized and key not in seen:
|
|
1406
|
+
seen.add(key)
|
|
1407
|
+
result.append(normalized)
|
|
1408
|
+
return result
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
def _unique_pairs(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
1412
|
+
seen: set[tuple[str, str]] = set()
|
|
1413
|
+
result: list[tuple[str, str]] = []
|
|
1414
|
+
for source, target in pairs:
|
|
1415
|
+
key = (source.lower(), target.lower())
|
|
1416
|
+
if key not in seen:
|
|
1417
|
+
seen.add(key)
|
|
1418
|
+
result.append((source, target))
|
|
1419
|
+
return result
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
def _unique_triples(triples: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
|
|
1423
|
+
seen: set[tuple[str, str, str]] = set()
|
|
1424
|
+
result: list[tuple[str, str, str]] = []
|
|
1425
|
+
for source, target, relationship_type in triples:
|
|
1426
|
+
key = (source.lower(), target.lower(), relationship_type)
|
|
1427
|
+
if key not in seen:
|
|
1428
|
+
seen.add(key)
|
|
1429
|
+
result.append((source, target, relationship_type))
|
|
1430
|
+
return result
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
def _is_probable_person(label: str) -> bool:
|
|
1434
|
+
if _is_bad_person_label(label):
|
|
1435
|
+
return False
|
|
1436
|
+
if re.fullmatch(PLACEHOLDER_NAME, label):
|
|
1437
|
+
return True
|
|
1438
|
+
if re.fullmatch(LATIN_NAME, label):
|
|
1439
|
+
blocked = {"Blue Bottle", "Blue Bottle Coffee", "People Square", "West Lake"}
|
|
1440
|
+
return label not in blocked
|
|
1441
|
+
return bool(re.fullmatch(CJK_NAME, label))
|