@reconcrap/people-network-memory 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +476 -0
  2. package/docs/mcp_tools.md +138 -0
  3. package/harness_adapters/openclaw/mcp.managed.unix.template.json +25 -0
  4. package/harness_adapters/openclaw/mcp.managed.windows.template.json +26 -0
  5. package/harness_adapters/openclaw/mcp.template.json +14 -0
  6. package/harness_adapters/openclaw/ppl/SKILL.md +114 -0
  7. package/package.json +30 -0
  8. package/pyproject.toml +26 -0
  9. package/scripts/install_windows.ps1 +92 -0
  10. package/scripts/npm/people-memory.js +276 -0
  11. package/scripts/people_memory_bootstrap.py +247 -0
  12. package/scripts/run_graphiti_live_from_liepin.ps1 +87 -0
  13. package/scripts/run_tests_with_artifacts.ps1 +307 -0
  14. package/src/people_network_memory/__init__.py +6 -0
  15. package/src/people_network_memory/application/__init__.py +16 -0
  16. package/src/people_network_memory/application/normalization.py +1441 -0
  17. package/src/people_network_memory/application/services.py +921 -0
  18. package/src/people_network_memory/cli.py +1212 -0
  19. package/src/people_network_memory/config.py +268 -0
  20. package/src/people_network_memory/domain/__init__.py +55 -0
  21. package/src/people_network_memory/domain/identity.py +77 -0
  22. package/src/people_network_memory/domain/models.py +355 -0
  23. package/src/people_network_memory/fixtures/__init__.py +6 -0
  24. package/src/people_network_memory/fixtures/eval.py +398 -0
  25. package/src/people_network_memory/fixtures/extractor_eval.py +364 -0
  26. package/src/people_network_memory/fixtures/generator.py +290 -0
  27. package/src/people_network_memory/fixtures/report.py +252 -0
  28. package/src/people_network_memory/graphiti_adapter/__init__.py +9 -0
  29. package/src/people_network_memory/graphiti_adapter/episode_formatter.py +70 -0
  30. package/src/people_network_memory/graphiti_adapter/graphiti_store.py +655 -0
  31. package/src/people_network_memory/graphiti_adapter/indexer.py +194 -0
  32. package/src/people_network_memory/graphiti_adapter/ontology.py +68 -0
  33. package/src/people_network_memory/harness_adapters/__init__.py +2 -0
  34. package/src/people_network_memory/harness_adapters/openclaw/__init__.py +9 -0
  35. package/src/people_network_memory/harness_adapters/openclaw/installer.py +577 -0
  36. package/src/people_network_memory/harness_adapters/openclaw/integration_eval.py +508 -0
  37. package/src/people_network_memory/harness_adapters/openclaw/smoke.py +292 -0
  38. package/src/people_network_memory/infrastructure/__init__.py +2 -0
  39. package/src/people_network_memory/infrastructure/archive_backup.py +171 -0
  40. package/src/people_network_memory/infrastructure/diagnostics.py +171 -0
  41. package/src/people_network_memory/infrastructure/embeddings.py +155 -0
  42. package/src/people_network_memory/infrastructure/file_store.py +129 -0
  43. package/src/people_network_memory/infrastructure/graphiti_promotion.py +212 -0
  44. package/src/people_network_memory/infrastructure/id_generator.py +40 -0
  45. package/src/people_network_memory/infrastructure/in_memory_store.py +1008 -0
  46. package/src/people_network_memory/infrastructure/llm_extractor.py +476 -0
  47. package/src/people_network_memory/infrastructure/llm_identity_advisor.py +200 -0
  48. package/src/people_network_memory/infrastructure/llm_judge.py +162 -0
  49. package/src/people_network_memory/infrastructure/redaction.py +21 -0
  50. package/src/people_network_memory/infrastructure/release_check.py +186 -0
  51. package/src/people_network_memory/infrastructure/retrieval_intent.py +98 -0
  52. package/src/people_network_memory/infrastructure/semantic_index.py +262 -0
  53. package/src/people_network_memory/mcp_server/__init__.py +2 -0
  54. package/src/people_network_memory/mcp_server/contracts.py +85 -0
  55. package/src/people_network_memory/mcp_server/runtime.py +133 -0
  56. package/src/people_network_memory/mcp_server/tools.py +588 -0
  57. package/src/people_network_memory/ports/__init__.py +2 -0
  58. package/src/people_network_memory/ports/errors.py +25 -0
  59. package/src/people_network_memory/ports/interfaces.py +103 -0
  60. package/src/people_network_memory/projection/__init__.py +6 -0
  61. package/src/people_network_memory/projection/builders.py +46 -0
@@ -0,0 +1,1441 @@
1
+ """Conservative source-text normalization for low-friction capture."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections.abc import Iterable
7
+ from datetime import date, timedelta
8
+
9
+ from people_network_memory.domain.models import (
10
+ AttributedClaim,
11
+ DirectFact,
12
+ FollowUpTask,
13
+ MentionedPerson,
14
+ Participant,
15
+ PersonRef,
16
+ RelationshipAssertion,
17
+ SocialInteraction,
18
+ )
19
+
20
+
21
+ PLACEHOLDER_NAME = r"[Pp]erson\s+[A-Z]"
22
+ LATIN_NAME = r"[A-Z][A-Za-z'-]+(?:\s+[A-Z][A-Za-z'-]+){0,2}"
23
+ CJK_NAME = r"[\u4e00-\u9fff]{2,8}"
24
+ CJK_ALIAS = r"[\u4e00-\u9fffA-Za-z0-9_-]{1,12}"
25
+ PERSON = rf"(?:{PLACEHOLDER_NAME}|{LATIN_NAME}|{CJK_NAME})"
26
+
27
+ EN_MET_RE = re.compile(
28
+ rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+"
29
+ rf"(?:(?i:another|a\s+different|different|new)\s+)?"
30
+ rf"(?P<person>{PERSON})(?=\s+(?i:at|today|yesterday|on|who|that)\b|[.,;]|$)",
31
+ )
32
+ EN_REMEMBER_RE = re.compile(rf"\b(?i:remember|add|save)\s+(?P<person>{PERSON})\b")
33
+ EN_PLACE_RE = re.compile(
34
+ rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+{PERSON}"
35
+ rf"(?:\s+(?i:today|yesterday)|\s+(?i:on)\s+\d{{4}}-\d{{1,2}}-\d{{1,2}})?"
36
+ rf"\s+(?i:at)\s+(?P<place>[^.;,()]+)",
37
+ )
38
+ EN_ANOTHER_PLACE_RE = re.compile(
39
+ rf"\b(?i:met|spoke with|had coffee with|had dinner with|had lunch with|called)\s+"
40
+ rf"(?:(?i:another|a\s+different|different|new)\s+){PERSON}"
41
+ rf"(?:\s+(?i:today|yesterday)|\s+(?i:on)\s+\d{{4}}-\d{{1,2}}-\d{{1,2}})?"
42
+ rf"\s+(?i:at)\s+(?P<place>[^.;,()]+)",
43
+ )
44
+ EN_TOPIC_RE = re.compile(r"\b(?i:discussed|talked about|chatted about)\s+(?P<topic>[^.;]+)")
45
+ EN_MENTION_RE = re.compile(
46
+ rf"\b(?P<speaker>{PERSON})\s+(?i:mentioned|said|told me)\s+(?P<target>{PERSON})\b",
47
+ )
48
+ EN_MET_KNOWS_RE = re.compile(
49
+ rf"\b(?i:met|spoke with|had coffee with|had dinner with|called)\s+"
50
+ rf"(?P<source>{PERSON})\s+(?i:who|that)\s+(?i:also\s+)?(?i:knows|knew)\s+(?P<target>{PERSON})\b",
51
+ )
52
+ EN_KNOWS_RE = re.compile(
53
+ rf"\b(?P<source>{PERSON})\s+(?i:also\s+)?(?i:knows|knew)\s+(?P<target>{PERSON})\b",
54
+ )
55
+ EN_FRIENDS_WITH_RE = re.compile(
56
+ rf"\b(?P<source>{PERSON})\s+(?i:is\s+)?(?i:friends?\s+with)\s+(?P<target>{PERSON})\b",
57
+ )
58
+ EN_WORKS_WITH_RE = re.compile(
59
+ rf"\b(?P<source>{PERSON})\s+(?i:works|worked|is\s+working)\s+(?i:with)\s+(?P<target>{PERSON})\b",
60
+ )
61
+ EN_INTRODUCED_ME_RE = re.compile(
62
+ rf"\b(?P<source>{PERSON})\s+(?i:introduced\s+me\s+to)\s+(?P<target>{PERSON})\b",
63
+ )
64
+ EN_PROMISE_RE = re.compile(r"\b(?i:(?:i\s+)?promised to)\s+(?P<task>[^.;]+)")
65
+ EN_ASKED_ME_RE = re.compile(r"\b(?i:(?:he|she|they|[A-Z][A-Za-z'-]+)\s+asked\s+me\s+to)\s+(?P<task>[^.;]+)")
66
+ ZH_PROMISE_RE = re.compile(r"(?:我)?(?:答应|承诺|说好|约好)(?P<task>[^。;;]+)")
67
+ EN_WORK_RE = re.compile(
68
+ rf"\b(?P<person>{PERSON})\s+(?i:currently\s+)?(?i:works|is working|worked)\s+(?i:at|for)\s+"
69
+ r"(?P<organization>[^.;,]+?)(?:\s+(?i:as)\s+(?P<role>[^.;,]+))?(?=[.;,]|$)"
70
+ )
71
+ EN_STUDIED_RE = re.compile(
72
+ rf"\b(?P<person>{PERSON})\s+(?i:studied at|graduated from|went to)\s+(?P<school>[^.;,]+)"
73
+ )
74
+ EN_INTEREST_RE = re.compile(
75
+ rf"\b(?P<person>{PERSON})\s+(?i:likes|enjoys|is interested in|is into)\s+(?P<value>[^.;,]+)"
76
+ )
77
+ DEFAULT_WORK_RE = re.compile(r"\b(?i:works|is working|worked)\s+(?i:at|for)\s+(?P<organization>[^.;,]+)")
78
+ DEFAULT_STUDIED_RE = re.compile(r"\b(?i:studied at|graduated from|went to)\s+(?P<school>[^.;,]+)")
79
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
80
+ PHONE_RE = re.compile(r"(?<!\d)(?:\+?\d[\d\s().-]{6,}\d)(?!\d)")
81
+ ISO_DATE_RE = re.compile(r"\b(?P<year>20\d{2})[-/](?P<month>\d{1,2})[-/](?P<day>\d{1,2})\b")
82
+ MONTH_DAY_RE = re.compile(
83
+ r"\b(?P<month>jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|"
84
+ r"jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)"
85
+ r"\s+(?P<day>\d{1,2})\b",
86
+ flags=re.IGNORECASE,
87
+ )
88
+ ZH_NEXT_WEEKDAY_RE = re.compile(
89
+ r"下(?:周|星期|礼拜)(?P<weekday>[一二三四五六日天])"
90
+ )
91
+
92
+ ZH_PARTICIPANT_RE = re.compile(
93
+ rf"(?:见了|认识了|遇到|碰到)(?P<person>{PERSON})"
94
+ r"(?=\s*[((,,。.]|聊了|说|提到|也|和|他|她|其|在|喜欢|正在|可能|已经|$)"
95
+ )
96
+ ZH_CHAT_WITH_RE = re.compile(rf"和(?P<person>{PERSON})(?:聊了|聊天|见面)")
97
+ ZH_ARRANGED_WITH_RE = re.compile(rf"和(?P<person>{PERSON})(?:约好|说好)")
98
+ ZH_COMPLETED_SEND_TO_RE = re.compile(
99
+ rf"(?:已经|已|刚刚|刚)?(?:给|把)(?P<person>{PERSON})"
100
+ r"(?:发了|发送了|发过去了|分享了|转发了)"
101
+ )
102
+ ZH_REMEMBER_RE = re.compile(rf"(?:记住|记录|保存)(?P<person>{PERSON})")
103
+ ZH_PLACE_RE = re.compile(r"在(?P<place>[^,。,.]+?)(?:又)?见了")
104
+ ZH_TOPIC_RE = re.compile(r"聊了(?P<topic>[^,。,.]+)")
105
+ ZH_FOCUS_TOPIC_RE = re.compile(
106
+ r"(?:重点看|重点关注|正在关注|也在关注|关注)(?P<topic>[^,。,.;;]+)"
107
+ )
108
+ ZH_ORG_FOCUS_CLAIM_RE = re.compile(
109
+ r"(?P<subject>[\u4e00-\u9fffA-Za-z0-9_-]{2,30}"
110
+ r"(?:公司|集团|科技|智能|机器人|资本|基金|大学|学院|实验室))"
111
+ r"(?P<time>上半年|下半年|今年|明年|后续|接下来|未来)"
112
+ r"(?:会|将|要)?"
113
+ r"(?P<focus>重点看|重点关注|关注)"
114
+ r"(?P<topic>[^,。,.;;]+)"
115
+ )
116
+ ZH_MENTION_RE = re.compile(rf"(?P<speaker>{PERSON})提到(?P<target>{PERSON})")
117
+ ZH_DEFAULT_WORK_RE = re.compile(
118
+ r"(?:他|她|他们|她们|这个人|这人|此人)?在"
119
+ r"(?P<organization>[^,。,.;;]{2,30}?)(?:做|负责|从事|搞)"
120
+ r"(?P<role>[^,。,.;;]+)"
121
+ )
122
+ ZH_CONTACT_PREF_RE = re.compile(
123
+ r"(?:喜欢|偏好|倾向于|最好)(?:用|通过)?"
124
+ r"(?P<method>微信|短信|电话|邮件|邮箱|WhatsApp|飞书|钉钉)"
125
+ r"(?:联系|沟通|跟进)"
126
+ )
127
+ CONTACT_PREFERENCE_ONLY_RE = re.compile(
128
+ r"(?:之后|以后|后续|下次|以后有事)?\s*"
129
+ r"(?:最好|偏好|喜欢|倾向于)?\s*(?:用|通过)?\s*"
130
+ r"(?:微信|短信|电话|邮件|邮箱|whatsapp|飞书|钉钉)\s*"
131
+ r"(?:联系|沟通|跟进)",
132
+ flags=re.IGNORECASE,
133
+ )
134
+ CONCRETE_FOLLOW_UP_CUE_RE = re.compile(
135
+ r"(?:"
136
+ r"明天|后天|下周|周[一二三四五六日天]|星期[一二三四五六日天]|礼拜[一二三四五六日天]|"
137
+ r"\d{1,2}[月/.-]\d{1,2}|20\d{2}[-/]\d{1,2}[-/]\d{1,2}|"
138
+ r"答应|承诺|说好|约好|提醒我|需要我|让我|"
139
+ r"发|发送|推荐|介绍|引荐|安排|确认|准备|"
140
+ r"tomorrow|next\s+week|next\s+\w+day|by\s+\w+day|"
141
+ r"promised|asked\s+me\s+to|need\s+to|send|introduce|share|confirm|schedule"
142
+ r")",
143
+ flags=re.IGNORECASE,
144
+ )
145
+ ZH_ALIAS_CONTEXT = r"(?:[,。,.;;]|见了|认识了|遇到|碰到|记住|记录|保存|和)\s*"
146
+ ZH_ALIAS_PAREN_RE = re.compile(
147
+ rf"{ZH_ALIAS_CONTEXT}(?P<canonical>{PERSON})\s*[((](?P<alias>{CJK_ALIAS})[))]"
148
+ )
149
+ ZH_ALIAS_START_PAREN_RE = re.compile(
150
+ rf"^(?P<canonical>{PERSON})\s*[((](?P<alias>{CJK_ALIAS})[))]"
151
+ )
152
+ ZH_ALIAS_PHRASE_RE = re.compile(
153
+ rf"{ZH_ALIAS_CONTEXT}(?P<canonical>{PERSON})"
154
+ rf"(?:又叫|也叫|昵称(?:是|叫)?|绰号(?:是|叫)?|外号(?:是|叫)?|花名(?:是|叫)?)"
155
+ rf"(?P<alias>{PERSON})"
156
+ )
157
+ ZH_ALIAS_START_PHRASE_RE = re.compile(
158
+ rf"^(?P<canonical>{PERSON})"
159
+ rf"(?:又叫|也叫|昵称(?:是|叫)?|绰号(?:是|叫)?|外号(?:是|叫)?|花名(?:是|叫)?)"
160
+ rf"(?P<alias>{PERSON})"
161
+ )
162
+
163
+ CJK_ALIAS_DESCRIPTORS = {
164
+ "前同事",
165
+ "同事",
166
+ "朋友",
167
+ "老同学",
168
+ "同学",
169
+ "校友",
170
+ "客户",
171
+ "投资人",
172
+ "创始人",
173
+ "合伙人",
174
+ "产品经理",
175
+ "工程师",
176
+ }
177
+ CJK_ALIAS_DESCRIPTOR_SUFFIXES = (
178
+ "经理",
179
+ "工程师",
180
+ "负责人",
181
+ "总监",
182
+ "同事",
183
+ "朋友",
184
+ "客户",
185
+ "投资人",
186
+ "创始人",
187
+ "合伙人",
188
+ "校友",
189
+ "同学",
190
+ )
191
+ BAD_PERSON_LABELS = {"i", "me", "my", "you", "he", "him", "his", "she", "her", "they", "them"}
192
+ ORGANIZATION_LABEL_SUFFIXES = (
193
+ "公司",
194
+ "集团",
195
+ "科技",
196
+ "智能",
197
+ "机器人",
198
+ "资本",
199
+ "基金",
200
+ "大学",
201
+ "学院",
202
+ "实验室",
203
+ )
204
+ PHONE_PREDICATES = {"phone", "mobile", "telephone", "tel"}
205
+ EMAIL_PREDICATES = {"email", "mail"}
206
+ MONTHS = {
207
+ "jan": 1,
208
+ "january": 1,
209
+ "feb": 2,
210
+ "february": 2,
211
+ "mar": 3,
212
+ "march": 3,
213
+ "apr": 4,
214
+ "april": 4,
215
+ "may": 5,
216
+ "jun": 6,
217
+ "june": 6,
218
+ "jul": 7,
219
+ "july": 7,
220
+ "aug": 8,
221
+ "august": 8,
222
+ "sep": 9,
223
+ "september": 9,
224
+ "oct": 10,
225
+ "october": 10,
226
+ "nov": 11,
227
+ "november": 11,
228
+ "dec": 12,
229
+ "december": 12,
230
+ }
231
+ ZH_WEEKDAYS = {
232
+ "一": 0,
233
+ "二": 1,
234
+ "三": 2,
235
+ "四": 3,
236
+ "五": 4,
237
+ "六": 5,
238
+ "日": 6,
239
+ "天": 6,
240
+ }
241
+
242
+
243
+ def normalize_interaction(interaction: SocialInteraction) -> SocialInteraction:
244
+ """Fill obvious structure from source_text without inventing uncertain facts."""
245
+
246
+ source = interaction.source_text
247
+ explicit_aliases = _explicit_aliases(source)
248
+ participants = list(interaction.participants)
249
+ mentioned = list(interaction.mentioned_people)
250
+ topics = list(interaction.topics)
251
+ direct_facts = list(interaction.direct_facts)
252
+ attributed_claims = list(interaction.attributed_claims)
253
+ follow_ups = list(interaction.follow_ups)
254
+ relationships = list(interaction.relationships)
255
+ place = interaction.place
256
+
257
+ if not participants:
258
+ for label in _participant_labels(source):
259
+ _append_participant(participants, label)
260
+ for subject in _fact_subject_labels(source):
261
+ _append_participant(participants, subject)
262
+ for label in explicit_aliases:
263
+ _append_participant(participants, label)
264
+
265
+ if not place:
266
+ place = _first_match(source, [EN_ANOTHER_PLACE_RE, EN_PLACE_RE, ZH_PLACE_RE], "place")
267
+
268
+ topics = _normalize_topics(source, topics)
269
+
270
+ if not mentioned:
271
+ for speaker, target in _mention_pairs(source):
272
+ _append_participant(participants, speaker)
273
+ _append_mentioned(mentioned, target, mentioned_by=speaker)
274
+ for relationship_source, relationship_target, _ in _relationship_triples(source):
275
+ _append_mentioned(mentioned, relationship_target, mentioned_by=relationship_source)
276
+ if not attributed_claims:
277
+ for speaker, target in _mention_pairs(source):
278
+ attributed_claims.append(
279
+ AttributedClaim(
280
+ speaker=PersonRef(label=speaker),
281
+ subject=PersonRef(label=target),
282
+ claim_text=_claim_sentence(source, speaker, target),
283
+ )
284
+ )
285
+ if not relationships:
286
+ for relationship_source, relationship_target, relationship_type in _relationship_triples(source):
287
+ relationships.append(
288
+ RelationshipAssertion(
289
+ source=PersonRef(label=relationship_source),
290
+ target=PersonRef(label=relationship_target),
291
+ relationship_type=relationship_type,
292
+ )
293
+ )
294
+
295
+ direct_facts.extend(
296
+ _direct_facts(
297
+ source,
298
+ participants,
299
+ fallback_subject=_single_explicit_alias_ref(explicit_aliases),
300
+ )
301
+ )
302
+ for claim in _source_focus_claims(source, participants):
303
+ if not any(existing.claim_text == claim.claim_text for existing in attributed_claims):
304
+ attributed_claims.append(claim)
305
+
306
+ if not follow_ups:
307
+ for task in _follow_up_labels(source):
308
+ related = [
309
+ participant.person
310
+ for participant in participants
311
+ if _person_label_referenced_in_text(participant.person.label, task)
312
+ ]
313
+ follow_ups.append(FollowUpTask(description=f"Promised to {task}", related_people=related))
314
+
315
+ (
316
+ participants,
317
+ mentioned,
318
+ direct_facts,
319
+ attributed_claims,
320
+ follow_ups,
321
+ relationships,
322
+ ) = _sanitize_interaction_fields(
323
+ source=source,
324
+ occurred_at=interaction.occurred_at.date() if interaction.occurred_at else None,
325
+ place=place,
326
+ participants=participants,
327
+ mentioned=mentioned,
328
+ direct_facts=direct_facts,
329
+ attributed_claims=attributed_claims,
330
+ follow_ups=follow_ups,
331
+ relationships=relationships,
332
+ )
333
+
334
+ (
335
+ participants,
336
+ mentioned,
337
+ direct_facts,
338
+ attributed_claims,
339
+ follow_ups,
340
+ relationships,
341
+ ) = _canonicalize_shorthand_refs(
342
+ participants=participants,
343
+ mentioned=mentioned,
344
+ direct_facts=direct_facts,
345
+ attributed_claims=attributed_claims,
346
+ follow_ups=follow_ups,
347
+ relationships=relationships,
348
+ explicit_aliases=explicit_aliases,
349
+ )
350
+
351
+ return interaction.model_copy(
352
+ update={
353
+ "place": place,
354
+ "participants": participants,
355
+ "mentioned_people": mentioned,
356
+ "topics": topics,
357
+ "direct_facts": direct_facts,
358
+ "attributed_claims": attributed_claims,
359
+ "follow_ups": follow_ups,
360
+ "relationships": relationships,
361
+ }
362
+ )
363
+
364
+
365
+ def _participant_labels(text: str) -> list[str]:
366
+ labels = [match.group("person").strip() for match in EN_MET_RE.finditer(text)]
367
+ labels.extend(match.group("person").strip() for match in EN_REMEMBER_RE.finditer(text))
368
+ labels.extend(match.group("person").strip() for match in ZH_PARTICIPANT_RE.finditer(text))
369
+ labels.extend(match.group("person").strip() for match in ZH_CHAT_WITH_RE.finditer(text))
370
+ labels.extend(match.group("person").strip() for match in ZH_ARRANGED_WITH_RE.finditer(text))
371
+ labels.extend(match.group("person").strip() for match in ZH_COMPLETED_SEND_TO_RE.finditer(text))
372
+ labels.extend(match.group("person").strip() for match in ZH_REMEMBER_RE.finditer(text))
373
+ return _unique_labels(labels)
374
+
375
+
376
+ def _fact_subject_labels(text: str) -> list[str]:
377
+ labels: list[str] = []
378
+ for pattern in [EN_WORK_RE, EN_STUDIED_RE, EN_INTEREST_RE]:
379
+ labels.extend(match.group("person").strip() for match in pattern.finditer(text))
380
+ return _unique_labels(label for label in labels if _is_probable_person(label))
381
+
382
+
383
+ def _topic_labels(text: str) -> list[str]:
384
+ labels = [match.group("topic").strip() for match in EN_TOPIC_RE.finditer(text)]
385
+ labels.extend(match.group("topic").strip() for match in ZH_TOPIC_RE.finditer(text))
386
+ labels.extend(match.group("topic").strip() for match in ZH_FOCUS_TOPIC_RE.finditer(text))
387
+ return _unique_labels(labels)
388
+
389
+
390
+ def _normalize_topics(source: str, topics: list[str]) -> list[str]:
391
+ normalized: list[str] = []
392
+ for topic in topics:
393
+ cleaned = _clean_topic_label(topic)
394
+ if cleaned and not _is_noise_topic(cleaned) and cleaned not in normalized:
395
+ normalized.append(cleaned)
396
+ for label in _topic_labels(source):
397
+ cleaned = _clean_topic_label(label)
398
+ if cleaned and not _is_noise_topic(cleaned) and cleaned not in normalized:
399
+ normalized.append(cleaned)
400
+ return normalized
401
+
402
+
403
+ def _clean_topic_label(value: str) -> str:
404
+ value = value.strip(" \t\r\n,.;:,。!?!?\"'")
405
+ value = re.sub(r"^(?:他|她|他们|她们|这个人|这人|现在|也|在|会|将|要)\s*", "", value)
406
+ return value.strip(" \t\r\n,.;:,。!?!?\"'")
407
+
408
+
409
+ def _is_noise_topic(value: str) -> bool:
410
+ value = value.strip()
411
+ if not value:
412
+ return True
413
+ if value in {"今天", "昨天", "刚刚", "早些时候", "上午", "下午", "晚上"}:
414
+ return True
415
+ return bool(
416
+ re.fullmatch(
417
+ r"(?:半|一|二|三|四|五|六|七|八|九|十|两|\d+)+"
418
+ r"(?:分钟|小时|天|周|个月|年)",
419
+ value,
420
+ )
421
+ )
422
+
423
+
424
+ def _source_focus_claims(
425
+ source: str, participants: list[Participant]
426
+ ) -> list[AttributedClaim]:
427
+ speaker = _default_subject(participants)
428
+ if not speaker:
429
+ return []
430
+ claims: list[AttributedClaim] = []
431
+ for match in ZH_ORG_FOCUS_CLAIM_RE.finditer(source):
432
+ subject = _clean_focus_subject(match.group("subject"))
433
+ topic = _clean_topic_label(match.group("topic"))
434
+ if not topic or _is_noise_topic(topic):
435
+ continue
436
+ claim_text = match.group(0).strip()
437
+ for prefix in ("他说", "她说", "他们说", "她们说", "表示", "提到"):
438
+ if claim_text.startswith(prefix):
439
+ claim_text = claim_text[len(prefix) :].strip()
440
+ break
441
+ claims.append(
442
+ AttributedClaim(
443
+ speaker=speaker,
444
+ subject=PersonRef(label=subject),
445
+ claim_text=claim_text,
446
+ claim_type="business_focus",
447
+ )
448
+ )
449
+ return claims
450
+
451
+
452
+ def _clean_focus_subject(value: str) -> str:
453
+ value = value.strip()
454
+ for prefix in ("他说", "她说", "他们说", "她们说", "表示", "提到"):
455
+ if value.startswith(prefix):
456
+ return value[len(prefix) :].strip()
457
+ return value
458
+
459
+
460
+ def _mention_pairs(text: str) -> list[tuple[str, str]]:
461
+ pairs = [
462
+ (match.group("speaker").strip(), match.group("target").strip())
463
+ for match in EN_MENTION_RE.finditer(text)
464
+ ]
465
+ pairs.extend(
466
+ (match.group("speaker").strip(), match.group("target").strip())
467
+ for match in ZH_MENTION_RE.finditer(text)
468
+ )
469
+ return [
470
+ pair
471
+ for pair in _unique_pairs(pairs)
472
+ if _is_probable_person(pair[0]) and _is_probable_person(pair[1])
473
+ ]
474
+
475
+
476
+ def _relationship_triples(text: str) -> list[tuple[str, str, str]]:
477
+ triples: list[tuple[str, str, str]] = []
478
+ relationship_patterns = [
479
+ (EN_MET_KNOWS_RE, "knows"),
480
+ (EN_KNOWS_RE, "knows"),
481
+ (EN_FRIENDS_WITH_RE, "friends_with"),
482
+ (EN_WORKS_WITH_RE, "works_with"),
483
+ (EN_INTRODUCED_ME_RE, "introduced_by"),
484
+ ]
485
+ for pattern, relationship_type in relationship_patterns:
486
+ triples.extend(
487
+ (
488
+ match.group("source").strip(),
489
+ match.group("target").strip(),
490
+ relationship_type,
491
+ )
492
+ for match in pattern.finditer(text)
493
+ )
494
+ return [
495
+ triple
496
+ for triple in _unique_triples(triples)
497
+ if _is_probable_person(triple[0]) and _is_probable_person(triple[1])
498
+ ]
499
+
500
+
501
+ def _direct_facts(
502
+ text: str,
503
+ participants: list[Participant],
504
+ *,
505
+ fallback_subject: PersonRef | None = None,
506
+ ) -> list[DirectFact]:
507
+ facts: list[DirectFact] = []
508
+ for match in EN_WORK_RE.finditer(text):
509
+ subject = PersonRef(label=match.group("person").strip())
510
+ organization = match.group("organization").strip()
511
+ facts.append(
512
+ DirectFact(
513
+ subject=subject,
514
+ predicate="works_at",
515
+ value=organization,
516
+ metadata=_optional_metadata(role=match.group("role")),
517
+ )
518
+ )
519
+ for match in EN_STUDIED_RE.finditer(text):
520
+ facts.append(
521
+ DirectFact(
522
+ subject=PersonRef(label=match.group("person").strip()),
523
+ predicate="studied_at",
524
+ value=match.group("school").strip(),
525
+ )
526
+ )
527
+ for match in EN_INTEREST_RE.finditer(text):
528
+ facts.append(
529
+ DirectFact(
530
+ subject=PersonRef(label=match.group("person").strip()),
531
+ predicate="interest",
532
+ value=match.group("value").strip(),
533
+ )
534
+ )
535
+ subject = _default_subject(participants) or fallback_subject
536
+ if subject:
537
+ for match in DEFAULT_WORK_RE.finditer(text):
538
+ facts.append(
539
+ DirectFact(
540
+ subject=subject,
541
+ predicate="works_at",
542
+ value=match.group("organization").strip(),
543
+ )
544
+ )
545
+ for match in DEFAULT_STUDIED_RE.finditer(text):
546
+ facts.append(
547
+ DirectFact(
548
+ subject=subject,
549
+ predicate="studied_at",
550
+ value=match.group("school").strip(),
551
+ )
552
+ )
553
+ for match in ZH_DEFAULT_WORK_RE.finditer(text):
554
+ organization = match.group("organization").strip()
555
+ role = match.group("role").strip()
556
+ facts.append(
557
+ DirectFact(
558
+ subject=subject,
559
+ predicate="works_at",
560
+ value=organization,
561
+ metadata=_optional_metadata(role=role),
562
+ )
563
+ )
564
+ for match in ZH_CONTACT_PREF_RE.finditer(text):
565
+ facts.append(
566
+ DirectFact(
567
+ subject=subject,
568
+ predicate="preference",
569
+ value=f"{match.group('method').strip()}联系",
570
+ )
571
+ )
572
+ for email in EMAIL_RE.findall(text):
573
+ facts.append(DirectFact(subject=subject, predicate="email", value=email))
574
+ for phone in PHONE_RE.findall(text):
575
+ facts.append(DirectFact(subject=subject, predicate="phone", value=phone.strip()))
576
+ return facts
577
+
578
+
579
+ def _follow_up_labels(text: str) -> list[str]:
580
+ labels = [match.group("task").strip() for match in EN_PROMISE_RE.finditer(text)]
581
+ labels.extend(match.group("task").strip() for match in EN_ASKED_ME_RE.finditer(text))
582
+ labels.extend(match.group("task").strip() for match in ZH_PROMISE_RE.finditer(text))
583
+ return _unique_labels(label.strip(" \t\r\n,.;:,。!?!?\"'") for label in labels)
584
+
585
+
586
+ def _append_participant(participants: list[Participant], label: str) -> None:
587
+ label = _clean_person_label(label)
588
+ if not _is_probable_person(label):
589
+ return
590
+ for index, participant in enumerate(participants):
591
+ existing = participant.person.label
592
+ if existing.lower() == label.lower():
593
+ return
594
+ if _same_person_label(existing, label):
595
+ if _is_full_latin_name(label) and not _is_full_latin_name(existing):
596
+ participants[index] = participant.model_copy(
597
+ update={"person": participant.person.model_copy(update={"label": label})}
598
+ )
599
+ return
600
+ participants.append(Participant(person=PersonRef(label=label)))
601
+
602
+
603
+ def _append_mentioned(
604
+ mentioned: list[MentionedPerson], label: str, *, mentioned_by: str | None = None
605
+ ) -> None:
606
+ label = _clean_person_label(label)
607
+ mentioned_by = _clean_person_label(mentioned_by) if mentioned_by else None
608
+ if not _is_probable_person(label):
609
+ return
610
+ if any(item.person.label == label for item in mentioned):
611
+ return
612
+ mentioned.append(
613
+ MentionedPerson(
614
+ person=PersonRef(label=label),
615
+ mentioned_by=PersonRef(label=mentioned_by) if mentioned_by else None,
616
+ )
617
+ )
618
+
619
+
620
+ def _first_match(text: str, patterns: list[re.Pattern[str]], group: str) -> str | None:
621
+ for pattern in patterns:
622
+ match = pattern.search(text)
623
+ if match:
624
+ value = match.group(group).strip()
625
+ if group == "place":
626
+ value = _clean_place_label(value)
627
+ return value or None
628
+ return None
629
+
630
+
631
+ def _clean_place_label(value: str) -> str:
632
+ return re.sub(
633
+ r"\s+\b(?:this afternoon|this morning|this evening|today|yesterday|tonight)\b.*$",
634
+ "",
635
+ value.strip(),
636
+ flags=re.IGNORECASE,
637
+ ).strip()
638
+
639
+
640
+ def _claim_sentence(text: str, speaker: str, target: str) -> str:
641
+ for sentence in re.split(r"(?<=[.;。])\s*", text):
642
+ if speaker in sentence and target in sentence:
643
+ return sentence.strip()
644
+ return f"{speaker} mentioned {target}."
645
+
646
+
647
+ def _sanitize_interaction_fields(
648
+ *,
649
+ source: str,
650
+ occurred_at: date | None,
651
+ place: str | None,
652
+ participants: list[Participant],
653
+ mentioned: list[MentionedPerson],
654
+ direct_facts: list[DirectFact],
655
+ attributed_claims: list[AttributedClaim],
656
+ follow_ups: list[FollowUpTask],
657
+ relationships: list[RelationshipAssertion],
658
+ ) -> tuple[
659
+ list[Participant],
660
+ list[MentionedPerson],
661
+ list[DirectFact],
662
+ list[AttributedClaim],
663
+ list[FollowUpTask],
664
+ list[RelationshipAssertion],
665
+ ]:
666
+ cleaned_participants: list[Participant] = []
667
+ for participant in participants:
668
+ person = _clean_ref(participant.person, place=place)
669
+ if person:
670
+ cleaned_participants.append(participant.model_copy(update={"person": person}))
671
+ cleaned_participants = _dedupe_participants(cleaned_participants)
672
+ default_ref = _default_subject(cleaned_participants)
673
+
674
+ cleaned_mentioned: list[MentionedPerson] = []
675
+ for item in mentioned:
676
+ person = _clean_ref(item.person, place=place)
677
+ if not person:
678
+ continue
679
+ mentioned_by = _clean_ref(item.mentioned_by, place=place) if item.mentioned_by else None
680
+ cleaned_mentioned.append(
681
+ item.model_copy(update={"person": person, "mentioned_by": mentioned_by})
682
+ )
683
+ cleaned_mentioned = _dedupe_mentioned(cleaned_mentioned)
684
+
685
+ cleaned_facts: list[DirectFact] = []
686
+ for fact in direct_facts:
687
+ subject = _clean_ref(fact.subject, place=place)
688
+ if subject is None and _is_bad_person_label(fact.subject.label) and default_ref:
689
+ subject = default_ref
690
+ if subject is None:
691
+ continue
692
+ normalized = fact.model_copy(update={"subject": subject})
693
+ if _valid_direct_fact(normalized, source=source):
694
+ cleaned_facts.append(normalized)
695
+ cleaned_facts = _dedupe_model_items(cleaned_facts)
696
+
697
+ cleaned_claims: list[AttributedClaim] = []
698
+ for claim in attributed_claims:
699
+ speaker = _clean_ref(claim.speaker, place=place) if claim.speaker else None
700
+ subject = _clean_ref(claim.subject, place=place) if claim.subject else None
701
+ if speaker is None and claim.speaker and _is_bad_person_label(claim.speaker.label):
702
+ speaker = default_ref
703
+ if subject is None and claim.subject and _is_bad_person_label(claim.subject.label):
704
+ subject = default_ref
705
+ if not speaker and not subject:
706
+ continue
707
+ if speaker and subject and _same_person_label(speaker.label, subject.label):
708
+ if _looks_like_follow_up_claim(claim.claim_text):
709
+ continue
710
+ cleaned_claims.append(claim.model_copy(update={"speaker": speaker, "subject": subject}))
711
+ cleaned_claims = _dedupe_model_items(cleaned_claims)
712
+
713
+ cleaned_follow_ups: list[FollowUpTask] = []
714
+ for follow_up in follow_ups:
715
+ if _looks_like_contact_preference_only_follow_up(follow_up.description):
716
+ continue
717
+ related = [
718
+ ref
719
+ for ref in (
720
+ _clean_ref(ref, place=place) for ref in follow_up.related_people
721
+ )
722
+ if ref is not None
723
+ ]
724
+ if not related and default_ref:
725
+ related = [default_ref]
726
+ due_at = follow_up.due_at or _date_hint_from_text(
727
+ follow_up.description,
728
+ source,
729
+ reference_date=occurred_at,
730
+ )
731
+ cleaned_follow_ups.append(
732
+ follow_up.model_copy(update={"related_people": related, "due_at": due_at})
733
+ )
734
+ cleaned_follow_ups = _dedupe_follow_ups(cleaned_follow_ups)
735
+
736
+ cleaned_relationships: list[RelationshipAssertion] = []
737
+ for relationship in relationships:
738
+ source_ref = _clean_ref(relationship.source, place=place)
739
+ target_ref = _clean_ref(relationship.target, place=place)
740
+ if not source_ref or not target_ref:
741
+ continue
742
+ cleaned_relationships.append(
743
+ relationship.model_copy(update={"source": source_ref, "target": target_ref})
744
+ )
745
+ cleaned_relationships = _dedupe_model_items(cleaned_relationships)
746
+
747
+ return (
748
+ cleaned_participants,
749
+ cleaned_mentioned,
750
+ cleaned_facts,
751
+ cleaned_claims,
752
+ cleaned_follow_ups,
753
+ cleaned_relationships,
754
+ )
755
+
756
+
757
+ def _clean_ref(ref: PersonRef | None, *, place: str | None) -> PersonRef | None:
758
+ if ref is None:
759
+ return None
760
+ label = _clean_person_label(ref.label)
761
+ aliases = [
762
+ alias
763
+ for alias in (_clean_person_label(alias) for alias in ref.aliases)
764
+ if alias and alias != label and not _is_date_like(alias)
765
+ ]
766
+ updated = ref.model_copy(update={"label": label, "aliases": _unique_labels(aliases)})
767
+ if not _valid_person_ref(updated, place=place):
768
+ return None
769
+ return updated
770
+
771
+
772
+ def _clean_person_label(label: str | None) -> str:
773
+ label = (label or "").strip(" \t\r\n,.;:,。!?!?\"'")
774
+ if not label:
775
+ return ""
776
+ label = re.sub(
777
+ r"^(?:另一个|另外一个|另一位|另位|不同的|新的)\s*",
778
+ "",
779
+ label,
780
+ ).strip()
781
+ label = re.sub(
782
+ r"^(?:this|that|another|a\s+different|different|new)\s+",
783
+ "",
784
+ label,
785
+ flags=re.IGNORECASE,
786
+ ).strip()
787
+ if re.fullmatch(r"[\u4e00-\u9fff]+", label):
788
+ for marker in ["也", "正在", "可能", "已经"]:
789
+ if marker in label:
790
+ prefix = label.split(marker, 1)[0]
791
+ if 2 <= len(prefix) <= 4:
792
+ return prefix
793
+ return label
794
+
795
+
796
+ def _valid_person_ref(ref: PersonRef, *, place: str | None) -> bool:
797
+ if _is_bad_person_label(ref.label):
798
+ return False
799
+ if _looks_like_organization_label(ref.label):
800
+ return False
801
+ if _is_date_like(ref.label):
802
+ return False
803
+ if place and ref.label.strip().casefold() == place.strip().casefold():
804
+ return False
805
+ return _is_probable_person(ref.label)
806
+
807
+
808
+ def _is_bad_person_label(label: str) -> bool:
809
+ return label.strip().casefold() in BAD_PERSON_LABELS
810
+
811
+
812
+ def _looks_like_organization_label(label: str) -> bool:
813
+ stripped = label.strip()
814
+ if stripped.startswith("测试"):
815
+ return False
816
+ return any(stripped.endswith(suffix) for suffix in ORGANIZATION_LABEL_SUFFIXES)
817
+
818
+
819
+ def _is_date_like(value: str) -> bool:
820
+ return bool(ISO_DATE_RE.fullmatch(value.strip()))
821
+
822
+
823
+ def _valid_direct_fact(fact: DirectFact, *, source: str = "") -> bool:
824
+ predicate = fact.predicate.strip().casefold()
825
+ value = fact.value.strip()
826
+ if predicate in PHONE_PREDICATES:
827
+ return bool(PHONE_RE.fullmatch(value)) and not _is_date_like(value)
828
+ if predicate in EMAIL_PREDICATES:
829
+ return bool(EMAIL_RE.fullmatch(value))
830
+ if predicate in {"works_at", "worked_at", "work", "current_job"}:
831
+ if _looks_like_contrast_only_work_fact(source, fact.subject.label, value):
832
+ return False
833
+ return bool(value)
834
+
835
+
836
+ def _looks_like_contrast_only_work_fact(source: str, person_label: str, organization: str) -> bool:
837
+ if not source or not person_label or not organization:
838
+ return False
839
+ person = re.escape(person_label.strip())
840
+ org = re.escape(organization.strip())
841
+ patterns = [
842
+ rf"和{org}(?:的|那个|那位)?{person}[^。;;,.,]*(?:不是同一个人|不是一个人|不同的人)",
843
+ rf"{org}(?:的|那个|那位)?{person}[^。;;,.,]*(?:不是同一个人|不是一个人|不同的人)",
844
+ rf"(?:不是同一个人|不是一个人|不同的人)[^。;;,.,]*{org}(?:的|那个|那位)?{person}",
845
+ ]
846
+ return any(re.search(pattern, source) for pattern in patterns)
847
+
848
+
849
+ def _looks_like_follow_up_claim(text: str) -> bool:
850
+ lowered = text.casefold()
851
+ return any(
852
+ phrase in lowered
853
+ for phrase in [
854
+ "asked me to",
855
+ "asked us to",
856
+ "reminded me to",
857
+ "promised to",
858
+ "答应",
859
+ "提醒我",
860
+ ]
861
+ )
862
+
863
+
864
+ def _looks_like_contact_preference_only_follow_up(text: str) -> bool:
865
+ normalized = " ".join(text.casefold().split())
866
+ if not CONTACT_PREFERENCE_ONLY_RE.search(normalized):
867
+ return False
868
+ without_contact_phrase = CONTACT_PREFERENCE_ONLY_RE.sub("", normalized).strip()
869
+ without_contact_phrase = re.sub(r"^[,,。;;\s]+|[,,。;;\s]+$", "", without_contact_phrase)
870
+ if CONCRETE_FOLLOW_UP_CUE_RE.search(without_contact_phrase):
871
+ return False
872
+ return True
873
+
874
+
875
+ def _dedupe_mentioned(items: list[MentionedPerson]) -> list[MentionedPerson]:
876
+ by_person: dict[str, MentionedPerson] = {}
877
+ for item in items:
878
+ key = item.person.label.casefold()
879
+ existing = by_person.get(key)
880
+ if existing is None or _mentioned_quality(item) > _mentioned_quality(existing):
881
+ by_person[key] = item
882
+ return list(by_person.values())
883
+
884
+
885
+ def _mentioned_quality(item: MentionedPerson) -> int:
886
+ return int(item.mentioned_by is not None) + int(bool(item.context))
887
+
888
+
889
+ def _dedupe_follow_ups(items: list[FollowUpTask]) -> list[FollowUpTask]:
890
+ result: list[FollowUpTask] = []
891
+ for item in items:
892
+ existing_index = next(
893
+ (
894
+ index
895
+ for index, existing in enumerate(result)
896
+ if _similar_follow_up(existing.description, item.description)
897
+ ),
898
+ None,
899
+ )
900
+ if existing_index is None:
901
+ result.append(item)
902
+ continue
903
+ existing = result[existing_index]
904
+ replacement = item if _follow_up_quality(item) > _follow_up_quality(existing) else existing
905
+ result[existing_index] = replacement
906
+ return result
907
+
908
+
909
+ def _similar_follow_up(left: str, right: str) -> bool:
910
+ left_key = _follow_up_key(left)
911
+ right_key = _follow_up_key(right)
912
+ if left_key == right_key:
913
+ return True
914
+ return left_key in right_key or right_key in left_key
915
+
916
+
917
+ def _follow_up_quality(item: FollowUpTask) -> int:
918
+ return len(item.description) + (25 if item.due_at else 0) + (5 * len(item.related_people))
919
+
920
+
921
+ def _follow_up_key(value: str) -> str:
922
+ return re.sub(r"\s+", " ", value.casefold().replace("promised to ", "")).strip()
923
+
924
+
925
+ def _dedupe_model_items(items: list):
926
+ result: list = []
927
+ seen: set[str] = set()
928
+ for item in items:
929
+ key = item.model_dump_json()
930
+ if key in seen:
931
+ continue
932
+ seen.add(key)
933
+ result.append(item)
934
+ return result
935
+
936
+
937
+ def _dedupe_direct_facts(items: list[DirectFact]) -> list[DirectFact]:
938
+ result: list[DirectFact] = []
939
+ seen: dict[tuple[str, str, str], int] = {}
940
+ for item in items:
941
+ key = (
942
+ item.subject.label.casefold(),
943
+ item.predicate.casefold(),
944
+ item.value.casefold(),
945
+ )
946
+ if key in seen:
947
+ existing_index = seen[key]
948
+ existing = result[existing_index]
949
+ if not existing.metadata and item.metadata:
950
+ result[existing_index] = item
951
+ continue
952
+ seen[key] = len(result)
953
+ result.append(item)
954
+ return result
955
+
956
+
957
+ def _date_hint_from_text(
958
+ text: str, source: str, *, reference_date: date | None = None
959
+ ) -> date | None:
960
+ match = ISO_DATE_RE.search(text)
961
+ if match:
962
+ return _safe_date(
963
+ int(match.group("year")),
964
+ int(match.group("month")),
965
+ int(match.group("day")),
966
+ )
967
+ source_date = _first_source_date(source) or reference_date or date.today()
968
+ month_match = MONTH_DAY_RE.search(text)
969
+ if month_match and source_date:
970
+ month = MONTHS[month_match.group("month").casefold()]
971
+ return _safe_date(source_date.year, month, int(month_match.group("day")))
972
+ zh_weekday_match = ZH_NEXT_WEEKDAY_RE.search(text)
973
+ if zh_weekday_match and source_date:
974
+ target_weekday = ZH_WEEKDAYS[zh_weekday_match.group("weekday")]
975
+ days_until_next_week = 7 - source_date.weekday() + target_weekday
976
+ return source_date + timedelta(days=days_until_next_week)
977
+ if "tomorrow" in text.casefold() and source_date:
978
+ return source_date + timedelta(days=1)
979
+ return None
980
+
981
+
982
+ def _first_source_date(source: str) -> date | None:
983
+ match = ISO_DATE_RE.search(source)
984
+ if not match:
985
+ return None
986
+ return _safe_date(
987
+ int(match.group("year")),
988
+ int(match.group("month")),
989
+ int(match.group("day")),
990
+ )
991
+
992
+
993
+ def _safe_date(year: int, month: int, day: int) -> date | None:
994
+ try:
995
+ return date(year, month, day)
996
+ except ValueError:
997
+ return None
998
+
999
+
1000
+ def _canonicalize_shorthand_refs(
1001
+ *,
1002
+ participants: list[Participant],
1003
+ mentioned: list[MentionedPerson],
1004
+ direct_facts: list[DirectFact],
1005
+ attributed_claims: list[AttributedClaim],
1006
+ follow_ups: list[FollowUpTask],
1007
+ relationships: list[RelationshipAssertion],
1008
+ explicit_aliases: dict[str, list[str]] | None = None,
1009
+ ) -> tuple[
1010
+ list[Participant],
1011
+ list[MentionedPerson],
1012
+ list[DirectFact],
1013
+ list[AttributedClaim],
1014
+ list[FollowUpTask],
1015
+ list[RelationshipAssertion],
1016
+ ]:
1017
+ explicit_aliases = explicit_aliases or {}
1018
+ alias_map = _participant_shorthand_map(participants)
1019
+ alias_map.update(_explicit_alias_map(explicit_aliases, participants))
1020
+ if not alias_map and not explicit_aliases:
1021
+ return participants, mentioned, direct_facts, attributed_claims, follow_ups, relationships
1022
+
1023
+ canonical_participants = _dedupe_participants(
1024
+ [
1025
+ participant.model_copy(
1026
+ update={
1027
+ "person": _canonicalize_ref(
1028
+ participant.person,
1029
+ alias_map,
1030
+ explicit_aliases,
1031
+ )
1032
+ }
1033
+ )
1034
+ for participant in participants
1035
+ ]
1036
+ )
1037
+ return (
1038
+ canonical_participants,
1039
+ [
1040
+ item.model_copy(
1041
+ update={
1042
+ "person": _canonicalize_ref(item.person, alias_map, explicit_aliases),
1043
+ "mentioned_by": (
1044
+ _canonicalize_ref(
1045
+ item.mentioned_by,
1046
+ alias_map,
1047
+ explicit_aliases,
1048
+ )
1049
+ if item.mentioned_by
1050
+ else None
1051
+ ),
1052
+ }
1053
+ )
1054
+ for item in mentioned
1055
+ ],
1056
+ _dedupe_direct_facts([
1057
+ fact.model_copy(
1058
+ update={
1059
+ "subject": _canonicalize_ref(
1060
+ fact.subject,
1061
+ alias_map,
1062
+ explicit_aliases,
1063
+ )
1064
+ }
1065
+ )
1066
+ for fact in direct_facts
1067
+ ]),
1068
+ [
1069
+ claim.model_copy(
1070
+ update={
1071
+ "speaker": (
1072
+ _canonicalize_ref(
1073
+ claim.speaker,
1074
+ alias_map,
1075
+ explicit_aliases,
1076
+ )
1077
+ if claim.speaker
1078
+ else None
1079
+ ),
1080
+ "subject": (
1081
+ _canonicalize_ref(
1082
+ claim.subject,
1083
+ alias_map,
1084
+ explicit_aliases,
1085
+ )
1086
+ if claim.subject
1087
+ else None
1088
+ ),
1089
+ }
1090
+ )
1091
+ for claim in attributed_claims
1092
+ ],
1093
+ [
1094
+ follow_up.model_copy(
1095
+ update={
1096
+ "related_people": [
1097
+ _canonicalize_ref(ref, alias_map, explicit_aliases)
1098
+ for ref in follow_up.related_people
1099
+ ]
1100
+ }
1101
+ )
1102
+ for follow_up in follow_ups
1103
+ ],
1104
+ [
1105
+ relationship.model_copy(
1106
+ update={
1107
+ "source": _canonicalize_ref(
1108
+ relationship.source,
1109
+ alias_map,
1110
+ explicit_aliases,
1111
+ ),
1112
+ "target": _canonicalize_ref(
1113
+ relationship.target,
1114
+ alias_map,
1115
+ explicit_aliases,
1116
+ ),
1117
+ }
1118
+ )
1119
+ for relationship in relationships
1120
+ ],
1121
+ )
1122
+
1123
+
1124
+ def _explicit_aliases(text: str) -> dict[str, list[str]]:
1125
+ aliases: dict[str, list[str]] = {}
1126
+ for pattern in [ZH_ALIAS_PAREN_RE, ZH_ALIAS_START_PAREN_RE]:
1127
+ for match in pattern.finditer(text):
1128
+ canonical = match.group("canonical").strip()
1129
+ alias = match.group("alias").strip()
1130
+ _add_explicit_alias(aliases, canonical, alias)
1131
+ for pattern in [ZH_ALIAS_PHRASE_RE, ZH_ALIAS_START_PHRASE_RE]:
1132
+ for match in pattern.finditer(text):
1133
+ canonical = match.group("canonical").strip()
1134
+ alias = match.group("alias").strip()
1135
+ _add_explicit_alias(aliases, canonical, alias)
1136
+ return aliases
1137
+
1138
+
1139
+ def _single_explicit_alias_ref(explicit_aliases: dict[str, list[str]]) -> PersonRef | None:
1140
+ if len(explicit_aliases) != 1:
1141
+ return None
1142
+ canonical, aliases = next(iter(explicit_aliases.items()))
1143
+ return PersonRef(label=canonical, aliases=aliases)
1144
+
1145
+
1146
+ def _add_explicit_alias(aliases: dict[str, list[str]], canonical: str, alias: str) -> None:
1147
+ if _looks_like_cjk_context_fragment(canonical):
1148
+ return
1149
+ if not _is_probable_person(canonical):
1150
+ return
1151
+ if not _is_probable_alias(alias):
1152
+ return
1153
+ if _is_date_like(alias):
1154
+ return
1155
+ if canonical.casefold() == alias.casefold():
1156
+ return
1157
+ items = aliases.setdefault(canonical, [])
1158
+ if alias not in items:
1159
+ items.append(alias)
1160
+
1161
+
1162
+ def _looks_like_cjk_context_fragment(label: str) -> bool:
1163
+ if not re.fullmatch(CJK_NAME, label):
1164
+ return False
1165
+ if label.startswith(("今天", "昨天", "前天", "今晚", "上午", "下午", "晚上")):
1166
+ return True
1167
+ return any(marker in label for marker in ["见了", "认识了", "遇到", "碰到", "记住", "记录", "保存"])
1168
+
1169
+
1170
+ def _is_probable_alias(alias: str) -> bool:
1171
+ alias = alias.strip()
1172
+ if not alias or alias in CJK_ALIAS_DESCRIPTORS:
1173
+ return False
1174
+ if alias.endswith(CJK_ALIAS_DESCRIPTOR_SUFFIXES):
1175
+ return False
1176
+ return bool(re.fullmatch(CJK_ALIAS, alias))
1177
+
1178
+
1179
+ def _explicit_alias_map(
1180
+ explicit_aliases: dict[str, list[str]],
1181
+ participants: list[Participant],
1182
+ ) -> dict[str, str]:
1183
+ canonical_by_alias: dict[str, str | None] = {}
1184
+ explicit_lookup: dict[str, str | None] = {}
1185
+ for canonical, aliases in explicit_aliases.items():
1186
+ _remember_alias_candidate(explicit_lookup, canonical, canonical)
1187
+ for alias in aliases:
1188
+ _remember_alias_candidate(canonical_by_alias, alias, canonical)
1189
+ _remember_alias_candidate(explicit_lookup, alias, canonical)
1190
+ for participant in participants:
1191
+ ref = participant.person
1192
+ if ref.person_id or ref.email or ref.phone or ref.company_hint:
1193
+ continue
1194
+ label_key = ref.label.strip().casefold()
1195
+ if not label_key:
1196
+ continue
1197
+ explicit_canonical = _explicit_canonical_for_ref(ref, explicit_lookup, explicit_aliases)
1198
+ if explicit_canonical and explicit_canonical != ref.label:
1199
+ _remember_alias_candidate(canonical_by_alias, ref.label, explicit_canonical)
1200
+ for alias in ref.aliases:
1201
+ _remember_alias_candidate(canonical_by_alias, alias, explicit_canonical)
1202
+ for alias in ref.aliases:
1203
+ canonical = canonical_by_alias.get(alias.strip().casefold())
1204
+ if not canonical or canonical == ref.label:
1205
+ continue
1206
+ _remember_alias_candidate(canonical_by_alias, ref.label, canonical)
1207
+ return {
1208
+ alias: canonical
1209
+ for alias, canonical in canonical_by_alias.items()
1210
+ if canonical is not None and not _has_distinct_alias_ref(participants, alias, canonical)
1211
+ }
1212
+
1213
+
1214
+ def _remember_alias_candidate(
1215
+ mapping: dict[str, str | None], alias: str, canonical: str
1216
+ ) -> None:
1217
+ alias_key = alias.strip().casefold()
1218
+ if not alias_key:
1219
+ return
1220
+ if alias_key not in mapping:
1221
+ mapping[alias_key] = canonical
1222
+ elif mapping[alias_key] != canonical:
1223
+ mapping[alias_key] = None
1224
+
1225
+
1226
+ def _explicit_canonical_for_ref(
1227
+ ref: PersonRef,
1228
+ explicit_lookup: dict[str, str | None],
1229
+ explicit_aliases: dict[str, list[str]],
1230
+ ) -> str | None:
1231
+ labels = [ref.label, *ref.aliases]
1232
+ matches = {
1233
+ canonical
1234
+ for label in labels
1235
+ for canonical in [explicit_lookup.get(label.strip().casefold())]
1236
+ if canonical is not None
1237
+ }
1238
+ for canonical in explicit_aliases:
1239
+ if _same_test_prefixed_cjk_name(canonical, ref.label):
1240
+ matches.add(canonical)
1241
+ return next(iter(matches)) if len(matches) == 1 else None
1242
+
1243
+
1244
+ def _same_test_prefixed_cjk_name(canonical: str, label: str) -> bool:
1245
+ canonical = canonical.strip()
1246
+ label = label.strip()
1247
+ if not canonical.startswith("测试"):
1248
+ return False
1249
+ stripped = canonical.removeprefix("测试")
1250
+ return bool(stripped and label == stripped and re.fullmatch(CJK_NAME, stripped))
1251
+
1252
+
1253
+ def _participant_shorthand_map(participants: list[Participant]) -> dict[str, str]:
1254
+ canonical_by_alias: dict[str, str | None] = {}
1255
+ for participant in participants:
1256
+ label = participant.person.label.strip()
1257
+ for alias in participant.person.aliases:
1258
+ alias_key = alias.strip().casefold()
1259
+ if not alias_key or alias_key == label.casefold():
1260
+ continue
1261
+ if alias_key not in canonical_by_alias:
1262
+ canonical_by_alias[alias_key] = label
1263
+ elif canonical_by_alias[alias_key] != label:
1264
+ canonical_by_alias[alias_key] = None
1265
+ if not _is_full_latin_name(label):
1266
+ continue
1267
+ for alias in _latin_name_aliases(label):
1268
+ if alias not in canonical_by_alias:
1269
+ canonical_by_alias[alias] = label
1270
+ elif canonical_by_alias[alias] != label:
1271
+ canonical_by_alias[alias] = None
1272
+ return {
1273
+ alias: full
1274
+ for alias, full in canonical_by_alias.items()
1275
+ if full is not None and not _has_distinct_alias_ref(participants, alias, full)
1276
+ }
1277
+
1278
+
1279
+ def _has_distinct_alias_ref(
1280
+ participants: list[Participant], alias: str, full_name: str
1281
+ ) -> bool:
1282
+ for participant in participants:
1283
+ ref = participant.person
1284
+ label = ref.label.strip()
1285
+ if label.lower() != alias:
1286
+ continue
1287
+ if ref.person_id or ref.email or ref.phone or ref.company_hint:
1288
+ return True
1289
+ if label.lower() == full_name.lower():
1290
+ return True
1291
+ return False
1292
+
1293
+
1294
+ def _canonicalize_ref(
1295
+ ref: PersonRef,
1296
+ alias_map: dict[str, str],
1297
+ explicit_aliases: dict[str, list[str]],
1298
+ ) -> PersonRef:
1299
+ label = ref.label.strip()
1300
+ explicit_canonical = _explicit_canonical_label(label, explicit_aliases)
1301
+ canonical = (
1302
+ label
1303
+ if (ref.person_id or ref.email or ref.phone)
1304
+ else explicit_canonical or alias_map.get(label.casefold())
1305
+ )
1306
+ if not canonical and not (ref.person_id or ref.email or ref.phone):
1307
+ for alias in ref.aliases:
1308
+ canonical = alias_map.get(alias.strip().casefold())
1309
+ if canonical:
1310
+ break
1311
+ if not canonical:
1312
+ canonical = label
1313
+ aliases = [alias for alias in ref.aliases if alias != canonical]
1314
+ if label != canonical and label not in aliases:
1315
+ aliases.append(label)
1316
+ for alias in explicit_aliases.get(canonical, []):
1317
+ if alias != canonical and alias not in aliases:
1318
+ aliases.append(alias)
1319
+ if canonical == ref.label and aliases == ref.aliases:
1320
+ return ref
1321
+ return ref.model_copy(update={"label": canonical, "aliases": aliases})
1322
+
1323
+
1324
+ def _explicit_canonical_label(
1325
+ label: str, explicit_aliases: dict[str, list[str]]
1326
+ ) -> str | None:
1327
+ label_key = label.strip().casefold()
1328
+ for canonical in explicit_aliases:
1329
+ if canonical.strip().casefold() == label_key:
1330
+ return canonical
1331
+ return None
1332
+
1333
+
1334
+ def _dedupe_participants(participants: list[Participant]) -> list[Participant]:
1335
+ seen: dict[str, int] = {}
1336
+ result: list[Participant] = []
1337
+ for participant in participants:
1338
+ key = participant.person.label.lower()
1339
+ if key in seen:
1340
+ index = seen[key]
1341
+ existing = result[index]
1342
+ aliases = _unique_labels([*existing.person.aliases, *participant.person.aliases])
1343
+ result[index] = existing.model_copy(
1344
+ update={"person": existing.person.model_copy(update={"aliases": aliases})}
1345
+ )
1346
+ continue
1347
+ seen[key] = len(result)
1348
+ result.append(participant)
1349
+ return result
1350
+
1351
+
1352
+ def _person_label_referenced_in_text(label: str, text: str) -> bool:
1353
+ if label in text:
1354
+ return True
1355
+ if _is_full_latin_name(label):
1356
+ first = label.split()[0]
1357
+ return bool(re.search(rf"\b{re.escape(first)}\b", text))
1358
+ return False
1359
+
1360
+
1361
+ def _same_person_label(existing: str, candidate: str) -> bool:
1362
+ if existing.lower() == candidate.lower():
1363
+ return True
1364
+ existing_key = existing.strip().lower()
1365
+ candidate_key = candidate.strip().lower()
1366
+ if candidate_key in _latin_name_aliases(existing):
1367
+ return True
1368
+ if existing_key in _latin_name_aliases(candidate):
1369
+ return True
1370
+ return False
1371
+
1372
+
1373
+ def _is_full_latin_name(label: str) -> bool:
1374
+ return bool(re.fullmatch(r"[A-Z][A-Za-z'-]+(?:\s+[A-Z][A-Za-z'-]+)+", label.strip()))
1375
+
1376
+
1377
+ def _latin_name_aliases(label: str) -> set[str]:
1378
+ if not _is_full_latin_name(label):
1379
+ return set()
1380
+ tokens = label.strip().split()
1381
+ aliases = {tokens[0].lower()}
1382
+ aliases.update(" ".join(tokens[:index]).lower() for index in range(2, len(tokens)))
1383
+ if tokens[0].casefold() == "test" and len(tokens) >= 3:
1384
+ aliases.add(tokens[1].lower())
1385
+ aliases.update(" ".join(tokens[1:index]).lower() for index in range(3, len(tokens)))
1386
+ return aliases
1387
+
1388
+
1389
+ def _default_subject(participants: list[Participant]) -> PersonRef | None:
1390
+ if len(participants) == 1:
1391
+ return participants[0].person
1392
+ return None
1393
+
1394
+
1395
+ def _optional_metadata(**values: object) -> dict[str, object]:
1396
+ return {key: value.strip() for key, value in values.items() if isinstance(value, str) and value.strip()}
1397
+
1398
+
1399
+ def _unique_labels(labels: Iterable[object]) -> list[str]:
1400
+ seen: set[str] = set()
1401
+ result: list[str] = []
1402
+ for label in labels:
1403
+ normalized = str(label).strip()
1404
+ key = normalized.lower()
1405
+ if normalized and key not in seen:
1406
+ seen.add(key)
1407
+ result.append(normalized)
1408
+ return result
1409
+
1410
+
1411
+ def _unique_pairs(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
1412
+ seen: set[tuple[str, str]] = set()
1413
+ result: list[tuple[str, str]] = []
1414
+ for source, target in pairs:
1415
+ key = (source.lower(), target.lower())
1416
+ if key not in seen:
1417
+ seen.add(key)
1418
+ result.append((source, target))
1419
+ return result
1420
+
1421
+
1422
+ def _unique_triples(triples: list[tuple[str, str, str]]) -> list[tuple[str, str, str]]:
1423
+ seen: set[tuple[str, str, str]] = set()
1424
+ result: list[tuple[str, str, str]] = []
1425
+ for source, target, relationship_type in triples:
1426
+ key = (source.lower(), target.lower(), relationship_type)
1427
+ if key not in seen:
1428
+ seen.add(key)
1429
+ result.append((source, target, relationship_type))
1430
+ return result
1431
+
1432
+
1433
+ def _is_probable_person(label: str) -> bool:
1434
+ if _is_bad_person_label(label):
1435
+ return False
1436
+ if re.fullmatch(PLACEHOLDER_NAME, label):
1437
+ return True
1438
+ if re.fullmatch(LATIN_NAME, label):
1439
+ blocked = {"Blue Bottle", "Blue Bottle Coffee", "People Square", "West Lake"}
1440
+ return label not in blocked
1441
+ return bool(re.fullmatch(CJK_NAME, label))