rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,825 @@
1
+ """
2
+ RNSR Entity Linker
3
+
4
+ Cross-document entity linking with fuzzy matching and LLM disambiguation.
5
+ Links entities that represent the same real-world entity across documents.
6
+
7
+ Features adaptive learning for normalization patterns - learns new titles
8
+ and suffixes from user's document workload.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import os
15
+ import re
16
+ from collections import defaultdict
17
+ from datetime import datetime
18
+ from difflib import SequenceMatcher
19
+ from pathlib import Path
20
+ from threading import Lock
21
+ from typing import TYPE_CHECKING, Any
22
+
23
+ import structlog
24
+
25
+ from rnsr.extraction.models import Entity, EntityLink, EntityType
26
+ from rnsr.llm import get_llm
27
+
28
+ if TYPE_CHECKING:
29
+ from rnsr.indexing.knowledge_graph import KnowledgeGraph
30
+
31
+ logger = structlog.get_logger(__name__)
32
+
33
+
34
+ # =============================================================================
35
+ # Learned Normalization Patterns
36
+ # =============================================================================
37
+
38
+ DEFAULT_NORMALIZATION_PATH = Path.home() / ".rnsr" / "learned_normalization.json"
39
+
40
+
41
+ class LearnedNormalizationPatterns:
42
+ """
43
+ Registry for learning domain-specific normalization patterns.
44
+
45
+ Learns:
46
+ - Titles/prefixes (Mr., Dr., Esq., Hon., M.D., etc.)
47
+ - Suffixes (Inc., LLC, GmbH, Pty Ltd, etc.)
48
+ - Domain-specific patterns (legal, medical, regional)
49
+ """
50
+
51
+ # Base patterns (always included)
52
+ BASE_TITLES = [
53
+ "mr.", "mrs.", "ms.", "dr.", "prof.",
54
+ "mr", "mrs", "ms", "dr", "prof",
55
+ "the", "hon.", "hon", "sir", "dame",
56
+ ]
57
+
58
+ BASE_SUFFIXES = [
59
+ ", inc.", ", inc", ", llc", ", llc.",
60
+ ", corp.", ", corp", ", ltd.", ", ltd",
61
+ "inc.", "inc", "llc", "corp.", "corp", "ltd.", "ltd",
62
+ ", esq.", ", esq", "esq.", "esq",
63
+ ", jr.", ", jr", ", sr.", ", sr",
64
+ "jr.", "jr", "sr.", "sr",
65
+ ]
66
+
67
+ def __init__(
68
+ self,
69
+ storage_path: Path | str | None = None,
70
+ auto_save: bool = True,
71
+ ):
72
+ """
73
+ Initialize the normalization patterns registry.
74
+
75
+ Args:
76
+ storage_path: Path to JSON file for persistence.
77
+ auto_save: Whether to save after each new pattern.
78
+ """
79
+ self.storage_path = Path(storage_path) if storage_path else DEFAULT_NORMALIZATION_PATH
80
+ self.auto_save = auto_save
81
+
82
+ self._lock = Lock()
83
+ self._titles: dict[str, dict[str, Any]] = {}
84
+ self._suffixes: dict[str, dict[str, Any]] = {}
85
+ self._dirty = False
86
+
87
+ self._load()
88
+
89
+ def _load(self) -> None:
90
+ """Load learned patterns from storage."""
91
+ if not self.storage_path.exists():
92
+ return
93
+
94
+ try:
95
+ with open(self.storage_path, "r") as f:
96
+ data = json.load(f)
97
+
98
+ self._titles = data.get("titles", {})
99
+ self._suffixes = data.get("suffixes", {})
100
+
101
+ logger.info(
102
+ "normalization_patterns_loaded",
103
+ titles=len(self._titles),
104
+ suffixes=len(self._suffixes),
105
+ )
106
+
107
+ except Exception as e:
108
+ logger.warning("failed_to_load_normalization_patterns", error=str(e))
109
+
110
+ def _save(self) -> None:
111
+ """Save patterns to storage."""
112
+ if not self._dirty:
113
+ return
114
+
115
+ try:
116
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
117
+
118
+ data = {
119
+ "version": "1.0",
120
+ "updated_at": datetime.utcnow().isoformat(),
121
+ "titles": self._titles,
122
+ "suffixes": self._suffixes,
123
+ }
124
+
125
+ with open(self.storage_path, "w") as f:
126
+ json.dump(data, f, indent=2)
127
+
128
+ self._dirty = False
129
+
130
+ except Exception as e:
131
+ logger.warning("failed_to_save_normalization_patterns", error=str(e))
132
+
133
+ def record_title(
134
+ self,
135
+ title: str,
136
+ domain: str = "general",
137
+ entity_example: str = "",
138
+ ) -> None:
139
+ """
140
+ Record a learned title/prefix.
141
+
142
+ Args:
143
+ title: The title pattern (e.g., "Atty.", "M.D.").
144
+ domain: Domain category (legal, medical, regional, etc.).
145
+ entity_example: Example entity with this title.
146
+ """
147
+ title = title.lower().strip()
148
+
149
+ if not title or title in self.BASE_TITLES:
150
+ return
151
+
152
+ with self._lock:
153
+ now = datetime.utcnow().isoformat()
154
+
155
+ if title not in self._titles:
156
+ self._titles[title] = {
157
+ "count": 0,
158
+ "domain": domain,
159
+ "first_seen": now,
160
+ "last_seen": now,
161
+ "examples": [],
162
+ }
163
+ logger.info("new_title_pattern_learned", title=title, domain=domain)
164
+
165
+ self._titles[title]["count"] += 1
166
+ self._titles[title]["last_seen"] = now
167
+
168
+ if entity_example and len(self._titles[title]["examples"]) < 3:
169
+ self._titles[title]["examples"].append(entity_example)
170
+
171
+ self._dirty = True
172
+
173
+ if self.auto_save:
174
+ self._save()
175
+
176
+ def record_suffix(
177
+ self,
178
+ suffix: str,
179
+ domain: str = "general",
180
+ entity_example: str = "",
181
+ ) -> None:
182
+ """
183
+ Record a learned suffix.
184
+
185
+ Args:
186
+ suffix: The suffix pattern (e.g., "GmbH", "Pty Ltd").
187
+ domain: Domain category (legal, corporate, regional, etc.).
188
+ entity_example: Example entity with this suffix.
189
+ """
190
+ suffix = suffix.lower().strip()
191
+
192
+ if not suffix or suffix in self.BASE_SUFFIXES:
193
+ return
194
+
195
+ with self._lock:
196
+ now = datetime.utcnow().isoformat()
197
+
198
+ if suffix not in self._suffixes:
199
+ self._suffixes[suffix] = {
200
+ "count": 0,
201
+ "domain": domain,
202
+ "first_seen": now,
203
+ "last_seen": now,
204
+ "examples": [],
205
+ }
206
+ logger.info("new_suffix_pattern_learned", suffix=suffix, domain=domain)
207
+
208
+ self._suffixes[suffix]["count"] += 1
209
+ self._suffixes[suffix]["last_seen"] = now
210
+
211
+ if entity_example and len(self._suffixes[suffix]["examples"]) < 3:
212
+ self._suffixes[suffix]["examples"].append(entity_example)
213
+
214
+ self._dirty = True
215
+
216
+ if self.auto_save:
217
+ self._save()
218
+
219
+ def get_all_titles(self, min_count: int = 1) -> list[str]:
220
+ """Get all titles (base + learned)."""
221
+ learned = [
222
+ title for title, data in self._titles.items()
223
+ if data["count"] >= min_count
224
+ ]
225
+ return list(set(self.BASE_TITLES + learned))
226
+
227
+ def get_all_suffixes(self, min_count: int = 1) -> list[str]:
228
+ """Get all suffixes (base + learned)."""
229
+ learned = [
230
+ suffix for suffix, data in self._suffixes.items()
231
+ if data["count"] >= min_count
232
+ ]
233
+ return list(set(self.BASE_SUFFIXES + learned))
234
+
235
+ def get_stats(self) -> dict[str, Any]:
236
+ """Get statistics about learned patterns."""
237
+ return {
238
+ "learned_titles": len(self._titles),
239
+ "learned_suffixes": len(self._suffixes),
240
+ "total_titles": len(self.get_all_titles()),
241
+ "total_suffixes": len(self.get_all_suffixes()),
242
+ }
243
+
244
+
245
+ # Global normalization patterns instance
246
+ _global_normalization_patterns: LearnedNormalizationPatterns | None = None
247
+
248
+
249
+ def get_learned_normalization_patterns() -> LearnedNormalizationPatterns:
250
+ """Get the global normalization patterns registry."""
251
+ global _global_normalization_patterns
252
+
253
+ if _global_normalization_patterns is None:
254
+ custom_path = os.getenv("RNSR_NORMALIZATION_PATH")
255
+ _global_normalization_patterns = LearnedNormalizationPatterns(
256
+ storage_path=custom_path if custom_path else None
257
+ )
258
+
259
+ return _global_normalization_patterns
260
+
261
+
262
+ # LLM disambiguation prompt
263
+ DISAMBIGUATION_PROMPT = """You are an expert at entity resolution. Determine if these two entities refer to the same real-world entity.
264
+
265
+ Entity 1:
266
+ - Name: {name1}
267
+ - Type: {type1}
268
+ - Aliases: {aliases1}
269
+ - Context: {context1}
270
+ - Document: {doc1}
271
+
272
+ Entity 2:
273
+ - Name: {name2}
274
+ - Type: {type2}
275
+ - Aliases: {aliases2}
276
+ - Context: {context2}
277
+ - Document: {doc2}
278
+
279
+ Consider:
280
+ 1. Name similarity (accounting for variations, titles, abbreviations)
281
+ 2. Context similarity (same role, same events, same relationships)
282
+ 3. Document context (are these documents related?)
283
+
284
+ Respond with JSON:
285
+ ```json
286
+ {{
287
+ "same_entity": true/false,
288
+ "confidence": 0.0-1.0,
289
+ "reasoning": "Brief explanation of your decision"
290
+ }}
291
+ ```
292
+ """
293
+
294
+
295
+ class EntityLinker:
296
+ """
297
+ Links entities across documents using multiple strategies:
298
+
299
+ 1. Exact match on canonical_name
300
+ 2. Fuzzy string matching (Levenshtein-based)
301
+ 3. Alias matching
302
+ 4. LLM disambiguation for ambiguous cases
303
+ """
304
+
305
+ def __init__(
306
+ self,
307
+ knowledge_graph: KnowledgeGraph,
308
+ llm: Any | None = None,
309
+ exact_match_threshold: float = 1.0,
310
+ fuzzy_match_threshold: float = 0.85,
311
+ llm_confidence_threshold: float = 0.75,
312
+ use_llm_disambiguation: bool = True,
313
+ enable_pattern_learning: bool = True,
314
+ ):
315
+ """
316
+ Initialize the entity linker.
317
+
318
+ Args:
319
+ knowledge_graph: Knowledge graph for storing/querying entities.
320
+ llm: LLM instance for disambiguation. If None, uses get_llm().
321
+ exact_match_threshold: Confidence for exact matches.
322
+ fuzzy_match_threshold: Minimum similarity for fuzzy matching.
323
+ llm_confidence_threshold: Minimum LLM confidence to accept.
324
+ use_llm_disambiguation: Whether to use LLM for ambiguous cases.
325
+ enable_pattern_learning: Learn new normalization patterns.
326
+ """
327
+ self.kg = knowledge_graph
328
+ self.llm = llm
329
+ self.exact_match_threshold = exact_match_threshold
330
+ self.fuzzy_match_threshold = fuzzy_match_threshold
331
+ self.llm_confidence_threshold = llm_confidence_threshold
332
+ self.use_llm_disambiguation = use_llm_disambiguation
333
+ self.enable_pattern_learning = enable_pattern_learning
334
+
335
+ # Lazy LLM initialization
336
+ self._llm_initialized = False
337
+
338
+ # Normalization patterns registry
339
+ self._normalization_patterns = get_learned_normalization_patterns() if enable_pattern_learning else None
340
+
341
+ def _get_llm(self) -> Any:
342
+ """Get or initialize LLM."""
343
+ if self.llm is None and not self._llm_initialized:
344
+ self.llm = get_llm()
345
+ self._llm_initialized = True
346
+ return self.llm
347
+
348
+ def link_entities(
349
+ self,
350
+ new_entities: list[Entity],
351
+ target_doc_id: str | None = None,
352
+ ) -> list[EntityLink]:
353
+ """
354
+ Link new entities to existing entities in the knowledge graph.
355
+
356
+ Args:
357
+ new_entities: Newly extracted entities to link.
358
+ target_doc_id: If provided, only link to entities in this document.
359
+
360
+ Returns:
361
+ List of EntityLink objects for confirmed links.
362
+ """
363
+ links = []
364
+
365
+ for entity in new_entities:
366
+ # Find candidates from knowledge graph
367
+ candidates = self._find_candidates(entity, target_doc_id)
368
+
369
+ if not candidates:
370
+ continue
371
+
372
+ # Score each candidate
373
+ scored_candidates = []
374
+ for candidate in candidates:
375
+ score, method = self._score_match(entity, candidate)
376
+ if score >= self.fuzzy_match_threshold:
377
+ scored_candidates.append((candidate, score, method))
378
+
379
+ # Sort by score
380
+ scored_candidates.sort(key=lambda x: -x[1])
381
+
382
+ if not scored_candidates:
383
+ continue
384
+
385
+ # Handle disambiguation
386
+ best_candidate, best_score, best_method = scored_candidates[0]
387
+
388
+ if best_score >= self.exact_match_threshold:
389
+ # Exact match - link directly
390
+ link = EntityLink(
391
+ entity_id_1=entity.id,
392
+ entity_id_2=best_candidate.id,
393
+ confidence=best_score,
394
+ link_method=best_method,
395
+ evidence=f"Matched on: {entity.canonical_name} = {best_candidate.canonical_name}",
396
+ )
397
+ links.append(link)
398
+
399
+ # Store in knowledge graph
400
+ self.kg.link_entities(
401
+ entity.id,
402
+ best_candidate.id,
403
+ confidence=best_score,
404
+ link_method=best_method,
405
+ evidence=link.evidence,
406
+ )
407
+
408
+ elif len(scored_candidates) == 1 and best_score >= self.fuzzy_match_threshold:
409
+ # Single fuzzy match - link with lower confidence
410
+ link = EntityLink(
411
+ entity_id_1=entity.id,
412
+ entity_id_2=best_candidate.id,
413
+ confidence=best_score,
414
+ link_method=best_method,
415
+ evidence=f"Fuzzy match: {entity.canonical_name} ~ {best_candidate.canonical_name}",
416
+ )
417
+ links.append(link)
418
+
419
+ self.kg.link_entities(
420
+ entity.id,
421
+ best_candidate.id,
422
+ confidence=best_score,
423
+ link_method=best_method,
424
+ evidence=link.evidence,
425
+ )
426
+
427
+ elif self.use_llm_disambiguation and len(scored_candidates) >= 1:
428
+ # Ambiguous - use LLM
429
+ llm_link = self._disambiguate_with_llm(entity, scored_candidates)
430
+ if llm_link:
431
+ links.append(llm_link)
432
+
433
+ self.kg.link_entities(
434
+ llm_link.entity_id_1,
435
+ llm_link.entity_id_2,
436
+ confidence=llm_link.confidence,
437
+ link_method=llm_link.link_method,
438
+ evidence=llm_link.evidence,
439
+ )
440
+
441
+ logger.info(
442
+ "entity_linking_complete",
443
+ new_entities=len(new_entities),
444
+ links_created=len(links),
445
+ )
446
+
447
+ return links
448
+
449
+ def _find_candidates(
450
+ self,
451
+ entity: Entity,
452
+ target_doc_id: str | None = None,
453
+ ) -> list[Entity]:
454
+ """
455
+ Find candidate entities that might match the given entity.
456
+
457
+ Args:
458
+ entity: Entity to find matches for.
459
+ target_doc_id: Optional document ID filter.
460
+
461
+ Returns:
462
+ List of candidate entities.
463
+ """
464
+ candidates = []
465
+
466
+ # Search by exact name
467
+ exact_matches = self.kg.find_entities_by_name(
468
+ entity.canonical_name,
469
+ entity_type=entity.type,
470
+ fuzzy=False,
471
+ )
472
+ candidates.extend(exact_matches)
473
+
474
+ # Search by fuzzy name
475
+ fuzzy_matches = self.kg.find_entities_by_name(
476
+ entity.canonical_name,
477
+ entity_type=entity.type,
478
+ fuzzy=True,
479
+ )
480
+ for match in fuzzy_matches:
481
+ if match.id not in {c.id for c in candidates}:
482
+ candidates.append(match)
483
+
484
+ # Search by aliases
485
+ for alias in entity.aliases:
486
+ alias_matches = self.kg.find_entities_by_name(
487
+ alias,
488
+ entity_type=entity.type,
489
+ fuzzy=True,
490
+ )
491
+ for match in alias_matches:
492
+ if match.id not in {c.id for c in candidates}:
493
+ candidates.append(match)
494
+
495
+ # Filter out the entity itself
496
+ candidates = [c for c in candidates if c.id != entity.id]
497
+
498
+ # Filter by target document if specified
499
+ if target_doc_id:
500
+ candidates = [
501
+ c for c in candidates
502
+ if target_doc_id in c.document_ids
503
+ ]
504
+
505
+ return candidates
506
+
507
+ def _score_match(
508
+ self,
509
+ entity1: Entity,
510
+ entity2: Entity,
511
+ ) -> tuple[float, str]:
512
+ """
513
+ Score the similarity between two entities.
514
+
515
+ Args:
516
+ entity1: First entity.
517
+ entity2: Second entity.
518
+
519
+ Returns:
520
+ Tuple of (score, method) where score is 0.0-1.0.
521
+ """
522
+ # Type mismatch is a strong negative signal
523
+ if entity1.type != entity2.type:
524
+ return 0.0, "type_mismatch"
525
+
526
+ # Exact canonical name match
527
+ if entity1.canonical_name.lower().strip() == entity2.canonical_name.lower().strip():
528
+ return 1.0, "exact"
529
+
530
+ # Check alias matches
531
+ all_names1 = {n.lower().strip() for n in entity1.all_names}
532
+ all_names2 = {n.lower().strip() for n in entity2.all_names}
533
+
534
+ if all_names1 & all_names2:
535
+ return 0.95, "alias"
536
+
537
+ # Fuzzy string matching on all name combinations
538
+ best_similarity = 0.0
539
+ for name1 in entity1.all_names:
540
+ for name2 in entity2.all_names:
541
+ similarity = self._string_similarity(name1, name2)
542
+ best_similarity = max(best_similarity, similarity)
543
+
544
+ if best_similarity >= self.fuzzy_match_threshold:
545
+ return best_similarity, "fuzzy"
546
+
547
+ # Name containment (one name contains the other)
548
+ name1_lower = entity1.canonical_name.lower()
549
+ name2_lower = entity2.canonical_name.lower()
550
+
551
+ if name1_lower in name2_lower or name2_lower in name1_lower:
552
+ # Score based on length ratio
553
+ shorter = min(len(name1_lower), len(name2_lower))
554
+ longer = max(len(name1_lower), len(name2_lower))
555
+ return 0.7 + 0.2 * (shorter / longer), "containment"
556
+
557
+ return best_similarity, "low_similarity"
558
+
559
+ def _string_similarity(self, s1: str, s2: str) -> float:
560
+ """
561
+ Calculate string similarity using SequenceMatcher.
562
+
563
+ Args:
564
+ s1: First string.
565
+ s2: Second string.
566
+
567
+ Returns:
568
+ Similarity score 0.0-1.0.
569
+ """
570
+ # Normalize strings
571
+ s1 = self._normalize_name(s1)
572
+ s2 = self._normalize_name(s2)
573
+
574
+ # Use SequenceMatcher for similarity
575
+ return SequenceMatcher(None, s1, s2).ratio()
576
+
577
+ def _normalize_name(self, name: str, learn_patterns: bool = True) -> str:
578
+ """
579
+ Normalize a name for comparison.
580
+
581
+ Uses both base patterns and learned patterns from the registry.
582
+
583
+ Args:
584
+ name: Name to normalize.
585
+ learn_patterns: Whether to record new patterns found.
586
+
587
+ Returns:
588
+ Normalized name.
589
+ """
590
+ original_name = name
591
+
592
+ # Lowercase
593
+ name = name.lower()
594
+
595
+ # Get titles and suffixes (base + learned)
596
+ if self._normalization_patterns:
597
+ titles = self._normalization_patterns.get_all_titles()
598
+ suffixes = self._normalization_patterns.get_all_suffixes()
599
+ else:
600
+ titles = LearnedNormalizationPatterns.BASE_TITLES
601
+ suffixes = LearnedNormalizationPatterns.BASE_SUFFIXES
602
+
603
+ # Remove titles/prefixes
604
+ removed_title = None
605
+ for title in sorted(titles, key=len, reverse=True): # Longest first
606
+ if name.startswith(title + " "):
607
+ removed_title = title
608
+ name = name[len(title) + 1:]
609
+ break
610
+
611
+ # Remove suffixes
612
+ removed_suffix = None
613
+ for suffix in sorted(suffixes, key=len, reverse=True): # Longest first
614
+ if name.endswith(suffix):
615
+ removed_suffix = suffix
616
+ name = name[:-len(suffix)]
617
+ break
618
+
619
+ # Learn new patterns if enabled
620
+ if learn_patterns and self._normalization_patterns and self.enable_pattern_learning:
621
+ # Detect potential new titles (patterns like "X. Name" or "X Name")
622
+ title_match = re.match(r'^([A-Za-z]{1,4}\.?)\s+[A-Z]', original_name)
623
+ if title_match and not removed_title:
624
+ potential_title = title_match.group(1).lower()
625
+ if potential_title not in titles and len(potential_title) >= 2:
626
+ self._normalization_patterns.record_title(
627
+ potential_title,
628
+ domain="detected",
629
+ entity_example=original_name,
630
+ )
631
+
632
+ # Detect potential new suffixes (patterns at end of company names)
633
+ suffix_patterns = [
634
+ r',?\s*(gmbh|ag|s\.a\.|pty\s*ltd|plc|bv|nv|asa|ab|as|oy|a/s|k\.k\.|co\.,?\s*ltd\.?)$',
635
+ r',?\s*([A-Z]{2,4}\.?)$', # Acronym suffixes
636
+ ]
637
+ for pattern in suffix_patterns:
638
+ suffix_match = re.search(pattern, original_name, re.IGNORECASE)
639
+ if suffix_match and not removed_suffix:
640
+ potential_suffix = suffix_match.group(1).lower()
641
+ if potential_suffix not in suffixes:
642
+ self._normalization_patterns.record_suffix(
643
+ potential_suffix,
644
+ domain="detected",
645
+ entity_example=original_name,
646
+ )
647
+
648
+ # Normalize whitespace
649
+ name = " ".join(name.split())
650
+
651
+ return name.strip()
652
+
653
+ def _disambiguate_with_llm(
654
+ self,
655
+ entity: Entity,
656
+ candidates: list[tuple[Entity, float, str]],
657
+ ) -> EntityLink | None:
658
+ """
659
+ Use LLM to disambiguate between multiple candidates.
660
+
661
+ Args:
662
+ entity: Entity to match.
663
+ candidates: List of (candidate, score, method) tuples.
664
+
665
+ Returns:
666
+ EntityLink if a match is found, None otherwise.
667
+ """
668
+ llm = self._get_llm()
669
+ if llm is None:
670
+ return None
671
+
672
+ # Get context from first mention
673
+ entity_context = ""
674
+ if entity.mentions:
675
+ entity_context = entity.mentions[0].context
676
+
677
+ # Try each candidate
678
+ for candidate, score, method in candidates:
679
+ candidate_context = ""
680
+ if candidate.mentions:
681
+ candidate_context = candidate.mentions[0].context
682
+
683
+ prompt = DISAMBIGUATION_PROMPT.format(
684
+ name1=entity.canonical_name,
685
+ type1=entity.type.value,
686
+ aliases1=", ".join(entity.aliases) or "None",
687
+ context1=entity_context or "No context available",
688
+ doc1=entity.source_doc_id or "Unknown",
689
+ name2=candidate.canonical_name,
690
+ type2=candidate.type.value,
691
+ aliases2=", ".join(candidate.aliases) or "None",
692
+ context2=candidate_context or "No context available",
693
+ doc2=candidate.source_doc_id or "Unknown",
694
+ )
695
+
696
+ try:
697
+ response = llm.complete(prompt)
698
+ response_text = str(response) if not isinstance(response, str) else response
699
+
700
+ result = self._parse_disambiguation_response(response_text)
701
+
702
+ if result and result.get("same_entity") and result.get("confidence", 0) >= self.llm_confidence_threshold:
703
+ return EntityLink(
704
+ entity_id_1=entity.id,
705
+ entity_id_2=candidate.id,
706
+ confidence=result["confidence"],
707
+ link_method="llm",
708
+ evidence=result.get("reasoning", "LLM disambiguation"),
709
+ )
710
+
711
+ except Exception as e:
712
+ logger.warning(
713
+ "llm_disambiguation_failed",
714
+ entity=entity.canonical_name,
715
+ candidate=candidate.canonical_name,
716
+ error=str(e),
717
+ )
718
+
719
+ return None
720
+
721
+ def _parse_disambiguation_response(
722
+ self,
723
+ response_text: str,
724
+ ) -> dict[str, Any] | None:
725
+ """
726
+ Parse LLM disambiguation response.
727
+
728
+ Args:
729
+ response_text: Raw LLM response.
730
+
731
+ Returns:
732
+ Parsed result dict or None.
733
+ """
734
+ # Extract JSON from response
735
+ json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response_text)
736
+ if json_match:
737
+ json_str = json_match.group(1)
738
+ else:
739
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
740
+ if json_match:
741
+ json_str = json_match.group(0)
742
+ else:
743
+ return None
744
+
745
+ try:
746
+ return json.loads(json_str)
747
+ except json.JSONDecodeError:
748
+ return None
749
+
750
+ def link_all_entities_in_document(
751
+ self,
752
+ doc_id: str,
753
+ ) -> list[EntityLink]:
754
+ """
755
+ Link all entities within a document to each other.
756
+
757
+ Useful for finding co-references within a single document.
758
+
759
+ Args:
760
+ doc_id: Document ID.
761
+
762
+ Returns:
763
+ List of EntityLink objects.
764
+ """
765
+ entities = self.kg.find_entities_in_document(doc_id)
766
+
767
+ if len(entities) < 2:
768
+ return []
769
+
770
+ links = []
771
+
772
+ # Group entities by type
773
+ by_type: dict[EntityType, list[Entity]] = defaultdict(list)
774
+ for entity in entities:
775
+ by_type[entity.type].append(entity)
776
+
777
+ # Link within each type group
778
+ for entity_type, type_entities in by_type.items():
779
+ for i, entity1 in enumerate(type_entities):
780
+ for entity2 in type_entities[i + 1:]:
781
+ score, method = self._score_match(entity1, entity2)
782
+
783
+ if score >= self.fuzzy_match_threshold:
784
+ link = EntityLink(
785
+ entity_id_1=entity1.id,
786
+ entity_id_2=entity2.id,
787
+ confidence=score,
788
+ link_method=method,
789
+ evidence=f"Same document co-reference: {entity1.canonical_name} = {entity2.canonical_name}",
790
+ )
791
+ links.append(link)
792
+
793
+ self.kg.link_entities(
794
+ entity1.id,
795
+ entity2.id,
796
+ confidence=score,
797
+ link_method=method,
798
+ evidence=link.evidence,
799
+ )
800
+
801
+ return links
802
+
803
+ def link_across_documents(
804
+ self,
805
+ doc_id_1: str,
806
+ doc_id_2: str,
807
+ ) -> list[EntityLink]:
808
+ """
809
+ Link entities between two specific documents.
810
+
811
+ Args:
812
+ doc_id_1: First document ID.
813
+ doc_id_2: Second document ID.
814
+
815
+ Returns:
816
+ List of EntityLink objects.
817
+ """
818
+ entities_1 = self.kg.find_entities_in_document(doc_id_1)
819
+ entities_2 = self.kg.find_entities_in_document(doc_id_2)
820
+
821
+ if not entities_1 or not entities_2:
822
+ return []
823
+
824
+ # Use the main linking method
825
+ return self.link_entities(entities_1, target_doc_id=doc_id_2)