rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,825 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Entity Linker
|
|
3
|
+
|
|
4
|
+
Cross-document entity linking with fuzzy matching and LLM disambiguation.
|
|
5
|
+
Links entities that represent the same real-world entity across documents.
|
|
6
|
+
|
|
7
|
+
Features adaptive learning for normalization patterns - learns new titles
|
|
8
|
+
and suffixes from user's document workload.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from difflib import SequenceMatcher
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from threading import Lock
|
|
21
|
+
from typing import TYPE_CHECKING, Any
|
|
22
|
+
|
|
23
|
+
import structlog
|
|
24
|
+
|
|
25
|
+
from rnsr.extraction.models import Entity, EntityLink, EntityType
|
|
26
|
+
from rnsr.llm import get_llm
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from rnsr.indexing.knowledge_graph import KnowledgeGraph
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# =============================================================================
|
|
35
|
+
# Learned Normalization Patterns
|
|
36
|
+
# =============================================================================
|
|
37
|
+
|
|
38
|
+
DEFAULT_NORMALIZATION_PATH = Path.home() / ".rnsr" / "learned_normalization.json"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LearnedNormalizationPatterns:
|
|
42
|
+
"""
|
|
43
|
+
Registry for learning domain-specific normalization patterns.
|
|
44
|
+
|
|
45
|
+
Learns:
|
|
46
|
+
- Titles/prefixes (Mr., Dr., Esq., Hon., M.D., etc.)
|
|
47
|
+
- Suffixes (Inc., LLC, GmbH, Pty Ltd, etc.)
|
|
48
|
+
- Domain-specific patterns (legal, medical, regional)
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Base patterns (always included)
|
|
52
|
+
BASE_TITLES = [
|
|
53
|
+
"mr.", "mrs.", "ms.", "dr.", "prof.",
|
|
54
|
+
"mr", "mrs", "ms", "dr", "prof",
|
|
55
|
+
"the", "hon.", "hon", "sir", "dame",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
BASE_SUFFIXES = [
|
|
59
|
+
", inc.", ", inc", ", llc", ", llc.",
|
|
60
|
+
", corp.", ", corp", ", ltd.", ", ltd",
|
|
61
|
+
"inc.", "inc", "llc", "corp.", "corp", "ltd.", "ltd",
|
|
62
|
+
", esq.", ", esq", "esq.", "esq",
|
|
63
|
+
", jr.", ", jr", ", sr.", ", sr",
|
|
64
|
+
"jr.", "jr", "sr.", "sr",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
storage_path: Path | str | None = None,
|
|
70
|
+
auto_save: bool = True,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Initialize the normalization patterns registry.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
storage_path: Path to JSON file for persistence.
|
|
77
|
+
auto_save: Whether to save after each new pattern.
|
|
78
|
+
"""
|
|
79
|
+
self.storage_path = Path(storage_path) if storage_path else DEFAULT_NORMALIZATION_PATH
|
|
80
|
+
self.auto_save = auto_save
|
|
81
|
+
|
|
82
|
+
self._lock = Lock()
|
|
83
|
+
self._titles: dict[str, dict[str, Any]] = {}
|
|
84
|
+
self._suffixes: dict[str, dict[str, Any]] = {}
|
|
85
|
+
self._dirty = False
|
|
86
|
+
|
|
87
|
+
self._load()
|
|
88
|
+
|
|
89
|
+
def _load(self) -> None:
|
|
90
|
+
"""Load learned patterns from storage."""
|
|
91
|
+
if not self.storage_path.exists():
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
with open(self.storage_path, "r") as f:
|
|
96
|
+
data = json.load(f)
|
|
97
|
+
|
|
98
|
+
self._titles = data.get("titles", {})
|
|
99
|
+
self._suffixes = data.get("suffixes", {})
|
|
100
|
+
|
|
101
|
+
logger.info(
|
|
102
|
+
"normalization_patterns_loaded",
|
|
103
|
+
titles=len(self._titles),
|
|
104
|
+
suffixes=len(self._suffixes),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.warning("failed_to_load_normalization_patterns", error=str(e))
|
|
109
|
+
|
|
110
|
+
def _save(self) -> None:
|
|
111
|
+
"""Save patterns to storage."""
|
|
112
|
+
if not self._dirty:
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
|
|
118
|
+
data = {
|
|
119
|
+
"version": "1.0",
|
|
120
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
121
|
+
"titles": self._titles,
|
|
122
|
+
"suffixes": self._suffixes,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
with open(self.storage_path, "w") as f:
|
|
126
|
+
json.dump(data, f, indent=2)
|
|
127
|
+
|
|
128
|
+
self._dirty = False
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.warning("failed_to_save_normalization_patterns", error=str(e))
|
|
132
|
+
|
|
133
|
+
def record_title(
|
|
134
|
+
self,
|
|
135
|
+
title: str,
|
|
136
|
+
domain: str = "general",
|
|
137
|
+
entity_example: str = "",
|
|
138
|
+
) -> None:
|
|
139
|
+
"""
|
|
140
|
+
Record a learned title/prefix.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
title: The title pattern (e.g., "Atty.", "M.D.").
|
|
144
|
+
domain: Domain category (legal, medical, regional, etc.).
|
|
145
|
+
entity_example: Example entity with this title.
|
|
146
|
+
"""
|
|
147
|
+
title = title.lower().strip()
|
|
148
|
+
|
|
149
|
+
if not title or title in self.BASE_TITLES:
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
with self._lock:
|
|
153
|
+
now = datetime.utcnow().isoformat()
|
|
154
|
+
|
|
155
|
+
if title not in self._titles:
|
|
156
|
+
self._titles[title] = {
|
|
157
|
+
"count": 0,
|
|
158
|
+
"domain": domain,
|
|
159
|
+
"first_seen": now,
|
|
160
|
+
"last_seen": now,
|
|
161
|
+
"examples": [],
|
|
162
|
+
}
|
|
163
|
+
logger.info("new_title_pattern_learned", title=title, domain=domain)
|
|
164
|
+
|
|
165
|
+
self._titles[title]["count"] += 1
|
|
166
|
+
self._titles[title]["last_seen"] = now
|
|
167
|
+
|
|
168
|
+
if entity_example and len(self._titles[title]["examples"]) < 3:
|
|
169
|
+
self._titles[title]["examples"].append(entity_example)
|
|
170
|
+
|
|
171
|
+
self._dirty = True
|
|
172
|
+
|
|
173
|
+
if self.auto_save:
|
|
174
|
+
self._save()
|
|
175
|
+
|
|
176
|
+
def record_suffix(
|
|
177
|
+
self,
|
|
178
|
+
suffix: str,
|
|
179
|
+
domain: str = "general",
|
|
180
|
+
entity_example: str = "",
|
|
181
|
+
) -> None:
|
|
182
|
+
"""
|
|
183
|
+
Record a learned suffix.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
suffix: The suffix pattern (e.g., "GmbH", "Pty Ltd").
|
|
187
|
+
domain: Domain category (legal, corporate, regional, etc.).
|
|
188
|
+
entity_example: Example entity with this suffix.
|
|
189
|
+
"""
|
|
190
|
+
suffix = suffix.lower().strip()
|
|
191
|
+
|
|
192
|
+
if not suffix or suffix in self.BASE_SUFFIXES:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
with self._lock:
|
|
196
|
+
now = datetime.utcnow().isoformat()
|
|
197
|
+
|
|
198
|
+
if suffix not in self._suffixes:
|
|
199
|
+
self._suffixes[suffix] = {
|
|
200
|
+
"count": 0,
|
|
201
|
+
"domain": domain,
|
|
202
|
+
"first_seen": now,
|
|
203
|
+
"last_seen": now,
|
|
204
|
+
"examples": [],
|
|
205
|
+
}
|
|
206
|
+
logger.info("new_suffix_pattern_learned", suffix=suffix, domain=domain)
|
|
207
|
+
|
|
208
|
+
self._suffixes[suffix]["count"] += 1
|
|
209
|
+
self._suffixes[suffix]["last_seen"] = now
|
|
210
|
+
|
|
211
|
+
if entity_example and len(self._suffixes[suffix]["examples"]) < 3:
|
|
212
|
+
self._suffixes[suffix]["examples"].append(entity_example)
|
|
213
|
+
|
|
214
|
+
self._dirty = True
|
|
215
|
+
|
|
216
|
+
if self.auto_save:
|
|
217
|
+
self._save()
|
|
218
|
+
|
|
219
|
+
def get_all_titles(self, min_count: int = 1) -> list[str]:
|
|
220
|
+
"""Get all titles (base + learned)."""
|
|
221
|
+
learned = [
|
|
222
|
+
title for title, data in self._titles.items()
|
|
223
|
+
if data["count"] >= min_count
|
|
224
|
+
]
|
|
225
|
+
return list(set(self.BASE_TITLES + learned))
|
|
226
|
+
|
|
227
|
+
def get_all_suffixes(self, min_count: int = 1) -> list[str]:
|
|
228
|
+
"""Get all suffixes (base + learned)."""
|
|
229
|
+
learned = [
|
|
230
|
+
suffix for suffix, data in self._suffixes.items()
|
|
231
|
+
if data["count"] >= min_count
|
|
232
|
+
]
|
|
233
|
+
return list(set(self.BASE_SUFFIXES + learned))
|
|
234
|
+
|
|
235
|
+
def get_stats(self) -> dict[str, Any]:
|
|
236
|
+
"""Get statistics about learned patterns."""
|
|
237
|
+
return {
|
|
238
|
+
"learned_titles": len(self._titles),
|
|
239
|
+
"learned_suffixes": len(self._suffixes),
|
|
240
|
+
"total_titles": len(self.get_all_titles()),
|
|
241
|
+
"total_suffixes": len(self.get_all_suffixes()),
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# Global normalization patterns instance
|
|
246
|
+
_global_normalization_patterns: LearnedNormalizationPatterns | None = None
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_learned_normalization_patterns() -> LearnedNormalizationPatterns:
|
|
250
|
+
"""Get the global normalization patterns registry."""
|
|
251
|
+
global _global_normalization_patterns
|
|
252
|
+
|
|
253
|
+
if _global_normalization_patterns is None:
|
|
254
|
+
custom_path = os.getenv("RNSR_NORMALIZATION_PATH")
|
|
255
|
+
_global_normalization_patterns = LearnedNormalizationPatterns(
|
|
256
|
+
storage_path=custom_path if custom_path else None
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return _global_normalization_patterns
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# LLM disambiguation prompt
|
|
263
|
+
DISAMBIGUATION_PROMPT = """You are an expert at entity resolution. Determine if these two entities refer to the same real-world entity.
|
|
264
|
+
|
|
265
|
+
Entity 1:
|
|
266
|
+
- Name: {name1}
|
|
267
|
+
- Type: {type1}
|
|
268
|
+
- Aliases: {aliases1}
|
|
269
|
+
- Context: {context1}
|
|
270
|
+
- Document: {doc1}
|
|
271
|
+
|
|
272
|
+
Entity 2:
|
|
273
|
+
- Name: {name2}
|
|
274
|
+
- Type: {type2}
|
|
275
|
+
- Aliases: {aliases2}
|
|
276
|
+
- Context: {context2}
|
|
277
|
+
- Document: {doc2}
|
|
278
|
+
|
|
279
|
+
Consider:
|
|
280
|
+
1. Name similarity (accounting for variations, titles, abbreviations)
|
|
281
|
+
2. Context similarity (same role, same events, same relationships)
|
|
282
|
+
3. Document context (are these documents related?)
|
|
283
|
+
|
|
284
|
+
Respond with JSON:
|
|
285
|
+
```json
|
|
286
|
+
{{
|
|
287
|
+
"same_entity": true/false,
|
|
288
|
+
"confidence": 0.0-1.0,
|
|
289
|
+
"reasoning": "Brief explanation of your decision"
|
|
290
|
+
}}
|
|
291
|
+
```
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class EntityLinker:
|
|
296
|
+
"""
|
|
297
|
+
Links entities across documents using multiple strategies:
|
|
298
|
+
|
|
299
|
+
1. Exact match on canonical_name
|
|
300
|
+
2. Fuzzy string matching (Levenshtein-based)
|
|
301
|
+
3. Alias matching
|
|
302
|
+
4. LLM disambiguation for ambiguous cases
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
def __init__(
|
|
306
|
+
self,
|
|
307
|
+
knowledge_graph: KnowledgeGraph,
|
|
308
|
+
llm: Any | None = None,
|
|
309
|
+
exact_match_threshold: float = 1.0,
|
|
310
|
+
fuzzy_match_threshold: float = 0.85,
|
|
311
|
+
llm_confidence_threshold: float = 0.75,
|
|
312
|
+
use_llm_disambiguation: bool = True,
|
|
313
|
+
enable_pattern_learning: bool = True,
|
|
314
|
+
):
|
|
315
|
+
"""
|
|
316
|
+
Initialize the entity linker.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
knowledge_graph: Knowledge graph for storing/querying entities.
|
|
320
|
+
llm: LLM instance for disambiguation. If None, uses get_llm().
|
|
321
|
+
exact_match_threshold: Confidence for exact matches.
|
|
322
|
+
fuzzy_match_threshold: Minimum similarity for fuzzy matching.
|
|
323
|
+
llm_confidence_threshold: Minimum LLM confidence to accept.
|
|
324
|
+
use_llm_disambiguation: Whether to use LLM for ambiguous cases.
|
|
325
|
+
enable_pattern_learning: Learn new normalization patterns.
|
|
326
|
+
"""
|
|
327
|
+
self.kg = knowledge_graph
|
|
328
|
+
self.llm = llm
|
|
329
|
+
self.exact_match_threshold = exact_match_threshold
|
|
330
|
+
self.fuzzy_match_threshold = fuzzy_match_threshold
|
|
331
|
+
self.llm_confidence_threshold = llm_confidence_threshold
|
|
332
|
+
self.use_llm_disambiguation = use_llm_disambiguation
|
|
333
|
+
self.enable_pattern_learning = enable_pattern_learning
|
|
334
|
+
|
|
335
|
+
# Lazy LLM initialization
|
|
336
|
+
self._llm_initialized = False
|
|
337
|
+
|
|
338
|
+
# Normalization patterns registry
|
|
339
|
+
self._normalization_patterns = get_learned_normalization_patterns() if enable_pattern_learning else None
|
|
340
|
+
|
|
341
|
+
def _get_llm(self) -> Any:
|
|
342
|
+
"""Get or initialize LLM."""
|
|
343
|
+
if self.llm is None and not self._llm_initialized:
|
|
344
|
+
self.llm = get_llm()
|
|
345
|
+
self._llm_initialized = True
|
|
346
|
+
return self.llm
|
|
347
|
+
|
|
348
|
+
def link_entities(
|
|
349
|
+
self,
|
|
350
|
+
new_entities: list[Entity],
|
|
351
|
+
target_doc_id: str | None = None,
|
|
352
|
+
) -> list[EntityLink]:
|
|
353
|
+
"""
|
|
354
|
+
Link new entities to existing entities in the knowledge graph.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
new_entities: Newly extracted entities to link.
|
|
358
|
+
target_doc_id: If provided, only link to entities in this document.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
List of EntityLink objects for confirmed links.
|
|
362
|
+
"""
|
|
363
|
+
links = []
|
|
364
|
+
|
|
365
|
+
for entity in new_entities:
|
|
366
|
+
# Find candidates from knowledge graph
|
|
367
|
+
candidates = self._find_candidates(entity, target_doc_id)
|
|
368
|
+
|
|
369
|
+
if not candidates:
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Score each candidate
|
|
373
|
+
scored_candidates = []
|
|
374
|
+
for candidate in candidates:
|
|
375
|
+
score, method = self._score_match(entity, candidate)
|
|
376
|
+
if score >= self.fuzzy_match_threshold:
|
|
377
|
+
scored_candidates.append((candidate, score, method))
|
|
378
|
+
|
|
379
|
+
# Sort by score
|
|
380
|
+
scored_candidates.sort(key=lambda x: -x[1])
|
|
381
|
+
|
|
382
|
+
if not scored_candidates:
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
# Handle disambiguation
|
|
386
|
+
best_candidate, best_score, best_method = scored_candidates[0]
|
|
387
|
+
|
|
388
|
+
if best_score >= self.exact_match_threshold:
|
|
389
|
+
# Exact match - link directly
|
|
390
|
+
link = EntityLink(
|
|
391
|
+
entity_id_1=entity.id,
|
|
392
|
+
entity_id_2=best_candidate.id,
|
|
393
|
+
confidence=best_score,
|
|
394
|
+
link_method=best_method,
|
|
395
|
+
evidence=f"Matched on: {entity.canonical_name} = {best_candidate.canonical_name}",
|
|
396
|
+
)
|
|
397
|
+
links.append(link)
|
|
398
|
+
|
|
399
|
+
# Store in knowledge graph
|
|
400
|
+
self.kg.link_entities(
|
|
401
|
+
entity.id,
|
|
402
|
+
best_candidate.id,
|
|
403
|
+
confidence=best_score,
|
|
404
|
+
link_method=best_method,
|
|
405
|
+
evidence=link.evidence,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
elif len(scored_candidates) == 1 and best_score >= self.fuzzy_match_threshold:
|
|
409
|
+
# Single fuzzy match - link with lower confidence
|
|
410
|
+
link = EntityLink(
|
|
411
|
+
entity_id_1=entity.id,
|
|
412
|
+
entity_id_2=best_candidate.id,
|
|
413
|
+
confidence=best_score,
|
|
414
|
+
link_method=best_method,
|
|
415
|
+
evidence=f"Fuzzy match: {entity.canonical_name} ~ {best_candidate.canonical_name}",
|
|
416
|
+
)
|
|
417
|
+
links.append(link)
|
|
418
|
+
|
|
419
|
+
self.kg.link_entities(
|
|
420
|
+
entity.id,
|
|
421
|
+
best_candidate.id,
|
|
422
|
+
confidence=best_score,
|
|
423
|
+
link_method=best_method,
|
|
424
|
+
evidence=link.evidence,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
elif self.use_llm_disambiguation and len(scored_candidates) >= 1:
|
|
428
|
+
# Ambiguous - use LLM
|
|
429
|
+
llm_link = self._disambiguate_with_llm(entity, scored_candidates)
|
|
430
|
+
if llm_link:
|
|
431
|
+
links.append(llm_link)
|
|
432
|
+
|
|
433
|
+
self.kg.link_entities(
|
|
434
|
+
llm_link.entity_id_1,
|
|
435
|
+
llm_link.entity_id_2,
|
|
436
|
+
confidence=llm_link.confidence,
|
|
437
|
+
link_method=llm_link.link_method,
|
|
438
|
+
evidence=llm_link.evidence,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
logger.info(
|
|
442
|
+
"entity_linking_complete",
|
|
443
|
+
new_entities=len(new_entities),
|
|
444
|
+
links_created=len(links),
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return links
|
|
448
|
+
|
|
449
|
+
def _find_candidates(
|
|
450
|
+
self,
|
|
451
|
+
entity: Entity,
|
|
452
|
+
target_doc_id: str | None = None,
|
|
453
|
+
) -> list[Entity]:
|
|
454
|
+
"""
|
|
455
|
+
Find candidate entities that might match the given entity.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
entity: Entity to find matches for.
|
|
459
|
+
target_doc_id: Optional document ID filter.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
List of candidate entities.
|
|
463
|
+
"""
|
|
464
|
+
candidates = []
|
|
465
|
+
|
|
466
|
+
# Search by exact name
|
|
467
|
+
exact_matches = self.kg.find_entities_by_name(
|
|
468
|
+
entity.canonical_name,
|
|
469
|
+
entity_type=entity.type,
|
|
470
|
+
fuzzy=False,
|
|
471
|
+
)
|
|
472
|
+
candidates.extend(exact_matches)
|
|
473
|
+
|
|
474
|
+
# Search by fuzzy name
|
|
475
|
+
fuzzy_matches = self.kg.find_entities_by_name(
|
|
476
|
+
entity.canonical_name,
|
|
477
|
+
entity_type=entity.type,
|
|
478
|
+
fuzzy=True,
|
|
479
|
+
)
|
|
480
|
+
for match in fuzzy_matches:
|
|
481
|
+
if match.id not in {c.id for c in candidates}:
|
|
482
|
+
candidates.append(match)
|
|
483
|
+
|
|
484
|
+
# Search by aliases
|
|
485
|
+
for alias in entity.aliases:
|
|
486
|
+
alias_matches = self.kg.find_entities_by_name(
|
|
487
|
+
alias,
|
|
488
|
+
entity_type=entity.type,
|
|
489
|
+
fuzzy=True,
|
|
490
|
+
)
|
|
491
|
+
for match in alias_matches:
|
|
492
|
+
if match.id not in {c.id for c in candidates}:
|
|
493
|
+
candidates.append(match)
|
|
494
|
+
|
|
495
|
+
# Filter out the entity itself
|
|
496
|
+
candidates = [c for c in candidates if c.id != entity.id]
|
|
497
|
+
|
|
498
|
+
# Filter by target document if specified
|
|
499
|
+
if target_doc_id:
|
|
500
|
+
candidates = [
|
|
501
|
+
c for c in candidates
|
|
502
|
+
if target_doc_id in c.document_ids
|
|
503
|
+
]
|
|
504
|
+
|
|
505
|
+
return candidates
|
|
506
|
+
|
|
507
|
+
def _score_match(
|
|
508
|
+
self,
|
|
509
|
+
entity1: Entity,
|
|
510
|
+
entity2: Entity,
|
|
511
|
+
) -> tuple[float, str]:
|
|
512
|
+
"""
|
|
513
|
+
Score the similarity between two entities.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
entity1: First entity.
|
|
517
|
+
entity2: Second entity.
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Tuple of (score, method) where score is 0.0-1.0.
|
|
521
|
+
"""
|
|
522
|
+
# Type mismatch is a strong negative signal
|
|
523
|
+
if entity1.type != entity2.type:
|
|
524
|
+
return 0.0, "type_mismatch"
|
|
525
|
+
|
|
526
|
+
# Exact canonical name match
|
|
527
|
+
if entity1.canonical_name.lower().strip() == entity2.canonical_name.lower().strip():
|
|
528
|
+
return 1.0, "exact"
|
|
529
|
+
|
|
530
|
+
# Check alias matches
|
|
531
|
+
all_names1 = {n.lower().strip() for n in entity1.all_names}
|
|
532
|
+
all_names2 = {n.lower().strip() for n in entity2.all_names}
|
|
533
|
+
|
|
534
|
+
if all_names1 & all_names2:
|
|
535
|
+
return 0.95, "alias"
|
|
536
|
+
|
|
537
|
+
# Fuzzy string matching on all name combinations
|
|
538
|
+
best_similarity = 0.0
|
|
539
|
+
for name1 in entity1.all_names:
|
|
540
|
+
for name2 in entity2.all_names:
|
|
541
|
+
similarity = self._string_similarity(name1, name2)
|
|
542
|
+
best_similarity = max(best_similarity, similarity)
|
|
543
|
+
|
|
544
|
+
if best_similarity >= self.fuzzy_match_threshold:
|
|
545
|
+
return best_similarity, "fuzzy"
|
|
546
|
+
|
|
547
|
+
# Name containment (one name contains the other)
|
|
548
|
+
name1_lower = entity1.canonical_name.lower()
|
|
549
|
+
name2_lower = entity2.canonical_name.lower()
|
|
550
|
+
|
|
551
|
+
if name1_lower in name2_lower or name2_lower in name1_lower:
|
|
552
|
+
# Score based on length ratio
|
|
553
|
+
shorter = min(len(name1_lower), len(name2_lower))
|
|
554
|
+
longer = max(len(name1_lower), len(name2_lower))
|
|
555
|
+
return 0.7 + 0.2 * (shorter / longer), "containment"
|
|
556
|
+
|
|
557
|
+
return best_similarity, "low_similarity"
|
|
558
|
+
|
|
559
|
+
def _string_similarity(self, s1: str, s2: str) -> float:
|
|
560
|
+
"""
|
|
561
|
+
Calculate string similarity using SequenceMatcher.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
s1: First string.
|
|
565
|
+
s2: Second string.
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
Similarity score 0.0-1.0.
|
|
569
|
+
"""
|
|
570
|
+
# Normalize strings
|
|
571
|
+
s1 = self._normalize_name(s1)
|
|
572
|
+
s2 = self._normalize_name(s2)
|
|
573
|
+
|
|
574
|
+
# Use SequenceMatcher for similarity
|
|
575
|
+
return SequenceMatcher(None, s1, s2).ratio()
|
|
576
|
+
|
|
577
|
+
def _normalize_name(self, name: str, learn_patterns: bool = True) -> str:
|
|
578
|
+
"""
|
|
579
|
+
Normalize a name for comparison.
|
|
580
|
+
|
|
581
|
+
Uses both base patterns and learned patterns from the registry.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
name: Name to normalize.
|
|
585
|
+
learn_patterns: Whether to record new patterns found.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
Normalized name.
|
|
589
|
+
"""
|
|
590
|
+
original_name = name
|
|
591
|
+
|
|
592
|
+
# Lowercase
|
|
593
|
+
name = name.lower()
|
|
594
|
+
|
|
595
|
+
# Get titles and suffixes (base + learned)
|
|
596
|
+
if self._normalization_patterns:
|
|
597
|
+
titles = self._normalization_patterns.get_all_titles()
|
|
598
|
+
suffixes = self._normalization_patterns.get_all_suffixes()
|
|
599
|
+
else:
|
|
600
|
+
titles = LearnedNormalizationPatterns.BASE_TITLES
|
|
601
|
+
suffixes = LearnedNormalizationPatterns.BASE_SUFFIXES
|
|
602
|
+
|
|
603
|
+
# Remove titles/prefixes
|
|
604
|
+
removed_title = None
|
|
605
|
+
for title in sorted(titles, key=len, reverse=True): # Longest first
|
|
606
|
+
if name.startswith(title + " "):
|
|
607
|
+
removed_title = title
|
|
608
|
+
name = name[len(title) + 1:]
|
|
609
|
+
break
|
|
610
|
+
|
|
611
|
+
# Remove suffixes
|
|
612
|
+
removed_suffix = None
|
|
613
|
+
for suffix in sorted(suffixes, key=len, reverse=True): # Longest first
|
|
614
|
+
if name.endswith(suffix):
|
|
615
|
+
removed_suffix = suffix
|
|
616
|
+
name = name[:-len(suffix)]
|
|
617
|
+
break
|
|
618
|
+
|
|
619
|
+
# Learn new patterns if enabled
|
|
620
|
+
if learn_patterns and self._normalization_patterns and self.enable_pattern_learning:
|
|
621
|
+
# Detect potential new titles (patterns like "X. Name" or "X Name")
|
|
622
|
+
title_match = re.match(r'^([A-Za-z]{1,4}\.?)\s+[A-Z]', original_name)
|
|
623
|
+
if title_match and not removed_title:
|
|
624
|
+
potential_title = title_match.group(1).lower()
|
|
625
|
+
if potential_title not in titles and len(potential_title) >= 2:
|
|
626
|
+
self._normalization_patterns.record_title(
|
|
627
|
+
potential_title,
|
|
628
|
+
domain="detected",
|
|
629
|
+
entity_example=original_name,
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
# Detect potential new suffixes (patterns at end of company names)
|
|
633
|
+
suffix_patterns = [
|
|
634
|
+
r',?\s*(gmbh|ag|s\.a\.|pty\s*ltd|plc|bv|nv|asa|ab|as|oy|a/s|k\.k\.|co\.,?\s*ltd\.?)$',
|
|
635
|
+
r',?\s*([A-Z]{2,4}\.?)$', # Acronym suffixes
|
|
636
|
+
]
|
|
637
|
+
for pattern in suffix_patterns:
|
|
638
|
+
suffix_match = re.search(pattern, original_name, re.IGNORECASE)
|
|
639
|
+
if suffix_match and not removed_suffix:
|
|
640
|
+
potential_suffix = suffix_match.group(1).lower()
|
|
641
|
+
if potential_suffix not in suffixes:
|
|
642
|
+
self._normalization_patterns.record_suffix(
|
|
643
|
+
potential_suffix,
|
|
644
|
+
domain="detected",
|
|
645
|
+
entity_example=original_name,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
# Normalize whitespace
|
|
649
|
+
name = " ".join(name.split())
|
|
650
|
+
|
|
651
|
+
return name.strip()
|
|
652
|
+
|
|
653
|
+
def _disambiguate_with_llm(
|
|
654
|
+
self,
|
|
655
|
+
entity: Entity,
|
|
656
|
+
candidates: list[tuple[Entity, float, str]],
|
|
657
|
+
) -> EntityLink | None:
|
|
658
|
+
"""
|
|
659
|
+
Use LLM to disambiguate between multiple candidates.
|
|
660
|
+
|
|
661
|
+
Args:
|
|
662
|
+
entity: Entity to match.
|
|
663
|
+
candidates: List of (candidate, score, method) tuples.
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
EntityLink if a match is found, None otherwise.
|
|
667
|
+
"""
|
|
668
|
+
llm = self._get_llm()
|
|
669
|
+
if llm is None:
|
|
670
|
+
return None
|
|
671
|
+
|
|
672
|
+
# Get context from first mention
|
|
673
|
+
entity_context = ""
|
|
674
|
+
if entity.mentions:
|
|
675
|
+
entity_context = entity.mentions[0].context
|
|
676
|
+
|
|
677
|
+
# Try each candidate
|
|
678
|
+
for candidate, score, method in candidates:
|
|
679
|
+
candidate_context = ""
|
|
680
|
+
if candidate.mentions:
|
|
681
|
+
candidate_context = candidate.mentions[0].context
|
|
682
|
+
|
|
683
|
+
prompt = DISAMBIGUATION_PROMPT.format(
|
|
684
|
+
name1=entity.canonical_name,
|
|
685
|
+
type1=entity.type.value,
|
|
686
|
+
aliases1=", ".join(entity.aliases) or "None",
|
|
687
|
+
context1=entity_context or "No context available",
|
|
688
|
+
doc1=entity.source_doc_id or "Unknown",
|
|
689
|
+
name2=candidate.canonical_name,
|
|
690
|
+
type2=candidate.type.value,
|
|
691
|
+
aliases2=", ".join(candidate.aliases) or "None",
|
|
692
|
+
context2=candidate_context or "No context available",
|
|
693
|
+
doc2=candidate.source_doc_id or "Unknown",
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
response = llm.complete(prompt)
|
|
698
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
699
|
+
|
|
700
|
+
result = self._parse_disambiguation_response(response_text)
|
|
701
|
+
|
|
702
|
+
if result and result.get("same_entity") and result.get("confidence", 0) >= self.llm_confidence_threshold:
|
|
703
|
+
return EntityLink(
|
|
704
|
+
entity_id_1=entity.id,
|
|
705
|
+
entity_id_2=candidate.id,
|
|
706
|
+
confidence=result["confidence"],
|
|
707
|
+
link_method="llm",
|
|
708
|
+
evidence=result.get("reasoning", "LLM disambiguation"),
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
except Exception as e:
|
|
712
|
+
logger.warning(
|
|
713
|
+
"llm_disambiguation_failed",
|
|
714
|
+
entity=entity.canonical_name,
|
|
715
|
+
candidate=candidate.canonical_name,
|
|
716
|
+
error=str(e),
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
def _parse_disambiguation_response(
|
|
722
|
+
self,
|
|
723
|
+
response_text: str,
|
|
724
|
+
) -> dict[str, Any] | None:
|
|
725
|
+
"""
|
|
726
|
+
Parse LLM disambiguation response.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
response_text: Raw LLM response.
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
Parsed result dict or None.
|
|
733
|
+
"""
|
|
734
|
+
# Extract JSON from response
|
|
735
|
+
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response_text)
|
|
736
|
+
if json_match:
|
|
737
|
+
json_str = json_match.group(1)
|
|
738
|
+
else:
|
|
739
|
+
json_match = re.search(r'\{[\s\S]*\}', response_text)
|
|
740
|
+
if json_match:
|
|
741
|
+
json_str = json_match.group(0)
|
|
742
|
+
else:
|
|
743
|
+
return None
|
|
744
|
+
|
|
745
|
+
try:
|
|
746
|
+
return json.loads(json_str)
|
|
747
|
+
except json.JSONDecodeError:
|
|
748
|
+
return None
|
|
749
|
+
|
|
750
|
+
def link_all_entities_in_document(
|
|
751
|
+
self,
|
|
752
|
+
doc_id: str,
|
|
753
|
+
) -> list[EntityLink]:
|
|
754
|
+
"""
|
|
755
|
+
Link all entities within a document to each other.
|
|
756
|
+
|
|
757
|
+
Useful for finding co-references within a single document.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
doc_id: Document ID.
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
List of EntityLink objects.
|
|
764
|
+
"""
|
|
765
|
+
entities = self.kg.find_entities_in_document(doc_id)
|
|
766
|
+
|
|
767
|
+
if len(entities) < 2:
|
|
768
|
+
return []
|
|
769
|
+
|
|
770
|
+
links = []
|
|
771
|
+
|
|
772
|
+
# Group entities by type
|
|
773
|
+
by_type: dict[EntityType, list[Entity]] = defaultdict(list)
|
|
774
|
+
for entity in entities:
|
|
775
|
+
by_type[entity.type].append(entity)
|
|
776
|
+
|
|
777
|
+
# Link within each type group
|
|
778
|
+
for entity_type, type_entities in by_type.items():
|
|
779
|
+
for i, entity1 in enumerate(type_entities):
|
|
780
|
+
for entity2 in type_entities[i + 1:]:
|
|
781
|
+
score, method = self._score_match(entity1, entity2)
|
|
782
|
+
|
|
783
|
+
if score >= self.fuzzy_match_threshold:
|
|
784
|
+
link = EntityLink(
|
|
785
|
+
entity_id_1=entity1.id,
|
|
786
|
+
entity_id_2=entity2.id,
|
|
787
|
+
confidence=score,
|
|
788
|
+
link_method=method,
|
|
789
|
+
evidence=f"Same document co-reference: {entity1.canonical_name} = {entity2.canonical_name}",
|
|
790
|
+
)
|
|
791
|
+
links.append(link)
|
|
792
|
+
|
|
793
|
+
self.kg.link_entities(
|
|
794
|
+
entity1.id,
|
|
795
|
+
entity2.id,
|
|
796
|
+
confidence=score,
|
|
797
|
+
link_method=method,
|
|
798
|
+
evidence=link.evidence,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
return links
|
|
802
|
+
|
|
803
|
+
def link_across_documents(
|
|
804
|
+
self,
|
|
805
|
+
doc_id_1: str,
|
|
806
|
+
doc_id_2: str,
|
|
807
|
+
) -> list[EntityLink]:
|
|
808
|
+
"""
|
|
809
|
+
Link entities between two specific documents.
|
|
810
|
+
|
|
811
|
+
Args:
|
|
812
|
+
doc_id_1: First document ID.
|
|
813
|
+
doc_id_2: Second document ID.
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
List of EntityLink objects.
|
|
817
|
+
"""
|
|
818
|
+
entities_1 = self.kg.find_entities_in_document(doc_id_1)
|
|
819
|
+
entities_2 = self.kg.find_entities_in_document(doc_id_2)
|
|
820
|
+
|
|
821
|
+
if not entities_1 or not entities_2:
|
|
822
|
+
return []
|
|
823
|
+
|
|
824
|
+
# Use the main linking method
|
|
825
|
+
return self.link_entities(entities_1, target_doc_id=doc_id_2)
|