rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Entity Extractor
|
|
3
|
+
|
|
4
|
+
DEPRECATED: This extractor uses LLM-first approach which can hallucinate.
|
|
5
|
+
Use RLMUnifiedExtractor instead for grounded, accurate extraction.
|
|
6
|
+
|
|
7
|
+
LLM-based entity extraction from document sections.
|
|
8
|
+
Extracts people, organizations, dates, legal concepts, and other entities.
|
|
9
|
+
|
|
10
|
+
Features adaptive learning: when the LLM discovers new entity types, they are
|
|
11
|
+
stored in a learned types registry and used in future extraction prompts.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import re
|
|
18
|
+
import time
|
|
19
|
+
import warnings
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
import structlog
|
|
23
|
+
|
|
24
|
+
from rnsr.extraction.models import (
|
|
25
|
+
Entity,
|
|
26
|
+
EntityType,
|
|
27
|
+
ExtractionResult,
|
|
28
|
+
Mention,
|
|
29
|
+
)
|
|
30
|
+
from rnsr.extraction.learned_types import (
|
|
31
|
+
get_learned_type_registry,
|
|
32
|
+
record_learned_type,
|
|
33
|
+
)
|
|
34
|
+
from rnsr.llm import get_llm
|
|
35
|
+
|
|
36
|
+
logger = structlog.get_logger(__name__)
|
|
37
|
+
|
|
38
|
+
# Deprecation warning
|
|
39
|
+
_DEPRECATION_WARNING = """
|
|
40
|
+
EntityExtractor is deprecated and may hallucinate entities.
|
|
41
|
+
Use RLMUnifiedExtractor instead for grounded, accurate extraction:
|
|
42
|
+
|
|
43
|
+
from rnsr.extraction import RLMUnifiedExtractor
|
|
44
|
+
extractor = RLMUnifiedExtractor()
|
|
45
|
+
result = extractor.extract(node_id, doc_id, header, content)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Entity extraction prompt template
|
|
50
|
+
ENTITY_EXTRACTION_PROMPT = """You are an expert entity extractor for legal and business documents.
|
|
51
|
+
|
|
52
|
+
Analyze the following document section and extract all significant entities.
|
|
53
|
+
|
|
54
|
+
Document Section:
|
|
55
|
+
---
|
|
56
|
+
{content}
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
Section ID: {node_id}
|
|
60
|
+
Document ID: {doc_id}
|
|
61
|
+
Section Header: {header}
|
|
62
|
+
|
|
63
|
+
Extract entities of the following types:
|
|
64
|
+
- PERSON: Names of individuals, including their roles if mentioned (e.g., "plaintiff", "defendant", "witness", "CEO")
|
|
65
|
+
- ORGANIZATION: Companies, agencies, courts, government bodies
|
|
66
|
+
- LEGAL_CONCEPT: Legal claims, breaches, obligations, remedies, causes of action
|
|
67
|
+
- DATE: Specific dates, time periods, deadlines
|
|
68
|
+
- EVENT: Significant occurrences (hearings, signings, breaches, filings)
|
|
69
|
+
- LOCATION: Places, addresses, jurisdictions
|
|
70
|
+
- REFERENCE: Section references, exhibit numbers, document citations
|
|
71
|
+
- MONETARY: Dollar amounts, financial figures
|
|
72
|
+
- DOCUMENT: Referenced documents (contracts, exhibits, agreements)
|
|
73
|
+
{learned_types_section}
|
|
74
|
+
For each entity, provide:
|
|
75
|
+
1. type: One of the types above (or your own descriptive type if none fit)
|
|
76
|
+
2. canonical_name: The standardized/normalized name
|
|
77
|
+
3. aliases: Any alternative names or spellings found
|
|
78
|
+
4. context: The surrounding sentence or phrase where the entity appears
|
|
79
|
+
5. metadata: Any additional relevant information (roles, dates, amounts)
|
|
80
|
+
|
|
81
|
+
Return your response as a JSON array of entities:
|
|
82
|
+
```json
|
|
83
|
+
[
|
|
84
|
+
{{
|
|
85
|
+
"type": "PERSON",
|
|
86
|
+
"canonical_name": "John Smith",
|
|
87
|
+
"aliases": ["Mr. Smith", "J. Smith"],
|
|
88
|
+
"context": "John Smith, the defendant, filed a motion...",
|
|
89
|
+
"metadata": {{"role": "defendant"}}
|
|
90
|
+
}},
|
|
91
|
+
...
|
|
92
|
+
]
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
If no entities are found, return an empty array: []
|
|
96
|
+
|
|
97
|
+
Important:
|
|
98
|
+
- Be thorough but precise - only extract clearly identifiable entities
|
|
99
|
+
- Normalize names (e.g., "Mr. John Smith" -> "John Smith")
|
|
100
|
+
- Include context that helps understand the entity's role
|
|
101
|
+
- For legal concepts, use standardized legal terminology
|
|
102
|
+
- If an entity doesn't fit the predefined types, use your own descriptive type name
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class EntityExtractor:
|
|
107
|
+
"""
|
|
108
|
+
DEPRECATED: Extracts entities from document sections using LLM-first approach.
|
|
109
|
+
|
|
110
|
+
This extractor can hallucinate entities. Use RLMUnifiedExtractor instead.
|
|
111
|
+
|
|
112
|
+
Supports batch processing, caching, and adaptive learning of entity types.
|
|
113
|
+
When new entity types are discovered, they are stored and used in future prompts.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(
|
|
117
|
+
self,
|
|
118
|
+
llm: Any | None = None,
|
|
119
|
+
min_content_length: int = 50,
|
|
120
|
+
max_content_length: int = 8000,
|
|
121
|
+
enable_type_learning: bool = True,
|
|
122
|
+
learned_type_min_count: int = 2,
|
|
123
|
+
suppress_deprecation_warning: bool = False,
|
|
124
|
+
):
|
|
125
|
+
# Emit deprecation warning
|
|
126
|
+
if not suppress_deprecation_warning:
|
|
127
|
+
warnings.warn(
|
|
128
|
+
_DEPRECATION_WARNING,
|
|
129
|
+
DeprecationWarning,
|
|
130
|
+
stacklevel=2,
|
|
131
|
+
)
|
|
132
|
+
logger.warning("deprecated_extractor_used", extractor="EntityExtractor")
|
|
133
|
+
"""
|
|
134
|
+
Initialize the entity extractor.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
llm: LLM instance to use. If None, uses get_llm().
|
|
138
|
+
min_content_length: Minimum content length to process.
|
|
139
|
+
max_content_length: Maximum content length per extraction call.
|
|
140
|
+
enable_type_learning: Whether to learn new entity types.
|
|
141
|
+
learned_type_min_count: Minimum occurrences before a learned type
|
|
142
|
+
is included in extraction prompts.
|
|
143
|
+
"""
|
|
144
|
+
self.llm = llm or get_llm()
|
|
145
|
+
self.min_content_length = min_content_length
|
|
146
|
+
self.max_content_length = max_content_length
|
|
147
|
+
self.enable_type_learning = enable_type_learning
|
|
148
|
+
self.learned_type_min_count = learned_type_min_count
|
|
149
|
+
|
|
150
|
+
# Cache for extracted entities (node_id -> entities)
|
|
151
|
+
self._cache: dict[str, list[Entity]] = {}
|
|
152
|
+
|
|
153
|
+
# Get learned type registry
|
|
154
|
+
self._type_registry = get_learned_type_registry() if enable_type_learning else None
|
|
155
|
+
|
|
156
|
+
def extract_from_node(
|
|
157
|
+
self,
|
|
158
|
+
node_id: str,
|
|
159
|
+
doc_id: str,
|
|
160
|
+
header: str,
|
|
161
|
+
content: str,
|
|
162
|
+
page_num: int | None = None,
|
|
163
|
+
) -> ExtractionResult:
|
|
164
|
+
"""
|
|
165
|
+
Extract entities from a single document node.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
node_id: Skeleton node ID.
|
|
169
|
+
doc_id: Document ID.
|
|
170
|
+
header: Section header text.
|
|
171
|
+
content: Full section content.
|
|
172
|
+
page_num: Page number if available.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
ExtractionResult with extracted entities.
|
|
176
|
+
"""
|
|
177
|
+
start_time = time.time()
|
|
178
|
+
result = ExtractionResult(
|
|
179
|
+
node_id=node_id,
|
|
180
|
+
doc_id=doc_id,
|
|
181
|
+
extraction_method="llm",
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Skip very short content
|
|
185
|
+
if len(content.strip()) < self.min_content_length:
|
|
186
|
+
logger.debug(
|
|
187
|
+
"skipping_short_content",
|
|
188
|
+
node_id=node_id,
|
|
189
|
+
content_length=len(content),
|
|
190
|
+
)
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
# Check cache
|
|
194
|
+
cache_key = f"{doc_id}:{node_id}"
|
|
195
|
+
if cache_key in self._cache:
|
|
196
|
+
result.entities = self._cache[cache_key]
|
|
197
|
+
logger.debug("using_cached_entities", node_id=node_id)
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
# Truncate content if too long
|
|
201
|
+
if len(content) > self.max_content_length:
|
|
202
|
+
content = content[:self.max_content_length] + "..."
|
|
203
|
+
result.warnings.append(f"Content truncated from {len(content)} chars")
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
entities = self._extract_with_llm(
|
|
207
|
+
node_id=node_id,
|
|
208
|
+
doc_id=doc_id,
|
|
209
|
+
header=header,
|
|
210
|
+
content=content,
|
|
211
|
+
page_num=page_num,
|
|
212
|
+
)
|
|
213
|
+
result.entities = entities
|
|
214
|
+
|
|
215
|
+
# Cache results
|
|
216
|
+
self._cache[cache_key] = entities
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(
|
|
220
|
+
"entity_extraction_failed",
|
|
221
|
+
node_id=node_id,
|
|
222
|
+
error=str(e),
|
|
223
|
+
)
|
|
224
|
+
result.warnings.append(f"Extraction failed: {str(e)}")
|
|
225
|
+
|
|
226
|
+
result.processing_time_ms = (time.time() - start_time) * 1000
|
|
227
|
+
|
|
228
|
+
logger.info(
|
|
229
|
+
"entities_extracted",
|
|
230
|
+
node_id=node_id,
|
|
231
|
+
entity_count=len(result.entities),
|
|
232
|
+
processing_time_ms=result.processing_time_ms,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
def _extract_with_llm(
|
|
238
|
+
self,
|
|
239
|
+
node_id: str,
|
|
240
|
+
doc_id: str,
|
|
241
|
+
header: str,
|
|
242
|
+
content: str,
|
|
243
|
+
page_num: int | None = None,
|
|
244
|
+
) -> list[Entity]:
|
|
245
|
+
"""
|
|
246
|
+
Use LLM to extract entities from content.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
node_id: Skeleton node ID.
|
|
250
|
+
doc_id: Document ID.
|
|
251
|
+
header: Section header.
|
|
252
|
+
content: Section content.
|
|
253
|
+
page_num: Page number.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
List of extracted Entity objects.
|
|
257
|
+
"""
|
|
258
|
+
# Build learned types section for prompt
|
|
259
|
+
learned_types_section = ""
|
|
260
|
+
if self._type_registry:
|
|
261
|
+
learned_types = self._type_registry.get_types_for_prompt(
|
|
262
|
+
min_count=self.learned_type_min_count,
|
|
263
|
+
limit=15,
|
|
264
|
+
)
|
|
265
|
+
if learned_types:
|
|
266
|
+
types_list = ", ".join(learned_types).upper()
|
|
267
|
+
learned_types_section = f"\nAdditionally, these domain-specific types have been learned from previous documents:\n- {types_list}\n"
|
|
268
|
+
|
|
269
|
+
prompt = ENTITY_EXTRACTION_PROMPT.format(
|
|
270
|
+
content=content,
|
|
271
|
+
node_id=node_id,
|
|
272
|
+
doc_id=doc_id,
|
|
273
|
+
header=header,
|
|
274
|
+
learned_types_section=learned_types_section,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Call LLM
|
|
278
|
+
response = self.llm.complete(prompt)
|
|
279
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
280
|
+
|
|
281
|
+
# Parse JSON from response
|
|
282
|
+
entities = self._parse_llm_response(
|
|
283
|
+
response_text=response_text,
|
|
284
|
+
node_id=node_id,
|
|
285
|
+
doc_id=doc_id,
|
|
286
|
+
page_num=page_num,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
return entities
|
|
290
|
+
|
|
291
|
+
def _parse_llm_response(
|
|
292
|
+
self,
|
|
293
|
+
response_text: str,
|
|
294
|
+
node_id: str,
|
|
295
|
+
doc_id: str,
|
|
296
|
+
page_num: int | None = None,
|
|
297
|
+
) -> list[Entity]:
|
|
298
|
+
"""
|
|
299
|
+
Parse LLM response into Entity objects.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
response_text: Raw LLM response.
|
|
303
|
+
node_id: Source node ID.
|
|
304
|
+
doc_id: Source document ID.
|
|
305
|
+
page_num: Page number.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of Entity objects.
|
|
309
|
+
"""
|
|
310
|
+
# Extract JSON from response (may be wrapped in markdown code block)
|
|
311
|
+
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response_text)
|
|
312
|
+
if json_match:
|
|
313
|
+
json_str = json_match.group(1)
|
|
314
|
+
else:
|
|
315
|
+
# Try to find a JSON array directly
|
|
316
|
+
json_match = re.search(r'\[[\s\S]*\]', response_text)
|
|
317
|
+
if json_match:
|
|
318
|
+
json_str = json_match.group(0)
|
|
319
|
+
else:
|
|
320
|
+
logger.warning(
|
|
321
|
+
"no_json_found_in_response",
|
|
322
|
+
response_preview=response_text[:200],
|
|
323
|
+
)
|
|
324
|
+
return []
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
raw_entities = json.loads(json_str)
|
|
328
|
+
except json.JSONDecodeError as e:
|
|
329
|
+
logger.warning(
|
|
330
|
+
"json_parse_error",
|
|
331
|
+
error=str(e),
|
|
332
|
+
json_preview=json_str[:200],
|
|
333
|
+
)
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
if not isinstance(raw_entities, list):
|
|
337
|
+
logger.warning("expected_list_of_entities", got=type(raw_entities).__name__)
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
entities = []
|
|
341
|
+
for raw in raw_entities:
|
|
342
|
+
try:
|
|
343
|
+
entity = self._create_entity_from_raw(
|
|
344
|
+
raw=raw,
|
|
345
|
+
node_id=node_id,
|
|
346
|
+
doc_id=doc_id,
|
|
347
|
+
page_num=page_num,
|
|
348
|
+
)
|
|
349
|
+
if entity:
|
|
350
|
+
entities.append(entity)
|
|
351
|
+
except Exception as e:
|
|
352
|
+
logger.debug(
|
|
353
|
+
"failed_to_create_entity",
|
|
354
|
+
raw=raw,
|
|
355
|
+
error=str(e),
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
return entities
|
|
359
|
+
|
|
360
|
+
def _create_entity_from_raw(
|
|
361
|
+
self,
|
|
362
|
+
raw: dict[str, Any],
|
|
363
|
+
node_id: str,
|
|
364
|
+
doc_id: str,
|
|
365
|
+
page_num: int | None = None,
|
|
366
|
+
) -> Entity | None:
|
|
367
|
+
"""
|
|
368
|
+
Create an Entity object from raw LLM output.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
raw: Raw entity dict from LLM.
|
|
372
|
+
node_id: Source node ID.
|
|
373
|
+
doc_id: Source document ID.
|
|
374
|
+
page_num: Page number.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Entity object or None if invalid.
|
|
378
|
+
"""
|
|
379
|
+
# Parse entity type
|
|
380
|
+
type_str = raw.get("type", "").upper()
|
|
381
|
+
original_type = type_str # Preserve for metadata
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
entity_type = EntityType(type_str.lower())
|
|
385
|
+
except ValueError:
|
|
386
|
+
# Try mapping common variations
|
|
387
|
+
type_mapping = {
|
|
388
|
+
"PERSON": EntityType.PERSON,
|
|
389
|
+
"PEOPLE": EntityType.PERSON,
|
|
390
|
+
"INDIVIDUAL": EntityType.PERSON,
|
|
391
|
+
"NAME": EntityType.PERSON,
|
|
392
|
+
"ORGANIZATION": EntityType.ORGANIZATION,
|
|
393
|
+
"ORG": EntityType.ORGANIZATION,
|
|
394
|
+
"COMPANY": EntityType.ORGANIZATION,
|
|
395
|
+
"AGENCY": EntityType.ORGANIZATION,
|
|
396
|
+
"COURT": EntityType.ORGANIZATION,
|
|
397
|
+
"LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
|
|
398
|
+
"LEGAL": EntityType.LEGAL_CONCEPT,
|
|
399
|
+
"CONCEPT": EntityType.LEGAL_CONCEPT,
|
|
400
|
+
"CLAIM": EntityType.LEGAL_CONCEPT,
|
|
401
|
+
"OBLIGATION": EntityType.LEGAL_CONCEPT,
|
|
402
|
+
"DATE": EntityType.DATE,
|
|
403
|
+
"TIME": EntityType.DATE,
|
|
404
|
+
"DATETIME": EntityType.DATE,
|
|
405
|
+
"PERIOD": EntityType.DATE,
|
|
406
|
+
"EVENT": EntityType.EVENT,
|
|
407
|
+
"OCCURRENCE": EntityType.EVENT,
|
|
408
|
+
"INCIDENT": EntityType.EVENT,
|
|
409
|
+
"LOCATION": EntityType.LOCATION,
|
|
410
|
+
"PLACE": EntityType.LOCATION,
|
|
411
|
+
"ADDRESS": EntityType.LOCATION,
|
|
412
|
+
"JURISDICTION": EntityType.LOCATION,
|
|
413
|
+
"REFERENCE": EntityType.REFERENCE,
|
|
414
|
+
"REF": EntityType.REFERENCE,
|
|
415
|
+
"CITATION": EntityType.REFERENCE,
|
|
416
|
+
"SECTION": EntityType.REFERENCE,
|
|
417
|
+
"MONETARY": EntityType.MONETARY,
|
|
418
|
+
"MONEY": EntityType.MONETARY,
|
|
419
|
+
"AMOUNT": EntityType.MONETARY,
|
|
420
|
+
"CURRENCY": EntityType.MONETARY,
|
|
421
|
+
"FINANCIAL": EntityType.MONETARY,
|
|
422
|
+
"DOCUMENT": EntityType.DOCUMENT,
|
|
423
|
+
"DOC": EntityType.DOCUMENT,
|
|
424
|
+
"CONTRACT": EntityType.DOCUMENT,
|
|
425
|
+
"AGREEMENT": EntityType.DOCUMENT,
|
|
426
|
+
"EXHIBIT": EntityType.DOCUMENT,
|
|
427
|
+
}
|
|
428
|
+
entity_type = type_mapping.get(type_str)
|
|
429
|
+
|
|
430
|
+
if not entity_type:
|
|
431
|
+
# Check if we have a learned mapping for this type
|
|
432
|
+
if self._type_registry:
|
|
433
|
+
mappings = self._type_registry.get_mappings()
|
|
434
|
+
if type_str.lower() in mappings:
|
|
435
|
+
mapped_type = mappings[type_str.lower()]
|
|
436
|
+
try:
|
|
437
|
+
entity_type = EntityType(mapped_type.lower())
|
|
438
|
+
logger.debug(
|
|
439
|
+
"using_learned_mapping",
|
|
440
|
+
original=type_str,
|
|
441
|
+
mapped_to=mapped_type,
|
|
442
|
+
)
|
|
443
|
+
except ValueError:
|
|
444
|
+
pass
|
|
445
|
+
|
|
446
|
+
if not entity_type:
|
|
447
|
+
# Use OTHER as fallback - never drop entities
|
|
448
|
+
logger.debug("unmapped_entity_type_using_other", type=type_str)
|
|
449
|
+
entity_type = EntityType.OTHER
|
|
450
|
+
|
|
451
|
+
# Get canonical name
|
|
452
|
+
canonical_name = raw.get("canonical_name", "").strip()
|
|
453
|
+
if not canonical_name:
|
|
454
|
+
canonical_name = raw.get("name", "").strip()
|
|
455
|
+
if not canonical_name:
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
# Get aliases
|
|
459
|
+
aliases = raw.get("aliases", [])
|
|
460
|
+
if isinstance(aliases, str):
|
|
461
|
+
aliases = [aliases]
|
|
462
|
+
aliases = [a.strip() for a in aliases if a and a.strip()]
|
|
463
|
+
|
|
464
|
+
# Get context
|
|
465
|
+
context = raw.get("context", "").strip()
|
|
466
|
+
|
|
467
|
+
# Get metadata
|
|
468
|
+
metadata = raw.get("metadata", {})
|
|
469
|
+
if not isinstance(metadata, dict):
|
|
470
|
+
metadata = {}
|
|
471
|
+
|
|
472
|
+
# Preserve original type if we used the OTHER fallback
|
|
473
|
+
if entity_type == EntityType.OTHER and original_type:
|
|
474
|
+
metadata["original_type"] = original_type.lower()
|
|
475
|
+
|
|
476
|
+
# Record this type for adaptive learning
|
|
477
|
+
if self._type_registry and self.enable_type_learning:
|
|
478
|
+
self._type_registry.record_type(
|
|
479
|
+
type_name=original_type.lower(),
|
|
480
|
+
context=context,
|
|
481
|
+
entity_name=canonical_name,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Create mention
|
|
485
|
+
mention = Mention(
|
|
486
|
+
node_id=node_id,
|
|
487
|
+
doc_id=doc_id,
|
|
488
|
+
context=context,
|
|
489
|
+
page_num=page_num,
|
|
490
|
+
confidence=1.0,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Create entity
|
|
494
|
+
entity = Entity(
|
|
495
|
+
type=entity_type,
|
|
496
|
+
canonical_name=canonical_name,
|
|
497
|
+
aliases=aliases,
|
|
498
|
+
mentions=[mention],
|
|
499
|
+
metadata=metadata,
|
|
500
|
+
source_doc_id=doc_id,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
return entity
|
|
504
|
+
|
|
505
|
+
def extract_batch(
|
|
506
|
+
self,
|
|
507
|
+
nodes: list[dict[str, Any]],
|
|
508
|
+
) -> list[ExtractionResult]:
|
|
509
|
+
"""
|
|
510
|
+
Extract entities from multiple nodes.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
nodes: List of node dicts with keys: node_id, doc_id, header, content, page_num
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
List of ExtractionResult objects.
|
|
517
|
+
"""
|
|
518
|
+
results = []
|
|
519
|
+
|
|
520
|
+
for node in nodes:
|
|
521
|
+
result = self.extract_from_node(
|
|
522
|
+
node_id=node.get("node_id", ""),
|
|
523
|
+
doc_id=node.get("doc_id", ""),
|
|
524
|
+
header=node.get("header", ""),
|
|
525
|
+
content=node.get("content", ""),
|
|
526
|
+
page_num=node.get("page_num"),
|
|
527
|
+
)
|
|
528
|
+
results.append(result)
|
|
529
|
+
|
|
530
|
+
return results
|
|
531
|
+
|
|
532
|
+
def clear_cache(self) -> None:
|
|
533
|
+
"""Clear the entity cache."""
|
|
534
|
+
self._cache.clear()
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def merge_entities(entities: list[Entity]) -> list[Entity]:
|
|
538
|
+
"""
|
|
539
|
+
Merge duplicate entities based on canonical name and type.
|
|
540
|
+
|
|
541
|
+
Combines mentions and aliases from duplicates.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
entities: List of entities to merge.
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
Deduplicated list of entities.
|
|
548
|
+
"""
|
|
549
|
+
# Group by (type, normalized canonical_name)
|
|
550
|
+
grouped: dict[tuple[EntityType, str], list[Entity]] = {}
|
|
551
|
+
|
|
552
|
+
for entity in entities:
|
|
553
|
+
key = (entity.type, entity.canonical_name.lower().strip())
|
|
554
|
+
if key not in grouped:
|
|
555
|
+
grouped[key] = []
|
|
556
|
+
grouped[key].append(entity)
|
|
557
|
+
|
|
558
|
+
# Merge each group
|
|
559
|
+
merged = []
|
|
560
|
+
for entities_group in grouped.values():
|
|
561
|
+
if len(entities_group) == 1:
|
|
562
|
+
merged.append(entities_group[0])
|
|
563
|
+
else:
|
|
564
|
+
# Merge into first entity
|
|
565
|
+
primary = entities_group[0]
|
|
566
|
+
for other in entities_group[1:]:
|
|
567
|
+
# Merge mentions
|
|
568
|
+
primary.mentions.extend(other.mentions)
|
|
569
|
+
|
|
570
|
+
# Merge aliases
|
|
571
|
+
for alias in other.aliases:
|
|
572
|
+
primary.add_alias(alias)
|
|
573
|
+
|
|
574
|
+
# Merge metadata (prefer primary's values on conflict)
|
|
575
|
+
for k, v in other.metadata.items():
|
|
576
|
+
if k not in primary.metadata:
|
|
577
|
+
primary.metadata[k] = v
|
|
578
|
+
|
|
579
|
+
merged.append(primary)
|
|
580
|
+
|
|
581
|
+
return merged
|