rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,990 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Unified RLM Extractor
|
|
3
|
+
|
|
4
|
+
The single, comprehensive extractor for BOTH entities AND relationships.
|
|
5
|
+
Always uses the most accurate approach:
|
|
6
|
+
|
|
7
|
+
1. LLM analyzes document and writes extraction code
|
|
8
|
+
2. Code executes on DOC_VAR (grounded in actual text)
|
|
9
|
+
3. ToT validation with probabilities
|
|
10
|
+
4. Cross-validation between entities and relationships
|
|
11
|
+
5. Adaptive learning for new types
|
|
12
|
+
|
|
13
|
+
This is the RECOMMENDED extractor - it consolidates all the best
|
|
14
|
+
practices from the RLM paper into a single, unified interface.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
import time
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from typing import Any, TYPE_CHECKING
|
|
24
|
+
|
|
25
|
+
import structlog
|
|
26
|
+
|
|
27
|
+
from rnsr.extraction.models import (
|
|
28
|
+
Entity,
|
|
29
|
+
EntityType,
|
|
30
|
+
ExtractionResult,
|
|
31
|
+
Mention,
|
|
32
|
+
Relationship,
|
|
33
|
+
RelationType,
|
|
34
|
+
)
|
|
35
|
+
from rnsr.extraction.learned_types import (
|
|
36
|
+
get_learned_type_registry,
|
|
37
|
+
get_learned_relationship_type_registry,
|
|
38
|
+
)
|
|
39
|
+
from rnsr.llm import get_llm
|
|
40
|
+
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from rnsr.agent.repl_env import REPLEnvironment
|
|
43
|
+
from rnsr.models import DocumentTree
|
|
44
|
+
|
|
45
|
+
logger = structlog.get_logger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# =============================================================================
|
|
49
|
+
# Unified RLM Prompts
|
|
50
|
+
# =============================================================================
|
|
51
|
+
|
|
52
|
+
RLM_UNIFIED_SYSTEM_PROMPT = """You are an RLM (Recursive Language Model) extracting entities AND relationships from a document.
|
|
53
|
+
|
|
54
|
+
CRITICAL: You do NOT have the full document in context. It is stored in DOC_VAR.
|
|
55
|
+
You must write Python code to extract both entities and relationships.
|
|
56
|
+
|
|
57
|
+
## Available Variables:
|
|
58
|
+
- DOC_VAR: The document text (string)
|
|
59
|
+
- SECTION_CONTENT: Current section content (string)
|
|
60
|
+
- KNOWN_ENTITY_TYPES: Entity types the system has learned (list)
|
|
61
|
+
- KNOWN_RELATIONSHIP_TYPES: Relationship types the system has learned (list)
|
|
62
|
+
|
|
63
|
+
## Available Functions:
|
|
64
|
+
- search_text(pattern): Search for regex pattern, returns list of (start, end, match)
|
|
65
|
+
- re.findall(pattern, text): Standard regex
|
|
66
|
+
- re.finditer(pattern, text): Iterate matches with positions
|
|
67
|
+
- store_variable(name, content): Store findings
|
|
68
|
+
|
|
69
|
+
## Your Task:
|
|
70
|
+
Write Python code that extracts:
|
|
71
|
+
1. ENTITIES: People, organizations, dates, money, locations, legal concepts, etc.
|
|
72
|
+
2. RELATIONSHIPS: How entities relate to each other and to the document
|
|
73
|
+
|
|
74
|
+
## Output Format:
|
|
75
|
+
```python
|
|
76
|
+
entities = []
|
|
77
|
+
relationships = []
|
|
78
|
+
|
|
79
|
+
# Extract entities with exact text positions
|
|
80
|
+
for match in re.finditer(r'pattern', SECTION_CONTENT):
|
|
81
|
+
entities.append({
|
|
82
|
+
"text": match.group(),
|
|
83
|
+
"canonical_name": "Normalized Name",
|
|
84
|
+
"type": "ENTITY_TYPE",
|
|
85
|
+
"start": match.start(),
|
|
86
|
+
"end": match.end(),
|
|
87
|
+
"confidence": 0.9
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
# Extract relationships between entities
|
|
91
|
+
# Look for patterns like "X is affiliated with Y", "X caused Y", etc.
|
|
92
|
+
for match in re.finditer(r'(\w+)\s+(?:is|was)\s+(?:employed|hired)\s+by\s+(\w+)', SECTION_CONTENT):
|
|
93
|
+
relationships.append({
|
|
94
|
+
"source_text": match.group(1),
|
|
95
|
+
"target_text": match.group(2),
|
|
96
|
+
"type": "AFFILIATED_WITH",
|
|
97
|
+
"evidence": match.group(),
|
|
98
|
+
"start": match.start(),
|
|
99
|
+
"end": match.end(),
|
|
100
|
+
"confidence": 0.85
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
store_variable("ENTITIES", entities)
|
|
104
|
+
store_variable("RELATIONSHIPS", relationships)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Entity Types:
|
|
108
|
+
PERSON, ORGANIZATION, DATE, MONETARY, LOCATION, REFERENCE, DOCUMENT, EVENT, LEGAL_CONCEPT
|
|
109
|
+
{learned_entity_types}
|
|
110
|
+
|
|
111
|
+
## Relationship Types:
|
|
112
|
+
MENTIONS, TEMPORAL_BEFORE, TEMPORAL_AFTER, CAUSAL, SUPPORTS, CONTRADICTS,
|
|
113
|
+
AFFILIATED_WITH, PARTY_TO, REFERENCES, SUPERSEDES, AMENDS
|
|
114
|
+
{learned_relationship_types}
|
|
115
|
+
|
|
116
|
+
Write code appropriate for this specific document."""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
RLM_UNIFIED_EXTRACTION_PROMPT = """Document section to extract from:
|
|
120
|
+
|
|
121
|
+
Section Header: {header}
|
|
122
|
+
Section Content (first 3000 chars):
|
|
123
|
+
---
|
|
124
|
+
{content_preview}
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
Total section length: {content_length} characters
|
|
128
|
+
|
|
129
|
+
Write Python code to extract ALL entities and relationships.
|
|
130
|
+
Consider:
|
|
131
|
+
1. What types of entities appear? (people, companies, dates, money, etc.)
|
|
132
|
+
2. How are entities related? (affiliated_with, party_to, temporal, causal)
|
|
133
|
+
3. What domain-specific patterns exist? (legal terms, citations, etc.)
|
|
134
|
+
|
|
135
|
+
End with:
|
|
136
|
+
store_variable("ENTITIES", entities)
|
|
137
|
+
store_variable("RELATIONSHIPS", relationships)"""
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
RLM_TOT_VALIDATION_PROMPT = """You are validating extracted entities and relationships using Tree of Thoughts reasoning.
|
|
141
|
+
|
|
142
|
+
## Extracted Entities:
|
|
143
|
+
{entities_json}
|
|
144
|
+
|
|
145
|
+
## Extracted Relationships:
|
|
146
|
+
{relationships_json}
|
|
147
|
+
|
|
148
|
+
## Section Content (for verification):
|
|
149
|
+
{content_preview}
|
|
150
|
+
|
|
151
|
+
VALIDATION TASK:
|
|
152
|
+
For each entity and relationship, estimate probability (0.0-1.0) that it is valid.
|
|
153
|
+
|
|
154
|
+
ENTITY VALIDATION:
|
|
155
|
+
- Is this a real, specific entity (not a generic term)?
|
|
156
|
+
- Is the type correct?
|
|
157
|
+
- Is the canonical_name properly normalized?
|
|
158
|
+
|
|
159
|
+
RELATIONSHIP VALIDATION:
|
|
160
|
+
- Is there actual evidence for this relationship in the text?
|
|
161
|
+
- Is the relationship type correct?
|
|
162
|
+
- Are source and target correctly identified?
|
|
163
|
+
|
|
164
|
+
OUTPUT FORMAT (JSON):
|
|
165
|
+
{{
|
|
166
|
+
"entity_validations": [
|
|
167
|
+
{{"id": 0, "valid": true, "probability": 0.9, "type": "PERSON", "canonical_name": "John Smith", "reasoning": "Clear person name with title"}},
|
|
168
|
+
{{"id": 1, "valid": false, "probability": 0.2, "reasoning": "Generic term, not specific entity"}}
|
|
169
|
+
],
|
|
170
|
+
"relationship_validations": [
|
|
171
|
+
{{"id": 0, "valid": true, "probability": 0.85, "type": "AFFILIATED_WITH", "reasoning": "Evidence shows employment relationship"}},
|
|
172
|
+
{{"id": 1, "valid": false, "probability": 0.3, "reasoning": "Co-occurrence but no explicit relationship"}}
|
|
173
|
+
],
|
|
174
|
+
"cross_validation": {{
|
|
175
|
+
"entities_in_relationships": [0, 2],
|
|
176
|
+
"orphan_relationships": [],
|
|
177
|
+
"confidence_adjustments": []
|
|
178
|
+
}}
|
|
179
|
+
}}
|
|
180
|
+
|
|
181
|
+
Respond ONLY with JSON."""
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# =============================================================================
|
|
185
|
+
# Unified REPL for Extraction
|
|
186
|
+
# =============================================================================
|
|
187
|
+
|
|
188
|
+
class UnifiedREPL:
|
|
189
|
+
"""
|
|
190
|
+
REPL for unified entity + relationship extraction.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
document_text: str,
|
|
196
|
+
section_content: str = "",
|
|
197
|
+
known_entity_types: list[str] | None = None,
|
|
198
|
+
known_relationship_types: list[str] | None = None,
|
|
199
|
+
):
|
|
200
|
+
"""Initialize with document and learned types."""
|
|
201
|
+
self.document_text = document_text
|
|
202
|
+
self.section_content = section_content or document_text
|
|
203
|
+
self.known_entity_types = known_entity_types or []
|
|
204
|
+
self.known_relationship_types = known_relationship_types or []
|
|
205
|
+
self.variables: dict[str, Any] = {}
|
|
206
|
+
|
|
207
|
+
self._namespace = self._build_namespace()
|
|
208
|
+
|
|
209
|
+
def _build_namespace(self) -> dict[str, Any]:
|
|
210
|
+
"""Build Python namespace for code execution."""
|
|
211
|
+
return {
|
|
212
|
+
# Core variables
|
|
213
|
+
"DOC_VAR": self.document_text,
|
|
214
|
+
"SECTION_CONTENT": self.section_content,
|
|
215
|
+
"KNOWN_ENTITY_TYPES": self.known_entity_types,
|
|
216
|
+
"KNOWN_RELATIONSHIP_TYPES": self.known_relationship_types,
|
|
217
|
+
"VARIABLES": self.variables,
|
|
218
|
+
|
|
219
|
+
# Built-ins
|
|
220
|
+
"len": len,
|
|
221
|
+
"str": str,
|
|
222
|
+
"int": int,
|
|
223
|
+
"float": float,
|
|
224
|
+
"list": list,
|
|
225
|
+
"dict": dict,
|
|
226
|
+
"set": set,
|
|
227
|
+
"range": range,
|
|
228
|
+
"enumerate": enumerate,
|
|
229
|
+
"sorted": sorted,
|
|
230
|
+
"min": min,
|
|
231
|
+
"max": max,
|
|
232
|
+
"any": any,
|
|
233
|
+
"all": all,
|
|
234
|
+
"re": re,
|
|
235
|
+
|
|
236
|
+
# Functions
|
|
237
|
+
"search_text": self._search_text,
|
|
238
|
+
"store_variable": self._store_variable,
|
|
239
|
+
"get_variable": self._get_variable,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
def _search_text(self, pattern: str) -> list[tuple[int, int, str]]:
|
|
243
|
+
"""Search document for regex pattern."""
|
|
244
|
+
results = []
|
|
245
|
+
try:
|
|
246
|
+
for match in re.finditer(pattern, self.section_content, re.IGNORECASE):
|
|
247
|
+
results.append((match.start(), match.end(), match.group()))
|
|
248
|
+
except re.error as e:
|
|
249
|
+
logger.warning("regex_error", pattern=pattern, error=str(e))
|
|
250
|
+
return results
|
|
251
|
+
|
|
252
|
+
def _store_variable(self, name: str, content: Any) -> str:
|
|
253
|
+
"""Store a variable."""
|
|
254
|
+
self.variables[name] = content
|
|
255
|
+
return f"Stored ${name}"
|
|
256
|
+
|
|
257
|
+
def _get_variable(self, name: str) -> Any:
|
|
258
|
+
"""Retrieve a variable."""
|
|
259
|
+
return self.variables.get(name)
|
|
260
|
+
|
|
261
|
+
def execute(self, code: str) -> dict[str, Any]:
|
|
262
|
+
"""Execute Python code."""
|
|
263
|
+
result = {
|
|
264
|
+
"success": False,
|
|
265
|
+
"entities": [],
|
|
266
|
+
"relationships": [],
|
|
267
|
+
"error": None,
|
|
268
|
+
"variables": list(self.variables.keys()),
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
# Clean code
|
|
272
|
+
code = self._clean_code(code)
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
# Compile and execute
|
|
276
|
+
compiled = compile(code, "<rlm_unified_extraction>", "exec")
|
|
277
|
+
exec(compiled, self._namespace)
|
|
278
|
+
|
|
279
|
+
result["success"] = True
|
|
280
|
+
result["variables"] = list(self.variables.keys())
|
|
281
|
+
result["entities"] = self.variables.get("ENTITIES", [])
|
|
282
|
+
result["relationships"] = self.variables.get("RELATIONSHIPS", [])
|
|
283
|
+
|
|
284
|
+
except Exception as e:
|
|
285
|
+
result["error"] = str(e)
|
|
286
|
+
logger.warning("rlm_execution_error", error=str(e), code=code[:200])
|
|
287
|
+
|
|
288
|
+
return result
|
|
289
|
+
|
|
290
|
+
def _clean_code(self, code: str) -> str:
|
|
291
|
+
"""Remove markdown code blocks."""
|
|
292
|
+
code = re.sub(r'^```python\s*', '', code, flags=re.MULTILINE)
|
|
293
|
+
code = re.sub(r'^```\s*$', '', code, flags=re.MULTILINE)
|
|
294
|
+
return code.strip()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# =============================================================================
|
|
298
|
+
# Unified Extraction Result
|
|
299
|
+
# =============================================================================
|
|
300
|
+
|
|
301
|
+
@dataclass
|
|
302
|
+
class RLMUnifiedResult:
|
|
303
|
+
"""Result of unified RLM extraction."""
|
|
304
|
+
|
|
305
|
+
node_id: str = ""
|
|
306
|
+
doc_id: str = ""
|
|
307
|
+
entities: list[Entity] = field(default_factory=list)
|
|
308
|
+
relationships: list[Relationship] = field(default_factory=list)
|
|
309
|
+
|
|
310
|
+
# Code generation
|
|
311
|
+
code_generated: str = ""
|
|
312
|
+
code_executed: bool = False
|
|
313
|
+
|
|
314
|
+
# Raw candidates (before validation)
|
|
315
|
+
raw_entities: list[dict] = field(default_factory=list)
|
|
316
|
+
raw_relationships: list[dict] = field(default_factory=list)
|
|
317
|
+
|
|
318
|
+
# Validation
|
|
319
|
+
tot_validated: bool = False
|
|
320
|
+
cross_validated: bool = False
|
|
321
|
+
|
|
322
|
+
# Stats
|
|
323
|
+
processing_time_ms: float = 0.0
|
|
324
|
+
warnings: list[str] = field(default_factory=list)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# =============================================================================
|
|
328
|
+
# RLM Unified Extractor
|
|
329
|
+
# =============================================================================
|
|
330
|
+
|
|
331
|
+
class RLMUnifiedExtractor:
|
|
332
|
+
"""
|
|
333
|
+
Unified RLM Extractor for entities AND relationships.
|
|
334
|
+
|
|
335
|
+
This is the RECOMMENDED extractor. It uses:
|
|
336
|
+
1. RLM code generation (LLM writes extraction code)
|
|
337
|
+
2. ToT validation (probabilities + reasoning)
|
|
338
|
+
3. Cross-validation between entities and relationships
|
|
339
|
+
4. Adaptive learning for new types
|
|
340
|
+
|
|
341
|
+
Always grounded - all extractions tied to exact text spans.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
def __init__(
|
|
345
|
+
self,
|
|
346
|
+
llm: Any | None = None,
|
|
347
|
+
max_code_attempts: int = 3,
|
|
348
|
+
tot_selection_threshold: float = 0.6,
|
|
349
|
+
enable_type_learning: bool = True,
|
|
350
|
+
enable_tot_validation: bool = True,
|
|
351
|
+
enable_cross_validation: bool = True,
|
|
352
|
+
):
|
|
353
|
+
"""
|
|
354
|
+
Initialize the unified extractor.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
llm: LLM instance.
|
|
358
|
+
max_code_attempts: Max attempts if code fails.
|
|
359
|
+
tot_selection_threshold: Threshold for ToT validation.
|
|
360
|
+
enable_type_learning: Learn new entity/relationship types.
|
|
361
|
+
enable_tot_validation: Use ToT for validation.
|
|
362
|
+
enable_cross_validation: Cross-validate entities and relationships.
|
|
363
|
+
"""
|
|
364
|
+
self.llm = llm
|
|
365
|
+
self.max_code_attempts = max_code_attempts
|
|
366
|
+
self.tot_selection_threshold = tot_selection_threshold
|
|
367
|
+
self.enable_type_learning = enable_type_learning
|
|
368
|
+
self.enable_tot_validation = enable_tot_validation
|
|
369
|
+
self.enable_cross_validation = enable_cross_validation
|
|
370
|
+
|
|
371
|
+
self._llm_initialized = False
|
|
372
|
+
|
|
373
|
+
# Learned type registries
|
|
374
|
+
self._entity_type_registry = None
|
|
375
|
+
self._relationship_type_registry = None
|
|
376
|
+
|
|
377
|
+
if enable_type_learning:
|
|
378
|
+
self._entity_type_registry = get_learned_type_registry()
|
|
379
|
+
try:
|
|
380
|
+
self._relationship_type_registry = get_learned_relationship_type_registry()
|
|
381
|
+
except Exception:
|
|
382
|
+
# Registry may not exist yet
|
|
383
|
+
self._relationship_type_registry = None
|
|
384
|
+
|
|
385
|
+
def _get_llm(self) -> Any:
|
|
386
|
+
"""Get or initialize LLM."""
|
|
387
|
+
if self.llm is None and not self._llm_initialized:
|
|
388
|
+
self.llm = get_llm()
|
|
389
|
+
self._llm_initialized = True
|
|
390
|
+
return self.llm
|
|
391
|
+
|
|
392
|
+
def extract(
|
|
393
|
+
self,
|
|
394
|
+
node_id: str,
|
|
395
|
+
doc_id: str,
|
|
396
|
+
header: str,
|
|
397
|
+
content: str,
|
|
398
|
+
page_num: int | None = None,
|
|
399
|
+
document_text: str | None = None,
|
|
400
|
+
) -> RLMUnifiedResult:
|
|
401
|
+
"""
|
|
402
|
+
Extract entities AND relationships using unified RLM approach.
|
|
403
|
+
|
|
404
|
+
Flow:
|
|
405
|
+
1. LLM generates extraction code based on document
|
|
406
|
+
2. Code executes on DOC_VAR (grounded)
|
|
407
|
+
3. ToT validates candidates with probabilities
|
|
408
|
+
4. Cross-validation boosts/filters
|
|
409
|
+
5. Learn new types
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
node_id: Section node ID.
|
|
413
|
+
doc_id: Document ID.
|
|
414
|
+
header: Section header.
|
|
415
|
+
content: Section content.
|
|
416
|
+
page_num: Page number.
|
|
417
|
+
document_text: Full document text for DOC_VAR.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
RLMUnifiedResult with entities and relationships.
|
|
421
|
+
"""
|
|
422
|
+
start_time = time.time()
|
|
423
|
+
|
|
424
|
+
result = RLMUnifiedResult(
|
|
425
|
+
node_id=node_id,
|
|
426
|
+
doc_id=doc_id,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
if len(content.strip()) < 50:
|
|
430
|
+
return result
|
|
431
|
+
|
|
432
|
+
llm = self._get_llm()
|
|
433
|
+
if llm is None:
|
|
434
|
+
result.warnings.append("No LLM available")
|
|
435
|
+
return result
|
|
436
|
+
|
|
437
|
+
# Get learned types for prompt
|
|
438
|
+
learned_entity_types = self._get_learned_entity_types()
|
|
439
|
+
learned_relationship_types = self._get_learned_relationship_types()
|
|
440
|
+
|
|
441
|
+
# STEP 1: Generate and execute extraction code
|
|
442
|
+
exec_result = self._generate_and_execute_code(
|
|
443
|
+
header=header,
|
|
444
|
+
content=content,
|
|
445
|
+
document_text=document_text or content,
|
|
446
|
+
learned_entity_types=learned_entity_types,
|
|
447
|
+
learned_relationship_types=learned_relationship_types,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
result.code_generated = exec_result.get("code", "")
|
|
451
|
+
result.code_executed = exec_result.get("success", False)
|
|
452
|
+
result.raw_entities = exec_result.get("entities", [])
|
|
453
|
+
result.raw_relationships = exec_result.get("relationships", [])
|
|
454
|
+
|
|
455
|
+
if not result.code_executed:
|
|
456
|
+
result.warnings.append(f"Code execution failed: {exec_result.get('error', 'Unknown')}")
|
|
457
|
+
result.processing_time_ms = (time.time() - start_time) * 1000
|
|
458
|
+
return result
|
|
459
|
+
|
|
460
|
+
# STEP 2: ToT Validation
|
|
461
|
+
if self.enable_tot_validation and (result.raw_entities or result.raw_relationships):
|
|
462
|
+
validated = self._tot_validate(
|
|
463
|
+
entities=result.raw_entities,
|
|
464
|
+
relationships=result.raw_relationships,
|
|
465
|
+
content=content,
|
|
466
|
+
)
|
|
467
|
+
entities = validated.get("entities", result.raw_entities)
|
|
468
|
+
relationships = validated.get("relationships", result.raw_relationships)
|
|
469
|
+
result.tot_validated = True
|
|
470
|
+
else:
|
|
471
|
+
entities = result.raw_entities
|
|
472
|
+
relationships = result.raw_relationships
|
|
473
|
+
|
|
474
|
+
# STEP 3: Convert to model objects
|
|
475
|
+
result.entities = self._candidates_to_entities(
|
|
476
|
+
candidates=entities,
|
|
477
|
+
node_id=node_id,
|
|
478
|
+
doc_id=doc_id,
|
|
479
|
+
content=content,
|
|
480
|
+
page_num=page_num,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
result.relationships = self._candidates_to_relationships(
|
|
484
|
+
candidates=relationships,
|
|
485
|
+
entities=result.entities,
|
|
486
|
+
node_id=node_id,
|
|
487
|
+
doc_id=doc_id,
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# STEP 4: Cross-validation
|
|
491
|
+
if self.enable_cross_validation and result.entities and result.relationships:
|
|
492
|
+
result.entities, result.relationships = self._cross_validate(
|
|
493
|
+
result.entities, result.relationships
|
|
494
|
+
)
|
|
495
|
+
result.cross_validated = True
|
|
496
|
+
|
|
497
|
+
# STEP 5: Learn new types
|
|
498
|
+
if self.enable_type_learning:
|
|
499
|
+
self._learn_new_types(result.entities, result.relationships)
|
|
500
|
+
|
|
501
|
+
result.processing_time_ms = (time.time() - start_time) * 1000
|
|
502
|
+
|
|
503
|
+
logger.info(
|
|
504
|
+
"rlm_unified_extraction_complete",
|
|
505
|
+
node_id=node_id,
|
|
506
|
+
entities=len(result.entities),
|
|
507
|
+
relationships=len(result.relationships),
|
|
508
|
+
time_ms=result.processing_time_ms,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
return result
|
|
512
|
+
|
|
513
|
+
def _get_learned_entity_types(self) -> str:
|
|
514
|
+
"""Get learned entity types for prompt."""
|
|
515
|
+
if not self._entity_type_registry:
|
|
516
|
+
return ""
|
|
517
|
+
|
|
518
|
+
types = self._entity_type_registry.get_types_for_prompt()
|
|
519
|
+
if not types:
|
|
520
|
+
return ""
|
|
521
|
+
|
|
522
|
+
return f"\nAlso consider these learned types: {', '.join(types)}"
|
|
523
|
+
|
|
524
|
+
def _get_learned_relationship_types(self) -> str:
|
|
525
|
+
"""Get learned relationship types for prompt."""
|
|
526
|
+
if not self._relationship_type_registry:
|
|
527
|
+
return ""
|
|
528
|
+
|
|
529
|
+
try:
|
|
530
|
+
types = self._relationship_type_registry.get_types_for_prompt()
|
|
531
|
+
if not types:
|
|
532
|
+
return ""
|
|
533
|
+
return f"\nAlso consider these learned types: {', '.join(types)}"
|
|
534
|
+
except Exception:
|
|
535
|
+
return ""
|
|
536
|
+
|
|
537
|
+
def _generate_and_execute_code(
|
|
538
|
+
self,
|
|
539
|
+
header: str,
|
|
540
|
+
content: str,
|
|
541
|
+
document_text: str,
|
|
542
|
+
learned_entity_types: str,
|
|
543
|
+
learned_relationship_types: str,
|
|
544
|
+
) -> dict[str, Any]:
|
|
545
|
+
"""Generate extraction code and execute it."""
|
|
546
|
+
llm = self._get_llm()
|
|
547
|
+
|
|
548
|
+
# Get learned types for REPL
|
|
549
|
+
entity_types = []
|
|
550
|
+
relationship_types = []
|
|
551
|
+
|
|
552
|
+
if self._entity_type_registry:
|
|
553
|
+
entity_types = self._entity_type_registry.get_types_for_prompt()
|
|
554
|
+
if self._relationship_type_registry:
|
|
555
|
+
try:
|
|
556
|
+
relationship_types = self._relationship_type_registry.get_types_for_prompt()
|
|
557
|
+
except Exception:
|
|
558
|
+
pass
|
|
559
|
+
|
|
560
|
+
# Create REPL
|
|
561
|
+
repl = UnifiedREPL(
|
|
562
|
+
document_text=document_text,
|
|
563
|
+
section_content=content,
|
|
564
|
+
known_entity_types=entity_types,
|
|
565
|
+
known_relationship_types=relationship_types,
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
# Build prompt
|
|
569
|
+
system_prompt = RLM_UNIFIED_SYSTEM_PROMPT.format(
|
|
570
|
+
learned_entity_types=learned_entity_types,
|
|
571
|
+
learned_relationship_types=learned_relationship_types,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
extraction_prompt = RLM_UNIFIED_EXTRACTION_PROMPT.format(
|
|
575
|
+
header=header,
|
|
576
|
+
content_preview=content[:3000],
|
|
577
|
+
content_length=len(content),
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
prompt = f"{system_prompt}\n\n{extraction_prompt}"
|
|
581
|
+
|
|
582
|
+
for attempt in range(self.max_code_attempts):
|
|
583
|
+
try:
|
|
584
|
+
# LLM generates code
|
|
585
|
+
response = llm.complete(prompt)
|
|
586
|
+
code = str(response) if not isinstance(response, str) else response
|
|
587
|
+
|
|
588
|
+
# Validate we got actual code
|
|
589
|
+
if not code or len(code.strip()) < 20:
|
|
590
|
+
logger.warning("empty_or_short_code_response", attempt=attempt, length=len(code) if code else 0)
|
|
591
|
+
continue
|
|
592
|
+
|
|
593
|
+
# Check if response looks like code (not just JSON or text)
|
|
594
|
+
if "store_variable" not in code and "entities" not in code.lower():
|
|
595
|
+
logger.warning("response_not_code", attempt=attempt, preview=code[:100])
|
|
596
|
+
prompt += "\n\nPlease respond ONLY with Python code that extracts entities and relationships."
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
# Execute
|
|
600
|
+
exec_result = repl.execute(code)
|
|
601
|
+
|
|
602
|
+
if exec_result["success"]:
|
|
603
|
+
entities = exec_result.get("entities", [])
|
|
604
|
+
relationships = exec_result.get("relationships", [])
|
|
605
|
+
|
|
606
|
+
# Validate entities are properly structured
|
|
607
|
+
valid_entities = []
|
|
608
|
+
for e in entities:
|
|
609
|
+
if isinstance(e, dict) and e.get("text"):
|
|
610
|
+
valid_entities.append(e)
|
|
611
|
+
|
|
612
|
+
valid_relationships = []
|
|
613
|
+
for r in relationships:
|
|
614
|
+
if isinstance(r, dict) and (r.get("source_text") or r.get("type")):
|
|
615
|
+
valid_relationships.append(r)
|
|
616
|
+
|
|
617
|
+
return {
|
|
618
|
+
"success": True,
|
|
619
|
+
"code": code,
|
|
620
|
+
"entities": valid_entities,
|
|
621
|
+
"relationships": valid_relationships,
|
|
622
|
+
}
|
|
623
|
+
else:
|
|
624
|
+
# Retry with error feedback
|
|
625
|
+
prompt += f"\n\nPrevious code had error: {exec_result['error']}\nPlease fix."
|
|
626
|
+
|
|
627
|
+
except Exception as e:
|
|
628
|
+
logger.warning("code_generation_failed", attempt=attempt, error=str(e))
|
|
629
|
+
|
|
630
|
+
return {"success": False, "error": "Max attempts exceeded"}
|
|
631
|
+
|
|
632
|
+
def _tot_validate(
|
|
633
|
+
self,
|
|
634
|
+
entities: list[dict],
|
|
635
|
+
relationships: list[dict],
|
|
636
|
+
content: str,
|
|
637
|
+
) -> dict[str, list[dict]]:
|
|
638
|
+
"""Validate with Tree of Thoughts."""
|
|
639
|
+
llm = self._get_llm()
|
|
640
|
+
|
|
641
|
+
entities_json = json.dumps([
|
|
642
|
+
{"id": i, "text": e.get("text", ""), "type": e.get("type", ""), "confidence": e.get("confidence", 0.5)}
|
|
643
|
+
for i, e in enumerate(entities[:20])
|
|
644
|
+
], indent=2)
|
|
645
|
+
|
|
646
|
+
relationships_json = json.dumps([
|
|
647
|
+
{"id": i, "source": r.get("source_text", ""), "target": r.get("target_text", ""),
|
|
648
|
+
"type": r.get("type", ""), "evidence": r.get("evidence", "")[:100]}
|
|
649
|
+
for i, r in enumerate(relationships[:15])
|
|
650
|
+
], indent=2)
|
|
651
|
+
|
|
652
|
+
prompt = RLM_TOT_VALIDATION_PROMPT.format(
|
|
653
|
+
entities_json=entities_json,
|
|
654
|
+
relationships_json=relationships_json,
|
|
655
|
+
content_preview=content[:2000],
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
try:
|
|
659
|
+
response = llm.complete(prompt)
|
|
660
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
661
|
+
|
|
662
|
+
# Clean response - remove markdown code blocks if present
|
|
663
|
+
response_text = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
|
|
664
|
+
response_text = re.sub(r'^```\s*$', '', response_text, flags=re.MULTILINE)
|
|
665
|
+
response_text = response_text.strip()
|
|
666
|
+
|
|
667
|
+
# Parse JSON - try multiple strategies
|
|
668
|
+
data = None
|
|
669
|
+
|
|
670
|
+
# Strategy 1: Direct parse
|
|
671
|
+
try:
|
|
672
|
+
data = json.loads(response_text)
|
|
673
|
+
except json.JSONDecodeError:
|
|
674
|
+
pass
|
|
675
|
+
|
|
676
|
+
# Strategy 2: Extract JSON object
|
|
677
|
+
if data is None:
|
|
678
|
+
json_match = re.search(r'\{[\s\S]*\}', response_text)
|
|
679
|
+
if json_match:
|
|
680
|
+
try:
|
|
681
|
+
data = json.loads(json_match.group())
|
|
682
|
+
except json.JSONDecodeError:
|
|
683
|
+
pass
|
|
684
|
+
|
|
685
|
+
# Strategy 3: Try to fix common issues
|
|
686
|
+
if data is None:
|
|
687
|
+
# Try fixing trailing commas, missing quotes, etc.
|
|
688
|
+
fixed = re.sub(r',(\s*[}\]])', r'\1', response_text)
|
|
689
|
+
try:
|
|
690
|
+
data = json.loads(fixed)
|
|
691
|
+
except json.JSONDecodeError:
|
|
692
|
+
pass
|
|
693
|
+
|
|
694
|
+
if data is None:
|
|
695
|
+
logger.debug("tot_json_parse_failed", response_preview=response_text[:200])
|
|
696
|
+
return {"entities": entities, "relationships": relationships}
|
|
697
|
+
|
|
698
|
+
# Apply validations
|
|
699
|
+
entity_validations = {v["id"]: v for v in data.get("entity_validations", [])}
|
|
700
|
+
relationship_validations = {v["id"]: v for v in data.get("relationship_validations", [])}
|
|
701
|
+
|
|
702
|
+
# Filter and update entities
|
|
703
|
+
validated_entities = []
|
|
704
|
+
for i, entity in enumerate(entities):
|
|
705
|
+
validation = entity_validations.get(i, {})
|
|
706
|
+
if validation.get("valid", True) and validation.get("probability", 0.5) >= self.tot_selection_threshold:
|
|
707
|
+
entity["type"] = validation.get("type", entity.get("type", "OTHER"))
|
|
708
|
+
entity["canonical_name"] = validation.get("canonical_name", entity.get("canonical_name", entity.get("text", "")))
|
|
709
|
+
entity["confidence"] = validation.get("probability", entity.get("confidence", 0.5))
|
|
710
|
+
entity["tot_reasoning"] = validation.get("reasoning", "")
|
|
711
|
+
validated_entities.append(entity)
|
|
712
|
+
|
|
713
|
+
# Filter and update relationships
|
|
714
|
+
validated_relationships = []
|
|
715
|
+
for i, rel in enumerate(relationships):
|
|
716
|
+
validation = relationship_validations.get(i, {})
|
|
717
|
+
if validation.get("valid", True) and validation.get("probability", 0.5) >= self.tot_selection_threshold:
|
|
718
|
+
rel["type"] = validation.get("type", rel.get("type", "MENTIONS"))
|
|
719
|
+
rel["confidence"] = validation.get("probability", rel.get("confidence", 0.5))
|
|
720
|
+
rel["tot_reasoning"] = validation.get("reasoning", "")
|
|
721
|
+
validated_relationships.append(rel)
|
|
722
|
+
|
|
723
|
+
return {"entities": validated_entities, "relationships": validated_relationships}
|
|
724
|
+
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.warning("tot_validation_failed", error=str(e))
|
|
727
|
+
return {"entities": entities, "relationships": relationships}
|
|
728
|
+
|
|
729
|
+
def _candidates_to_entities(
|
|
730
|
+
self,
|
|
731
|
+
candidates: list[dict],
|
|
732
|
+
node_id: str,
|
|
733
|
+
doc_id: str,
|
|
734
|
+
content: str,
|
|
735
|
+
page_num: int | None,
|
|
736
|
+
) -> list[Entity]:
|
|
737
|
+
"""Convert candidates to Entity objects."""
|
|
738
|
+
entities = []
|
|
739
|
+
|
|
740
|
+
for candidate in candidates:
|
|
741
|
+
if not candidate.get("text"):
|
|
742
|
+
continue
|
|
743
|
+
|
|
744
|
+
entity_type = self._map_entity_type(candidate.get("type", "OTHER"))
|
|
745
|
+
|
|
746
|
+
mention = Mention(
|
|
747
|
+
node_id=node_id,
|
|
748
|
+
doc_id=doc_id,
|
|
749
|
+
span_start=candidate.get("start"),
|
|
750
|
+
span_end=candidate.get("end"),
|
|
751
|
+
context=content[
|
|
752
|
+
max(0, (candidate.get("start") or 0) - 50):
|
|
753
|
+
(candidate.get("end") or 0) + 50
|
|
754
|
+
] if candidate.get("start") is not None else "",
|
|
755
|
+
page_num=page_num,
|
|
756
|
+
confidence=candidate.get("confidence", 0.5),
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
metadata = {
|
|
760
|
+
"rlm_extracted": True,
|
|
761
|
+
"grounded": candidate.get("start") is not None,
|
|
762
|
+
"tot_validated": "tot_reasoning" in candidate,
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
if candidate.get("tot_reasoning"):
|
|
766
|
+
metadata["tot_reasoning"] = candidate["tot_reasoning"]
|
|
767
|
+
|
|
768
|
+
if entity_type == EntityType.OTHER:
|
|
769
|
+
metadata["original_type"] = candidate.get("type", "").lower()
|
|
770
|
+
|
|
771
|
+
entity = Entity(
|
|
772
|
+
type=entity_type,
|
|
773
|
+
canonical_name=candidate.get("canonical_name", candidate.get("text", "")),
|
|
774
|
+
aliases=[candidate.get("text")] if candidate.get("canonical_name") != candidate.get("text") else [],
|
|
775
|
+
mentions=[mention],
|
|
776
|
+
metadata=metadata,
|
|
777
|
+
source_doc_id=doc_id,
|
|
778
|
+
)
|
|
779
|
+
entities.append(entity)
|
|
780
|
+
|
|
781
|
+
return entities
|
|
782
|
+
|
|
783
|
+
def _candidates_to_relationships(
|
|
784
|
+
self,
|
|
785
|
+
candidates: list[dict],
|
|
786
|
+
entities: list[Entity],
|
|
787
|
+
node_id: str,
|
|
788
|
+
doc_id: str,
|
|
789
|
+
) -> list[Relationship]:
|
|
790
|
+
"""Convert candidates to Relationship objects."""
|
|
791
|
+
relationships = []
|
|
792
|
+
|
|
793
|
+
# Build entity lookup
|
|
794
|
+
entity_by_text = {}
|
|
795
|
+
for entity in entities:
|
|
796
|
+
entity_by_text[entity.canonical_name.lower()] = entity.id
|
|
797
|
+
for alias in entity.aliases:
|
|
798
|
+
entity_by_text[alias.lower()] = entity.id
|
|
799
|
+
|
|
800
|
+
for candidate in candidates:
|
|
801
|
+
rel_type = self._map_relationship_type(candidate.get("type", "MENTIONS"))
|
|
802
|
+
|
|
803
|
+
# Try to match source/target to entities
|
|
804
|
+
source_text = candidate.get("source_text", "")
|
|
805
|
+
target_text = candidate.get("target_text", "")
|
|
806
|
+
|
|
807
|
+
source_id = entity_by_text.get(source_text.lower(), f"text:{source_text}")
|
|
808
|
+
target_id = entity_by_text.get(target_text.lower(), f"text:{target_text}")
|
|
809
|
+
|
|
810
|
+
source_type = "entity" if source_id in [e.id for e in entities] else "text"
|
|
811
|
+
target_type = "entity" if target_id in [e.id for e in entities] else "text"
|
|
812
|
+
|
|
813
|
+
metadata = {
|
|
814
|
+
"rlm_extracted": True,
|
|
815
|
+
"grounded": candidate.get("start") is not None,
|
|
816
|
+
"tot_validated": "tot_reasoning" in candidate,
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
if candidate.get("tot_reasoning"):
|
|
820
|
+
metadata["tot_reasoning"] = candidate["tot_reasoning"]
|
|
821
|
+
|
|
822
|
+
if rel_type == RelationType.OTHER:
|
|
823
|
+
metadata["original_type"] = candidate.get("type", "").lower()
|
|
824
|
+
|
|
825
|
+
relationship = Relationship(
|
|
826
|
+
type=rel_type,
|
|
827
|
+
source_id=source_id,
|
|
828
|
+
source_type=source_type,
|
|
829
|
+
target_id=target_id,
|
|
830
|
+
target_type=target_type,
|
|
831
|
+
confidence=candidate.get("confidence", 0.5),
|
|
832
|
+
evidence=candidate.get("evidence", ""),
|
|
833
|
+
doc_id=doc_id,
|
|
834
|
+
node_id=node_id,
|
|
835
|
+
metadata=metadata,
|
|
836
|
+
)
|
|
837
|
+
relationships.append(relationship)
|
|
838
|
+
|
|
839
|
+
return relationships
|
|
840
|
+
|
|
841
|
+
def _cross_validate(
|
|
842
|
+
self,
|
|
843
|
+
entities: list[Entity],
|
|
844
|
+
relationships: list[Relationship],
|
|
845
|
+
) -> tuple[list[Entity], list[Relationship]]:
|
|
846
|
+
"""Cross-validate entities and relationships."""
|
|
847
|
+
entity_ids = {e.id for e in entities}
|
|
848
|
+
|
|
849
|
+
# Find entities referenced in relationships
|
|
850
|
+
entities_in_rels = set()
|
|
851
|
+
for rel in relationships:
|
|
852
|
+
if rel.source_type == "entity" and rel.source_id in entity_ids:
|
|
853
|
+
entities_in_rels.add(rel.source_id)
|
|
854
|
+
if rel.target_type == "entity" and rel.target_id in entity_ids:
|
|
855
|
+
entities_in_rels.add(rel.target_id)
|
|
856
|
+
|
|
857
|
+
# Boost confidence for entities in relationships
|
|
858
|
+
for entity in entities:
|
|
859
|
+
if entity.id in entities_in_rels:
|
|
860
|
+
if entity.mentions:
|
|
861
|
+
entity.mentions[0].confidence = min(entity.mentions[0].confidence * 1.1, 1.0)
|
|
862
|
+
entity.metadata["cross_validated"] = True
|
|
863
|
+
|
|
864
|
+
# Boost confidence for relationships with validated entities
|
|
865
|
+
for rel in relationships:
|
|
866
|
+
both_valid = (
|
|
867
|
+
(rel.source_type == "entity" and rel.source_id in entity_ids) and
|
|
868
|
+
(rel.target_type == "entity" and rel.target_id in entity_ids)
|
|
869
|
+
)
|
|
870
|
+
if both_valid:
|
|
871
|
+
rel.confidence = min(rel.confidence * 1.1, 1.0)
|
|
872
|
+
rel.metadata["cross_validated"] = True
|
|
873
|
+
|
|
874
|
+
return entities, relationships
|
|
875
|
+
|
|
876
|
+
def _learn_new_types(
|
|
877
|
+
self,
|
|
878
|
+
entities: list[Entity],
|
|
879
|
+
relationships: list[Relationship],
|
|
880
|
+
) -> None:
|
|
881
|
+
"""Learn new entity and relationship types."""
|
|
882
|
+
# Learn entity types
|
|
883
|
+
if self._entity_type_registry:
|
|
884
|
+
for entity in entities:
|
|
885
|
+
if entity.type == EntityType.OTHER:
|
|
886
|
+
original_type = entity.metadata.get("original_type", "unknown")
|
|
887
|
+
context = entity.mentions[0].context if entity.mentions else ""
|
|
888
|
+
self._entity_type_registry.record_type(
|
|
889
|
+
type_name=original_type,
|
|
890
|
+
context=context,
|
|
891
|
+
entity_name=entity.canonical_name,
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
# Learn relationship types
|
|
895
|
+
if self._relationship_type_registry:
|
|
896
|
+
for rel in relationships:
|
|
897
|
+
if rel.type == RelationType.OTHER:
|
|
898
|
+
original_type = rel.metadata.get("original_type", "unknown")
|
|
899
|
+
try:
|
|
900
|
+
self._relationship_type_registry.record_type(
|
|
901
|
+
type_name=original_type,
|
|
902
|
+
context=rel.evidence,
|
|
903
|
+
relationship_description=f"{rel.source_id} -> {rel.target_id}",
|
|
904
|
+
)
|
|
905
|
+
except Exception:
|
|
906
|
+
pass
|
|
907
|
+
|
|
908
|
+
def _map_entity_type(self, type_str: str) -> EntityType:
|
|
909
|
+
"""Map type string to EntityType enum."""
|
|
910
|
+
type_str = type_str.upper()
|
|
911
|
+
|
|
912
|
+
mapping = {
|
|
913
|
+
"PERSON": EntityType.PERSON,
|
|
914
|
+
"ORGANIZATION": EntityType.ORGANIZATION,
|
|
915
|
+
"ORG": EntityType.ORGANIZATION,
|
|
916
|
+
"COMPANY": EntityType.ORGANIZATION,
|
|
917
|
+
"DATE": EntityType.DATE,
|
|
918
|
+
"MONETARY": EntityType.MONETARY,
|
|
919
|
+
"MONEY": EntityType.MONETARY,
|
|
920
|
+
"LOCATION": EntityType.LOCATION,
|
|
921
|
+
"REFERENCE": EntityType.REFERENCE,
|
|
922
|
+
"DOCUMENT": EntityType.DOCUMENT,
|
|
923
|
+
"EVENT": EntityType.EVENT,
|
|
924
|
+
"LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
|
|
925
|
+
"LEGAL": EntityType.LEGAL_CONCEPT,
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
try:
|
|
929
|
+
return EntityType(type_str.lower())
|
|
930
|
+
except ValueError:
|
|
931
|
+
return mapping.get(type_str, EntityType.OTHER)
|
|
932
|
+
|
|
933
|
+
def _map_relationship_type(self, type_str: str) -> RelationType:
|
|
934
|
+
"""Map type string to RelationType enum."""
|
|
935
|
+
type_str = type_str.upper()
|
|
936
|
+
|
|
937
|
+
mapping = {
|
|
938
|
+
"MENTIONS": RelationType.MENTIONS,
|
|
939
|
+
"TEMPORAL_BEFORE": RelationType.TEMPORAL_BEFORE,
|
|
940
|
+
"TEMPORAL_AFTER": RelationType.TEMPORAL_AFTER,
|
|
941
|
+
"CAUSAL": RelationType.CAUSAL,
|
|
942
|
+
"SUPPORTS": RelationType.SUPPORTS,
|
|
943
|
+
"CONTRADICTS": RelationType.CONTRADICTS,
|
|
944
|
+
"AFFILIATED_WITH": RelationType.AFFILIATED_WITH,
|
|
945
|
+
"PARTY_TO": RelationType.PARTY_TO,
|
|
946
|
+
"REFERENCES": RelationType.REFERENCES,
|
|
947
|
+
"SUPERSEDES": RelationType.SUPERSEDES,
|
|
948
|
+
"AMENDS": RelationType.AMENDS,
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
return RelationType(type_str.lower())
|
|
953
|
+
except ValueError:
|
|
954
|
+
return mapping.get(type_str, RelationType.OTHER)
|
|
955
|
+
|
|
956
|
+
def to_extraction_result(self, unified_result: RLMUnifiedResult) -> ExtractionResult:
|
|
957
|
+
"""Convert to standard ExtractionResult format."""
|
|
958
|
+
return ExtractionResult(
|
|
959
|
+
node_id=unified_result.node_id,
|
|
960
|
+
doc_id=unified_result.doc_id,
|
|
961
|
+
entities=unified_result.entities,
|
|
962
|
+
relationships=unified_result.relationships,
|
|
963
|
+
processing_time_ms=unified_result.processing_time_ms,
|
|
964
|
+
extraction_method="rlm_unified",
|
|
965
|
+
warnings=unified_result.warnings,
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
# Convenience function
|
|
970
|
+
def extract_entities_and_relationships(
|
|
971
|
+
node_id: str,
|
|
972
|
+
doc_id: str,
|
|
973
|
+
header: str,
|
|
974
|
+
content: str,
|
|
975
|
+
page_num: int | None = None,
|
|
976
|
+
) -> RLMUnifiedResult:
|
|
977
|
+
"""
|
|
978
|
+
Extract entities and relationships using the unified RLM approach.
|
|
979
|
+
|
|
980
|
+
This is the recommended way to extract - always uses the most
|
|
981
|
+
accurate, grounded approach with ToT validation.
|
|
982
|
+
"""
|
|
983
|
+
extractor = RLMUnifiedExtractor()
|
|
984
|
+
return extractor.extract(
|
|
985
|
+
node_id=node_id,
|
|
986
|
+
doc_id=doc_id,
|
|
987
|
+
header=header,
|
|
988
|
+
content=content,
|
|
989
|
+
page_num=page_num,
|
|
990
|
+
)
|