rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR RLM Entity Extractor
|
|
3
|
+
|
|
4
|
+
Implements the TRUE RLM pattern for entity extraction:
|
|
5
|
+
1. LLM writes its own regex/Python code based on the document
|
|
6
|
+
2. Code executes on DOC_VAR (grounded in actual text)
|
|
7
|
+
3. LLM validates and classifies results
|
|
8
|
+
|
|
9
|
+
This is more powerful than pre-defined patterns because:
|
|
10
|
+
- LLM adapts to domain-specific patterns it discovers
|
|
11
|
+
- Can write complex extraction logic we didn't anticipate
|
|
12
|
+
- Still grounded because code executes on actual text
|
|
13
|
+
- Recursive - can use sub_llm for complex validation
|
|
14
|
+
|
|
15
|
+
From the RLM paper:
|
|
16
|
+
"The Neural Network generates code to interact with the document,
|
|
17
|
+
rather than having the document in its context window."
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import re
|
|
24
|
+
import time
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from typing import Any, TYPE_CHECKING
|
|
27
|
+
|
|
28
|
+
import structlog
|
|
29
|
+
|
|
30
|
+
from rnsr.extraction.models import Entity, EntityType, ExtractionResult, Mention
|
|
31
|
+
from rnsr.extraction.learned_types import get_learned_type_registry
|
|
32
|
+
from rnsr.llm import get_llm
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from rnsr.agent.repl_env import REPLEnvironment
|
|
36
|
+
from rnsr.models import DocumentTree
|
|
37
|
+
|
|
38
|
+
logger = structlog.get_logger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# =============================================================================
|
|
42
|
+
# RLM Extraction Prompts
|
|
43
|
+
# =============================================================================
|
|
44
|
+
|
|
45
|
+
RLM_ENTITY_EXTRACTION_SYSTEM = """You are an RLM (Recursive Language Model) extracting entities from a document.
|
|
46
|
+
|
|
47
|
+
CRITICAL: You do NOT have the full document in context. It is stored in DOC_VAR.
|
|
48
|
+
You must write Python code to extract entities from DOC_VAR.
|
|
49
|
+
|
|
50
|
+
## Available Variables:
|
|
51
|
+
- DOC_VAR: The document text (string). Use slicing, regex, etc.
|
|
52
|
+
- SECTION_CONTENT: Current section content (string, smaller than DOC_VAR)
|
|
53
|
+
|
|
54
|
+
## Available Functions:
|
|
55
|
+
- search_text(pattern): Search DOC_VAR for regex pattern, returns list of (start, end, match)
|
|
56
|
+
- len(DOC_VAR): Get document length
|
|
57
|
+
- DOC_VAR[i:j]: Slice document
|
|
58
|
+
- re.findall(pattern, text): Standard regex
|
|
59
|
+
- re.finditer(pattern, text): Iterate matches with positions
|
|
60
|
+
- store_variable(name, content): Store findings for later
|
|
61
|
+
|
|
62
|
+
## Your Task:
|
|
63
|
+
Extract entities (people, organizations, dates, etc.) by writing Python code.
|
|
64
|
+
|
|
65
|
+
IMPORTANT: Your code should:
|
|
66
|
+
1. Write regex patterns tailored to THIS document
|
|
67
|
+
2. Execute patterns to find matches (grounded in text)
|
|
68
|
+
3. Return structured results with exact positions
|
|
69
|
+
|
|
70
|
+
## Output Format:
|
|
71
|
+
Write Python code that produces a list of entity dictionaries:
|
|
72
|
+
```python
|
|
73
|
+
entities = []
|
|
74
|
+
|
|
75
|
+
# Example: Find person names with titles
|
|
76
|
+
for match in re.finditer(r'(?:Mr\.|Mrs\.|Dr\.)\s+([A-Z][a-z]+\s+[A-Z][a-z]+)', SECTION_CONTENT):
|
|
77
|
+
entities.append({
|
|
78
|
+
"text": match.group(),
|
|
79
|
+
"canonical_name": match.group(1), # Without title
|
|
80
|
+
"type": "PERSON",
|
|
81
|
+
"start": match.start(),
|
|
82
|
+
"end": match.end(),
|
|
83
|
+
"confidence": 0.9
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
# Example: Find dollar amounts
|
|
87
|
+
for match in re.finditer(r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion))?', SECTION_CONTENT):
|
|
88
|
+
entities.append({
|
|
89
|
+
"text": match.group(),
|
|
90
|
+
"type": "MONETARY",
|
|
91
|
+
"start": match.start(),
|
|
92
|
+
"end": match.end(),
|
|
93
|
+
"confidence": 0.95
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
# Store results
|
|
97
|
+
store_variable("ENTITIES", entities)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Write code appropriate for the document type and content shown."""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
RLM_EXTRACTION_PROMPT = """Document section to extract entities from:
|
|
104
|
+
|
|
105
|
+
Section Header: {header}
|
|
106
|
+
Section Content (first 2000 chars):
|
|
107
|
+
---
|
|
108
|
+
{content_preview}
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
Total section length: {content_length} characters
|
|
112
|
+
|
|
113
|
+
Based on this content, write Python code to extract all significant entities.
|
|
114
|
+
Consider:
|
|
115
|
+
1. What types of entities appear in this document? (people, companies, dates, money, etc.)
|
|
116
|
+
2. What patterns would match them? (titles, suffixes, formats, etc.)
|
|
117
|
+
3. Are there domain-specific entities? (legal terms, technical concepts, etc.)
|
|
118
|
+
|
|
119
|
+
Write Python code that will execute on SECTION_CONTENT to extract entities.
|
|
120
|
+
End your code with: store_variable("ENTITIES", entities)"""
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
RLM_VALIDATION_PROMPT = """You extracted these entity candidates from the document.
|
|
124
|
+
Validate each one and determine if it's a real, significant entity.
|
|
125
|
+
|
|
126
|
+
Candidates:
|
|
127
|
+
{candidates_json}
|
|
128
|
+
|
|
129
|
+
For each candidate, provide:
|
|
130
|
+
1. valid: true if significant entity, false if noise
|
|
131
|
+
2. type: Entity type (PERSON, ORGANIZATION, DATE, MONETARY, LOCATION, etc.)
|
|
132
|
+
3. canonical_name: Cleaned/normalized name
|
|
133
|
+
4. confidence: 0.0-1.0
|
|
134
|
+
|
|
135
|
+
Return JSON array:
|
|
136
|
+
```json
|
|
137
|
+
[
|
|
138
|
+
{{"id": 0, "valid": true, "type": "PERSON", "canonical_name": "John Smith", "confidence": 0.9}},
|
|
139
|
+
{{"id": 1, "valid": false, "reason": "Generic term, not specific entity"}}
|
|
140
|
+
]
|
|
141
|
+
```"""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# =============================================================================
|
|
145
|
+
# Lightweight REPL for Extraction (if full REPL not available)
|
|
146
|
+
# =============================================================================
|
|
147
|
+
|
|
148
|
+
class LightweightREPL:
|
|
149
|
+
"""
|
|
150
|
+
Lightweight REPL for entity extraction.
|
|
151
|
+
|
|
152
|
+
Provides the core DOC_VAR + code execution pattern
|
|
153
|
+
without the full REPL infrastructure.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
def __init__(self, document_text: str, section_content: str = ""):
|
|
157
|
+
"""Initialize with document text."""
|
|
158
|
+
self.document_text = document_text
|
|
159
|
+
self.section_content = section_content or document_text
|
|
160
|
+
self.variables: dict[str, Any] = {}
|
|
161
|
+
|
|
162
|
+
self._namespace = self._build_namespace()
|
|
163
|
+
|
|
164
|
+
def _build_namespace(self) -> dict[str, Any]:
|
|
165
|
+
"""Build Python namespace for code execution."""
|
|
166
|
+
return {
|
|
167
|
+
# Core variables
|
|
168
|
+
"DOC_VAR": self.document_text,
|
|
169
|
+
"SECTION_CONTENT": self.section_content,
|
|
170
|
+
"VARIABLES": self.variables,
|
|
171
|
+
|
|
172
|
+
# Built-ins
|
|
173
|
+
"len": len,
|
|
174
|
+
"str": str,
|
|
175
|
+
"int": int,
|
|
176
|
+
"float": float,
|
|
177
|
+
"list": list,
|
|
178
|
+
"dict": dict,
|
|
179
|
+
"range": range,
|
|
180
|
+
"enumerate": enumerate,
|
|
181
|
+
"sorted": sorted,
|
|
182
|
+
"min": min,
|
|
183
|
+
"max": max,
|
|
184
|
+
"re": re,
|
|
185
|
+
|
|
186
|
+
# Functions
|
|
187
|
+
"search_text": self._search_text,
|
|
188
|
+
"store_variable": self._store_variable,
|
|
189
|
+
"get_variable": self._get_variable,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
def _search_text(self, pattern: str) -> list[tuple[int, int, str]]:
|
|
193
|
+
"""Search document for regex pattern."""
|
|
194
|
+
results = []
|
|
195
|
+
try:
|
|
196
|
+
for match in re.finditer(pattern, self.section_content, re.IGNORECASE):
|
|
197
|
+
results.append((match.start(), match.end(), match.group()))
|
|
198
|
+
except re.error as e:
|
|
199
|
+
logger.warning("regex_error", pattern=pattern, error=str(e))
|
|
200
|
+
return results
|
|
201
|
+
|
|
202
|
+
def _store_variable(self, name: str, content: Any) -> str:
|
|
203
|
+
"""Store a variable."""
|
|
204
|
+
self.variables[name] = content
|
|
205
|
+
return f"Stored ${name}"
|
|
206
|
+
|
|
207
|
+
def _get_variable(self, name: str) -> Any:
|
|
208
|
+
"""Retrieve a variable."""
|
|
209
|
+
return self.variables.get(name)
|
|
210
|
+
|
|
211
|
+
def execute(self, code: str) -> dict[str, Any]:
|
|
212
|
+
"""Execute Python code."""
|
|
213
|
+
result = {
|
|
214
|
+
"success": False,
|
|
215
|
+
"output": None,
|
|
216
|
+
"error": None,
|
|
217
|
+
"variables": list(self.variables.keys()),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
# Clean code
|
|
221
|
+
code = self._clean_code(code)
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
# Compile and execute
|
|
225
|
+
compiled = compile(code, "<rlm_extraction>", "exec")
|
|
226
|
+
exec(compiled, self._namespace)
|
|
227
|
+
|
|
228
|
+
result["success"] = True
|
|
229
|
+
result["variables"] = list(self.variables.keys())
|
|
230
|
+
result["output"] = self.variables.get("ENTITIES", [])
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
result["error"] = str(e)
|
|
234
|
+
logger.warning("rlm_execution_error", error=str(e), code=code[:200])
|
|
235
|
+
|
|
236
|
+
return result
|
|
237
|
+
|
|
238
|
+
def _clean_code(self, code: str) -> str:
|
|
239
|
+
"""Remove markdown code blocks."""
|
|
240
|
+
code = re.sub(r'^```python\s*', '', code, flags=re.MULTILINE)
|
|
241
|
+
code = re.sub(r'^```\s*$', '', code, flags=re.MULTILINE)
|
|
242
|
+
return code.strip()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# =============================================================================
|
|
246
|
+
# RLM Entity Extractor
|
|
247
|
+
# =============================================================================
|
|
248
|
+
|
|
249
|
+
@dataclass
|
|
250
|
+
class RLMExtractionResult:
|
|
251
|
+
"""Result of RLM-based extraction."""
|
|
252
|
+
|
|
253
|
+
entities: list[Entity] = field(default_factory=list)
|
|
254
|
+
code_generated: str = ""
|
|
255
|
+
code_executed: bool = False
|
|
256
|
+
execution_output: Any = None
|
|
257
|
+
raw_candidates: list[dict] = field(default_factory=list)
|
|
258
|
+
processing_time_ms: float = 0.0
|
|
259
|
+
warnings: list[str] = field(default_factory=list)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
class RLMEntityExtractor:
|
|
263
|
+
"""
|
|
264
|
+
RLM-based entity extractor.
|
|
265
|
+
|
|
266
|
+
The LLM writes its own extraction code based on the document,
|
|
267
|
+
then the code executes on DOC_VAR (grounded).
|
|
268
|
+
|
|
269
|
+
Flow:
|
|
270
|
+
1. Show LLM a preview of the document
|
|
271
|
+
2. LLM writes Python code to extract entities
|
|
272
|
+
3. Code executes on actual document (grounded)
|
|
273
|
+
4. LLM validates/classifies the extracted candidates
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
def __init__(
|
|
277
|
+
self,
|
|
278
|
+
llm: Any | None = None,
|
|
279
|
+
repl_environment: "REPLEnvironment | None" = None,
|
|
280
|
+
enable_type_learning: bool = True,
|
|
281
|
+
max_code_attempts: int = 3,
|
|
282
|
+
validate_with_llm: bool = True,
|
|
283
|
+
):
|
|
284
|
+
"""
|
|
285
|
+
Initialize the RLM extractor.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
llm: LLM instance.
|
|
289
|
+
repl_environment: Optional full REPL environment.
|
|
290
|
+
enable_type_learning: Learn new entity types.
|
|
291
|
+
max_code_attempts: Max attempts if code fails.
|
|
292
|
+
validate_with_llm: Validate candidates with LLM.
|
|
293
|
+
"""
|
|
294
|
+
self.llm = llm
|
|
295
|
+
self.repl_environment = repl_environment
|
|
296
|
+
self.enable_type_learning = enable_type_learning
|
|
297
|
+
self.max_code_attempts = max_code_attempts
|
|
298
|
+
self.validate_with_llm = validate_with_llm
|
|
299
|
+
|
|
300
|
+
self._llm_initialized = False
|
|
301
|
+
self._type_registry = get_learned_type_registry() if enable_type_learning else None
|
|
302
|
+
|
|
303
|
+
def _get_llm(self) -> Any:
|
|
304
|
+
"""Get or initialize LLM."""
|
|
305
|
+
if self.llm is None and not self._llm_initialized:
|
|
306
|
+
self.llm = get_llm()
|
|
307
|
+
self._llm_initialized = True
|
|
308
|
+
return self.llm
|
|
309
|
+
|
|
310
|
+
def extract_from_node(
|
|
311
|
+
self,
|
|
312
|
+
node_id: str,
|
|
313
|
+
doc_id: str,
|
|
314
|
+
header: str,
|
|
315
|
+
content: str,
|
|
316
|
+
page_num: int | None = None,
|
|
317
|
+
document_text: str | None = None,
|
|
318
|
+
) -> ExtractionResult:
|
|
319
|
+
"""
|
|
320
|
+
Extract entities using RLM approach.
|
|
321
|
+
|
|
322
|
+
The LLM writes code to extract entities, which is then
|
|
323
|
+
executed on the actual document text.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
node_id: Section node ID.
|
|
327
|
+
doc_id: Document ID.
|
|
328
|
+
header: Section header.
|
|
329
|
+
content: Section content.
|
|
330
|
+
page_num: Page number.
|
|
331
|
+
document_text: Full document text for DOC_VAR.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
ExtractionResult with extracted entities.
|
|
335
|
+
"""
|
|
336
|
+
start_time = time.time()
|
|
337
|
+
|
|
338
|
+
result = ExtractionResult(
|
|
339
|
+
node_id=node_id,
|
|
340
|
+
doc_id=doc_id,
|
|
341
|
+
extraction_method="rlm",
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
if len(content.strip()) < 50:
|
|
345
|
+
return result
|
|
346
|
+
|
|
347
|
+
llm = self._get_llm()
|
|
348
|
+
if llm is None:
|
|
349
|
+
result.warnings.append("No LLM available for RLM extraction")
|
|
350
|
+
return result
|
|
351
|
+
|
|
352
|
+
# STEP 1: LLM generates extraction code
|
|
353
|
+
rlm_result = self._generate_and_execute_code(
|
|
354
|
+
header=header,
|
|
355
|
+
content=content,
|
|
356
|
+
document_text=document_text or content,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
if not rlm_result.code_executed:
|
|
360
|
+
result.warnings.append(f"Code execution failed: {rlm_result.warnings}")
|
|
361
|
+
return result
|
|
362
|
+
|
|
363
|
+
# STEP 2: Validate candidates with LLM
|
|
364
|
+
if self.validate_with_llm and rlm_result.raw_candidates:
|
|
365
|
+
validated = self._validate_candidates(rlm_result.raw_candidates)
|
|
366
|
+
else:
|
|
367
|
+
validated = rlm_result.raw_candidates
|
|
368
|
+
|
|
369
|
+
# STEP 3: Convert to Entity objects
|
|
370
|
+
entities = self._candidates_to_entities(
|
|
371
|
+
candidates=validated,
|
|
372
|
+
node_id=node_id,
|
|
373
|
+
doc_id=doc_id,
|
|
374
|
+
content=content,
|
|
375
|
+
page_num=page_num,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
result.entities = entities
|
|
379
|
+
result.processing_time_ms = (time.time() - start_time) * 1000
|
|
380
|
+
|
|
381
|
+
logger.info(
|
|
382
|
+
"rlm_extraction_complete",
|
|
383
|
+
node_id=node_id,
|
|
384
|
+
candidates=len(rlm_result.raw_candidates),
|
|
385
|
+
validated=len(entities),
|
|
386
|
+
time_ms=result.processing_time_ms,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
def _generate_and_execute_code(
|
|
392
|
+
self,
|
|
393
|
+
header: str,
|
|
394
|
+
content: str,
|
|
395
|
+
document_text: str,
|
|
396
|
+
) -> RLMExtractionResult:
|
|
397
|
+
"""Generate extraction code and execute it."""
|
|
398
|
+
result = RLMExtractionResult()
|
|
399
|
+
|
|
400
|
+
llm = self._get_llm()
|
|
401
|
+
|
|
402
|
+
# Create REPL environment
|
|
403
|
+
repl = self.repl_environment or LightweightREPL(
|
|
404
|
+
document_text=document_text,
|
|
405
|
+
section_content=content,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Generate code prompt
|
|
409
|
+
prompt = f"{RLM_ENTITY_EXTRACTION_SYSTEM}\n\n{RLM_EXTRACTION_PROMPT.format(
|
|
410
|
+
header=header,
|
|
411
|
+
content_preview=content[:2000],
|
|
412
|
+
content_length=len(content),
|
|
413
|
+
)}"
|
|
414
|
+
|
|
415
|
+
for attempt in range(self.max_code_attempts):
|
|
416
|
+
try:
|
|
417
|
+
# LLM generates extraction code
|
|
418
|
+
response = llm.complete(prompt)
|
|
419
|
+
code = str(response) if not isinstance(response, str) else response
|
|
420
|
+
result.code_generated = code
|
|
421
|
+
|
|
422
|
+
# Execute the code
|
|
423
|
+
exec_result = repl.execute(code)
|
|
424
|
+
|
|
425
|
+
if exec_result["success"]:
|
|
426
|
+
result.code_executed = True
|
|
427
|
+
result.execution_output = exec_result["output"]
|
|
428
|
+
result.raw_candidates = exec_result.get("output", [])
|
|
429
|
+
|
|
430
|
+
# Get ENTITIES from variables if not in output
|
|
431
|
+
if not result.raw_candidates and hasattr(repl, 'variables'):
|
|
432
|
+
result.raw_candidates = repl.variables.get("ENTITIES", [])
|
|
433
|
+
|
|
434
|
+
break
|
|
435
|
+
else:
|
|
436
|
+
result.warnings.append(f"Attempt {attempt + 1}: {exec_result['error']}")
|
|
437
|
+
# Add error to prompt for retry
|
|
438
|
+
prompt += f"\n\nPrevious code had error: {exec_result['error']}\nPlease fix and try again."
|
|
439
|
+
|
|
440
|
+
except Exception as e:
|
|
441
|
+
result.warnings.append(f"Attempt {attempt + 1}: {str(e)}")
|
|
442
|
+
|
|
443
|
+
return result
|
|
444
|
+
|
|
445
|
+
def _validate_candidates(
|
|
446
|
+
self,
|
|
447
|
+
candidates: list[dict],
|
|
448
|
+
) -> list[dict]:
|
|
449
|
+
"""Validate extracted candidates with LLM."""
|
|
450
|
+
if not candidates:
|
|
451
|
+
return []
|
|
452
|
+
|
|
453
|
+
llm = self._get_llm()
|
|
454
|
+
|
|
455
|
+
# Format candidates for validation
|
|
456
|
+
candidates_json = json.dumps([
|
|
457
|
+
{
|
|
458
|
+
"id": i,
|
|
459
|
+
"text": c.get("text", ""),
|
|
460
|
+
"type": c.get("type", "UNKNOWN"),
|
|
461
|
+
"context": c.get("context", "")[:100] if c.get("context") else "",
|
|
462
|
+
}
|
|
463
|
+
for i, c in enumerate(candidates[:30]) # Limit
|
|
464
|
+
], indent=2)
|
|
465
|
+
|
|
466
|
+
prompt = RLM_VALIDATION_PROMPT.format(candidates_json=candidates_json)
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
response = llm.complete(prompt)
|
|
470
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
471
|
+
|
|
472
|
+
# Parse validation response
|
|
473
|
+
json_match = re.search(r'\[[\s\S]*\]', response_text)
|
|
474
|
+
if not json_match:
|
|
475
|
+
return candidates
|
|
476
|
+
|
|
477
|
+
validations = json.loads(json_match.group())
|
|
478
|
+
|
|
479
|
+
# Merge validations with candidates
|
|
480
|
+
validated = []
|
|
481
|
+
validation_by_id = {v.get("id"): v for v in validations}
|
|
482
|
+
|
|
483
|
+
for i, candidate in enumerate(candidates):
|
|
484
|
+
validation = validation_by_id.get(i, {})
|
|
485
|
+
|
|
486
|
+
if validation.get("valid", True):
|
|
487
|
+
candidate["type"] = validation.get("type", candidate.get("type", "OTHER"))
|
|
488
|
+
candidate["canonical_name"] = validation.get("canonical_name", candidate.get("text", ""))
|
|
489
|
+
candidate["confidence"] = validation.get("confidence", candidate.get("confidence", 0.5))
|
|
490
|
+
validated.append(candidate)
|
|
491
|
+
|
|
492
|
+
return validated
|
|
493
|
+
|
|
494
|
+
except Exception as e:
|
|
495
|
+
logger.warning("rlm_validation_failed", error=str(e))
|
|
496
|
+
return candidates
|
|
497
|
+
|
|
498
|
+
def _candidates_to_entities(
|
|
499
|
+
self,
|
|
500
|
+
candidates: list[dict],
|
|
501
|
+
node_id: str,
|
|
502
|
+
doc_id: str,
|
|
503
|
+
content: str,
|
|
504
|
+
page_num: int | None,
|
|
505
|
+
) -> list[Entity]:
|
|
506
|
+
"""Convert validated candidates to Entity objects."""
|
|
507
|
+
entities = []
|
|
508
|
+
|
|
509
|
+
for candidate in candidates:
|
|
510
|
+
if not candidate.get("text"):
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# Map entity type
|
|
514
|
+
entity_type = self._map_entity_type(candidate.get("type", "OTHER"))
|
|
515
|
+
|
|
516
|
+
# Learn new types
|
|
517
|
+
if entity_type == EntityType.OTHER and self._type_registry:
|
|
518
|
+
self._type_registry.record_type(
|
|
519
|
+
type_name=candidate.get("type", "unknown").lower(),
|
|
520
|
+
context=candidate.get("context", content[:100]),
|
|
521
|
+
entity_name=candidate.get("text", ""),
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Create mention
|
|
525
|
+
mention = Mention(
|
|
526
|
+
node_id=node_id,
|
|
527
|
+
doc_id=doc_id,
|
|
528
|
+
span_start=candidate.get("start"),
|
|
529
|
+
span_end=candidate.get("end"),
|
|
530
|
+
context=candidate.get("context", content[
|
|
531
|
+
max(0, (candidate.get("start") or 0) - 50):
|
|
532
|
+
(candidate.get("end") or 0) + 50
|
|
533
|
+
]),
|
|
534
|
+
page_num=page_num,
|
|
535
|
+
confidence=candidate.get("confidence", 0.5),
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Build metadata
|
|
539
|
+
metadata = {
|
|
540
|
+
"rlm_extracted": True,
|
|
541
|
+
"grounded": candidate.get("start") is not None,
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
if entity_type == EntityType.OTHER:
|
|
545
|
+
metadata["original_type"] = candidate.get("type", "").lower()
|
|
546
|
+
|
|
547
|
+
entity = Entity(
|
|
548
|
+
type=entity_type,
|
|
549
|
+
canonical_name=candidate.get("canonical_name", candidate.get("text", "")),
|
|
550
|
+
aliases=[candidate.get("text")] if candidate.get("canonical_name") != candidate.get("text") else [],
|
|
551
|
+
mentions=[mention],
|
|
552
|
+
metadata=metadata,
|
|
553
|
+
source_doc_id=doc_id,
|
|
554
|
+
)
|
|
555
|
+
entities.append(entity)
|
|
556
|
+
|
|
557
|
+
return entities
|
|
558
|
+
|
|
559
|
+
def _map_entity_type(self, type_str: str) -> EntityType:
|
|
560
|
+
"""Map type string to EntityType enum."""
|
|
561
|
+
type_str = type_str.upper()
|
|
562
|
+
|
|
563
|
+
mapping = {
|
|
564
|
+
"PERSON": EntityType.PERSON,
|
|
565
|
+
"PEOPLE": EntityType.PERSON,
|
|
566
|
+
"NAME": EntityType.PERSON,
|
|
567
|
+
"ORGANIZATION": EntityType.ORGANIZATION,
|
|
568
|
+
"ORG": EntityType.ORGANIZATION,
|
|
569
|
+
"COMPANY": EntityType.ORGANIZATION,
|
|
570
|
+
"DATE": EntityType.DATE,
|
|
571
|
+
"TIME": EntityType.DATE,
|
|
572
|
+
"MONETARY": EntityType.MONETARY,
|
|
573
|
+
"MONEY": EntityType.MONETARY,
|
|
574
|
+
"AMOUNT": EntityType.MONETARY,
|
|
575
|
+
"LOCATION": EntityType.LOCATION,
|
|
576
|
+
"PLACE": EntityType.LOCATION,
|
|
577
|
+
"ADDRESS": EntityType.LOCATION,
|
|
578
|
+
"REFERENCE": EntityType.REFERENCE,
|
|
579
|
+
"CITATION": EntityType.REFERENCE,
|
|
580
|
+
"DOCUMENT": EntityType.DOCUMENT,
|
|
581
|
+
"EVENT": EntityType.EVENT,
|
|
582
|
+
"LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
|
|
583
|
+
"LEGAL": EntityType.LEGAL_CONCEPT,
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
return EntityType(type_str.lower())
|
|
588
|
+
except ValueError:
|
|
589
|
+
return mapping.get(type_str, EntityType.OTHER)
|