rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,990 @@
1
+ """
2
+ RNSR Unified RLM Extractor
3
+
4
+ The single, comprehensive extractor for BOTH entities AND relationships.
5
+ Always uses the most accurate approach:
6
+
7
+ 1. LLM analyzes document and writes extraction code
8
+ 2. Code executes on DOC_VAR (grounded in actual text)
9
+ 3. ToT validation with probabilities
10
+ 4. Cross-validation between entities and relationships
11
+ 5. Adaptive learning for new types
12
+
13
+ This is the RECOMMENDED extractor - it consolidates all the best
14
+ practices from the RLM paper into a single, unified interface.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import re
21
+ import time
22
+ from dataclasses import dataclass, field
23
+ from typing import Any, TYPE_CHECKING
24
+
25
+ import structlog
26
+
27
+ from rnsr.extraction.models import (
28
+ Entity,
29
+ EntityType,
30
+ ExtractionResult,
31
+ Mention,
32
+ Relationship,
33
+ RelationType,
34
+ )
35
+ from rnsr.extraction.learned_types import (
36
+ get_learned_type_registry,
37
+ get_learned_relationship_type_registry,
38
+ )
39
+ from rnsr.llm import get_llm
40
+
41
+ if TYPE_CHECKING:
42
+ from rnsr.agent.repl_env import REPLEnvironment
43
+ from rnsr.models import DocumentTree
44
+
45
+ logger = structlog.get_logger(__name__)
46
+
47
+
48
+ # =============================================================================
49
+ # Unified RLM Prompts
50
+ # =============================================================================
51
+
52
+ RLM_UNIFIED_SYSTEM_PROMPT = """You are an RLM (Recursive Language Model) extracting entities AND relationships from a document.
53
+
54
+ CRITICAL: You do NOT have the full document in context. It is stored in DOC_VAR.
55
+ You must write Python code to extract both entities and relationships.
56
+
57
+ ## Available Variables:
58
+ - DOC_VAR: The document text (string)
59
+ - SECTION_CONTENT: Current section content (string)
60
+ - KNOWN_ENTITY_TYPES: Entity types the system has learned (list)
61
+ - KNOWN_RELATIONSHIP_TYPES: Relationship types the system has learned (list)
62
+
63
+ ## Available Functions:
64
+ - search_text(pattern): Search for regex pattern, returns list of (start, end, match)
65
+ - re.findall(pattern, text): Standard regex
66
+ - re.finditer(pattern, text): Iterate matches with positions
67
+ - store_variable(name, content): Store findings
68
+
69
+ ## Your Task:
70
+ Write Python code that extracts:
71
+ 1. ENTITIES: People, organizations, dates, money, locations, legal concepts, etc.
72
+ 2. RELATIONSHIPS: How entities relate to each other and to the document
73
+
74
+ ## Output Format:
75
+ ```python
76
+ entities = []
77
+ relationships = []
78
+
79
+ # Extract entities with exact text positions
80
+ for match in re.finditer(r'pattern', SECTION_CONTENT):
81
+ entities.append({
82
+ "text": match.group(),
83
+ "canonical_name": "Normalized Name",
84
+ "type": "ENTITY_TYPE",
85
+ "start": match.start(),
86
+ "end": match.end(),
87
+ "confidence": 0.9
88
+ })
89
+
90
+ # Extract relationships between entities
91
+ # Look for patterns like "X is affiliated with Y", "X caused Y", etc.
92
+ for match in re.finditer(r'(\w+)\s+(?:is|was)\s+(?:employed|hired)\s+by\s+(\w+)', SECTION_CONTENT):
93
+ relationships.append({
94
+ "source_text": match.group(1),
95
+ "target_text": match.group(2),
96
+ "type": "AFFILIATED_WITH",
97
+ "evidence": match.group(),
98
+ "start": match.start(),
99
+ "end": match.end(),
100
+ "confidence": 0.85
101
+ })
102
+
103
+ store_variable("ENTITIES", entities)
104
+ store_variable("RELATIONSHIPS", relationships)
105
+ ```
106
+
107
+ ## Entity Types:
108
+ PERSON, ORGANIZATION, DATE, MONETARY, LOCATION, REFERENCE, DOCUMENT, EVENT, LEGAL_CONCEPT
109
+ {learned_entity_types}
110
+
111
+ ## Relationship Types:
112
+ MENTIONS, TEMPORAL_BEFORE, TEMPORAL_AFTER, CAUSAL, SUPPORTS, CONTRADICTS,
113
+ AFFILIATED_WITH, PARTY_TO, REFERENCES, SUPERSEDES, AMENDS
114
+ {learned_relationship_types}
115
+
116
+ Write code appropriate for this specific document."""
117
+
118
+
119
+ RLM_UNIFIED_EXTRACTION_PROMPT = """Document section to extract from:
120
+
121
+ Section Header: {header}
122
+ Section Content (first 3000 chars):
123
+ ---
124
+ {content_preview}
125
+ ---
126
+
127
+ Total section length: {content_length} characters
128
+
129
+ Write Python code to extract ALL entities and relationships.
130
+ Consider:
131
+ 1. What types of entities appear? (people, companies, dates, money, etc.)
132
+ 2. How are entities related? (affiliated_with, party_to, temporal, causal)
133
+ 3. What domain-specific patterns exist? (legal terms, citations, etc.)
134
+
135
+ End with:
136
+ store_variable("ENTITIES", entities)
137
+ store_variable("RELATIONSHIPS", relationships)"""
138
+
139
+
140
+ RLM_TOT_VALIDATION_PROMPT = """You are validating extracted entities and relationships using Tree of Thoughts reasoning.
141
+
142
+ ## Extracted Entities:
143
+ {entities_json}
144
+
145
+ ## Extracted Relationships:
146
+ {relationships_json}
147
+
148
+ ## Section Content (for verification):
149
+ {content_preview}
150
+
151
+ VALIDATION TASK:
152
+ For each entity and relationship, estimate probability (0.0-1.0) that it is valid.
153
+
154
+ ENTITY VALIDATION:
155
+ - Is this a real, specific entity (not a generic term)?
156
+ - Is the type correct?
157
+ - Is the canonical_name properly normalized?
158
+
159
+ RELATIONSHIP VALIDATION:
160
+ - Is there actual evidence for this relationship in the text?
161
+ - Is the relationship type correct?
162
+ - Are source and target correctly identified?
163
+
164
+ OUTPUT FORMAT (JSON):
165
+ {{
166
+ "entity_validations": [
167
+ {{"id": 0, "valid": true, "probability": 0.9, "type": "PERSON", "canonical_name": "John Smith", "reasoning": "Clear person name with title"}},
168
+ {{"id": 1, "valid": false, "probability": 0.2, "reasoning": "Generic term, not specific entity"}}
169
+ ],
170
+ "relationship_validations": [
171
+ {{"id": 0, "valid": true, "probability": 0.85, "type": "AFFILIATED_WITH", "reasoning": "Evidence shows employment relationship"}},
172
+ {{"id": 1, "valid": false, "probability": 0.3, "reasoning": "Co-occurrence but no explicit relationship"}}
173
+ ],
174
+ "cross_validation": {{
175
+ "entities_in_relationships": [0, 2],
176
+ "orphan_relationships": [],
177
+ "confidence_adjustments": []
178
+ }}
179
+ }}
180
+
181
+ Respond ONLY with JSON."""
182
+
183
+
184
+ # =============================================================================
185
+ # Unified REPL for Extraction
186
+ # =============================================================================
187
+
188
+ class UnifiedREPL:
189
+ """
190
+ REPL for unified entity + relationship extraction.
191
+ """
192
+
193
+ def __init__(
194
+ self,
195
+ document_text: str,
196
+ section_content: str = "",
197
+ known_entity_types: list[str] | None = None,
198
+ known_relationship_types: list[str] | None = None,
199
+ ):
200
+ """Initialize with document and learned types."""
201
+ self.document_text = document_text
202
+ self.section_content = section_content or document_text
203
+ self.known_entity_types = known_entity_types or []
204
+ self.known_relationship_types = known_relationship_types or []
205
+ self.variables: dict[str, Any] = {}
206
+
207
+ self._namespace = self._build_namespace()
208
+
209
+ def _build_namespace(self) -> dict[str, Any]:
210
+ """Build Python namespace for code execution."""
211
+ return {
212
+ # Core variables
213
+ "DOC_VAR": self.document_text,
214
+ "SECTION_CONTENT": self.section_content,
215
+ "KNOWN_ENTITY_TYPES": self.known_entity_types,
216
+ "KNOWN_RELATIONSHIP_TYPES": self.known_relationship_types,
217
+ "VARIABLES": self.variables,
218
+
219
+ # Built-ins
220
+ "len": len,
221
+ "str": str,
222
+ "int": int,
223
+ "float": float,
224
+ "list": list,
225
+ "dict": dict,
226
+ "set": set,
227
+ "range": range,
228
+ "enumerate": enumerate,
229
+ "sorted": sorted,
230
+ "min": min,
231
+ "max": max,
232
+ "any": any,
233
+ "all": all,
234
+ "re": re,
235
+
236
+ # Functions
237
+ "search_text": self._search_text,
238
+ "store_variable": self._store_variable,
239
+ "get_variable": self._get_variable,
240
+ }
241
+
242
+ def _search_text(self, pattern: str) -> list[tuple[int, int, str]]:
243
+ """Search document for regex pattern."""
244
+ results = []
245
+ try:
246
+ for match in re.finditer(pattern, self.section_content, re.IGNORECASE):
247
+ results.append((match.start(), match.end(), match.group()))
248
+ except re.error as e:
249
+ logger.warning("regex_error", pattern=pattern, error=str(e))
250
+ return results
251
+
252
+ def _store_variable(self, name: str, content: Any) -> str:
253
+ """Store a variable."""
254
+ self.variables[name] = content
255
+ return f"Stored ${name}"
256
+
257
+ def _get_variable(self, name: str) -> Any:
258
+ """Retrieve a variable."""
259
+ return self.variables.get(name)
260
+
261
+ def execute(self, code: str) -> dict[str, Any]:
262
+ """Execute Python code."""
263
+ result = {
264
+ "success": False,
265
+ "entities": [],
266
+ "relationships": [],
267
+ "error": None,
268
+ "variables": list(self.variables.keys()),
269
+ }
270
+
271
+ # Clean code
272
+ code = self._clean_code(code)
273
+
274
+ try:
275
+ # Compile and execute
276
+ compiled = compile(code, "<rlm_unified_extraction>", "exec")
277
+ exec(compiled, self._namespace)
278
+
279
+ result["success"] = True
280
+ result["variables"] = list(self.variables.keys())
281
+ result["entities"] = self.variables.get("ENTITIES", [])
282
+ result["relationships"] = self.variables.get("RELATIONSHIPS", [])
283
+
284
+ except Exception as e:
285
+ result["error"] = str(e)
286
+ logger.warning("rlm_execution_error", error=str(e), code=code[:200])
287
+
288
+ return result
289
+
290
+ def _clean_code(self, code: str) -> str:
291
+ """Remove markdown code blocks."""
292
+ code = re.sub(r'^```python\s*', '', code, flags=re.MULTILINE)
293
+ code = re.sub(r'^```\s*$', '', code, flags=re.MULTILINE)
294
+ return code.strip()
295
+
296
+
297
+ # =============================================================================
298
+ # Unified Extraction Result
299
+ # =============================================================================
300
+
301
+ @dataclass
302
+ class RLMUnifiedResult:
303
+ """Result of unified RLM extraction."""
304
+
305
+ node_id: str = ""
306
+ doc_id: str = ""
307
+ entities: list[Entity] = field(default_factory=list)
308
+ relationships: list[Relationship] = field(default_factory=list)
309
+
310
+ # Code generation
311
+ code_generated: str = ""
312
+ code_executed: bool = False
313
+
314
+ # Raw candidates (before validation)
315
+ raw_entities: list[dict] = field(default_factory=list)
316
+ raw_relationships: list[dict] = field(default_factory=list)
317
+
318
+ # Validation
319
+ tot_validated: bool = False
320
+ cross_validated: bool = False
321
+
322
+ # Stats
323
+ processing_time_ms: float = 0.0
324
+ warnings: list[str] = field(default_factory=list)
325
+
326
+
327
+ # =============================================================================
328
+ # RLM Unified Extractor
329
+ # =============================================================================
330
+
331
+ class RLMUnifiedExtractor:
332
+ """
333
+ Unified RLM Extractor for entities AND relationships.
334
+
335
+ This is the RECOMMENDED extractor. It uses:
336
+ 1. RLM code generation (LLM writes extraction code)
337
+ 2. ToT validation (probabilities + reasoning)
338
+ 3. Cross-validation between entities and relationships
339
+ 4. Adaptive learning for new types
340
+
341
+ Always grounded - all extractions tied to exact text spans.
342
+ """
343
+
344
+ def __init__(
345
+ self,
346
+ llm: Any | None = None,
347
+ max_code_attempts: int = 3,
348
+ tot_selection_threshold: float = 0.6,
349
+ enable_type_learning: bool = True,
350
+ enable_tot_validation: bool = True,
351
+ enable_cross_validation: bool = True,
352
+ ):
353
+ """
354
+ Initialize the unified extractor.
355
+
356
+ Args:
357
+ llm: LLM instance.
358
+ max_code_attempts: Max attempts if code fails.
359
+ tot_selection_threshold: Threshold for ToT validation.
360
+ enable_type_learning: Learn new entity/relationship types.
361
+ enable_tot_validation: Use ToT for validation.
362
+ enable_cross_validation: Cross-validate entities and relationships.
363
+ """
364
+ self.llm = llm
365
+ self.max_code_attempts = max_code_attempts
366
+ self.tot_selection_threshold = tot_selection_threshold
367
+ self.enable_type_learning = enable_type_learning
368
+ self.enable_tot_validation = enable_tot_validation
369
+ self.enable_cross_validation = enable_cross_validation
370
+
371
+ self._llm_initialized = False
372
+
373
+ # Learned type registries
374
+ self._entity_type_registry = None
375
+ self._relationship_type_registry = None
376
+
377
+ if enable_type_learning:
378
+ self._entity_type_registry = get_learned_type_registry()
379
+ try:
380
+ self._relationship_type_registry = get_learned_relationship_type_registry()
381
+ except Exception:
382
+ # Registry may not exist yet
383
+ self._relationship_type_registry = None
384
+
385
+ def _get_llm(self) -> Any:
386
+ """Get or initialize LLM."""
387
+ if self.llm is None and not self._llm_initialized:
388
+ self.llm = get_llm()
389
+ self._llm_initialized = True
390
+ return self.llm
391
+
392
+ def extract(
393
+ self,
394
+ node_id: str,
395
+ doc_id: str,
396
+ header: str,
397
+ content: str,
398
+ page_num: int | None = None,
399
+ document_text: str | None = None,
400
+ ) -> RLMUnifiedResult:
401
+ """
402
+ Extract entities AND relationships using unified RLM approach.
403
+
404
+ Flow:
405
+ 1. LLM generates extraction code based on document
406
+ 2. Code executes on DOC_VAR (grounded)
407
+ 3. ToT validates candidates with probabilities
408
+ 4. Cross-validation boosts/filters
409
+ 5. Learn new types
410
+
411
+ Args:
412
+ node_id: Section node ID.
413
+ doc_id: Document ID.
414
+ header: Section header.
415
+ content: Section content.
416
+ page_num: Page number.
417
+ document_text: Full document text for DOC_VAR.
418
+
419
+ Returns:
420
+ RLMUnifiedResult with entities and relationships.
421
+ """
422
+ start_time = time.time()
423
+
424
+ result = RLMUnifiedResult(
425
+ node_id=node_id,
426
+ doc_id=doc_id,
427
+ )
428
+
429
+ if len(content.strip()) < 50:
430
+ return result
431
+
432
+ llm = self._get_llm()
433
+ if llm is None:
434
+ result.warnings.append("No LLM available")
435
+ return result
436
+
437
+ # Get learned types for prompt
438
+ learned_entity_types = self._get_learned_entity_types()
439
+ learned_relationship_types = self._get_learned_relationship_types()
440
+
441
+ # STEP 1: Generate and execute extraction code
442
+ exec_result = self._generate_and_execute_code(
443
+ header=header,
444
+ content=content,
445
+ document_text=document_text or content,
446
+ learned_entity_types=learned_entity_types,
447
+ learned_relationship_types=learned_relationship_types,
448
+ )
449
+
450
+ result.code_generated = exec_result.get("code", "")
451
+ result.code_executed = exec_result.get("success", False)
452
+ result.raw_entities = exec_result.get("entities", [])
453
+ result.raw_relationships = exec_result.get("relationships", [])
454
+
455
+ if not result.code_executed:
456
+ result.warnings.append(f"Code execution failed: {exec_result.get('error', 'Unknown')}")
457
+ result.processing_time_ms = (time.time() - start_time) * 1000
458
+ return result
459
+
460
+ # STEP 2: ToT Validation
461
+ if self.enable_tot_validation and (result.raw_entities or result.raw_relationships):
462
+ validated = self._tot_validate(
463
+ entities=result.raw_entities,
464
+ relationships=result.raw_relationships,
465
+ content=content,
466
+ )
467
+ entities = validated.get("entities", result.raw_entities)
468
+ relationships = validated.get("relationships", result.raw_relationships)
469
+ result.tot_validated = True
470
+ else:
471
+ entities = result.raw_entities
472
+ relationships = result.raw_relationships
473
+
474
+ # STEP 3: Convert to model objects
475
+ result.entities = self._candidates_to_entities(
476
+ candidates=entities,
477
+ node_id=node_id,
478
+ doc_id=doc_id,
479
+ content=content,
480
+ page_num=page_num,
481
+ )
482
+
483
+ result.relationships = self._candidates_to_relationships(
484
+ candidates=relationships,
485
+ entities=result.entities,
486
+ node_id=node_id,
487
+ doc_id=doc_id,
488
+ )
489
+
490
+ # STEP 4: Cross-validation
491
+ if self.enable_cross_validation and result.entities and result.relationships:
492
+ result.entities, result.relationships = self._cross_validate(
493
+ result.entities, result.relationships
494
+ )
495
+ result.cross_validated = True
496
+
497
+ # STEP 5: Learn new types
498
+ if self.enable_type_learning:
499
+ self._learn_new_types(result.entities, result.relationships)
500
+
501
+ result.processing_time_ms = (time.time() - start_time) * 1000
502
+
503
+ logger.info(
504
+ "rlm_unified_extraction_complete",
505
+ node_id=node_id,
506
+ entities=len(result.entities),
507
+ relationships=len(result.relationships),
508
+ time_ms=result.processing_time_ms,
509
+ )
510
+
511
+ return result
512
+
513
+ def _get_learned_entity_types(self) -> str:
514
+ """Get learned entity types for prompt."""
515
+ if not self._entity_type_registry:
516
+ return ""
517
+
518
+ types = self._entity_type_registry.get_types_for_prompt()
519
+ if not types:
520
+ return ""
521
+
522
+ return f"\nAlso consider these learned types: {', '.join(types)}"
523
+
524
+ def _get_learned_relationship_types(self) -> str:
525
+ """Get learned relationship types for prompt."""
526
+ if not self._relationship_type_registry:
527
+ return ""
528
+
529
+ try:
530
+ types = self._relationship_type_registry.get_types_for_prompt()
531
+ if not types:
532
+ return ""
533
+ return f"\nAlso consider these learned types: {', '.join(types)}"
534
+ except Exception:
535
+ return ""
536
+
537
+ def _generate_and_execute_code(
538
+ self,
539
+ header: str,
540
+ content: str,
541
+ document_text: str,
542
+ learned_entity_types: str,
543
+ learned_relationship_types: str,
544
+ ) -> dict[str, Any]:
545
+ """Generate extraction code and execute it."""
546
+ llm = self._get_llm()
547
+
548
+ # Get learned types for REPL
549
+ entity_types = []
550
+ relationship_types = []
551
+
552
+ if self._entity_type_registry:
553
+ entity_types = self._entity_type_registry.get_types_for_prompt()
554
+ if self._relationship_type_registry:
555
+ try:
556
+ relationship_types = self._relationship_type_registry.get_types_for_prompt()
557
+ except Exception:
558
+ pass
559
+
560
+ # Create REPL
561
+ repl = UnifiedREPL(
562
+ document_text=document_text,
563
+ section_content=content,
564
+ known_entity_types=entity_types,
565
+ known_relationship_types=relationship_types,
566
+ )
567
+
568
+ # Build prompt
569
+ system_prompt = RLM_UNIFIED_SYSTEM_PROMPT.format(
570
+ learned_entity_types=learned_entity_types,
571
+ learned_relationship_types=learned_relationship_types,
572
+ )
573
+
574
+ extraction_prompt = RLM_UNIFIED_EXTRACTION_PROMPT.format(
575
+ header=header,
576
+ content_preview=content[:3000],
577
+ content_length=len(content),
578
+ )
579
+
580
+ prompt = f"{system_prompt}\n\n{extraction_prompt}"
581
+
582
+ for attempt in range(self.max_code_attempts):
583
+ try:
584
+ # LLM generates code
585
+ response = llm.complete(prompt)
586
+ code = str(response) if not isinstance(response, str) else response
587
+
588
+ # Validate we got actual code
589
+ if not code or len(code.strip()) < 20:
590
+ logger.warning("empty_or_short_code_response", attempt=attempt, length=len(code) if code else 0)
591
+ continue
592
+
593
+ # Check if response looks like code (not just JSON or text)
594
+ if "store_variable" not in code and "entities" not in code.lower():
595
+ logger.warning("response_not_code", attempt=attempt, preview=code[:100])
596
+ prompt += "\n\nPlease respond ONLY with Python code that extracts entities and relationships."
597
+ continue
598
+
599
+ # Execute
600
+ exec_result = repl.execute(code)
601
+
602
+ if exec_result["success"]:
603
+ entities = exec_result.get("entities", [])
604
+ relationships = exec_result.get("relationships", [])
605
+
606
+ # Validate entities are properly structured
607
+ valid_entities = []
608
+ for e in entities:
609
+ if isinstance(e, dict) and e.get("text"):
610
+ valid_entities.append(e)
611
+
612
+ valid_relationships = []
613
+ for r in relationships:
614
+ if isinstance(r, dict) and (r.get("source_text") or r.get("type")):
615
+ valid_relationships.append(r)
616
+
617
+ return {
618
+ "success": True,
619
+ "code": code,
620
+ "entities": valid_entities,
621
+ "relationships": valid_relationships,
622
+ }
623
+ else:
624
+ # Retry with error feedback
625
+ prompt += f"\n\nPrevious code had error: {exec_result['error']}\nPlease fix."
626
+
627
+ except Exception as e:
628
+ logger.warning("code_generation_failed", attempt=attempt, error=str(e))
629
+
630
+ return {"success": False, "error": "Max attempts exceeded"}
631
+
632
+ def _tot_validate(
633
+ self,
634
+ entities: list[dict],
635
+ relationships: list[dict],
636
+ content: str,
637
+ ) -> dict[str, list[dict]]:
638
+ """Validate with Tree of Thoughts."""
639
+ llm = self._get_llm()
640
+
641
+ entities_json = json.dumps([
642
+ {"id": i, "text": e.get("text", ""), "type": e.get("type", ""), "confidence": e.get("confidence", 0.5)}
643
+ for i, e in enumerate(entities[:20])
644
+ ], indent=2)
645
+
646
+ relationships_json = json.dumps([
647
+ {"id": i, "source": r.get("source_text", ""), "target": r.get("target_text", ""),
648
+ "type": r.get("type", ""), "evidence": r.get("evidence", "")[:100]}
649
+ for i, r in enumerate(relationships[:15])
650
+ ], indent=2)
651
+
652
+ prompt = RLM_TOT_VALIDATION_PROMPT.format(
653
+ entities_json=entities_json,
654
+ relationships_json=relationships_json,
655
+ content_preview=content[:2000],
656
+ )
657
+
658
+ try:
659
+ response = llm.complete(prompt)
660
+ response_text = str(response) if not isinstance(response, str) else response
661
+
662
+ # Clean response - remove markdown code blocks if present
663
+ response_text = re.sub(r'^```json\s*', '', response_text, flags=re.MULTILINE)
664
+ response_text = re.sub(r'^```\s*$', '', response_text, flags=re.MULTILINE)
665
+ response_text = response_text.strip()
666
+
667
+ # Parse JSON - try multiple strategies
668
+ data = None
669
+
670
+ # Strategy 1: Direct parse
671
+ try:
672
+ data = json.loads(response_text)
673
+ except json.JSONDecodeError:
674
+ pass
675
+
676
+ # Strategy 2: Extract JSON object
677
+ if data is None:
678
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
679
+ if json_match:
680
+ try:
681
+ data = json.loads(json_match.group())
682
+ except json.JSONDecodeError:
683
+ pass
684
+
685
+ # Strategy 3: Try to fix common issues
686
+ if data is None:
687
+ # Try fixing trailing commas, missing quotes, etc.
688
+ fixed = re.sub(r',(\s*[}\]])', r'\1', response_text)
689
+ try:
690
+ data = json.loads(fixed)
691
+ except json.JSONDecodeError:
692
+ pass
693
+
694
+ if data is None:
695
+ logger.debug("tot_json_parse_failed", response_preview=response_text[:200])
696
+ return {"entities": entities, "relationships": relationships}
697
+
698
+ # Apply validations
699
+ entity_validations = {v["id"]: v for v in data.get("entity_validations", [])}
700
+ relationship_validations = {v["id"]: v for v in data.get("relationship_validations", [])}
701
+
702
+ # Filter and update entities
703
+ validated_entities = []
704
+ for i, entity in enumerate(entities):
705
+ validation = entity_validations.get(i, {})
706
+ if validation.get("valid", True) and validation.get("probability", 0.5) >= self.tot_selection_threshold:
707
+ entity["type"] = validation.get("type", entity.get("type", "OTHER"))
708
+ entity["canonical_name"] = validation.get("canonical_name", entity.get("canonical_name", entity.get("text", "")))
709
+ entity["confidence"] = validation.get("probability", entity.get("confidence", 0.5))
710
+ entity["tot_reasoning"] = validation.get("reasoning", "")
711
+ validated_entities.append(entity)
712
+
713
+ # Filter and update relationships
714
+ validated_relationships = []
715
+ for i, rel in enumerate(relationships):
716
+ validation = relationship_validations.get(i, {})
717
+ if validation.get("valid", True) and validation.get("probability", 0.5) >= self.tot_selection_threshold:
718
+ rel["type"] = validation.get("type", rel.get("type", "MENTIONS"))
719
+ rel["confidence"] = validation.get("probability", rel.get("confidence", 0.5))
720
+ rel["tot_reasoning"] = validation.get("reasoning", "")
721
+ validated_relationships.append(rel)
722
+
723
+ return {"entities": validated_entities, "relationships": validated_relationships}
724
+
725
+ except Exception as e:
726
+ logger.warning("tot_validation_failed", error=str(e))
727
+ return {"entities": entities, "relationships": relationships}
728
+
729
+ def _candidates_to_entities(
730
+ self,
731
+ candidates: list[dict],
732
+ node_id: str,
733
+ doc_id: str,
734
+ content: str,
735
+ page_num: int | None,
736
+ ) -> list[Entity]:
737
+ """Convert candidates to Entity objects."""
738
+ entities = []
739
+
740
+ for candidate in candidates:
741
+ if not candidate.get("text"):
742
+ continue
743
+
744
+ entity_type = self._map_entity_type(candidate.get("type", "OTHER"))
745
+
746
+ mention = Mention(
747
+ node_id=node_id,
748
+ doc_id=doc_id,
749
+ span_start=candidate.get("start"),
750
+ span_end=candidate.get("end"),
751
+ context=content[
752
+ max(0, (candidate.get("start") or 0) - 50):
753
+ (candidate.get("end") or 0) + 50
754
+ ] if candidate.get("start") is not None else "",
755
+ page_num=page_num,
756
+ confidence=candidate.get("confidence", 0.5),
757
+ )
758
+
759
+ metadata = {
760
+ "rlm_extracted": True,
761
+ "grounded": candidate.get("start") is not None,
762
+ "tot_validated": "tot_reasoning" in candidate,
763
+ }
764
+
765
+ if candidate.get("tot_reasoning"):
766
+ metadata["tot_reasoning"] = candidate["tot_reasoning"]
767
+
768
+ if entity_type == EntityType.OTHER:
769
+ metadata["original_type"] = candidate.get("type", "").lower()
770
+
771
+ entity = Entity(
772
+ type=entity_type,
773
+ canonical_name=candidate.get("canonical_name", candidate.get("text", "")),
774
+ aliases=[candidate.get("text")] if candidate.get("canonical_name") != candidate.get("text") else [],
775
+ mentions=[mention],
776
+ metadata=metadata,
777
+ source_doc_id=doc_id,
778
+ )
779
+ entities.append(entity)
780
+
781
+ return entities
782
+
783
+ def _candidates_to_relationships(
784
+ self,
785
+ candidates: list[dict],
786
+ entities: list[Entity],
787
+ node_id: str,
788
+ doc_id: str,
789
+ ) -> list[Relationship]:
790
+ """Convert candidates to Relationship objects."""
791
+ relationships = []
792
+
793
+ # Build entity lookup
794
+ entity_by_text = {}
795
+ for entity in entities:
796
+ entity_by_text[entity.canonical_name.lower()] = entity.id
797
+ for alias in entity.aliases:
798
+ entity_by_text[alias.lower()] = entity.id
799
+
800
+ for candidate in candidates:
801
+ rel_type = self._map_relationship_type(candidate.get("type", "MENTIONS"))
802
+
803
+ # Try to match source/target to entities
804
+ source_text = candidate.get("source_text", "")
805
+ target_text = candidate.get("target_text", "")
806
+
807
+ source_id = entity_by_text.get(source_text.lower(), f"text:{source_text}")
808
+ target_id = entity_by_text.get(target_text.lower(), f"text:{target_text}")
809
+
810
+ source_type = "entity" if source_id in [e.id for e in entities] else "text"
811
+ target_type = "entity" if target_id in [e.id for e in entities] else "text"
812
+
813
+ metadata = {
814
+ "rlm_extracted": True,
815
+ "grounded": candidate.get("start") is not None,
816
+ "tot_validated": "tot_reasoning" in candidate,
817
+ }
818
+
819
+ if candidate.get("tot_reasoning"):
820
+ metadata["tot_reasoning"] = candidate["tot_reasoning"]
821
+
822
+ if rel_type == RelationType.OTHER:
823
+ metadata["original_type"] = candidate.get("type", "").lower()
824
+
825
+ relationship = Relationship(
826
+ type=rel_type,
827
+ source_id=source_id,
828
+ source_type=source_type,
829
+ target_id=target_id,
830
+ target_type=target_type,
831
+ confidence=candidate.get("confidence", 0.5),
832
+ evidence=candidate.get("evidence", ""),
833
+ doc_id=doc_id,
834
+ node_id=node_id,
835
+ metadata=metadata,
836
+ )
837
+ relationships.append(relationship)
838
+
839
+ return relationships
840
+
841
+ def _cross_validate(
842
+ self,
843
+ entities: list[Entity],
844
+ relationships: list[Relationship],
845
+ ) -> tuple[list[Entity], list[Relationship]]:
846
+ """Cross-validate entities and relationships."""
847
+ entity_ids = {e.id for e in entities}
848
+
849
+ # Find entities referenced in relationships
850
+ entities_in_rels = set()
851
+ for rel in relationships:
852
+ if rel.source_type == "entity" and rel.source_id in entity_ids:
853
+ entities_in_rels.add(rel.source_id)
854
+ if rel.target_type == "entity" and rel.target_id in entity_ids:
855
+ entities_in_rels.add(rel.target_id)
856
+
857
+ # Boost confidence for entities in relationships
858
+ for entity in entities:
859
+ if entity.id in entities_in_rels:
860
+ if entity.mentions:
861
+ entity.mentions[0].confidence = min(entity.mentions[0].confidence * 1.1, 1.0)
862
+ entity.metadata["cross_validated"] = True
863
+
864
+ # Boost confidence for relationships with validated entities
865
+ for rel in relationships:
866
+ both_valid = (
867
+ (rel.source_type == "entity" and rel.source_id in entity_ids) and
868
+ (rel.target_type == "entity" and rel.target_id in entity_ids)
869
+ )
870
+ if both_valid:
871
+ rel.confidence = min(rel.confidence * 1.1, 1.0)
872
+ rel.metadata["cross_validated"] = True
873
+
874
+ return entities, relationships
875
+
876
+ def _learn_new_types(
877
+ self,
878
+ entities: list[Entity],
879
+ relationships: list[Relationship],
880
+ ) -> None:
881
+ """Learn new entity and relationship types."""
882
+ # Learn entity types
883
+ if self._entity_type_registry:
884
+ for entity in entities:
885
+ if entity.type == EntityType.OTHER:
886
+ original_type = entity.metadata.get("original_type", "unknown")
887
+ context = entity.mentions[0].context if entity.mentions else ""
888
+ self._entity_type_registry.record_type(
889
+ type_name=original_type,
890
+ context=context,
891
+ entity_name=entity.canonical_name,
892
+ )
893
+
894
+ # Learn relationship types
895
+ if self._relationship_type_registry:
896
+ for rel in relationships:
897
+ if rel.type == RelationType.OTHER:
898
+ original_type = rel.metadata.get("original_type", "unknown")
899
+ try:
900
+ self._relationship_type_registry.record_type(
901
+ type_name=original_type,
902
+ context=rel.evidence,
903
+ relationship_description=f"{rel.source_id} -> {rel.target_id}",
904
+ )
905
+ except Exception:
906
+ pass
907
+
908
+ def _map_entity_type(self, type_str: str) -> EntityType:
909
+ """Map type string to EntityType enum."""
910
+ type_str = type_str.upper()
911
+
912
+ mapping = {
913
+ "PERSON": EntityType.PERSON,
914
+ "ORGANIZATION": EntityType.ORGANIZATION,
915
+ "ORG": EntityType.ORGANIZATION,
916
+ "COMPANY": EntityType.ORGANIZATION,
917
+ "DATE": EntityType.DATE,
918
+ "MONETARY": EntityType.MONETARY,
919
+ "MONEY": EntityType.MONETARY,
920
+ "LOCATION": EntityType.LOCATION,
921
+ "REFERENCE": EntityType.REFERENCE,
922
+ "DOCUMENT": EntityType.DOCUMENT,
923
+ "EVENT": EntityType.EVENT,
924
+ "LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
925
+ "LEGAL": EntityType.LEGAL_CONCEPT,
926
+ }
927
+
928
+ try:
929
+ return EntityType(type_str.lower())
930
+ except ValueError:
931
+ return mapping.get(type_str, EntityType.OTHER)
932
+
933
+ def _map_relationship_type(self, type_str: str) -> RelationType:
934
+ """Map type string to RelationType enum."""
935
+ type_str = type_str.upper()
936
+
937
+ mapping = {
938
+ "MENTIONS": RelationType.MENTIONS,
939
+ "TEMPORAL_BEFORE": RelationType.TEMPORAL_BEFORE,
940
+ "TEMPORAL_AFTER": RelationType.TEMPORAL_AFTER,
941
+ "CAUSAL": RelationType.CAUSAL,
942
+ "SUPPORTS": RelationType.SUPPORTS,
943
+ "CONTRADICTS": RelationType.CONTRADICTS,
944
+ "AFFILIATED_WITH": RelationType.AFFILIATED_WITH,
945
+ "PARTY_TO": RelationType.PARTY_TO,
946
+ "REFERENCES": RelationType.REFERENCES,
947
+ "SUPERSEDES": RelationType.SUPERSEDES,
948
+ "AMENDS": RelationType.AMENDS,
949
+ }
950
+
951
+ try:
952
+ return RelationType(type_str.lower())
953
+ except ValueError:
954
+ return mapping.get(type_str, RelationType.OTHER)
955
+
956
+ def to_extraction_result(self, unified_result: RLMUnifiedResult) -> ExtractionResult:
957
+ """Convert to standard ExtractionResult format."""
958
+ return ExtractionResult(
959
+ node_id=unified_result.node_id,
960
+ doc_id=unified_result.doc_id,
961
+ entities=unified_result.entities,
962
+ relationships=unified_result.relationships,
963
+ processing_time_ms=unified_result.processing_time_ms,
964
+ extraction_method="rlm_unified",
965
+ warnings=unified_result.warnings,
966
+ )
967
+
968
+
969
+ # Convenience function
970
+ def extract_entities_and_relationships(
971
+ node_id: str,
972
+ doc_id: str,
973
+ header: str,
974
+ content: str,
975
+ page_num: int | None = None,
976
+ ) -> RLMUnifiedResult:
977
+ """
978
+ Extract entities and relationships using the unified RLM approach.
979
+
980
+ This is the recommended way to extract - always uses the most
981
+ accurate, grounded approach with ToT validation.
982
+ """
983
+ extractor = RLMUnifiedExtractor()
984
+ return extractor.extract(
985
+ node_id=node_id,
986
+ doc_id=doc_id,
987
+ header=header,
988
+ content=content,
989
+ page_num=page_num,
990
+ )