rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Tree of Thoughts Entity Validator
|
|
3
|
+
|
|
4
|
+
Applies the ToT pattern from the RLM Navigator to entity validation:
|
|
5
|
+
|
|
6
|
+
1. Given pre-extracted candidates, evaluate each with probability + reasoning
|
|
7
|
+
2. Navigate the document tree for additional context when uncertain
|
|
8
|
+
3. Make multi-step decisions (like backtracking in document navigation)
|
|
9
|
+
|
|
10
|
+
This prevents hallucination because:
|
|
11
|
+
- Candidates are already grounded in text (from pattern extraction)
|
|
12
|
+
- ToT provides structured evaluation with explicit probabilities
|
|
13
|
+
- Navigation provides additional context for ambiguous cases
|
|
14
|
+
- Same battle-tested pattern used for document Q&A
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import structlog
|
|
25
|
+
|
|
26
|
+
from rnsr.extraction.candidate_extractor import EntityCandidate
|
|
27
|
+
from rnsr.extraction.models import Entity, EntityType, Mention
|
|
28
|
+
from rnsr.llm import get_llm
|
|
29
|
+
from rnsr.models import DocumentTree
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ToT-style prompt for entity validation (mirrors graph.py ToT_SYSTEM_PROMPT pattern)
|
|
35
|
+
TOT_ENTITY_VALIDATION_PROMPT = """You are validating entity candidates extracted from a document.
|
|
36
|
+
|
|
37
|
+
Current Section: {section_header}
|
|
38
|
+
Section Content:
|
|
39
|
+
---
|
|
40
|
+
{section_content}
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
Entity Candidates to Evaluate:
|
|
44
|
+
{candidates_formatted}
|
|
45
|
+
|
|
46
|
+
EVALUATION TASK:
|
|
47
|
+
For each candidate, estimate the probability (0.0 to 1.0) that it is a valid,
|
|
48
|
+
significant entity worth tracking, AND classify its type.
|
|
49
|
+
|
|
50
|
+
INSTRUCTIONS:
|
|
51
|
+
1. Evaluate: For each candidate, analyze its context and estimate validity probability.
|
|
52
|
+
2. Valid entities have: clear identity, specific name, significance to document.
|
|
53
|
+
3. Invalid entities are: generic terms, partial matches, noise, common words.
|
|
54
|
+
4. If probability >= {selection_threshold}, include in selected_entities.
|
|
55
|
+
5. If probability < {rejection_threshold}, mark as rejected.
|
|
56
|
+
6. Provide brief reasoning for each decision.
|
|
57
|
+
7. Classify type: PERSON, ORGANIZATION, DATE, LOCATION, MONETARY, REFERENCE,
|
|
58
|
+
DOCUMENT, EVENT, LEGAL_CONCEPT, or describe a custom type.
|
|
59
|
+
|
|
60
|
+
OUTPUT FORMAT (JSON):
|
|
61
|
+
{{
|
|
62
|
+
"evaluations": [
|
|
63
|
+
{{
|
|
64
|
+
"candidate_id": 0,
|
|
65
|
+
"probability": 0.85,
|
|
66
|
+
"is_valid": true,
|
|
67
|
+
"entity_type": "PERSON",
|
|
68
|
+
"canonical_name": "John Smith",
|
|
69
|
+
"role": "defendant",
|
|
70
|
+
"reasoning": "Clear person name with title, mentioned as party to case"
|
|
71
|
+
}},
|
|
72
|
+
{{
|
|
73
|
+
"candidate_id": 1,
|
|
74
|
+
"probability": 0.30,
|
|
75
|
+
"is_valid": false,
|
|
76
|
+
"entity_type": null,
|
|
77
|
+
"canonical_name": null,
|
|
78
|
+
"reasoning": "Generic reference to 'the agreement', not a specific entity"
|
|
79
|
+
}}
|
|
80
|
+
],
|
|
81
|
+
"selected_entities": [0],
|
|
82
|
+
"needs_more_context": [],
|
|
83
|
+
"high_confidence_count": 1,
|
|
84
|
+
"low_confidence_count": 1
|
|
85
|
+
}}
|
|
86
|
+
|
|
87
|
+
If uncertain about a candidate (probability 0.4-0.6), add its id to "needs_more_context".
|
|
88
|
+
We may navigate to related sections to gather more information.
|
|
89
|
+
|
|
90
|
+
Respond ONLY with the JSON, no other text."""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Prompt for gathering context from related sections
|
|
94
|
+
TOT_CONTEXT_GATHERING_PROMPT = """You need more context to validate an entity candidate.
|
|
95
|
+
|
|
96
|
+
Entity candidate: "{candidate_text}" (type hint: {type_hint})
|
|
97
|
+
Original section: {original_section}
|
|
98
|
+
|
|
99
|
+
Related sections found:
|
|
100
|
+
{related_sections}
|
|
101
|
+
|
|
102
|
+
Based on this additional context, provide your evaluation:
|
|
103
|
+
{{
|
|
104
|
+
"candidate_id": {candidate_id},
|
|
105
|
+
"probability": 0.XX,
|
|
106
|
+
"is_valid": true/false,
|
|
107
|
+
"entity_type": "TYPE",
|
|
108
|
+
"canonical_name": "Full Name",
|
|
109
|
+
"reasoning": "With additional context from section X, this is clearly a..."
|
|
110
|
+
}}
|
|
111
|
+
|
|
112
|
+
Respond ONLY with the JSON, no other text."""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class TotValidationResult:
|
|
117
|
+
"""Result of ToT entity validation."""
|
|
118
|
+
|
|
119
|
+
candidate_id: int
|
|
120
|
+
probability: float
|
|
121
|
+
is_valid: bool
|
|
122
|
+
entity_type: str | None = None
|
|
123
|
+
canonical_name: str | None = None
|
|
124
|
+
role: str | None = None
|
|
125
|
+
reasoning: str = ""
|
|
126
|
+
used_navigation: bool = False
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class TotBatchResult:
|
|
131
|
+
"""Result of validating a batch of candidates."""
|
|
132
|
+
|
|
133
|
+
evaluations: list[TotValidationResult] = field(default_factory=list)
|
|
134
|
+
selected_entities: list[int] = field(default_factory=list)
|
|
135
|
+
needs_more_context: list[int] = field(default_factory=list)
|
|
136
|
+
high_confidence_count: int = 0
|
|
137
|
+
low_confidence_count: int = 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TotEntityValidator:
|
|
141
|
+
"""
|
|
142
|
+
Tree of Thoughts entity validator.
|
|
143
|
+
|
|
144
|
+
Uses the same ToT pattern as document navigation:
|
|
145
|
+
- Evaluate candidates with explicit probabilities
|
|
146
|
+
- Navigate for context when uncertain
|
|
147
|
+
- Structured JSON output for reliable parsing
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
llm: Any | None = None,
|
|
153
|
+
selection_threshold: float = 0.6,
|
|
154
|
+
rejection_threshold: float = 0.3,
|
|
155
|
+
enable_navigation: bool = True,
|
|
156
|
+
max_navigation_depth: int = 2,
|
|
157
|
+
max_candidates_per_batch: int = 20,
|
|
158
|
+
):
|
|
159
|
+
"""
|
|
160
|
+
Initialize the ToT validator.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
llm: LLM instance.
|
|
164
|
+
selection_threshold: Probability threshold for accepting entity.
|
|
165
|
+
rejection_threshold: Probability threshold for rejecting entity.
|
|
166
|
+
enable_navigation: Navigate tree for uncertain candidates.
|
|
167
|
+
max_navigation_depth: Max depth to navigate for context.
|
|
168
|
+
max_candidates_per_batch: Max candidates per LLM call.
|
|
169
|
+
"""
|
|
170
|
+
self.llm = llm
|
|
171
|
+
self.selection_threshold = selection_threshold
|
|
172
|
+
self.rejection_threshold = rejection_threshold
|
|
173
|
+
self.enable_navigation = enable_navigation
|
|
174
|
+
self.max_navigation_depth = max_navigation_depth
|
|
175
|
+
self.max_candidates_per_batch = max_candidates_per_batch
|
|
176
|
+
|
|
177
|
+
self._llm_initialized = False
|
|
178
|
+
|
|
179
|
+
def _get_llm(self) -> Any:
|
|
180
|
+
"""Get or initialize LLM."""
|
|
181
|
+
if self.llm is None and not self._llm_initialized:
|
|
182
|
+
self.llm = get_llm()
|
|
183
|
+
self._llm_initialized = True
|
|
184
|
+
return self.llm
|
|
185
|
+
|
|
186
|
+
def validate_candidates(
|
|
187
|
+
self,
|
|
188
|
+
candidates: list[EntityCandidate],
|
|
189
|
+
section_header: str,
|
|
190
|
+
section_content: str,
|
|
191
|
+
document_tree: DocumentTree | None = None,
|
|
192
|
+
node_id: str | None = None,
|
|
193
|
+
) -> TotBatchResult:
|
|
194
|
+
"""
|
|
195
|
+
Validate entity candidates using ToT reasoning.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
candidates: Pre-extracted candidates to validate.
|
|
199
|
+
section_header: Current section header.
|
|
200
|
+
section_content: Current section content.
|
|
201
|
+
document_tree: Optional tree for navigation.
|
|
202
|
+
node_id: Current node ID for navigation.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
TotBatchResult with validated entities.
|
|
206
|
+
"""
|
|
207
|
+
if not candidates:
|
|
208
|
+
return TotBatchResult()
|
|
209
|
+
|
|
210
|
+
llm = self._get_llm()
|
|
211
|
+
if llm is None:
|
|
212
|
+
# No LLM - accept all candidates with pattern-based types
|
|
213
|
+
return self._accept_all_candidates(candidates)
|
|
214
|
+
|
|
215
|
+
# Process in batches
|
|
216
|
+
all_results = TotBatchResult()
|
|
217
|
+
|
|
218
|
+
for i in range(0, len(candidates), self.max_candidates_per_batch):
|
|
219
|
+
batch = candidates[i:i + self.max_candidates_per_batch]
|
|
220
|
+
batch_offset = i
|
|
221
|
+
|
|
222
|
+
batch_result = self._validate_batch(
|
|
223
|
+
candidates=batch,
|
|
224
|
+
batch_offset=batch_offset,
|
|
225
|
+
section_header=section_header,
|
|
226
|
+
section_content=section_content,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Merge results
|
|
230
|
+
all_results.evaluations.extend(batch_result.evaluations)
|
|
231
|
+
all_results.selected_entities.extend(batch_result.selected_entities)
|
|
232
|
+
all_results.needs_more_context.extend(batch_result.needs_more_context)
|
|
233
|
+
all_results.high_confidence_count += batch_result.high_confidence_count
|
|
234
|
+
all_results.low_confidence_count += batch_result.low_confidence_count
|
|
235
|
+
|
|
236
|
+
# Handle uncertain candidates if navigation is enabled
|
|
237
|
+
if self.enable_navigation and document_tree and all_results.needs_more_context:
|
|
238
|
+
all_results = self._resolve_uncertain_candidates(
|
|
239
|
+
candidates=candidates,
|
|
240
|
+
batch_result=all_results,
|
|
241
|
+
document_tree=document_tree,
|
|
242
|
+
current_node_id=node_id,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return all_results
|
|
246
|
+
|
|
247
|
+
def _validate_batch(
|
|
248
|
+
self,
|
|
249
|
+
candidates: list[EntityCandidate],
|
|
250
|
+
batch_offset: int,
|
|
251
|
+
section_header: str,
|
|
252
|
+
section_content: str,
|
|
253
|
+
) -> TotBatchResult:
|
|
254
|
+
"""Validate a batch of candidates with ToT."""
|
|
255
|
+
# Format candidates for prompt
|
|
256
|
+
candidates_formatted = "\n".join([
|
|
257
|
+
f"[{i + batch_offset}] Text: \"{c.text}\" | Type Hint: {c.candidate_type} | "
|
|
258
|
+
f"Context: \"...{c.context[:100]}...\""
|
|
259
|
+
for i, c in enumerate(candidates)
|
|
260
|
+
])
|
|
261
|
+
|
|
262
|
+
prompt = TOT_ENTITY_VALIDATION_PROMPT.format(
|
|
263
|
+
section_header=section_header,
|
|
264
|
+
section_content=section_content[:2500],
|
|
265
|
+
candidates_formatted=candidates_formatted,
|
|
266
|
+
selection_threshold=self.selection_threshold,
|
|
267
|
+
rejection_threshold=self.rejection_threshold,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
response = self.llm.complete(prompt)
|
|
272
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
273
|
+
|
|
274
|
+
return self._parse_validation_response(response_text, len(candidates), batch_offset)
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
logger.warning("tot_validation_failed", error=str(e))
|
|
278
|
+
return self._accept_all_candidates(candidates, offset=batch_offset)
|
|
279
|
+
|
|
280
|
+
def _parse_validation_response(
|
|
281
|
+
self,
|
|
282
|
+
response_text: str,
|
|
283
|
+
candidate_count: int,
|
|
284
|
+
batch_offset: int,
|
|
285
|
+
) -> TotBatchResult:
|
|
286
|
+
"""Parse ToT validation response."""
|
|
287
|
+
result = TotBatchResult()
|
|
288
|
+
|
|
289
|
+
# Extract JSON
|
|
290
|
+
json_match = re.search(r'\{[\s\S]*\}', response_text)
|
|
291
|
+
if not json_match:
|
|
292
|
+
logger.warning("tot_no_json_found")
|
|
293
|
+
return result
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
data = json.loads(json_match.group())
|
|
297
|
+
except json.JSONDecodeError as e:
|
|
298
|
+
logger.warning("tot_json_parse_failed", error=str(e))
|
|
299
|
+
return result
|
|
300
|
+
|
|
301
|
+
# Parse evaluations
|
|
302
|
+
for eval_data in data.get("evaluations", []):
|
|
303
|
+
try:
|
|
304
|
+
validation = TotValidationResult(
|
|
305
|
+
candidate_id=eval_data.get("candidate_id", 0),
|
|
306
|
+
probability=float(eval_data.get("probability", 0.5)),
|
|
307
|
+
is_valid=eval_data.get("is_valid", False),
|
|
308
|
+
entity_type=eval_data.get("entity_type"),
|
|
309
|
+
canonical_name=eval_data.get("canonical_name"),
|
|
310
|
+
role=eval_data.get("role"),
|
|
311
|
+
reasoning=eval_data.get("reasoning", ""),
|
|
312
|
+
)
|
|
313
|
+
result.evaluations.append(validation)
|
|
314
|
+
|
|
315
|
+
if validation.is_valid:
|
|
316
|
+
result.high_confidence_count += 1
|
|
317
|
+
else:
|
|
318
|
+
result.low_confidence_count += 1
|
|
319
|
+
|
|
320
|
+
except (KeyError, TypeError, ValueError) as e:
|
|
321
|
+
logger.debug("tot_eval_parse_error", error=str(e))
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
# Parse selected entities (adjust for batch offset)
|
|
325
|
+
result.selected_entities = [
|
|
326
|
+
idx for idx in data.get("selected_entities", [])
|
|
327
|
+
if isinstance(idx, int)
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
# Parse needs_more_context
|
|
331
|
+
result.needs_more_context = [
|
|
332
|
+
idx for idx in data.get("needs_more_context", [])
|
|
333
|
+
if isinstance(idx, int)
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
return result
|
|
337
|
+
|
|
338
|
+
def _resolve_uncertain_candidates(
|
|
339
|
+
self,
|
|
340
|
+
candidates: list[EntityCandidate],
|
|
341
|
+
batch_result: TotBatchResult,
|
|
342
|
+
document_tree: DocumentTree,
|
|
343
|
+
current_node_id: str | None,
|
|
344
|
+
) -> TotBatchResult:
|
|
345
|
+
"""
|
|
346
|
+
Navigate document tree to resolve uncertain candidates.
|
|
347
|
+
|
|
348
|
+
This is like backtracking in document Q&A - gather more context
|
|
349
|
+
to make a better decision.
|
|
350
|
+
"""
|
|
351
|
+
if not batch_result.needs_more_context:
|
|
352
|
+
return batch_result
|
|
353
|
+
|
|
354
|
+
logger.info(
|
|
355
|
+
"tot_navigating_for_context",
|
|
356
|
+
uncertain_count=len(batch_result.needs_more_context),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
# Find related sections
|
|
360
|
+
related_sections = self._find_related_sections(
|
|
361
|
+
document_tree=document_tree,
|
|
362
|
+
current_node_id=current_node_id,
|
|
363
|
+
depth=self.max_navigation_depth,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if not related_sections:
|
|
367
|
+
# No related sections - accept uncertain candidates with lower confidence
|
|
368
|
+
for idx in batch_result.needs_more_context:
|
|
369
|
+
if idx < len(candidates):
|
|
370
|
+
# Add as selected with moderate confidence
|
|
371
|
+
batch_result.selected_entities.append(idx)
|
|
372
|
+
return batch_result
|
|
373
|
+
|
|
374
|
+
# Re-evaluate uncertain candidates with additional context
|
|
375
|
+
for idx in batch_result.needs_more_context:
|
|
376
|
+
if idx >= len(candidates):
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
candidate = candidates[idx]
|
|
380
|
+
|
|
381
|
+
resolved = self._resolve_single_candidate(
|
|
382
|
+
candidate=candidate,
|
|
383
|
+
candidate_id=idx,
|
|
384
|
+
related_sections=related_sections,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if resolved and resolved.is_valid:
|
|
388
|
+
# Update the evaluation
|
|
389
|
+
for i, eval_item in enumerate(batch_result.evaluations):
|
|
390
|
+
if eval_item.candidate_id == idx:
|
|
391
|
+
batch_result.evaluations[i] = resolved
|
|
392
|
+
break
|
|
393
|
+
else:
|
|
394
|
+
batch_result.evaluations.append(resolved)
|
|
395
|
+
|
|
396
|
+
batch_result.selected_entities.append(idx)
|
|
397
|
+
batch_result.high_confidence_count += 1
|
|
398
|
+
|
|
399
|
+
# Clear needs_more_context since we've processed them
|
|
400
|
+
batch_result.needs_more_context = []
|
|
401
|
+
|
|
402
|
+
return batch_result
|
|
403
|
+
|
|
404
|
+
def _find_related_sections(
|
|
405
|
+
self,
|
|
406
|
+
document_tree: DocumentTree,
|
|
407
|
+
current_node_id: str | None,
|
|
408
|
+
depth: int,
|
|
409
|
+
) -> list[dict[str, str]]:
|
|
410
|
+
"""Find related sections for context gathering."""
|
|
411
|
+
sections = []
|
|
412
|
+
|
|
413
|
+
if not document_tree or not document_tree.root:
|
|
414
|
+
return sections
|
|
415
|
+
|
|
416
|
+
# Collect sections from tree (siblings and nearby nodes)
|
|
417
|
+
def collect_sections(node: Any, current_depth: int) -> None:
|
|
418
|
+
if current_depth > depth:
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
if hasattr(node, 'header') and hasattr(node, 'content'):
|
|
422
|
+
node_id = getattr(node, 'id', str(id(node)))
|
|
423
|
+
if node_id != current_node_id:
|
|
424
|
+
sections.append({
|
|
425
|
+
"header": node.header or "(no header)",
|
|
426
|
+
"content": (node.content or "")[:500],
|
|
427
|
+
})
|
|
428
|
+
|
|
429
|
+
if hasattr(node, 'children'):
|
|
430
|
+
for child in node.children[:5]: # Limit children
|
|
431
|
+
collect_sections(child, current_depth + 1)
|
|
432
|
+
|
|
433
|
+
collect_sections(document_tree.root, 0)
|
|
434
|
+
|
|
435
|
+
return sections[:10] # Limit total sections
|
|
436
|
+
|
|
437
|
+
def _resolve_single_candidate(
|
|
438
|
+
self,
|
|
439
|
+
candidate: EntityCandidate,
|
|
440
|
+
candidate_id: int,
|
|
441
|
+
related_sections: list[dict[str, str]],
|
|
442
|
+
) -> TotValidationResult | None:
|
|
443
|
+
"""Resolve a single uncertain candidate with additional context."""
|
|
444
|
+
llm = self._get_llm()
|
|
445
|
+
if llm is None:
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
# Format related sections
|
|
449
|
+
sections_text = "\n\n".join([
|
|
450
|
+
f"### {s['header']}\n{s['content']}"
|
|
451
|
+
for s in related_sections[:5]
|
|
452
|
+
])
|
|
453
|
+
|
|
454
|
+
prompt = TOT_CONTEXT_GATHERING_PROMPT.format(
|
|
455
|
+
candidate_text=candidate.text,
|
|
456
|
+
type_hint=candidate.candidate_type,
|
|
457
|
+
original_section=candidate.context[:200],
|
|
458
|
+
related_sections=sections_text,
|
|
459
|
+
candidate_id=candidate_id,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
response = llm.complete(prompt)
|
|
464
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
465
|
+
|
|
466
|
+
# Parse response
|
|
467
|
+
json_match = re.search(r'\{[\s\S]*\}', response_text)
|
|
468
|
+
if not json_match:
|
|
469
|
+
return None
|
|
470
|
+
|
|
471
|
+
data = json.loads(json_match.group())
|
|
472
|
+
|
|
473
|
+
return TotValidationResult(
|
|
474
|
+
candidate_id=candidate_id,
|
|
475
|
+
probability=float(data.get("probability", 0.5)),
|
|
476
|
+
is_valid=data.get("is_valid", False),
|
|
477
|
+
entity_type=data.get("entity_type"),
|
|
478
|
+
canonical_name=data.get("canonical_name"),
|
|
479
|
+
role=data.get("role"),
|
|
480
|
+
reasoning=data.get("reasoning", ""),
|
|
481
|
+
used_navigation=True,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.debug("tot_context_resolution_failed", error=str(e))
|
|
486
|
+
return None
|
|
487
|
+
|
|
488
|
+
def _accept_all_candidates(
|
|
489
|
+
self,
|
|
490
|
+
candidates: list[EntityCandidate],
|
|
491
|
+
offset: int = 0,
|
|
492
|
+
) -> TotBatchResult:
|
|
493
|
+
"""Accept all candidates without validation (fallback)."""
|
|
494
|
+
result = TotBatchResult()
|
|
495
|
+
|
|
496
|
+
for i, candidate in enumerate(candidates):
|
|
497
|
+
idx = i + offset
|
|
498
|
+
result.evaluations.append(TotValidationResult(
|
|
499
|
+
candidate_id=idx,
|
|
500
|
+
probability=candidate.confidence,
|
|
501
|
+
is_valid=True,
|
|
502
|
+
entity_type=candidate.candidate_type.upper(),
|
|
503
|
+
canonical_name=candidate.text,
|
|
504
|
+
reasoning="Accepted without LLM validation",
|
|
505
|
+
))
|
|
506
|
+
result.selected_entities.append(idx)
|
|
507
|
+
|
|
508
|
+
result.high_confidence_count = len(candidates)
|
|
509
|
+
return result
|
|
510
|
+
|
|
511
|
+
def candidates_to_entities(
|
|
512
|
+
self,
|
|
513
|
+
candidates: list[EntityCandidate],
|
|
514
|
+
validation_result: TotBatchResult,
|
|
515
|
+
node_id: str,
|
|
516
|
+
doc_id: str,
|
|
517
|
+
page_num: int | None = None,
|
|
518
|
+
) -> list[Entity]:
|
|
519
|
+
"""
|
|
520
|
+
Convert validated candidates to Entity objects.
|
|
521
|
+
|
|
522
|
+
Only includes candidates that passed ToT validation.
|
|
523
|
+
"""
|
|
524
|
+
entities = []
|
|
525
|
+
|
|
526
|
+
# Build lookup for evaluations
|
|
527
|
+
eval_by_id = {e.candidate_id: e for e in validation_result.evaluations}
|
|
528
|
+
|
|
529
|
+
for idx in validation_result.selected_entities:
|
|
530
|
+
if idx >= len(candidates):
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
candidate = candidates[idx]
|
|
534
|
+
evaluation = eval_by_id.get(idx)
|
|
535
|
+
|
|
536
|
+
if not evaluation or not evaluation.is_valid:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
# Map entity type
|
|
540
|
+
entity_type = self._map_entity_type(
|
|
541
|
+
evaluation.entity_type or candidate.candidate_type
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Get canonical name
|
|
545
|
+
canonical_name = evaluation.canonical_name or candidate.text
|
|
546
|
+
|
|
547
|
+
# Create mention
|
|
548
|
+
mention = Mention(
|
|
549
|
+
node_id=node_id,
|
|
550
|
+
doc_id=doc_id,
|
|
551
|
+
span_start=candidate.start,
|
|
552
|
+
span_end=candidate.end,
|
|
553
|
+
context=candidate.context,
|
|
554
|
+
page_num=page_num,
|
|
555
|
+
confidence=evaluation.probability,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Build metadata
|
|
559
|
+
metadata = {
|
|
560
|
+
"grounded": True,
|
|
561
|
+
"tot_validated": True,
|
|
562
|
+
"tot_probability": evaluation.probability,
|
|
563
|
+
"tot_reasoning": evaluation.reasoning,
|
|
564
|
+
"pattern": candidate.pattern_name,
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if evaluation.role:
|
|
568
|
+
metadata["role"] = evaluation.role
|
|
569
|
+
|
|
570
|
+
if evaluation.used_navigation:
|
|
571
|
+
metadata["used_context_navigation"] = True
|
|
572
|
+
|
|
573
|
+
if entity_type == EntityType.OTHER:
|
|
574
|
+
metadata["original_type"] = (evaluation.entity_type or "").lower()
|
|
575
|
+
|
|
576
|
+
entity = Entity(
|
|
577
|
+
type=entity_type,
|
|
578
|
+
canonical_name=canonical_name,
|
|
579
|
+
aliases=[candidate.text] if candidate.text != canonical_name else [],
|
|
580
|
+
mentions=[mention],
|
|
581
|
+
metadata=metadata,
|
|
582
|
+
source_doc_id=doc_id,
|
|
583
|
+
)
|
|
584
|
+
entities.append(entity)
|
|
585
|
+
|
|
586
|
+
return entities
|
|
587
|
+
|
|
588
|
+
def _map_entity_type(self, type_str: str) -> EntityType:
|
|
589
|
+
"""Map type string to EntityType enum."""
|
|
590
|
+
type_str = type_str.upper()
|
|
591
|
+
|
|
592
|
+
mapping = {
|
|
593
|
+
"PERSON": EntityType.PERSON,
|
|
594
|
+
"ORGANIZATION": EntityType.ORGANIZATION,
|
|
595
|
+
"ORG": EntityType.ORGANIZATION,
|
|
596
|
+
"DATE": EntityType.DATE,
|
|
597
|
+
"LOCATION": EntityType.LOCATION,
|
|
598
|
+
"MONETARY": EntityType.MONETARY,
|
|
599
|
+
"MONEY": EntityType.MONETARY,
|
|
600
|
+
"REFERENCE": EntityType.REFERENCE,
|
|
601
|
+
"DOCUMENT": EntityType.DOCUMENT,
|
|
602
|
+
"EVENT": EntityType.EVENT,
|
|
603
|
+
"LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
|
|
604
|
+
"LEGAL": EntityType.LEGAL_CONCEPT,
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
try:
|
|
608
|
+
return EntityType(type_str.lower())
|
|
609
|
+
except ValueError:
|
|
610
|
+
return mapping.get(type_str, EntityType.OTHER)
|