rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,610 @@
1
+ """
2
+ RNSR Tree of Thoughts Entity Validator
3
+
4
+ Applies the ToT pattern from the RLM Navigator to entity validation:
5
+
6
+ 1. Given pre-extracted candidates, evaluate each with probability + reasoning
7
+ 2. Navigate the document tree for additional context when uncertain
8
+ 3. Make multi-step decisions (like backtracking in document navigation)
9
+
10
+ This prevents hallucination because:
11
+ - Candidates are already grounded in text (from pattern extraction)
12
+ - ToT provides structured evaluation with explicit probabilities
13
+ - Navigation provides additional context for ambiguous cases
14
+ - Same battle-tested pattern used for document Q&A
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import re
21
+ from dataclasses import dataclass, field
22
+ from typing import Any
23
+
24
+ import structlog
25
+
26
+ from rnsr.extraction.candidate_extractor import EntityCandidate
27
+ from rnsr.extraction.models import Entity, EntityType, Mention
28
+ from rnsr.llm import get_llm
29
+ from rnsr.models import DocumentTree
30
+
31
+ logger = structlog.get_logger(__name__)
32
+
33
+
34
+ # ToT-style prompt for entity validation (mirrors graph.py ToT_SYSTEM_PROMPT pattern)
35
+ TOT_ENTITY_VALIDATION_PROMPT = """You are validating entity candidates extracted from a document.
36
+
37
+ Current Section: {section_header}
38
+ Section Content:
39
+ ---
40
+ {section_content}
41
+ ---
42
+
43
+ Entity Candidates to Evaluate:
44
+ {candidates_formatted}
45
+
46
+ EVALUATION TASK:
47
+ For each candidate, estimate the probability (0.0 to 1.0) that it is a valid,
48
+ significant entity worth tracking, AND classify its type.
49
+
50
+ INSTRUCTIONS:
51
+ 1. Evaluate: For each candidate, analyze its context and estimate validity probability.
52
+ 2. Valid entities have: clear identity, specific name, significance to document.
53
+ 3. Invalid entities are: generic terms, partial matches, noise, common words.
54
+ 4. If probability >= {selection_threshold}, include in selected_entities.
55
+ 5. If probability < {rejection_threshold}, mark as rejected.
56
+ 6. Provide brief reasoning for each decision.
57
+ 7. Classify type: PERSON, ORGANIZATION, DATE, LOCATION, MONETARY, REFERENCE,
58
+ DOCUMENT, EVENT, LEGAL_CONCEPT, or describe a custom type.
59
+
60
+ OUTPUT FORMAT (JSON):
61
+ {{
62
+ "evaluations": [
63
+ {{
64
+ "candidate_id": 0,
65
+ "probability": 0.85,
66
+ "is_valid": true,
67
+ "entity_type": "PERSON",
68
+ "canonical_name": "John Smith",
69
+ "role": "defendant",
70
+ "reasoning": "Clear person name with title, mentioned as party to case"
71
+ }},
72
+ {{
73
+ "candidate_id": 1,
74
+ "probability": 0.30,
75
+ "is_valid": false,
76
+ "entity_type": null,
77
+ "canonical_name": null,
78
+ "reasoning": "Generic reference to 'the agreement', not a specific entity"
79
+ }}
80
+ ],
81
+ "selected_entities": [0],
82
+ "needs_more_context": [],
83
+ "high_confidence_count": 1,
84
+ "low_confidence_count": 1
85
+ }}
86
+
87
+ If uncertain about a candidate (probability 0.4-0.6), add its id to "needs_more_context".
88
+ We may navigate to related sections to gather more information.
89
+
90
+ Respond ONLY with the JSON, no other text."""
91
+
92
+
93
+ # Prompt for gathering context from related sections
94
+ TOT_CONTEXT_GATHERING_PROMPT = """You need more context to validate an entity candidate.
95
+
96
+ Entity candidate: "{candidate_text}" (type hint: {type_hint})
97
+ Original section: {original_section}
98
+
99
+ Related sections found:
100
+ {related_sections}
101
+
102
+ Based on this additional context, provide your evaluation:
103
+ {{
104
+ "candidate_id": {candidate_id},
105
+ "probability": 0.XX,
106
+ "is_valid": true/false,
107
+ "entity_type": "TYPE",
108
+ "canonical_name": "Full Name",
109
+ "reasoning": "With additional context from section X, this is clearly a..."
110
+ }}
111
+
112
+ Respond ONLY with the JSON, no other text."""
113
+
114
+
115
+ @dataclass
116
+ class TotValidationResult:
117
+ """Result of ToT entity validation."""
118
+
119
+ candidate_id: int
120
+ probability: float
121
+ is_valid: bool
122
+ entity_type: str | None = None
123
+ canonical_name: str | None = None
124
+ role: str | None = None
125
+ reasoning: str = ""
126
+ used_navigation: bool = False
127
+
128
+
129
+ @dataclass
130
+ class TotBatchResult:
131
+ """Result of validating a batch of candidates."""
132
+
133
+ evaluations: list[TotValidationResult] = field(default_factory=list)
134
+ selected_entities: list[int] = field(default_factory=list)
135
+ needs_more_context: list[int] = field(default_factory=list)
136
+ high_confidence_count: int = 0
137
+ low_confidence_count: int = 0
138
+
139
+
140
+ class TotEntityValidator:
141
+ """
142
+ Tree of Thoughts entity validator.
143
+
144
+ Uses the same ToT pattern as document navigation:
145
+ - Evaluate candidates with explicit probabilities
146
+ - Navigate for context when uncertain
147
+ - Structured JSON output for reliable parsing
148
+ """
149
+
150
+ def __init__(
151
+ self,
152
+ llm: Any | None = None,
153
+ selection_threshold: float = 0.6,
154
+ rejection_threshold: float = 0.3,
155
+ enable_navigation: bool = True,
156
+ max_navigation_depth: int = 2,
157
+ max_candidates_per_batch: int = 20,
158
+ ):
159
+ """
160
+ Initialize the ToT validator.
161
+
162
+ Args:
163
+ llm: LLM instance.
164
+ selection_threshold: Probability threshold for accepting entity.
165
+ rejection_threshold: Probability threshold for rejecting entity.
166
+ enable_navigation: Navigate tree for uncertain candidates.
167
+ max_navigation_depth: Max depth to navigate for context.
168
+ max_candidates_per_batch: Max candidates per LLM call.
169
+ """
170
+ self.llm = llm
171
+ self.selection_threshold = selection_threshold
172
+ self.rejection_threshold = rejection_threshold
173
+ self.enable_navigation = enable_navigation
174
+ self.max_navigation_depth = max_navigation_depth
175
+ self.max_candidates_per_batch = max_candidates_per_batch
176
+
177
+ self._llm_initialized = False
178
+
179
+ def _get_llm(self) -> Any:
180
+ """Get or initialize LLM."""
181
+ if self.llm is None and not self._llm_initialized:
182
+ self.llm = get_llm()
183
+ self._llm_initialized = True
184
+ return self.llm
185
+
186
+ def validate_candidates(
187
+ self,
188
+ candidates: list[EntityCandidate],
189
+ section_header: str,
190
+ section_content: str,
191
+ document_tree: DocumentTree | None = None,
192
+ node_id: str | None = None,
193
+ ) -> TotBatchResult:
194
+ """
195
+ Validate entity candidates using ToT reasoning.
196
+
197
+ Args:
198
+ candidates: Pre-extracted candidates to validate.
199
+ section_header: Current section header.
200
+ section_content: Current section content.
201
+ document_tree: Optional tree for navigation.
202
+ node_id: Current node ID for navigation.
203
+
204
+ Returns:
205
+ TotBatchResult with validated entities.
206
+ """
207
+ if not candidates:
208
+ return TotBatchResult()
209
+
210
+ llm = self._get_llm()
211
+ if llm is None:
212
+ # No LLM - accept all candidates with pattern-based types
213
+ return self._accept_all_candidates(candidates)
214
+
215
+ # Process in batches
216
+ all_results = TotBatchResult()
217
+
218
+ for i in range(0, len(candidates), self.max_candidates_per_batch):
219
+ batch = candidates[i:i + self.max_candidates_per_batch]
220
+ batch_offset = i
221
+
222
+ batch_result = self._validate_batch(
223
+ candidates=batch,
224
+ batch_offset=batch_offset,
225
+ section_header=section_header,
226
+ section_content=section_content,
227
+ )
228
+
229
+ # Merge results
230
+ all_results.evaluations.extend(batch_result.evaluations)
231
+ all_results.selected_entities.extend(batch_result.selected_entities)
232
+ all_results.needs_more_context.extend(batch_result.needs_more_context)
233
+ all_results.high_confidence_count += batch_result.high_confidence_count
234
+ all_results.low_confidence_count += batch_result.low_confidence_count
235
+
236
+ # Handle uncertain candidates if navigation is enabled
237
+ if self.enable_navigation and document_tree and all_results.needs_more_context:
238
+ all_results = self._resolve_uncertain_candidates(
239
+ candidates=candidates,
240
+ batch_result=all_results,
241
+ document_tree=document_tree,
242
+ current_node_id=node_id,
243
+ )
244
+
245
+ return all_results
246
+
247
+ def _validate_batch(
248
+ self,
249
+ candidates: list[EntityCandidate],
250
+ batch_offset: int,
251
+ section_header: str,
252
+ section_content: str,
253
+ ) -> TotBatchResult:
254
+ """Validate a batch of candidates with ToT."""
255
+ # Format candidates for prompt
256
+ candidates_formatted = "\n".join([
257
+ f"[{i + batch_offset}] Text: \"{c.text}\" | Type Hint: {c.candidate_type} | "
258
+ f"Context: \"...{c.context[:100]}...\""
259
+ for i, c in enumerate(candidates)
260
+ ])
261
+
262
+ prompt = TOT_ENTITY_VALIDATION_PROMPT.format(
263
+ section_header=section_header,
264
+ section_content=section_content[:2500],
265
+ candidates_formatted=candidates_formatted,
266
+ selection_threshold=self.selection_threshold,
267
+ rejection_threshold=self.rejection_threshold,
268
+ )
269
+
270
+ try:
271
+ response = self.llm.complete(prompt)
272
+ response_text = str(response) if not isinstance(response, str) else response
273
+
274
+ return self._parse_validation_response(response_text, len(candidates), batch_offset)
275
+
276
+ except Exception as e:
277
+ logger.warning("tot_validation_failed", error=str(e))
278
+ return self._accept_all_candidates(candidates, offset=batch_offset)
279
+
280
+ def _parse_validation_response(
281
+ self,
282
+ response_text: str,
283
+ candidate_count: int,
284
+ batch_offset: int,
285
+ ) -> TotBatchResult:
286
+ """Parse ToT validation response."""
287
+ result = TotBatchResult()
288
+
289
+ # Extract JSON
290
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
291
+ if not json_match:
292
+ logger.warning("tot_no_json_found")
293
+ return result
294
+
295
+ try:
296
+ data = json.loads(json_match.group())
297
+ except json.JSONDecodeError as e:
298
+ logger.warning("tot_json_parse_failed", error=str(e))
299
+ return result
300
+
301
+ # Parse evaluations
302
+ for eval_data in data.get("evaluations", []):
303
+ try:
304
+ validation = TotValidationResult(
305
+ candidate_id=eval_data.get("candidate_id", 0),
306
+ probability=float(eval_data.get("probability", 0.5)),
307
+ is_valid=eval_data.get("is_valid", False),
308
+ entity_type=eval_data.get("entity_type"),
309
+ canonical_name=eval_data.get("canonical_name"),
310
+ role=eval_data.get("role"),
311
+ reasoning=eval_data.get("reasoning", ""),
312
+ )
313
+ result.evaluations.append(validation)
314
+
315
+ if validation.is_valid:
316
+ result.high_confidence_count += 1
317
+ else:
318
+ result.low_confidence_count += 1
319
+
320
+ except (KeyError, TypeError, ValueError) as e:
321
+ logger.debug("tot_eval_parse_error", error=str(e))
322
+ continue
323
+
324
+ # Parse selected entities (adjust for batch offset)
325
+ result.selected_entities = [
326
+ idx for idx in data.get("selected_entities", [])
327
+ if isinstance(idx, int)
328
+ ]
329
+
330
+ # Parse needs_more_context
331
+ result.needs_more_context = [
332
+ idx for idx in data.get("needs_more_context", [])
333
+ if isinstance(idx, int)
334
+ ]
335
+
336
+ return result
337
+
338
+ def _resolve_uncertain_candidates(
339
+ self,
340
+ candidates: list[EntityCandidate],
341
+ batch_result: TotBatchResult,
342
+ document_tree: DocumentTree,
343
+ current_node_id: str | None,
344
+ ) -> TotBatchResult:
345
+ """
346
+ Navigate document tree to resolve uncertain candidates.
347
+
348
+ This is like backtracking in document Q&A - gather more context
349
+ to make a better decision.
350
+ """
351
+ if not batch_result.needs_more_context:
352
+ return batch_result
353
+
354
+ logger.info(
355
+ "tot_navigating_for_context",
356
+ uncertain_count=len(batch_result.needs_more_context),
357
+ )
358
+
359
+ # Find related sections
360
+ related_sections = self._find_related_sections(
361
+ document_tree=document_tree,
362
+ current_node_id=current_node_id,
363
+ depth=self.max_navigation_depth,
364
+ )
365
+
366
+ if not related_sections:
367
+ # No related sections - accept uncertain candidates with lower confidence
368
+ for idx in batch_result.needs_more_context:
369
+ if idx < len(candidates):
370
+ # Add as selected with moderate confidence
371
+ batch_result.selected_entities.append(idx)
372
+ return batch_result
373
+
374
+ # Re-evaluate uncertain candidates with additional context
375
+ for idx in batch_result.needs_more_context:
376
+ if idx >= len(candidates):
377
+ continue
378
+
379
+ candidate = candidates[idx]
380
+
381
+ resolved = self._resolve_single_candidate(
382
+ candidate=candidate,
383
+ candidate_id=idx,
384
+ related_sections=related_sections,
385
+ )
386
+
387
+ if resolved and resolved.is_valid:
388
+ # Update the evaluation
389
+ for i, eval_item in enumerate(batch_result.evaluations):
390
+ if eval_item.candidate_id == idx:
391
+ batch_result.evaluations[i] = resolved
392
+ break
393
+ else:
394
+ batch_result.evaluations.append(resolved)
395
+
396
+ batch_result.selected_entities.append(idx)
397
+ batch_result.high_confidence_count += 1
398
+
399
+ # Clear needs_more_context since we've processed them
400
+ batch_result.needs_more_context = []
401
+
402
+ return batch_result
403
+
404
+ def _find_related_sections(
405
+ self,
406
+ document_tree: DocumentTree,
407
+ current_node_id: str | None,
408
+ depth: int,
409
+ ) -> list[dict[str, str]]:
410
+ """Find related sections for context gathering."""
411
+ sections = []
412
+
413
+ if not document_tree or not document_tree.root:
414
+ return sections
415
+
416
+ # Collect sections from tree (siblings and nearby nodes)
417
+ def collect_sections(node: Any, current_depth: int) -> None:
418
+ if current_depth > depth:
419
+ return
420
+
421
+ if hasattr(node, 'header') and hasattr(node, 'content'):
422
+ node_id = getattr(node, 'id', str(id(node)))
423
+ if node_id != current_node_id:
424
+ sections.append({
425
+ "header": node.header or "(no header)",
426
+ "content": (node.content or "")[:500],
427
+ })
428
+
429
+ if hasattr(node, 'children'):
430
+ for child in node.children[:5]: # Limit children
431
+ collect_sections(child, current_depth + 1)
432
+
433
+ collect_sections(document_tree.root, 0)
434
+
435
+ return sections[:10] # Limit total sections
436
+
437
+ def _resolve_single_candidate(
438
+ self,
439
+ candidate: EntityCandidate,
440
+ candidate_id: int,
441
+ related_sections: list[dict[str, str]],
442
+ ) -> TotValidationResult | None:
443
+ """Resolve a single uncertain candidate with additional context."""
444
+ llm = self._get_llm()
445
+ if llm is None:
446
+ return None
447
+
448
+ # Format related sections
449
+ sections_text = "\n\n".join([
450
+ f"### {s['header']}\n{s['content']}"
451
+ for s in related_sections[:5]
452
+ ])
453
+
454
+ prompt = TOT_CONTEXT_GATHERING_PROMPT.format(
455
+ candidate_text=candidate.text,
456
+ type_hint=candidate.candidate_type,
457
+ original_section=candidate.context[:200],
458
+ related_sections=sections_text,
459
+ candidate_id=candidate_id,
460
+ )
461
+
462
+ try:
463
+ response = llm.complete(prompt)
464
+ response_text = str(response) if not isinstance(response, str) else response
465
+
466
+ # Parse response
467
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
468
+ if not json_match:
469
+ return None
470
+
471
+ data = json.loads(json_match.group())
472
+
473
+ return TotValidationResult(
474
+ candidate_id=candidate_id,
475
+ probability=float(data.get("probability", 0.5)),
476
+ is_valid=data.get("is_valid", False),
477
+ entity_type=data.get("entity_type"),
478
+ canonical_name=data.get("canonical_name"),
479
+ role=data.get("role"),
480
+ reasoning=data.get("reasoning", ""),
481
+ used_navigation=True,
482
+ )
483
+
484
+ except Exception as e:
485
+ logger.debug("tot_context_resolution_failed", error=str(e))
486
+ return None
487
+
488
+ def _accept_all_candidates(
489
+ self,
490
+ candidates: list[EntityCandidate],
491
+ offset: int = 0,
492
+ ) -> TotBatchResult:
493
+ """Accept all candidates without validation (fallback)."""
494
+ result = TotBatchResult()
495
+
496
+ for i, candidate in enumerate(candidates):
497
+ idx = i + offset
498
+ result.evaluations.append(TotValidationResult(
499
+ candidate_id=idx,
500
+ probability=candidate.confidence,
501
+ is_valid=True,
502
+ entity_type=candidate.candidate_type.upper(),
503
+ canonical_name=candidate.text,
504
+ reasoning="Accepted without LLM validation",
505
+ ))
506
+ result.selected_entities.append(idx)
507
+
508
+ result.high_confidence_count = len(candidates)
509
+ return result
510
+
511
+ def candidates_to_entities(
512
+ self,
513
+ candidates: list[EntityCandidate],
514
+ validation_result: TotBatchResult,
515
+ node_id: str,
516
+ doc_id: str,
517
+ page_num: int | None = None,
518
+ ) -> list[Entity]:
519
+ """
520
+ Convert validated candidates to Entity objects.
521
+
522
+ Only includes candidates that passed ToT validation.
523
+ """
524
+ entities = []
525
+
526
+ # Build lookup for evaluations
527
+ eval_by_id = {e.candidate_id: e for e in validation_result.evaluations}
528
+
529
+ for idx in validation_result.selected_entities:
530
+ if idx >= len(candidates):
531
+ continue
532
+
533
+ candidate = candidates[idx]
534
+ evaluation = eval_by_id.get(idx)
535
+
536
+ if not evaluation or not evaluation.is_valid:
537
+ continue
538
+
539
+ # Map entity type
540
+ entity_type = self._map_entity_type(
541
+ evaluation.entity_type or candidate.candidate_type
542
+ )
543
+
544
+ # Get canonical name
545
+ canonical_name = evaluation.canonical_name or candidate.text
546
+
547
+ # Create mention
548
+ mention = Mention(
549
+ node_id=node_id,
550
+ doc_id=doc_id,
551
+ span_start=candidate.start,
552
+ span_end=candidate.end,
553
+ context=candidate.context,
554
+ page_num=page_num,
555
+ confidence=evaluation.probability,
556
+ )
557
+
558
+ # Build metadata
559
+ metadata = {
560
+ "grounded": True,
561
+ "tot_validated": True,
562
+ "tot_probability": evaluation.probability,
563
+ "tot_reasoning": evaluation.reasoning,
564
+ "pattern": candidate.pattern_name,
565
+ }
566
+
567
+ if evaluation.role:
568
+ metadata["role"] = evaluation.role
569
+
570
+ if evaluation.used_navigation:
571
+ metadata["used_context_navigation"] = True
572
+
573
+ if entity_type == EntityType.OTHER:
574
+ metadata["original_type"] = (evaluation.entity_type or "").lower()
575
+
576
+ entity = Entity(
577
+ type=entity_type,
578
+ canonical_name=canonical_name,
579
+ aliases=[candidate.text] if candidate.text != canonical_name else [],
580
+ mentions=[mention],
581
+ metadata=metadata,
582
+ source_doc_id=doc_id,
583
+ )
584
+ entities.append(entity)
585
+
586
+ return entities
587
+
588
+ def _map_entity_type(self, type_str: str) -> EntityType:
589
+ """Map type string to EntityType enum."""
590
+ type_str = type_str.upper()
591
+
592
+ mapping = {
593
+ "PERSON": EntityType.PERSON,
594
+ "ORGANIZATION": EntityType.ORGANIZATION,
595
+ "ORG": EntityType.ORGANIZATION,
596
+ "DATE": EntityType.DATE,
597
+ "LOCATION": EntityType.LOCATION,
598
+ "MONETARY": EntityType.MONETARY,
599
+ "MONEY": EntityType.MONETARY,
600
+ "REFERENCE": EntityType.REFERENCE,
601
+ "DOCUMENT": EntityType.DOCUMENT,
602
+ "EVENT": EntityType.EVENT,
603
+ "LEGAL_CONCEPT": EntityType.LEGAL_CONCEPT,
604
+ "LEGAL": EntityType.LEGAL_CONCEPT,
605
+ }
606
+
607
+ try:
608
+ return EntityType(type_str.lower())
609
+ except ValueError:
610
+ return mapping.get(type_str, EntityType.OTHER)