rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,357 @@
1
+ """
2
+ RNSR Candidate Extractor
3
+
4
+ Pre-extraction of entity candidates using regex and pattern matching.
5
+ This provides GROUNDED candidates that the LLM then classifies,
6
+ rather than asking the LLM to hallucinate entities from scratch.
7
+
8
+ The flow is:
9
+ 1. Extract candidates using regex/patterns (grounded in actual text)
10
+ 2. LLM classifies and validates candidates (not inventing, labeling)
11
+ 3. Merge and deduplicate
12
+
13
+ This approach prevents hallucination because:
14
+ - Every entity is tied to an exact text span in the document
15
+ - LLM's job is classification, not generation
16
+ - Candidates come from deterministic pattern matching
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from dataclasses import dataclass, field
23
+ from typing import Any
24
+
25
+ import structlog
26
+
27
+ logger = structlog.get_logger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class EntityCandidate:
32
+ """
33
+ A candidate entity extracted from text before LLM classification.
34
+
35
+ This is GROUNDED - it points to exact text in the document.
36
+ """
37
+
38
+ text: str # Exact text as it appears
39
+ start: int # Character offset start
40
+ end: int # Character offset end
41
+ candidate_type: str # Suggested type (from pattern)
42
+ confidence: float = 0.5 # Pattern match confidence
43
+ context: str = "" # Surrounding text
44
+ pattern_name: str = "" # Which pattern matched
45
+ metadata: dict = field(default_factory=dict)
46
+
47
+
48
+ # =============================================================================
49
+ # Pattern Definitions
50
+ # =============================================================================
51
+
52
+ # Person patterns (names)
53
+ PERSON_PATTERNS = [
54
+ # Titles + Names: "Mr. John Smith", "Dr. Jane Doe"
55
+ (r'\b(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Hon\.|Rev\.)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', "title_name"),
56
+
57
+ # Full names (First Last): "John Smith", "Mary Jane Watson"
58
+ (r'\b[A-Z][a-z]+\s+(?:[A-Z]\.\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b', "full_name"),
59
+
60
+ # Names with suffix: "John Smith Jr.", "Robert Johnson III"
61
+ (r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+(?:Jr\.|Sr\.|II|III|IV|Esq\.)\b', "name_suffix"),
62
+
63
+ # Role-based: "Plaintiff John Smith", "Defendant ABC Corp"
64
+ (r'\b(?:Plaintiff|Defendant|Petitioner|Respondent|Appellant|Appellee)\s+[A-Z][A-Za-z\s,\.]+?(?=,|\.|;|and\b|\n)', "legal_party"),
65
+ ]
66
+
67
+ # Organization patterns
68
+ ORGANIZATION_PATTERNS = [
69
+ # Company suffixes: "Acme Inc.", "BigCorp LLC"
70
+ (r'\b[A-Z][A-Za-z\s&]+?(?:Inc\.|LLC|Ltd\.|Corp\.|Corporation|Company|Co\.|L\.P\.|LLP|PLC|GmbH|Pty Ltd)\.?\b', "company_suffix"),
71
+
72
+ # "The X Company/Organization/Association"
73
+ (r'\bThe\s+[A-Z][A-Za-z\s]+(?:Company|Corporation|Organization|Association|Foundation|Institute|Agency|Department|Board|Commission|Committee)\b', "org_name"),
74
+
75
+ # Courts: "Supreme Court", "District Court of..."
76
+ (r'\b(?:Supreme|District|Circuit|Appeals?|Bankruptcy|Federal|State|County|Municipal)\s+Court(?:\s+of\s+[A-Za-z\s]+)?\b', "court"),
77
+
78
+ # Government agencies
79
+ (r'\b(?:Department|Bureau|Office|Agency|Administration)\s+of\s+[A-Z][A-Za-z\s]+\b', "gov_agency"),
80
+ ]
81
+
82
+ # Date patterns
83
+ DATE_PATTERNS = [
84
+ # Full dates: "January 15, 2024", "15 January 2024"
85
+ (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', "full_date"),
86
+ (r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', "full_date"),
87
+
88
+ # Numeric dates: "01/15/2024", "2024-01-15"
89
+ (r'\b\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}\b', "numeric_date"),
90
+ (r'\b\d{4}[/\-]\d{1,2}[/\-]\d{1,2}\b', "iso_date"),
91
+
92
+ # Relative dates: "on or about January 2024"
93
+ (r'\b(?:on\s+or\s+about|approximately|around)\s+[A-Z][a-z]+\s+\d{4}\b', "approx_date"),
94
+ ]
95
+
96
+ # Monetary patterns
97
+ MONETARY_PATTERNS = [
98
+ # Dollar amounts: "$1,234.56", "$1.5 million"
99
+ (r'\$[\d,]+(?:\.\d{2})?\s*(?:million|billion|thousand|M|B|K)?\b', "dollar_amount"),
100
+
101
+ # Written amounts: "One Million Dollars"
102
+ (r'\b(?:One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+)\s+(?:Hundred|Thousand|Million|Billion)\s+(?:Dollars|dollars|USD)\b', "written_amount"),
103
+
104
+ # Currency codes: "USD 1,234", "EUR 500"
105
+ (r'\b(?:USD|EUR|GBP|CAD|AUD)\s*[\d,]+(?:\.\d{2})?\b', "currency_code"),
106
+ ]
107
+
108
+ # Location patterns
109
+ LOCATION_PATTERNS = [
110
+ # US States
111
+ (r'\b(?:Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\s+Hampshire|New\s+Jersey|New\s+Mexico|New\s+York|North\s+Carolina|North\s+Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\s+Island|South\s+Carolina|South\s+Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\s+Virginia|Wisconsin|Wyoming)\b', "us_state"),
112
+
113
+ # City, State format
114
+ (r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?,\s*(?:AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)\b', "city_state"),
115
+
116
+ # Addresses
117
+ (r'\b\d+\s+[A-Z][A-Za-z\s]+(?:Street|St\.|Avenue|Ave\.|Road|Rd\.|Boulevard|Blvd\.|Drive|Dr\.|Lane|Ln\.|Way|Place|Pl\.)\b', "street_address"),
118
+ ]
119
+
120
+ # Reference patterns (legal citations, exhibits)
121
+ REFERENCE_PATTERNS = [
122
+ # Section references: "Section 3.2", "§ 12"
123
+ (r'\b(?:Section|§)\s*\d+(?:\.\d+)*\b', "section_ref"),
124
+
125
+ # Exhibit references: "Exhibit A", "Attachment 1"
126
+ (r'\b(?:Exhibit|Attachment|Appendix|Schedule|Annex)\s+[A-Z0-9]+\b', "exhibit_ref"),
127
+
128
+ # Legal citations: "123 F.3d 456"
129
+ (r'\b\d+\s+[A-Z]\.\s*(?:\d+[a-z]*)?\s+\d+\b', "legal_citation"),
130
+
131
+ # Case citations: "Smith v. Jones"
132
+ (r'\b[A-Z][a-z]+\s+v\.\s+[A-Z][a-z]+\b', "case_citation"),
133
+ ]
134
+
135
+ # Document patterns
136
+ DOCUMENT_PATTERNS = [
137
+ # Agreement types
138
+ (r'\b(?:the\s+)?[A-Z][A-Za-z\s]*(?:Agreement|Contract|Lease|License|Deed|Will|Trust|Policy|Amendment|Addendum)\b', "agreement_type"),
139
+
140
+ # Legal documents
141
+ (r'\b(?:Complaint|Motion|Order|Judgment|Verdict|Subpoena|Affidavit|Declaration|Stipulation|Brief)\b', "legal_doc"),
142
+ ]
143
+
144
+
145
+ # Compile all patterns
146
+ COMPILED_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
147
+ "person": [(re.compile(p, re.IGNORECASE), n) for p, n in PERSON_PATTERNS],
148
+ "organization": [(re.compile(p, re.IGNORECASE), n) for p, n in ORGANIZATION_PATTERNS],
149
+ "date": [(re.compile(p, re.IGNORECASE), n) for p, n in DATE_PATTERNS],
150
+ "monetary": [(re.compile(p, re.IGNORECASE), n) for p, n in MONETARY_PATTERNS],
151
+ "location": [(re.compile(p, re.IGNORECASE), n) for p, n in LOCATION_PATTERNS],
152
+ "reference": [(re.compile(p, re.IGNORECASE), n) for p, n in REFERENCE_PATTERNS],
153
+ "document": [(re.compile(p, re.IGNORECASE), n) for p, n in DOCUMENT_PATTERNS],
154
+ }
155
+
156
+
157
+ # =============================================================================
158
+ # Candidate Extractor
159
+ # =============================================================================
160
+
161
+
162
+ class CandidateExtractor:
163
+ """
164
+ Extracts entity candidates from text using regex patterns.
165
+
166
+ This provides GROUNDED candidates - every candidate points to
167
+ exact text in the document, preventing LLM hallucination.
168
+ """
169
+
170
+ def __init__(
171
+ self,
172
+ context_window: int = 100,
173
+ min_confidence: float = 0.3,
174
+ dedupe_overlap_threshold: float = 0.5,
175
+ ):
176
+ """
177
+ Initialize the candidate extractor.
178
+
179
+ Args:
180
+ context_window: Characters of context to capture around matches.
181
+ min_confidence: Minimum confidence to include a candidate.
182
+ dedupe_overlap_threshold: Overlap ratio to consider duplicates.
183
+ """
184
+ self.context_window = context_window
185
+ self.min_confidence = min_confidence
186
+ self.dedupe_overlap_threshold = dedupe_overlap_threshold
187
+
188
+ def extract_candidates(
189
+ self,
190
+ text: str,
191
+ entity_types: list[str] | None = None,
192
+ ) -> list[EntityCandidate]:
193
+ """
194
+ Extract all entity candidates from text.
195
+
196
+ Args:
197
+ text: The text to extract from.
198
+ entity_types: Optional list of types to extract (default: all).
199
+
200
+ Returns:
201
+ List of EntityCandidate objects, sorted by position.
202
+ """
203
+ if not text:
204
+ return []
205
+
206
+ candidates = []
207
+ types_to_check = entity_types or list(COMPILED_PATTERNS.keys())
208
+
209
+ for entity_type in types_to_check:
210
+ patterns = COMPILED_PATTERNS.get(entity_type, [])
211
+
212
+ for pattern, pattern_name in patterns:
213
+ for match in pattern.finditer(text):
214
+ # Calculate confidence based on pattern specificity
215
+ confidence = self._calculate_confidence(match, pattern_name)
216
+
217
+ if confidence < self.min_confidence:
218
+ continue
219
+
220
+ # Extract context
221
+ start = max(0, match.start() - self.context_window)
222
+ end = min(len(text), match.end() + self.context_window)
223
+ context = text[start:end]
224
+
225
+ candidate = EntityCandidate(
226
+ text=match.group().strip(),
227
+ start=match.start(),
228
+ end=match.end(),
229
+ candidate_type=entity_type,
230
+ confidence=confidence,
231
+ context=context,
232
+ pattern_name=pattern_name,
233
+ )
234
+ candidates.append(candidate)
235
+
236
+ # Deduplicate overlapping candidates
237
+ candidates = self._deduplicate(candidates)
238
+
239
+ # Sort by position
240
+ candidates.sort(key=lambda c: c.start)
241
+
242
+ logger.debug(
243
+ "candidates_extracted",
244
+ total=len(candidates),
245
+ by_type={t: sum(1 for c in candidates if c.candidate_type == t)
246
+ for t in set(c.candidate_type for c in candidates)},
247
+ )
248
+
249
+ return candidates
250
+
251
+ def _calculate_confidence(
252
+ self,
253
+ match: re.Match,
254
+ pattern_name: str,
255
+ ) -> float:
256
+ """
257
+ Calculate confidence score for a pattern match.
258
+
259
+ More specific patterns get higher confidence.
260
+ """
261
+ base_confidence = 0.5
262
+
263
+ # Boost for specific pattern types
264
+ high_confidence_patterns = {
265
+ "title_name": 0.9,
266
+ "company_suffix": 0.85,
267
+ "full_date": 0.9,
268
+ "dollar_amount": 0.95,
269
+ "legal_citation": 0.9,
270
+ "case_citation": 0.95,
271
+ "iso_date": 0.9,
272
+ "court": 0.85,
273
+ "exhibit_ref": 0.9,
274
+ "section_ref": 0.85,
275
+ }
276
+
277
+ if pattern_name in high_confidence_patterns:
278
+ return high_confidence_patterns[pattern_name]
279
+
280
+ # Boost for longer matches (more specific)
281
+ match_length = len(match.group())
282
+ if match_length > 30:
283
+ base_confidence += 0.2
284
+ elif match_length > 15:
285
+ base_confidence += 0.1
286
+
287
+ return min(base_confidence, 1.0)
288
+
289
+ def _deduplicate(
290
+ self,
291
+ candidates: list[EntityCandidate],
292
+ ) -> list[EntityCandidate]:
293
+ """
294
+ Remove overlapping candidates, keeping the higher confidence one.
295
+ """
296
+ if not candidates:
297
+ return []
298
+
299
+ # Sort by confidence descending, then by span length descending
300
+ sorted_candidates = sorted(
301
+ candidates,
302
+ key=lambda c: (-c.confidence, -(c.end - c.start)),
303
+ )
304
+
305
+ kept = []
306
+ for candidate in sorted_candidates:
307
+ # Check if this overlaps with any kept candidate
308
+ overlaps = False
309
+ for kept_candidate in kept:
310
+ overlap = self._calculate_overlap(candidate, kept_candidate)
311
+ if overlap > self.dedupe_overlap_threshold:
312
+ overlaps = True
313
+ break
314
+
315
+ if not overlaps:
316
+ kept.append(candidate)
317
+
318
+ return kept
319
+
320
+ def _calculate_overlap(
321
+ self,
322
+ c1: EntityCandidate,
323
+ c2: EntityCandidate,
324
+ ) -> float:
325
+ """Calculate overlap ratio between two candidates."""
326
+ start = max(c1.start, c2.start)
327
+ end = min(c1.end, c2.end)
328
+
329
+ if start >= end:
330
+ return 0.0
331
+
332
+ overlap = end - start
333
+ min_length = min(c1.end - c1.start, c2.end - c2.start)
334
+
335
+ return overlap / min_length if min_length > 0 else 0.0
336
+
337
+ def extract_by_type(
338
+ self,
339
+ text: str,
340
+ entity_type: str,
341
+ ) -> list[EntityCandidate]:
342
+ """Extract candidates of a specific type."""
343
+ return self.extract_candidates(text, entity_types=[entity_type])
344
+
345
+
346
+ def extract_candidates_from_text(text: str) -> list[EntityCandidate]:
347
+ """
348
+ Convenience function to extract all candidates from text.
349
+
350
+ Args:
351
+ text: Text to extract from.
352
+
353
+ Returns:
354
+ List of EntityCandidate objects.
355
+ """
356
+ extractor = CandidateExtractor()
357
+ return extractor.extract_candidates(text)