rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,460 @@
1
+ """
2
+ Text Builder - Build Document Tree from Raw Text
3
+
4
+ This module enables RLM processing for text that doesn't come from PDFs.
5
+ Used by benchmarks (which provide raw text contexts) and APIs.
6
+
7
+ The key insight from the research paper:
8
+ - Traditional RAG stuffs all context into the LLM prompt → Context Rot
9
+ - RLM stores document as variable (DOC_VAR) and navigates via summaries
10
+ - This requires a hierarchical tree structure
11
+
12
+ For raw text, we:
13
+ 1. Apply semantic chunking to create logical segments
14
+ 2. Use hierarchical clustering to create a tree structure
15
+ 3. Generate summaries for navigation (skeleton index)
16
+ 4. Store full text in KV store (the DOC_VAR abstraction)
17
+
18
+ Usage:
19
+ from rnsr.ingestion.text_builder import build_tree_from_text
20
+
21
+ # From a single text string
22
+ tree = build_tree_from_text("Long document text here...")
23
+
24
+ # From multiple context chunks (benchmark datasets)
25
+ tree = build_tree_from_contexts(["context1", "context2", "context3"])
26
+
27
+ # Then use with skeleton index as normal
28
+ skeleton, kv_store = build_skeleton_index(tree)
29
+ answer = run_navigator(question, skeleton, kv_store)
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import hashlib
35
+ import re
36
+ from dataclasses import dataclass, field
37
+ from typing import Any, Callable, Union
38
+ from uuid import uuid4
39
+
40
+ import structlog
41
+
42
+ from rnsr.models import DocumentNode, DocumentTree
43
+
44
+ logger = structlog.get_logger(__name__)
45
+
46
+
47
+ @dataclass
48
+ class TextSegment:
49
+ """A segment of text with metadata."""
50
+
51
+ text: str
52
+ start_char: int
53
+ end_char: int
54
+ level: int = 0
55
+ header: str = ""
56
+ segment_id: str = ""
57
+
58
+ def __post_init__(self):
59
+ if not self.segment_id:
60
+ # Generate stable ID from content hash
61
+ content_hash = hashlib.md5(self.text[:100].encode()).hexdigest()[:8]
62
+ self.segment_id = f"seg_{content_hash}"
63
+
64
+
65
+ # =============================================================================
66
+ # Header Detection Patterns
67
+ # =============================================================================
68
+
69
+ # Common header patterns in text documents
70
+ HEADER_PATTERNS: list[tuple[str, Union[int, Callable[[re.Match[str]], int]]]] = [
71
+ # Numbered headers: "1.", "1.1", "1.1.1", etc.
72
+ (r'^(\d+\.)+\s+(.+)$', 1), # level based on dot count
73
+ # Markdown-style: "# Title", "## Subtitle"
74
+ (r'^(#{1,4})\s+(.+)$', lambda m: len(m.group(1))),
75
+ # ALL CAPS (likely headers)
76
+ (r'^([A-Z][A-Z\s]{4,50})$', 1),
77
+ # "Chapter X:", "Section X:", "Part X:"
78
+ (r'^(Chapter|Section|Part)\s+\d+[:\.]?\s*(.*)$', 1),
79
+ # Roman numerals: "I.", "II.", "III."
80
+ (r'^(I{1,3}|IV|V|VI{1,3}|IX|X)[\.\)]\s+(.+)$', 1),
81
+ ]
82
+
83
+
84
+ def _detect_headers_in_text(text: str) -> list[tuple[int, str, int]]:
85
+ """
86
+ Detect headers in text and their positions.
87
+
88
+ Returns list of (char_position, header_text, level).
89
+ """
90
+ headers: list[tuple[int, str, int]] = []
91
+ lines = text.split('\n')
92
+ char_pos = 0
93
+
94
+ for line in lines:
95
+ stripped = line.strip()
96
+
97
+ for pattern, level_info in HEADER_PATTERNS:
98
+ match = re.match(pattern, stripped)
99
+ if match:
100
+ # Determine level
101
+ if callable(level_info):
102
+ level = int(level_info(match))
103
+ elif pattern.startswith(r'^(\d+\.)+'):
104
+ # Count dots for numbered headers
105
+ level = stripped.count('.', 0, match.end(1))
106
+ else:
107
+ level = int(level_info)
108
+
109
+ headers.append((char_pos, stripped, min(level, 3)))
110
+ break
111
+
112
+ char_pos += len(line) + 1 # +1 for newline
113
+
114
+ return headers
115
+
116
+
117
+ # =============================================================================
118
+ # Semantic Chunking
119
+ # =============================================================================
120
+
121
+ def _semantic_chunk_text(
122
+ text: str,
123
+ chunk_size: int = 2000,
124
+ min_chunk_size: int = 100,
125
+ ) -> list[TextSegment]:
126
+ """
127
+ Chunk text semantically by detecting natural boundaries.
128
+
129
+ Strategy:
130
+ 1. First try to split on detected headers
131
+ 2. Then split on paragraph breaks
132
+ 3. Finally split on sentences if still too large
133
+ """
134
+ if len(text) < chunk_size:
135
+ return [TextSegment(text=text, start_char=0, end_char=len(text))]
136
+
137
+ segments: list[TextSegment] = []
138
+ headers = _detect_headers_in_text(text)
139
+
140
+ if headers:
141
+ # Split on headers
142
+ for i, (pos, header_text, level) in enumerate(headers):
143
+ start = pos
144
+ end = headers[i + 1][0] if i + 1 < len(headers) else len(text)
145
+
146
+ segment_text = text[start:end].strip()
147
+ if len(segment_text) >= min_chunk_size:
148
+ segments.append(TextSegment(
149
+ text=segment_text,
150
+ start_char=start,
151
+ end_char=end,
152
+ level=level,
153
+ header=header_text,
154
+ ))
155
+
156
+ # If we got good segments, use them
157
+ if len(segments) >= 2:
158
+ return segments
159
+
160
+ # Fallback: split on paragraph breaks
161
+ paragraphs = re.split(r'\n\n+', text)
162
+ current_chunk = ""
163
+ current_start = 0
164
+ char_pos = 0
165
+
166
+ for para in paragraphs:
167
+ para = para.strip()
168
+ if not para:
169
+ continue
170
+
171
+ # Check if adding this paragraph exceeds chunk size
172
+ if len(current_chunk) + len(para) > chunk_size and current_chunk:
173
+ segments.append(TextSegment(
174
+ text=current_chunk,
175
+ start_char=current_start,
176
+ end_char=char_pos,
177
+ ))
178
+ current_chunk = para
179
+ current_start = char_pos
180
+ else:
181
+ if current_chunk:
182
+ current_chunk += "\n\n" + para
183
+ else:
184
+ current_chunk = para
185
+
186
+ char_pos += len(para) + 2 # Account for \n\n
187
+
188
+ # Add final chunk
189
+ if current_chunk and len(current_chunk) >= min_chunk_size:
190
+ segments.append(TextSegment(
191
+ text=current_chunk,
192
+ start_char=current_start,
193
+ end_char=len(text),
194
+ ))
195
+
196
+ # If we only got one segment, try harder to split it
197
+ if len(segments) <= 1 and len(text) > chunk_size:
198
+ # Split on sentences
199
+ sentences = re.split(r'(?<=[.!?])\s+', text)
200
+ segments = []
201
+ current_chunk = ""
202
+ current_start = 0
203
+ char_pos = 0
204
+
205
+ for sent in sentences:
206
+ if len(current_chunk) + len(sent) > chunk_size and current_chunk:
207
+ segments.append(TextSegment(
208
+ text=current_chunk,
209
+ start_char=current_start,
210
+ end_char=char_pos,
211
+ ))
212
+ current_chunk = sent
213
+ current_start = char_pos
214
+ else:
215
+ current_chunk = (current_chunk + " " + sent).strip() if current_chunk else sent
216
+
217
+ char_pos += len(sent) + 1
218
+
219
+ if current_chunk:
220
+ segments.append(TextSegment(
221
+ text=current_chunk,
222
+ start_char=current_start,
223
+ end_char=len(text),
224
+ ))
225
+
226
+ return segments if segments else [TextSegment(text=text, start_char=0, end_char=len(text))]
227
+
228
+
229
+ # =============================================================================
230
+ # Tree Building
231
+ # =============================================================================
232
+
233
+ def _build_hierarchy_from_segments(
234
+ segments: list[TextSegment],
235
+ max_children: int = 10,
236
+ ) -> DocumentNode:
237
+ """
238
+ Build a hierarchical tree from flat segments.
239
+
240
+ Uses segment levels (from headers) to create proper nesting,
241
+ or creates a balanced tree if no levels detected.
242
+ """
243
+ if not segments:
244
+ return DocumentNode(
245
+ id="root",
246
+ content="",
247
+ header="Document",
248
+ level=0,
249
+ children=[],
250
+ )
251
+
252
+ # Check if we have level information
253
+ has_levels = any(s.level > 0 for s in segments)
254
+
255
+ if has_levels:
256
+ # Build tree using detected levels
257
+ root = DocumentNode(
258
+ id="root",
259
+ content="",
260
+ header="Document",
261
+ level=0,
262
+ children=[],
263
+ )
264
+
265
+ # Stack-based tree building
266
+ stack: list[DocumentNode] = [root]
267
+
268
+ for seg in segments:
269
+ node = DocumentNode(
270
+ id=seg.segment_id,
271
+ content=seg.text,
272
+ header=seg.header or f"Section {len(root.children) + 1}",
273
+ level=seg.level + 1, # Root is level 0
274
+ children=[],
275
+ )
276
+
277
+ # Find appropriate parent
278
+ while len(stack) > 1 and stack[-1].level >= node.level:
279
+ stack.pop()
280
+
281
+ stack[-1].children.append(node)
282
+ stack.append(node)
283
+
284
+ return root
285
+
286
+ else:
287
+ # No levels detected - create balanced tree
288
+ root = DocumentNode(
289
+ id="root",
290
+ content="",
291
+ header="Document",
292
+ level=0,
293
+ children=[],
294
+ )
295
+
296
+ if len(segments) <= max_children:
297
+ # All segments as direct children
298
+ for i, seg in enumerate(segments):
299
+ root.children.append(DocumentNode(
300
+ id=seg.segment_id,
301
+ content=seg.text,
302
+ header=seg.header or f"Section {i + 1}",
303
+ level=1,
304
+ children=[],
305
+ ))
306
+ else:
307
+ # Group into intermediate nodes
308
+ group_size = (len(segments) + max_children - 1) // max_children
309
+
310
+ for group_idx in range(0, len(segments), group_size):
311
+ group_segments = segments[group_idx:group_idx + group_size]
312
+
313
+ group_node = DocumentNode(
314
+ id=f"group_{group_idx}",
315
+ content="",
316
+ header=f"Part {group_idx // group_size + 1}",
317
+ level=1,
318
+ children=[],
319
+ )
320
+
321
+ for seg in group_segments:
322
+ group_node.children.append(DocumentNode(
323
+ id=seg.segment_id,
324
+ content=seg.text,
325
+ header=seg.header or "Segment",
326
+ level=2,
327
+ children=[],
328
+ ))
329
+
330
+ root.children.append(group_node)
331
+
332
+ return root
333
+
334
+
335
+ def _count_nodes(node: DocumentNode) -> int:
336
+ """Count total nodes in tree."""
337
+ return 1 + sum(_count_nodes(child) for child in node.children)
338
+
339
+
340
+ def _get_tree_depth(node: DocumentNode) -> int:
341
+ """Get maximum depth of tree."""
342
+ if not node.children:
343
+ return 1
344
+ return 1 + max(_get_tree_depth(child) for child in node.children)
345
+
346
+
347
+ # =============================================================================
348
+ # Public API
349
+ # =============================================================================
350
+
351
+ def build_tree_from_text(
352
+ text: str | list[str],
353
+ chunk_size: int = 2000,
354
+ generate_summaries: bool = False,
355
+ ) -> DocumentTree:
356
+ """
357
+ Build a document tree from raw text for RLM processing.
358
+
359
+ This is the key function that enables RNSR/RLM on non-PDF inputs.
360
+ The resulting tree can be used with build_skeleton_index() and
361
+ run_navigator() for full RLM query processing.
362
+
363
+ Args:
364
+ text: Either a single text string or a list of context chunks.
365
+ Lists are common from benchmark datasets.
366
+ chunk_size: Target size for semantic chunks (characters).
367
+ generate_summaries: Whether to generate LLM summaries for
368
+ navigation. Generally not needed since
369
+ skeleton_index handles this.
370
+
371
+ Returns:
372
+ DocumentTree suitable for skeleton indexing.
373
+
374
+ Example:
375
+ # From benchmark contexts
376
+ tree = build_tree_from_text(["ctx1", "ctx2", "ctx3"])
377
+ skeleton, kv = build_skeleton_index(tree)
378
+ result = run_navigator("question", skeleton, kv)
379
+ """
380
+ # Handle list of contexts (common from benchmarks)
381
+ if isinstance(text, list):
382
+ if len(text) == 0:
383
+ root = DocumentNode(
384
+ id="root",
385
+ content="",
386
+ header="Empty Document",
387
+ level=0,
388
+ children=[],
389
+ )
390
+ return DocumentTree(
391
+ id=f"doc_{uuid4().hex[:8]}",
392
+ title="Empty Document",
393
+ root=root,
394
+ total_nodes=1,
395
+ ingestion_tier=2,
396
+ )
397
+
398
+ # If contexts are already chunked, use them directly
399
+ if all(len(ctx) < chunk_size * 2 for ctx in text):
400
+ segments = [
401
+ TextSegment(
402
+ text=ctx,
403
+ start_char=0,
404
+ end_char=len(ctx),
405
+ header=f"Context {i + 1}",
406
+ )
407
+ for i, ctx in enumerate(text)
408
+ if ctx.strip()
409
+ ]
410
+ else:
411
+ # Combine and re-chunk
412
+ combined = "\n\n---\n\n".join(text)
413
+ segments = _semantic_chunk_text(combined, chunk_size)
414
+ else:
415
+ segments = _semantic_chunk_text(text, chunk_size)
416
+
417
+ # Build hierarchical tree
418
+ root = _build_hierarchy_from_segments(segments)
419
+
420
+ # Create DocumentTree wrapper
421
+ tree = DocumentTree(
422
+ id=f"doc_{uuid4().hex[:8]}",
423
+ title="Text Document",
424
+ root=root,
425
+ total_nodes=_count_nodes(root),
426
+ ingestion_tier=2, # Mark as semantic tier (not PDF-based)
427
+ )
428
+
429
+ logger.info(
430
+ "tree_built_from_text",
431
+ num_segments=len(segments),
432
+ tree_depth=_get_tree_depth(root),
433
+ total_nodes=tree.total_nodes,
434
+ )
435
+
436
+ return tree
437
+
438
+
439
+ def build_tree_from_contexts(
440
+ contexts: list[str],
441
+ question: str | None = None,
442
+ ) -> DocumentTree:
443
+ """
444
+ Build a tree optimized for benchmark evaluation.
445
+
446
+ This is a convenience wrapper for benchmark datasets that
447
+ provide pre-chunked context passages.
448
+
449
+ Args:
450
+ contexts: List of context passages from benchmark.
451
+ question: Optional question for context (unused currently).
452
+
453
+ Returns:
454
+ DocumentTree for skeleton indexing.
455
+ """
456
+ return build_tree_from_text(
457
+ contexts,
458
+ chunk_size=2000,
459
+ generate_summaries=False, # Skip summaries for speed in benchmarks
460
+ )