rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,417 @@
1
+ """
2
+ Semantic Fallback - TIER 2: For Flat Text Documents
3
+
4
+ When the Font Histogram Analyzer detects no font variance (flat text),
5
+ this module uses LlamaIndex's SemanticSplitterNodeParser to generate
6
+ "synthetic" sections based on embedding shifts.
7
+
8
+ Use this fallback when:
9
+ - Document has uniform font size throughout
10
+ - No headers can be detected via font analysis
11
+ - Document is machine-generated with no formatting
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from pathlib import Path
17
+
18
+ import fitz
19
+ import structlog
20
+
21
+ from rnsr.models import DocumentNode, DocumentTree
22
+
23
+ logger = structlog.get_logger(__name__)
24
+
25
+
26
+ def extract_raw_text(pdf_path: Path | str) -> str:
27
+ """
28
+ Extract all text from a PDF as a single string.
29
+
30
+ Args:
31
+ pdf_path: Path to the PDF file.
32
+
33
+ Returns:
34
+ Full text content of the document.
35
+ """
36
+ pdf_path = Path(pdf_path)
37
+ doc = fitz.open(pdf_path)
38
+
39
+ # get_text() returns str when called with no args or "text"
40
+ full_text = "\n\n".join(str(page.get_text()) for page in doc)
41
+ doc.close()
42
+
43
+ return full_text
44
+
45
+
46
+ def try_semantic_splitter_ingestion(
47
+ pdf_path: Path | str,
48
+ embed_provider: str | None = None,
49
+ ) -> DocumentTree:
50
+ """
51
+ TIER 2 Fallback: Use semantic splitting for flat text documents.
52
+
53
+ When Font Histogram detects no font variance, this method:
54
+ 1. Extracts raw text from the PDF
55
+ 2. Uses embedding-based splitting to find natural breaks
56
+ 3. Generates synthetic section headers
57
+
58
+ Args:
59
+ pdf_path: Path to the PDF file.
60
+ embed_provider: Embedding provider ("openai", "gemini", or None for auto).
61
+
62
+ Returns:
63
+ DocumentTree with synthetic sections.
64
+ """
65
+ pdf_path = Path(pdf_path)
66
+
67
+ logger.info("using_semantic_splitter", path=str(pdf_path))
68
+
69
+ # Extract raw text
70
+ full_text = extract_raw_text(pdf_path)
71
+
72
+ if not full_text.strip():
73
+ logger.warning("no_text_extracted", path=str(pdf_path))
74
+ # Return minimal tree
75
+ root = DocumentNode(id="root", level=0, header="Document")
76
+ return DocumentTree(
77
+ title="Empty Document",
78
+ root=root,
79
+ total_nodes=1,
80
+ ingestion_tier=2,
81
+ ingestion_method="semantic_splitter",
82
+ )
83
+
84
+ # Try to import LlamaIndex components
85
+ try:
86
+ from llama_index.core import Document
87
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
88
+
89
+ # Get embedding model (supports OpenAI, Gemini, auto-detect)
90
+ embed_model = _get_embedding_model(embed_provider)
91
+
92
+ # Create semantic splitter
93
+ splitter = SemanticSplitterNodeParser(
94
+ embed_model=embed_model,
95
+ breakpoint_percentile_threshold=95,
96
+ buffer_size=1,
97
+ )
98
+
99
+ # Split document
100
+ llama_doc = Document(text=full_text)
101
+ nodes = splitter.get_nodes_from_documents([llama_doc])
102
+
103
+ logger.info(
104
+ "semantic_split_complete",
105
+ chunks=len(nodes),
106
+ )
107
+
108
+ # Build tree from semantic chunks
109
+ return _build_tree_from_semantic_nodes(nodes, pdf_path.stem)
110
+
111
+ except ImportError as e:
112
+ logger.warning(
113
+ "llama_index_not_available",
114
+ error=str(e),
115
+ fallback="simple_chunking",
116
+ )
117
+ # Fall back to simple chunking
118
+ return _simple_chunk_fallback(full_text, pdf_path.stem)
119
+
120
+
121
+ def _get_embedding_model(provider: str | None = None):
122
+ """
123
+ Get embedding model with multi-provider support.
124
+
125
+ Supports: OpenAI, Gemini, auto-detect.
126
+
127
+ Args:
128
+ provider: "openai", "gemini", or None for auto-detect.
129
+
130
+ Returns:
131
+ LlamaIndex-compatible embedding model.
132
+ """
133
+ import os
134
+
135
+ # Auto-detect provider if not specified
136
+ if provider is None:
137
+ if os.getenv("GOOGLE_API_KEY"):
138
+ provider = "gemini"
139
+ elif os.getenv("OPENAI_API_KEY"):
140
+ provider = "openai"
141
+ else:
142
+ raise ValueError(
143
+ "No embedding API key found. "
144
+ "Set GOOGLE_API_KEY or OPENAI_API_KEY."
145
+ )
146
+
147
+ provider = provider.lower()
148
+
149
+ if provider == "gemini":
150
+ try:
151
+ from llama_index.embeddings.gemini import GeminiEmbedding
152
+
153
+ logger.info("using_gemini_embeddings")
154
+ return GeminiEmbedding(model_name="models/text-embedding-004")
155
+ except ImportError:
156
+ raise ImportError(
157
+ "Gemini embeddings not installed. "
158
+ "Install with: pip install llama-index-embeddings-gemini"
159
+ )
160
+
161
+ elif provider == "openai":
162
+ try:
163
+ from llama_index.embeddings.openai import OpenAIEmbedding
164
+
165
+ logger.info("using_openai_embeddings")
166
+ return OpenAIEmbedding(model="text-embedding-3-small")
167
+ except ImportError:
168
+ raise ImportError(
169
+ "OpenAI embeddings not installed. "
170
+ "Install with: pip install llama-index-embeddings-openai"
171
+ )
172
+
173
+ else:
174
+ raise ValueError(f"Unknown embedding provider: {provider}")
175
+
176
+
177
+ def _build_tree_from_semantic_nodes(nodes: list, title: str) -> DocumentTree:
178
+ """
179
+ Build a two-level tree structure from semantic splitter nodes.
180
+
181
+ This creates a hierarchy for plain text to give the navigator agent
182
+ a more meaningful structure to traverse.
183
+ """
184
+ root = DocumentNode(
185
+ id="root",
186
+ level=0,
187
+ header=title,
188
+ )
189
+
190
+ logger.info("generating_synthetic_headers", count=len(nodes))
191
+
192
+ # Helper to generate header (wrapped for potential parallelization later)
193
+ # For now, we add logging so the user knows it's not frozen
194
+ def get_header(text, index):
195
+ h = _generate_synthetic_header(text)
196
+ if h:
197
+ return h
198
+ return f"Section {index}"
199
+
200
+ # For plain text, create a two-level hierarchy.
201
+ group_size = 5 # Segments per group
202
+ if len(nodes) < group_size * 1.5: # Don't group if it results in tiny groups
203
+ # Create a flat tree if not enough segments for meaningful grouping
204
+ for i, node in enumerate(nodes, 1):
205
+ if i % 5 == 0:
206
+ logger.info("processing_node", current=i, total=len(nodes))
207
+
208
+ text = node.text.strip()
209
+ synthetic_header = get_header(text, i)
210
+ section = DocumentNode(
211
+ id=f"sec_{i:03d}",
212
+ level=1,
213
+ header=synthetic_header,
214
+ content=text,
215
+ )
216
+ root.children.append(section)
217
+ else:
218
+ # Create parent nodes to add hierarchy
219
+ num_groups = (len(nodes) + group_size - 1) // group_size
220
+ logger.info("processing_groups", total_groups=num_groups)
221
+
222
+ for i in range(num_groups):
223
+ logger.info("processing_group", current=i+1, total=num_groups)
224
+
225
+ start_index = i * group_size
226
+ end_index = start_index + group_size
227
+ group_nodes = nodes[start_index:end_index]
228
+
229
+ # Use the header of the first node in the group for the parent
230
+ parent_header_text = group_nodes[0].text.strip()
231
+ parent_header = get_header(parent_header_text, i + 1)
232
+
233
+ parent_node = DocumentNode(
234
+ id=f"group_{i}",
235
+ level=1,
236
+ header=parent_header,
237
+ )
238
+
239
+ for j, node in enumerate(group_nodes):
240
+ text = node.text.strip()
241
+ child_header = f"Paragraph {j + 1}"
242
+
243
+ child_node = DocumentNode(
244
+ id=f"sec_{(start_index + j):03d}",
245
+ level=2,
246
+ header=child_header,
247
+ content=text,
248
+ )
249
+ parent_node.children.append(child_node)
250
+
251
+ root.children.append(parent_node)
252
+
253
+ return DocumentTree(
254
+ title=title,
255
+ root=root,
256
+ total_nodes=len(nodes) + 1, # This is an approximation
257
+ ingestion_tier=2,
258
+ ingestion_method="semantic_splitter",
259
+ )
260
+
261
+
262
+ def _simple_chunk_fallback(text: str, title: str, chunk_size: int = 1000) -> DocumentTree:
263
+ """
264
+ Simple chunking fallback when LlamaIndex is not available.
265
+
266
+ Splits text into fixed-size chunks.
267
+ """
268
+ logger.info("using_simple_chunking", chunk_size=chunk_size)
269
+
270
+ root = DocumentNode(
271
+ id="root",
272
+ level=0,
273
+ header=title,
274
+ )
275
+
276
+ # Split into paragraphs first
277
+ paragraphs = text.split("\n\n")
278
+
279
+ # Group paragraphs into chunks
280
+ current_chunk = ""
281
+ chunk_num = 0
282
+
283
+ # Helper to generate header safely
284
+ def get_header(text, index):
285
+ h = _generate_synthetic_header(text)
286
+ if h:
287
+ return h
288
+ return f"Section {index}"
289
+
290
+ for i, para in enumerate(paragraphs):
291
+ para = para.strip()
292
+ if not para:
293
+ continue
294
+
295
+ if len(current_chunk) + len(para) > chunk_size:
296
+ if current_chunk:
297
+ chunk_num += 1
298
+ if chunk_num % 5 == 0:
299
+ logger.info("processing_chunk", current=chunk_num)
300
+
301
+ section = DocumentNode(
302
+ id=f"sec_{chunk_num:03d}",
303
+ level=1,
304
+ header=get_header(current_chunk, chunk_num),
305
+ content=current_chunk,
306
+ )
307
+ root.children.append(section)
308
+ current_chunk = para
309
+ else:
310
+ current_chunk += "\n\n" + para if current_chunk else para
311
+
312
+ # Add final chunk
313
+ if current_chunk:
314
+ chunk_num += 1
315
+ section = DocumentNode(
316
+ id=f"sec_{chunk_num:03d}",
317
+ level=1,
318
+ header=get_header(current_chunk, chunk_num),
319
+ content=current_chunk,
320
+ )
321
+ root.children.append(section)
322
+
323
+ return DocumentTree(
324
+ title=title,
325
+ root=root,
326
+ total_nodes=chunk_num + 1,
327
+ ingestion_tier=2,
328
+ ingestion_method="semantic_splitter",
329
+ )
330
+
331
+
332
+ def _generate_synthetic_header(text: str, section_num: int) -> str:
333
+ """
334
+ Generate a synthetic header from text content using LLM.
335
+
336
+ Per the research paper (Section 6.3): "For each identified section,
337
+ we execute an LLM call with prompt: 'Generate a descriptive,
338
+ hierarchical title for it. Return ONLY the title.'"
339
+
340
+ Falls back to heuristic extraction if LLM fails.
341
+ """
342
+ # Try LLM-based header generation first
343
+ try:
344
+ header = _generate_header_via_llm(text, section_num)
345
+ if header:
346
+ return header
347
+ except Exception as e:
348
+ logger.debug("llm_header_generation_failed", error=str(e))
349
+
350
+ # Fallback: heuristic extraction
351
+ return _generate_header_heuristic(text, section_num)
352
+
353
+
354
+ def _generate_header_via_llm(text: str, section_num: int) -> str | None:
355
+ """
356
+ Use LLM to generate a concise, descriptive header for a text section.
357
+
358
+ This implements the Synthetic Header Generation from Section 4.2.2:
359
+ "The 'Title' of each node in this semantic tree is generated generatively:
360
+ we feed the text of the cluster to a summarization LLM with the prompt
361
+ 'Generate a concise 5-word header for this text section.'"
362
+ """
363
+ from rnsr.llm import get_llm
364
+
365
+ # Truncate text to avoid token limits (first 1500 chars should be enough for context)
366
+ text_sample = text[:1500] if len(text) > 1500 else text
367
+
368
+ prompt = f"""Read the following text segment and generate a descriptive, hierarchical title for it.
369
+ The title should be concise (3-7 words) and capture the main topic of this section.
370
+
371
+ Text:
372
+ {text_sample}
373
+
374
+ Return ONLY the title, nothing else. Example format: "Section 3: Liability Limitations" or "Payment Terms and Conditions" """
375
+
376
+ try:
377
+ # Use centralized provider with retry logic
378
+ llm = get_llm()
379
+ # Note: LlamaIndex LLM.complete() usually returns a CompletionResponse,
380
+ # but our custom Gemini wrapper returns a string. str() handles both.
381
+ response = llm.complete(prompt)
382
+ header = str(response).strip().strip('"').strip("'")
383
+
384
+ # Validate: should be reasonable length
385
+ if 3 <= len(header) <= 100:
386
+ logger.debug("llm_header_generated", header=header[:50])
387
+ return header
388
+
389
+ except Exception as e:
390
+ logger.debug("synthetic_header_generation_failed", error=str(e))
391
+
392
+ return None
393
+
394
+
395
+ def _generate_header_heuristic(text: str, section_num: int) -> str:
396
+ """
397
+ Fallback: Generate header from first sentence/words when LLM unavailable.
398
+ """
399
+ # Get first sentence or first N words
400
+ words = text.split()[:10]
401
+
402
+ if not words:
403
+ return f"Section {section_num}"
404
+
405
+ header = " ".join(words)
406
+
407
+ # Truncate at sentence end if present
408
+ for punct in ".!?":
409
+ if punct in header:
410
+ header = header.split(punct)[0] + punct
411
+ break
412
+
413
+ # Ensure reasonable length
414
+ if len(header) > 60:
415
+ header = header[:57] + "..."
416
+
417
+ return header