rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Fallback - TIER 2: For Flat Text Documents
|
|
3
|
+
|
|
4
|
+
When the Font Histogram Analyzer detects no font variance (flat text),
|
|
5
|
+
this module uses LlamaIndex's SemanticSplitterNodeParser to generate
|
|
6
|
+
"synthetic" sections based on embedding shifts.
|
|
7
|
+
|
|
8
|
+
Use this fallback when:
|
|
9
|
+
- Document has uniform font size throughout
|
|
10
|
+
- No headers can be detected via font analysis
|
|
11
|
+
- Document is machine-generated with no formatting
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import fitz
|
|
19
|
+
import structlog
|
|
20
|
+
|
|
21
|
+
from rnsr.models import DocumentNode, DocumentTree
|
|
22
|
+
|
|
23
|
+
logger = structlog.get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_raw_text(pdf_path: Path | str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Extract all text from a PDF as a single string.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
pdf_path: Path to the PDF file.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Full text content of the document.
|
|
35
|
+
"""
|
|
36
|
+
pdf_path = Path(pdf_path)
|
|
37
|
+
doc = fitz.open(pdf_path)
|
|
38
|
+
|
|
39
|
+
# get_text() returns str when called with no args or "text"
|
|
40
|
+
full_text = "\n\n".join(str(page.get_text()) for page in doc)
|
|
41
|
+
doc.close()
|
|
42
|
+
|
|
43
|
+
return full_text
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def try_semantic_splitter_ingestion(
|
|
47
|
+
pdf_path: Path | str,
|
|
48
|
+
embed_provider: str | None = None,
|
|
49
|
+
) -> DocumentTree:
|
|
50
|
+
"""
|
|
51
|
+
TIER 2 Fallback: Use semantic splitting for flat text documents.
|
|
52
|
+
|
|
53
|
+
When Font Histogram detects no font variance, this method:
|
|
54
|
+
1. Extracts raw text from the PDF
|
|
55
|
+
2. Uses embedding-based splitting to find natural breaks
|
|
56
|
+
3. Generates synthetic section headers
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
pdf_path: Path to the PDF file.
|
|
60
|
+
embed_provider: Embedding provider ("openai", "gemini", or None for auto).
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
DocumentTree with synthetic sections.
|
|
64
|
+
"""
|
|
65
|
+
pdf_path = Path(pdf_path)
|
|
66
|
+
|
|
67
|
+
logger.info("using_semantic_splitter", path=str(pdf_path))
|
|
68
|
+
|
|
69
|
+
# Extract raw text
|
|
70
|
+
full_text = extract_raw_text(pdf_path)
|
|
71
|
+
|
|
72
|
+
if not full_text.strip():
|
|
73
|
+
logger.warning("no_text_extracted", path=str(pdf_path))
|
|
74
|
+
# Return minimal tree
|
|
75
|
+
root = DocumentNode(id="root", level=0, header="Document")
|
|
76
|
+
return DocumentTree(
|
|
77
|
+
title="Empty Document",
|
|
78
|
+
root=root,
|
|
79
|
+
total_nodes=1,
|
|
80
|
+
ingestion_tier=2,
|
|
81
|
+
ingestion_method="semantic_splitter",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Try to import LlamaIndex components
|
|
85
|
+
try:
|
|
86
|
+
from llama_index.core import Document
|
|
87
|
+
from llama_index.core.node_parser import SemanticSplitterNodeParser
|
|
88
|
+
|
|
89
|
+
# Get embedding model (supports OpenAI, Gemini, auto-detect)
|
|
90
|
+
embed_model = _get_embedding_model(embed_provider)
|
|
91
|
+
|
|
92
|
+
# Create semantic splitter
|
|
93
|
+
splitter = SemanticSplitterNodeParser(
|
|
94
|
+
embed_model=embed_model,
|
|
95
|
+
breakpoint_percentile_threshold=95,
|
|
96
|
+
buffer_size=1,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Split document
|
|
100
|
+
llama_doc = Document(text=full_text)
|
|
101
|
+
nodes = splitter.get_nodes_from_documents([llama_doc])
|
|
102
|
+
|
|
103
|
+
logger.info(
|
|
104
|
+
"semantic_split_complete",
|
|
105
|
+
chunks=len(nodes),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Build tree from semantic chunks
|
|
109
|
+
return _build_tree_from_semantic_nodes(nodes, pdf_path.stem)
|
|
110
|
+
|
|
111
|
+
except ImportError as e:
|
|
112
|
+
logger.warning(
|
|
113
|
+
"llama_index_not_available",
|
|
114
|
+
error=str(e),
|
|
115
|
+
fallback="simple_chunking",
|
|
116
|
+
)
|
|
117
|
+
# Fall back to simple chunking
|
|
118
|
+
return _simple_chunk_fallback(full_text, pdf_path.stem)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _get_embedding_model(provider: str | None = None):
|
|
122
|
+
"""
|
|
123
|
+
Get embedding model with multi-provider support.
|
|
124
|
+
|
|
125
|
+
Supports: OpenAI, Gemini, auto-detect.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
provider: "openai", "gemini", or None for auto-detect.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
LlamaIndex-compatible embedding model.
|
|
132
|
+
"""
|
|
133
|
+
import os
|
|
134
|
+
|
|
135
|
+
# Auto-detect provider if not specified
|
|
136
|
+
if provider is None:
|
|
137
|
+
if os.getenv("GOOGLE_API_KEY"):
|
|
138
|
+
provider = "gemini"
|
|
139
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
140
|
+
provider = "openai"
|
|
141
|
+
else:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"No embedding API key found. "
|
|
144
|
+
"Set GOOGLE_API_KEY or OPENAI_API_KEY."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
provider = provider.lower()
|
|
148
|
+
|
|
149
|
+
if provider == "gemini":
|
|
150
|
+
try:
|
|
151
|
+
from llama_index.embeddings.gemini import GeminiEmbedding
|
|
152
|
+
|
|
153
|
+
logger.info("using_gemini_embeddings")
|
|
154
|
+
return GeminiEmbedding(model_name="models/text-embedding-004")
|
|
155
|
+
except ImportError:
|
|
156
|
+
raise ImportError(
|
|
157
|
+
"Gemini embeddings not installed. "
|
|
158
|
+
"Install with: pip install llama-index-embeddings-gemini"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
elif provider == "openai":
|
|
162
|
+
try:
|
|
163
|
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
|
164
|
+
|
|
165
|
+
logger.info("using_openai_embeddings")
|
|
166
|
+
return OpenAIEmbedding(model="text-embedding-3-small")
|
|
167
|
+
except ImportError:
|
|
168
|
+
raise ImportError(
|
|
169
|
+
"OpenAI embeddings not installed. "
|
|
170
|
+
"Install with: pip install llama-index-embeddings-openai"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(f"Unknown embedding provider: {provider}")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _build_tree_from_semantic_nodes(nodes: list, title: str) -> DocumentTree:
|
|
178
|
+
"""
|
|
179
|
+
Build a two-level tree structure from semantic splitter nodes.
|
|
180
|
+
|
|
181
|
+
This creates a hierarchy for plain text to give the navigator agent
|
|
182
|
+
a more meaningful structure to traverse.
|
|
183
|
+
"""
|
|
184
|
+
root = DocumentNode(
|
|
185
|
+
id="root",
|
|
186
|
+
level=0,
|
|
187
|
+
header=title,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
logger.info("generating_synthetic_headers", count=len(nodes))
|
|
191
|
+
|
|
192
|
+
# Helper to generate header (wrapped for potential parallelization later)
|
|
193
|
+
# For now, we add logging so the user knows it's not frozen
|
|
194
|
+
def get_header(text, index):
|
|
195
|
+
h = _generate_synthetic_header(text)
|
|
196
|
+
if h:
|
|
197
|
+
return h
|
|
198
|
+
return f"Section {index}"
|
|
199
|
+
|
|
200
|
+
# For plain text, create a two-level hierarchy.
|
|
201
|
+
group_size = 5 # Segments per group
|
|
202
|
+
if len(nodes) < group_size * 1.5: # Don't group if it results in tiny groups
|
|
203
|
+
# Create a flat tree if not enough segments for meaningful grouping
|
|
204
|
+
for i, node in enumerate(nodes, 1):
|
|
205
|
+
if i % 5 == 0:
|
|
206
|
+
logger.info("processing_node", current=i, total=len(nodes))
|
|
207
|
+
|
|
208
|
+
text = node.text.strip()
|
|
209
|
+
synthetic_header = get_header(text, i)
|
|
210
|
+
section = DocumentNode(
|
|
211
|
+
id=f"sec_{i:03d}",
|
|
212
|
+
level=1,
|
|
213
|
+
header=synthetic_header,
|
|
214
|
+
content=text,
|
|
215
|
+
)
|
|
216
|
+
root.children.append(section)
|
|
217
|
+
else:
|
|
218
|
+
# Create parent nodes to add hierarchy
|
|
219
|
+
num_groups = (len(nodes) + group_size - 1) // group_size
|
|
220
|
+
logger.info("processing_groups", total_groups=num_groups)
|
|
221
|
+
|
|
222
|
+
for i in range(num_groups):
|
|
223
|
+
logger.info("processing_group", current=i+1, total=num_groups)
|
|
224
|
+
|
|
225
|
+
start_index = i * group_size
|
|
226
|
+
end_index = start_index + group_size
|
|
227
|
+
group_nodes = nodes[start_index:end_index]
|
|
228
|
+
|
|
229
|
+
# Use the header of the first node in the group for the parent
|
|
230
|
+
parent_header_text = group_nodes[0].text.strip()
|
|
231
|
+
parent_header = get_header(parent_header_text, i + 1)
|
|
232
|
+
|
|
233
|
+
parent_node = DocumentNode(
|
|
234
|
+
id=f"group_{i}",
|
|
235
|
+
level=1,
|
|
236
|
+
header=parent_header,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
for j, node in enumerate(group_nodes):
|
|
240
|
+
text = node.text.strip()
|
|
241
|
+
child_header = f"Paragraph {j + 1}"
|
|
242
|
+
|
|
243
|
+
child_node = DocumentNode(
|
|
244
|
+
id=f"sec_{(start_index + j):03d}",
|
|
245
|
+
level=2,
|
|
246
|
+
header=child_header,
|
|
247
|
+
content=text,
|
|
248
|
+
)
|
|
249
|
+
parent_node.children.append(child_node)
|
|
250
|
+
|
|
251
|
+
root.children.append(parent_node)
|
|
252
|
+
|
|
253
|
+
return DocumentTree(
|
|
254
|
+
title=title,
|
|
255
|
+
root=root,
|
|
256
|
+
total_nodes=len(nodes) + 1, # This is an approximation
|
|
257
|
+
ingestion_tier=2,
|
|
258
|
+
ingestion_method="semantic_splitter",
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _simple_chunk_fallback(text: str, title: str, chunk_size: int = 1000) -> DocumentTree:
|
|
263
|
+
"""
|
|
264
|
+
Simple chunking fallback when LlamaIndex is not available.
|
|
265
|
+
|
|
266
|
+
Splits text into fixed-size chunks.
|
|
267
|
+
"""
|
|
268
|
+
logger.info("using_simple_chunking", chunk_size=chunk_size)
|
|
269
|
+
|
|
270
|
+
root = DocumentNode(
|
|
271
|
+
id="root",
|
|
272
|
+
level=0,
|
|
273
|
+
header=title,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Split into paragraphs first
|
|
277
|
+
paragraphs = text.split("\n\n")
|
|
278
|
+
|
|
279
|
+
# Group paragraphs into chunks
|
|
280
|
+
current_chunk = ""
|
|
281
|
+
chunk_num = 0
|
|
282
|
+
|
|
283
|
+
# Helper to generate header safely
|
|
284
|
+
def get_header(text, index):
|
|
285
|
+
h = _generate_synthetic_header(text)
|
|
286
|
+
if h:
|
|
287
|
+
return h
|
|
288
|
+
return f"Section {index}"
|
|
289
|
+
|
|
290
|
+
for i, para in enumerate(paragraphs):
|
|
291
|
+
para = para.strip()
|
|
292
|
+
if not para:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
if len(current_chunk) + len(para) > chunk_size:
|
|
296
|
+
if current_chunk:
|
|
297
|
+
chunk_num += 1
|
|
298
|
+
if chunk_num % 5 == 0:
|
|
299
|
+
logger.info("processing_chunk", current=chunk_num)
|
|
300
|
+
|
|
301
|
+
section = DocumentNode(
|
|
302
|
+
id=f"sec_{chunk_num:03d}",
|
|
303
|
+
level=1,
|
|
304
|
+
header=get_header(current_chunk, chunk_num),
|
|
305
|
+
content=current_chunk,
|
|
306
|
+
)
|
|
307
|
+
root.children.append(section)
|
|
308
|
+
current_chunk = para
|
|
309
|
+
else:
|
|
310
|
+
current_chunk += "\n\n" + para if current_chunk else para
|
|
311
|
+
|
|
312
|
+
# Add final chunk
|
|
313
|
+
if current_chunk:
|
|
314
|
+
chunk_num += 1
|
|
315
|
+
section = DocumentNode(
|
|
316
|
+
id=f"sec_{chunk_num:03d}",
|
|
317
|
+
level=1,
|
|
318
|
+
header=get_header(current_chunk, chunk_num),
|
|
319
|
+
content=current_chunk,
|
|
320
|
+
)
|
|
321
|
+
root.children.append(section)
|
|
322
|
+
|
|
323
|
+
return DocumentTree(
|
|
324
|
+
title=title,
|
|
325
|
+
root=root,
|
|
326
|
+
total_nodes=chunk_num + 1,
|
|
327
|
+
ingestion_tier=2,
|
|
328
|
+
ingestion_method="semantic_splitter",
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _generate_synthetic_header(text: str, section_num: int) -> str:
|
|
333
|
+
"""
|
|
334
|
+
Generate a synthetic header from text content using LLM.
|
|
335
|
+
|
|
336
|
+
Per the research paper (Section 6.3): "For each identified section,
|
|
337
|
+
we execute an LLM call with prompt: 'Generate a descriptive,
|
|
338
|
+
hierarchical title for it. Return ONLY the title.'"
|
|
339
|
+
|
|
340
|
+
Falls back to heuristic extraction if LLM fails.
|
|
341
|
+
"""
|
|
342
|
+
# Try LLM-based header generation first
|
|
343
|
+
try:
|
|
344
|
+
header = _generate_header_via_llm(text, section_num)
|
|
345
|
+
if header:
|
|
346
|
+
return header
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.debug("llm_header_generation_failed", error=str(e))
|
|
349
|
+
|
|
350
|
+
# Fallback: heuristic extraction
|
|
351
|
+
return _generate_header_heuristic(text, section_num)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _generate_header_via_llm(text: str, section_num: int) -> str | None:
|
|
355
|
+
"""
|
|
356
|
+
Use LLM to generate a concise, descriptive header for a text section.
|
|
357
|
+
|
|
358
|
+
This implements the Synthetic Header Generation from Section 4.2.2:
|
|
359
|
+
"The 'Title' of each node in this semantic tree is generated generatively:
|
|
360
|
+
we feed the text of the cluster to a summarization LLM with the prompt
|
|
361
|
+
'Generate a concise 5-word header for this text section.'"
|
|
362
|
+
"""
|
|
363
|
+
from rnsr.llm import get_llm
|
|
364
|
+
|
|
365
|
+
# Truncate text to avoid token limits (first 1500 chars should be enough for context)
|
|
366
|
+
text_sample = text[:1500] if len(text) > 1500 else text
|
|
367
|
+
|
|
368
|
+
prompt = f"""Read the following text segment and generate a descriptive, hierarchical title for it.
|
|
369
|
+
The title should be concise (3-7 words) and capture the main topic of this section.
|
|
370
|
+
|
|
371
|
+
Text:
|
|
372
|
+
{text_sample}
|
|
373
|
+
|
|
374
|
+
Return ONLY the title, nothing else. Example format: "Section 3: Liability Limitations" or "Payment Terms and Conditions" """
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
# Use centralized provider with retry logic
|
|
378
|
+
llm = get_llm()
|
|
379
|
+
# Note: LlamaIndex LLM.complete() usually returns a CompletionResponse,
|
|
380
|
+
# but our custom Gemini wrapper returns a string. str() handles both.
|
|
381
|
+
response = llm.complete(prompt)
|
|
382
|
+
header = str(response).strip().strip('"').strip("'")
|
|
383
|
+
|
|
384
|
+
# Validate: should be reasonable length
|
|
385
|
+
if 3 <= len(header) <= 100:
|
|
386
|
+
logger.debug("llm_header_generated", header=header[:50])
|
|
387
|
+
return header
|
|
388
|
+
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.debug("synthetic_header_generation_failed", error=str(e))
|
|
391
|
+
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _generate_header_heuristic(text: str, section_num: int) -> str:
|
|
396
|
+
"""
|
|
397
|
+
Fallback: Generate header from first sentence/words when LLM unavailable.
|
|
398
|
+
"""
|
|
399
|
+
# Get first sentence or first N words
|
|
400
|
+
words = text.split()[:10]
|
|
401
|
+
|
|
402
|
+
if not words:
|
|
403
|
+
return f"Section {section_num}"
|
|
404
|
+
|
|
405
|
+
header = " ".join(words)
|
|
406
|
+
|
|
407
|
+
# Truncate at sentence end if present
|
|
408
|
+
for punct in ".!?":
|
|
409
|
+
if punct in header:
|
|
410
|
+
header = header.split(punct)[0] + punct
|
|
411
|
+
break
|
|
412
|
+
|
|
413
|
+
# Ensure reasonable length
|
|
414
|
+
if len(header) > 60:
|
|
415
|
+
header = header[:57] + "..."
|
|
416
|
+
|
|
417
|
+
return header
|