rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Skeleton Index - Summary-Only Vector Index with External Content
|
|
3
|
+
|
|
4
|
+
The Skeleton Index pattern implements a two-layer retrieval approach:
|
|
5
|
+
|
|
6
|
+
1. **Skeleton Layer** (Vector Index): Contains ONLY summaries and metadata
|
|
7
|
+
- Each IndexNode's .text field contains a 50-100 word summary
|
|
8
|
+
- Child node IDs stored in metadata for navigation
|
|
9
|
+
- Used for initial retrieval and expand/traverse decisions
|
|
10
|
+
|
|
11
|
+
2. **Content Layer** (KV Store): Contains full text content
|
|
12
|
+
- Stored separately to prevent context pollution
|
|
13
|
+
- Only fetched during synthesis when explicitly needed
|
|
14
|
+
- Accessed via node_id pointers
|
|
15
|
+
|
|
16
|
+
Agent Decision Protocol:
|
|
17
|
+
if summary_answers_question(node.text):
|
|
18
|
+
# EXPAND: Fetch full content from KV Store
|
|
19
|
+
content = kv_store.get(node.node_id)
|
|
20
|
+
store_as_variable(content)
|
|
21
|
+
else:
|
|
22
|
+
# TRAVERSE: Navigate to child nodes
|
|
23
|
+
children = [get_node(cid) for cid in node.child_ids]
|
|
24
|
+
continue_navigation(children)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
import structlog
|
|
32
|
+
|
|
33
|
+
from rnsr.exceptions import IndexingError
|
|
34
|
+
from rnsr.indexing.kv_store import InMemoryKVStore, KVStore, SQLiteKVStore
|
|
35
|
+
from rnsr.models import DocumentNode, DocumentTree, SkeletonNode
|
|
36
|
+
|
|
37
|
+
logger = structlog.get_logger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def generate_summary(content: str, max_words: int = 100) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Generate a summary for a node's content.
|
|
43
|
+
|
|
44
|
+
EXTRACTIVE approach: Take first portion to preserve key facts,
|
|
45
|
+
entities, and concrete details that ToT needs for evaluation.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
content: Full text content.
|
|
49
|
+
max_words: Maximum words in summary.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Summary text (50-100 words).
|
|
53
|
+
"""
|
|
54
|
+
if not content:
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
words = content.split()
|
|
58
|
+
|
|
59
|
+
if len(words) <= max_words:
|
|
60
|
+
return content
|
|
61
|
+
|
|
62
|
+
# EXTRACTIVE SUMMARY: Take first max_words to preserve:
|
|
63
|
+
# - Opening sentences (often contain key context)
|
|
64
|
+
# - Named entities (people, places, concepts)
|
|
65
|
+
# - Concrete facts (numbers, dates, specific actions)
|
|
66
|
+
# This gives ToT better signal than arbitrary truncation
|
|
67
|
+
return " ".join(words[:max_words]) + "..."
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def generate_summary_llm(
|
|
71
|
+
content: str,
|
|
72
|
+
llm: Any = None,
|
|
73
|
+
max_words: int = 75,
|
|
74
|
+
provider: str | None = None,
|
|
75
|
+
) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Generate a summary using an LLM.
|
|
78
|
+
|
|
79
|
+
Supports OpenAI, Anthropic, and Gemini providers.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
content: Full text content.
|
|
83
|
+
llm: LlamaIndex LLM instance (optional). If None, creates one.
|
|
84
|
+
max_words: Target word count.
|
|
85
|
+
provider: LLM provider ("openai", "anthropic", "gemini", or None for auto).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
LLM-generated summary.
|
|
89
|
+
"""
|
|
90
|
+
if not content or len(content.strip()) < 50:
|
|
91
|
+
return content
|
|
92
|
+
|
|
93
|
+
# If no LLM provided, try to create one
|
|
94
|
+
if llm is None:
|
|
95
|
+
llm = _get_llm_for_summary(provider)
|
|
96
|
+
if llm is None:
|
|
97
|
+
return generate_summary(content, max_words)
|
|
98
|
+
|
|
99
|
+
prompt = f"""Summarize the following text in {max_words} words or less.
|
|
100
|
+
|
|
101
|
+
IMPORTANT: Use an EXTRACTIVE approach - preserve:
|
|
102
|
+
- Key facts, entities, names, and concrete details (who, what, when, where)
|
|
103
|
+
- Specific actions, events, and outcomes
|
|
104
|
+
- Numbers, dates, and measurements
|
|
105
|
+
- The main subject and what happens to/with it
|
|
106
|
+
|
|
107
|
+
Avoid:
|
|
108
|
+
- Vague generalizations ("discusses various topics")
|
|
109
|
+
- Meta-commentary ("this section explains...")
|
|
110
|
+
- Abstractions without specifics
|
|
111
|
+
|
|
112
|
+
TEXT:
|
|
113
|
+
{content}
|
|
114
|
+
|
|
115
|
+
EXTRACTIVE SUMMARY:"""
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
response = await llm.acomplete(prompt)
|
|
119
|
+
return str(response).strip()
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning("llm_summary_failed", error=str(e))
|
|
122
|
+
return generate_summary(content, max_words)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _get_llm_for_summary(provider: str | None = None) -> Any:
|
|
126
|
+
"""
|
|
127
|
+
Get an LLM instance for summary generation.
|
|
128
|
+
|
|
129
|
+
Supports: OpenAI, Anthropic, Gemini, auto-detect.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
provider: "openai", "anthropic", "gemini", or None for auto-detect.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
LlamaIndex-compatible LLM, or None if unavailable.
|
|
136
|
+
"""
|
|
137
|
+
import os
|
|
138
|
+
|
|
139
|
+
# Auto-detect provider if not specified
|
|
140
|
+
if provider is None:
|
|
141
|
+
if os.getenv("GOOGLE_API_KEY"):
|
|
142
|
+
provider = "gemini"
|
|
143
|
+
elif os.getenv("ANTHROPIC_API_KEY"):
|
|
144
|
+
provider = "anthropic"
|
|
145
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
146
|
+
provider = "openai"
|
|
147
|
+
else:
|
|
148
|
+
logger.warning("no_llm_api_key_found")
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
provider = provider.lower()
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
if provider == "gemini":
|
|
155
|
+
from llama_index.llms.gemini import Gemini
|
|
156
|
+
|
|
157
|
+
logger.info("using_gemini_llm")
|
|
158
|
+
return Gemini(model="gemini-2.5-flash")
|
|
159
|
+
|
|
160
|
+
elif provider == "anthropic":
|
|
161
|
+
from llama_index.llms.anthropic import Anthropic
|
|
162
|
+
|
|
163
|
+
logger.info("using_anthropic_llm")
|
|
164
|
+
return Anthropic(model="claude-sonnet-4-5")
|
|
165
|
+
|
|
166
|
+
elif provider == "openai":
|
|
167
|
+
from llama_index.llms.openai import OpenAI
|
|
168
|
+
|
|
169
|
+
logger.info("using_openai_llm")
|
|
170
|
+
return OpenAI(model="gpt-5-mini")
|
|
171
|
+
|
|
172
|
+
else:
|
|
173
|
+
logger.warning("unknown_llm_provider", provider=provider)
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
except ImportError as e:
|
|
177
|
+
logger.warning("llm_import_failed", provider=provider, error=str(e))
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class SkeletonIndexBuilder:
|
|
182
|
+
"""
|
|
183
|
+
Builds a Skeleton Index from a DocumentTree.
|
|
184
|
+
|
|
185
|
+
The index consists of:
|
|
186
|
+
1. SkeletonNode objects (summaries + metadata)
|
|
187
|
+
2. KV Store entries (full content)
|
|
188
|
+
|
|
189
|
+
Attributes:
|
|
190
|
+
kv_store: Key-value store for full content.
|
|
191
|
+
nodes: Dictionary of node_id -> SkeletonNode.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
def __init__(self, kv_store: KVStore | None = None):
|
|
195
|
+
"""
|
|
196
|
+
Initialize the builder.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
kv_store: KV store instance. Defaults to InMemoryKVStore.
|
|
200
|
+
"""
|
|
201
|
+
self.kv_store = kv_store or InMemoryKVStore()
|
|
202
|
+
self.nodes: dict[str, SkeletonNode] = {}
|
|
203
|
+
|
|
204
|
+
logger.info("skeleton_builder_initialized")
|
|
205
|
+
|
|
206
|
+
def build_from_tree(self, tree: DocumentTree) -> dict[str, SkeletonNode]:
|
|
207
|
+
"""
|
|
208
|
+
Build a skeleton index from a DocumentTree.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
tree: The document tree to index.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dictionary mapping node_id to SkeletonNode.
|
|
215
|
+
"""
|
|
216
|
+
self.nodes.clear()
|
|
217
|
+
|
|
218
|
+
logger.info(
|
|
219
|
+
"building_skeleton_index",
|
|
220
|
+
doc_id=tree.id,
|
|
221
|
+
total_nodes=tree.total_nodes,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Recursively process the tree
|
|
225
|
+
self._process_node(tree.root, parent_id=None)
|
|
226
|
+
|
|
227
|
+
logger.info(
|
|
228
|
+
"skeleton_index_complete",
|
|
229
|
+
indexed_nodes=len(self.nodes),
|
|
230
|
+
kv_entries=self.kv_store.count(),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return self.nodes
|
|
234
|
+
|
|
235
|
+
def _process_node(
|
|
236
|
+
self,
|
|
237
|
+
node: DocumentNode,
|
|
238
|
+
parent_id: str | None,
|
|
239
|
+
) -> SkeletonNode:
|
|
240
|
+
"""
|
|
241
|
+
Recursively process a document node.
|
|
242
|
+
|
|
243
|
+
1. Store full content in KV Store
|
|
244
|
+
2. Generate summary
|
|
245
|
+
3. Create SkeletonNode
|
|
246
|
+
4. Process children
|
|
247
|
+
"""
|
|
248
|
+
# Store full content in KV Store
|
|
249
|
+
full_content = self._collect_content(node)
|
|
250
|
+
if full_content:
|
|
251
|
+
self.kv_store.put(node.id, full_content)
|
|
252
|
+
|
|
253
|
+
# Generate summary (summary-only in skeleton!)
|
|
254
|
+
summary = generate_summary(full_content)
|
|
255
|
+
|
|
256
|
+
# Create skeleton node
|
|
257
|
+
skeleton = SkeletonNode(
|
|
258
|
+
node_id=node.id,
|
|
259
|
+
parent_id=parent_id,
|
|
260
|
+
level=node.level,
|
|
261
|
+
header=node.header,
|
|
262
|
+
summary=summary, # ONLY summary in index
|
|
263
|
+
child_ids=[c.id for c in node.children],
|
|
264
|
+
page_num=node.page_num,
|
|
265
|
+
metadata={
|
|
266
|
+
"has_children": len(node.children) > 0,
|
|
267
|
+
"content_chars": len(full_content),
|
|
268
|
+
},
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
self.nodes[node.id] = skeleton
|
|
272
|
+
|
|
273
|
+
# Process children
|
|
274
|
+
for child in node.children:
|
|
275
|
+
self._process_node(child, parent_id=node.id)
|
|
276
|
+
|
|
277
|
+
return skeleton
|
|
278
|
+
|
|
279
|
+
def _collect_content(self, node: DocumentNode) -> str:
|
|
280
|
+
"""
|
|
281
|
+
Collect content from a node (header + body content).
|
|
282
|
+
"""
|
|
283
|
+
parts = []
|
|
284
|
+
|
|
285
|
+
if node.header:
|
|
286
|
+
parts.append(node.header)
|
|
287
|
+
|
|
288
|
+
if node.content:
|
|
289
|
+
parts.append(node.content)
|
|
290
|
+
|
|
291
|
+
return "\n\n".join(parts)
|
|
292
|
+
|
|
293
|
+
def get_node(self, node_id: str) -> SkeletonNode | None:
|
|
294
|
+
"""Get a skeleton node by ID."""
|
|
295
|
+
return self.nodes.get(node_id)
|
|
296
|
+
|
|
297
|
+
def get_content(self, node_id: str) -> str | None:
|
|
298
|
+
"""Get full content for a node from KV Store."""
|
|
299
|
+
return self.kv_store.get(node_id)
|
|
300
|
+
|
|
301
|
+
def get_children(self, node_id: str) -> list[SkeletonNode]:
|
|
302
|
+
"""Get child skeleton nodes."""
|
|
303
|
+
node = self.nodes.get(node_id)
|
|
304
|
+
if node is None:
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
return [
|
|
308
|
+
self.nodes[cid]
|
|
309
|
+
for cid in node.child_ids
|
|
310
|
+
if cid in self.nodes
|
|
311
|
+
]
|
|
312
|
+
|
|
313
|
+
def get_root(self) -> SkeletonNode | None:
|
|
314
|
+
"""Get the root node (level 0)."""
|
|
315
|
+
for node in self.nodes.values():
|
|
316
|
+
if node.level == 0:
|
|
317
|
+
return node
|
|
318
|
+
return None
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def build_skeleton_index(
|
|
322
|
+
tree: DocumentTree,
|
|
323
|
+
kv_store: KVStore | None = None,
|
|
324
|
+
) -> tuple[dict[str, SkeletonNode], KVStore]:
|
|
325
|
+
"""
|
|
326
|
+
Convenience function to build a skeleton index.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
tree: Document tree to index.
|
|
330
|
+
kv_store: Optional KV store (defaults to InMemoryKVStore).
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Tuple of (skeleton_nodes dict, kv_store).
|
|
334
|
+
|
|
335
|
+
Example:
|
|
336
|
+
tree = ingest_document("contract.pdf").tree
|
|
337
|
+
skeleton, kv = build_skeleton_index(tree)
|
|
338
|
+
|
|
339
|
+
# Navigate skeleton
|
|
340
|
+
root = skeleton[tree.root.id]
|
|
341
|
+
for child_id in root.child_ids:
|
|
342
|
+
child = skeleton[child_id]
|
|
343
|
+
print(f"{child.header}: {child.summary}")
|
|
344
|
+
|
|
345
|
+
# Only fetch full content when needed
|
|
346
|
+
if need_full_content:
|
|
347
|
+
content = kv.get(child_id)
|
|
348
|
+
"""
|
|
349
|
+
kv_store = kv_store or InMemoryKVStore()
|
|
350
|
+
builder = SkeletonIndexBuilder(kv_store)
|
|
351
|
+
nodes = builder.build_from_tree(tree)
|
|
352
|
+
return nodes, kv_store
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# For LlamaIndex integration
|
|
356
|
+
def create_llama_index_nodes(
|
|
357
|
+
skeleton_nodes: dict[str, SkeletonNode],
|
|
358
|
+
) -> list:
|
|
359
|
+
"""
|
|
360
|
+
Create LlamaIndex IndexNode objects from skeleton nodes.
|
|
361
|
+
|
|
362
|
+
Each IndexNode's .text field contains ONLY the summary,
|
|
363
|
+
with child_ids in metadata for navigation.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
List of LlamaIndex IndexNode objects.
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
from llama_index.core.schema import IndexNode
|
|
370
|
+
except ImportError:
|
|
371
|
+
raise IndexingError(
|
|
372
|
+
"LlamaIndex not installed. "
|
|
373
|
+
"Install with: pip install llama-index"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
llama_nodes = []
|
|
377
|
+
|
|
378
|
+
for skel in skeleton_nodes.values():
|
|
379
|
+
# IndexNode.text = summary ONLY (not full content!)
|
|
380
|
+
node = IndexNode(
|
|
381
|
+
text=skel.summary,
|
|
382
|
+
index_id=skel.node_id,
|
|
383
|
+
obj={
|
|
384
|
+
"node_id": skel.node_id,
|
|
385
|
+
"parent_id": skel.parent_id,
|
|
386
|
+
"level": skel.level,
|
|
387
|
+
"header": skel.header,
|
|
388
|
+
"child_ids": skel.child_ids,
|
|
389
|
+
"has_children": len(skel.child_ids) > 0,
|
|
390
|
+
},
|
|
391
|
+
)
|
|
392
|
+
llama_nodes.append(node)
|
|
393
|
+
|
|
394
|
+
logger.info("llama_nodes_created", count=len(llama_nodes))
|
|
395
|
+
return llama_nodes
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ingestion Module - Latent TOC Reconstruction + Vision Retrieval
|
|
3
|
+
|
|
4
|
+
Implements the "Latent Hierarchy Generator" from the research paper (Section 4):
|
|
5
|
+
- Visual-Geometric Analysis (Font Histogram + XY-Cut)
|
|
6
|
+
- Semantic Boundary Detection (SemanticSplitter + Hierarchical Clustering)
|
|
7
|
+
- Synthetic Header Generation (LLM-based titles for flat documents)
|
|
8
|
+
- Vision-Based Retrieval (OCR-free page image analysis)
|
|
9
|
+
- Table and Chart Parsing (Deep structured data extraction)
|
|
10
|
+
|
|
11
|
+
Responsible for:
|
|
12
|
+
1. Font Histogram Analysis (PRIMARY - Section 6.1)
|
|
13
|
+
2. Recursive XY-Cut (Visual Segmentation - Section 4.1.1)
|
|
14
|
+
3. Hierarchical Clustering (Multi-resolution topics - Section 4.2.2)
|
|
15
|
+
4. Synthetic Header Generation (LLM titles - Section 6.3)
|
|
16
|
+
5. Graceful Degradation (3-tier fallback)
|
|
17
|
+
6. Vision-Based Retrieval (PageIndex-inspired OCR-free mode)
|
|
18
|
+
7. Table Parsing (NEW - SQL-like queries over tables)
|
|
19
|
+
8. Chart Parsing (NEW - Trend analysis and data extraction)
|
|
20
|
+
|
|
21
|
+
Primary Entry Point:
|
|
22
|
+
ingest_document(pdf_path) -> IngestionResult
|
|
23
|
+
|
|
24
|
+
ALWAYS use ingest_document() - it handles fallbacks automatically.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from rnsr.ingestion.font_histogram import FontHistogramAnalyzer, FontAnalysis
|
|
28
|
+
from rnsr.ingestion.header_classifier import (
|
|
29
|
+
HeaderClassifier,
|
|
30
|
+
LearnedHeaderThresholds,
|
|
31
|
+
get_learned_header_thresholds,
|
|
32
|
+
)
|
|
33
|
+
from rnsr.ingestion.tree_builder import TreeBuilder, build_document_tree
|
|
34
|
+
from rnsr.ingestion.text_builder import build_tree_from_text, build_tree_from_contexts
|
|
35
|
+
from rnsr.ingestion.pipeline import ingest_document
|
|
36
|
+
from rnsr.ingestion.semantic_fallback import try_semantic_splitter_ingestion
|
|
37
|
+
from rnsr.ingestion.ocr_fallback import try_ocr_ingestion, check_ocr_available
|
|
38
|
+
from rnsr.ingestion.xy_cut import (
|
|
39
|
+
RecursiveXYCutter,
|
|
40
|
+
segment_pdf_with_xy_cut,
|
|
41
|
+
SegmentNode,
|
|
42
|
+
BoundingRegion,
|
|
43
|
+
)
|
|
44
|
+
from rnsr.ingestion.hierarchical_cluster import (
|
|
45
|
+
HierarchicalSemanticClusterer,
|
|
46
|
+
cluster_document_hierarchically,
|
|
47
|
+
TextCluster,
|
|
48
|
+
)
|
|
49
|
+
from rnsr.ingestion.layout_detector import detect_layout_complexity, LayoutComplexity
|
|
50
|
+
from rnsr.ingestion.layout_model import (
|
|
51
|
+
get_layout_model,
|
|
52
|
+
classify_layout_blocks,
|
|
53
|
+
check_layout_model_available,
|
|
54
|
+
get_layout_model_info,
|
|
55
|
+
LAYOUT_MODEL_BASE,
|
|
56
|
+
LAYOUT_MODEL_LARGE,
|
|
57
|
+
)
|
|
58
|
+
from rnsr.ingestion.vision_retrieval import (
|
|
59
|
+
VisionConfig,
|
|
60
|
+
VisionNavigator,
|
|
61
|
+
HybridVisionNavigator,
|
|
62
|
+
PageImageExtractor,
|
|
63
|
+
VisionLLM,
|
|
64
|
+
create_vision_navigator,
|
|
65
|
+
create_hybrid_navigator,
|
|
66
|
+
)
|
|
67
|
+
from rnsr.ingestion.table_parser import (
|
|
68
|
+
TableParser,
|
|
69
|
+
ParsedTable,
|
|
70
|
+
TableCell,
|
|
71
|
+
TableRow,
|
|
72
|
+
TableColumn,
|
|
73
|
+
TableQueryEngine,
|
|
74
|
+
parse_tables_from_text,
|
|
75
|
+
query_table,
|
|
76
|
+
)
|
|
77
|
+
from rnsr.ingestion.chart_parser import (
|
|
78
|
+
ChartParser,
|
|
79
|
+
ParsedChart,
|
|
80
|
+
ChartSeries,
|
|
81
|
+
DataPoint,
|
|
82
|
+
ChartType,
|
|
83
|
+
ChartAnalysis,
|
|
84
|
+
describe_chart,
|
|
85
|
+
)
|
|
86
|
+
from rnsr.models import IngestionResult
|
|
87
|
+
|
|
88
|
+
__all__ = [
|
|
89
|
+
# Pipeline (Primary Entry Point)
|
|
90
|
+
"ingest_document",
|
|
91
|
+
"IngestionResult",
|
|
92
|
+
|
|
93
|
+
# Tier 1: Font Histogram
|
|
94
|
+
"FontHistogramAnalyzer",
|
|
95
|
+
"FontAnalysis",
|
|
96
|
+
"HeaderClassifier",
|
|
97
|
+
"LearnedHeaderThresholds",
|
|
98
|
+
"get_learned_header_thresholds",
|
|
99
|
+
"TreeBuilder",
|
|
100
|
+
"build_document_tree",
|
|
101
|
+
|
|
102
|
+
# Text-to-Tree (for benchmarks using raw text)
|
|
103
|
+
"build_tree_from_text",
|
|
104
|
+
"build_tree_from_contexts",
|
|
105
|
+
|
|
106
|
+
# Tier 1b: Visual Analysis (LayoutLM + XY-Cut)
|
|
107
|
+
"detect_layout_complexity",
|
|
108
|
+
"LayoutComplexity",
|
|
109
|
+
"get_layout_model",
|
|
110
|
+
"classify_layout_blocks",
|
|
111
|
+
"check_layout_model_available",
|
|
112
|
+
"get_layout_model_info",
|
|
113
|
+
"LAYOUT_MODEL_BASE",
|
|
114
|
+
"LAYOUT_MODEL_LARGE",
|
|
115
|
+
|
|
116
|
+
# Tier 1b: Recursive XY-Cut (Visual Segmentation)
|
|
117
|
+
"RecursiveXYCutter",
|
|
118
|
+
"segment_pdf_with_xy_cut",
|
|
119
|
+
"SegmentNode",
|
|
120
|
+
"BoundingRegion",
|
|
121
|
+
|
|
122
|
+
# Tier 2: Semantic Splitter
|
|
123
|
+
"try_semantic_splitter_ingestion",
|
|
124
|
+
|
|
125
|
+
# Tier 2b: Hierarchical Clustering
|
|
126
|
+
"HierarchicalSemanticClusterer",
|
|
127
|
+
"cluster_document_hierarchically",
|
|
128
|
+
"TextCluster",
|
|
129
|
+
|
|
130
|
+
# Tier 3: OCR
|
|
131
|
+
"try_ocr_ingestion",
|
|
132
|
+
"check_ocr_available",
|
|
133
|
+
|
|
134
|
+
# Vision-Based Retrieval (PageIndex-inspired)
|
|
135
|
+
"VisionConfig",
|
|
136
|
+
"VisionNavigator",
|
|
137
|
+
"HybridVisionNavigator",
|
|
138
|
+
"PageImageExtractor",
|
|
139
|
+
"VisionLLM",
|
|
140
|
+
"create_vision_navigator",
|
|
141
|
+
"create_hybrid_navigator",
|
|
142
|
+
|
|
143
|
+
# Table Parsing (NEW)
|
|
144
|
+
"TableParser",
|
|
145
|
+
"ParsedTable",
|
|
146
|
+
"TableCell",
|
|
147
|
+
"TableRow",
|
|
148
|
+
"TableColumn",
|
|
149
|
+
"TableQueryEngine",
|
|
150
|
+
"parse_tables_from_text",
|
|
151
|
+
"query_table",
|
|
152
|
+
|
|
153
|
+
# Chart Parsing (NEW)
|
|
154
|
+
"ChartParser",
|
|
155
|
+
"ParsedChart",
|
|
156
|
+
"ChartSeries",
|
|
157
|
+
"DataPoint",
|
|
158
|
+
"ChartType",
|
|
159
|
+
"ChartAnalysis",
|
|
160
|
+
"describe_chart",
|
|
161
|
+
]
|