rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/client.py
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Client - Simple High-Level API
|
|
3
|
+
|
|
4
|
+
Provides the simplest possible interface for using RNSR.
|
|
5
|
+
Handles all the complexity of ingestion, indexing, and navigation.
|
|
6
|
+
|
|
7
|
+
This is the state-of-the-art hybrid retrieval system combining:
|
|
8
|
+
- PageIndex: Vectorless, reasoning-based tree search
|
|
9
|
+
- RLMs: REPL environment with recursive sub-LLM calls
|
|
10
|
+
- Vision: OCR-free image-based document analysis
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from rnsr import RNSRClient
|
|
14
|
+
|
|
15
|
+
# One-line document Q&A
|
|
16
|
+
client = RNSRClient()
|
|
17
|
+
answer = client.ask("contract.pdf", "What are the payment terms?")
|
|
18
|
+
|
|
19
|
+
# With caching (faster for repeated queries)
|
|
20
|
+
client = RNSRClient(cache_dir="./cache")
|
|
21
|
+
answer = client.ask("contract.pdf", "What are the terms?")
|
|
22
|
+
answer2 = client.ask("contract.pdf", "Who are the parties?") # Uses cache
|
|
23
|
+
|
|
24
|
+
# Advanced: RLM Navigator with full features
|
|
25
|
+
result = client.ask_advanced(
|
|
26
|
+
"complex_report.pdf",
|
|
27
|
+
"Compare revenue in Q1 vs Q2",
|
|
28
|
+
use_rlm=True,
|
|
29
|
+
enable_verification=True,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Vision-based analysis (for scanned docs, charts)
|
|
33
|
+
result = client.ask_vision(
|
|
34
|
+
"scanned_document.pdf",
|
|
35
|
+
"What is shown in the chart?",
|
|
36
|
+
)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import hashlib
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
import structlog
|
|
46
|
+
|
|
47
|
+
from rnsr.agent import run_navigator
|
|
48
|
+
from rnsr.document_store import DocumentStore
|
|
49
|
+
from rnsr.indexing import build_skeleton_index, save_index, load_index
|
|
50
|
+
from rnsr.indexing.kv_store import InMemoryKVStore, KVStore
|
|
51
|
+
from rnsr.ingestion import ingest_document
|
|
52
|
+
from rnsr.models import SkeletonNode
|
|
53
|
+
|
|
54
|
+
logger = structlog.get_logger(__name__)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class RNSRClient:
|
|
58
|
+
"""
|
|
59
|
+
High-level client for RNSR document Q&A.
|
|
60
|
+
|
|
61
|
+
This is the simplest way to use RNSR. It handles:
|
|
62
|
+
- Document ingestion
|
|
63
|
+
- Skeleton index building
|
|
64
|
+
- Optional caching/persistence
|
|
65
|
+
- Navigation and answer generation
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
# Basic usage (no caching)
|
|
69
|
+
client = RNSRClient()
|
|
70
|
+
answer = client.ask("document.pdf", "What is the main topic?")
|
|
71
|
+
|
|
72
|
+
# With caching (recommended for production)
|
|
73
|
+
client = RNSRClient(cache_dir="./rnsr_cache")
|
|
74
|
+
answer = client.ask("document.pdf", "What is the main topic?")
|
|
75
|
+
|
|
76
|
+
# From raw text
|
|
77
|
+
answer = client.ask_text(
|
|
78
|
+
"This is my document content...",
|
|
79
|
+
"What is this about?"
|
|
80
|
+
)
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
cache_dir: str | Path | None = None,
|
|
86
|
+
llm_provider: str | None = None,
|
|
87
|
+
llm_model: str | None = None,
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
Initialize the RNSR client.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
cache_dir: Optional directory for caching indexes.
|
|
94
|
+
If provided, indexes are persisted and reused.
|
|
95
|
+
llm_provider: LLM provider ("openai", "anthropic", "gemini")
|
|
96
|
+
llm_model: LLM model name
|
|
97
|
+
"""
|
|
98
|
+
self.cache_dir = Path(cache_dir) if cache_dir else None
|
|
99
|
+
self.llm_provider = llm_provider
|
|
100
|
+
self.llm_model = llm_model
|
|
101
|
+
|
|
102
|
+
# In-memory cache for session
|
|
103
|
+
self._session_cache: dict[str, tuple[dict[str, SkeletonNode], KVStore]] = {}
|
|
104
|
+
|
|
105
|
+
if self.cache_dir:
|
|
106
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
logger.info(
|
|
109
|
+
"rnsr_client_initialized",
|
|
110
|
+
cache_dir=str(self.cache_dir) if self.cache_dir else None,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def ask(
|
|
114
|
+
self,
|
|
115
|
+
document: str | Path,
|
|
116
|
+
question: str,
|
|
117
|
+
force_reindex: bool = False,
|
|
118
|
+
) -> dict[str, Any]:
|
|
119
|
+
"""
|
|
120
|
+
Ask a question about a PDF document.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
document: Path to PDF file
|
|
124
|
+
question: Question to ask
|
|
125
|
+
force_reindex: If True, re-process even if cached
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Result dictionary from the navigator
|
|
129
|
+
|
|
130
|
+
Example:
|
|
131
|
+
answer = client.ask("contract.pdf", "What are the payment terms?")
|
|
132
|
+
"""
|
|
133
|
+
doc_path = Path(document)
|
|
134
|
+
if not doc_path.exists():
|
|
135
|
+
raise FileNotFoundError(f"Document not found: {doc_path}")
|
|
136
|
+
|
|
137
|
+
# Get or create index
|
|
138
|
+
skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
|
|
139
|
+
|
|
140
|
+
# Run navigator
|
|
141
|
+
result = run_navigator(question, skeleton, kv_store)
|
|
142
|
+
return result.get("answer", "No answer found.")
|
|
143
|
+
|
|
144
|
+
def ask_text(
|
|
145
|
+
self,
|
|
146
|
+
text: str | list[str],
|
|
147
|
+
question: str,
|
|
148
|
+
cache_key: str | None = None,
|
|
149
|
+
) -> dict[str, Any]:
|
|
150
|
+
"""
|
|
151
|
+
Ask a question about raw text.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
text: Text content or list of text chunks
|
|
155
|
+
question: Question to ask
|
|
156
|
+
cache_key: Optional key for caching (if cache_dir is set)
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Answer string
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
answer = client.ask_text(
|
|
163
|
+
"The company was founded in 2020...",
|
|
164
|
+
"When was the company founded?"
|
|
165
|
+
)
|
|
166
|
+
"""
|
|
167
|
+
from rnsr.ingestion import build_tree_from_text
|
|
168
|
+
|
|
169
|
+
# Generate cache key from content if not provided
|
|
170
|
+
if cache_key is None:
|
|
171
|
+
content = text if isinstance(text, str) else "\n".join(text)
|
|
172
|
+
cache_key = f"text_{hashlib.md5(content[:1000].encode()).hexdigest()[:12]}"
|
|
173
|
+
|
|
174
|
+
# Check caches
|
|
175
|
+
if cache_key in self._session_cache:
|
|
176
|
+
skeleton, kv_store = self._session_cache[cache_key]
|
|
177
|
+
elif self.cache_dir and (self.cache_dir / cache_key).exists():
|
|
178
|
+
skeleton, kv_store = load_index(self.cache_dir / cache_key)
|
|
179
|
+
self._session_cache[cache_key] = (skeleton, kv_store)
|
|
180
|
+
else:
|
|
181
|
+
# Build index
|
|
182
|
+
tree = build_tree_from_text(text)
|
|
183
|
+
skeleton, kv_store = build_skeleton_index(tree)
|
|
184
|
+
self._session_cache[cache_key] = (skeleton, kv_store)
|
|
185
|
+
|
|
186
|
+
# Persist if cache_dir is set
|
|
187
|
+
if self.cache_dir:
|
|
188
|
+
save_index(skeleton, kv_store, self.cache_dir / cache_key)
|
|
189
|
+
|
|
190
|
+
result = run_navigator(question, skeleton, kv_store)
|
|
191
|
+
return result.get("answer", "No answer found.")
|
|
192
|
+
|
|
193
|
+
def ask_multiple(
|
|
194
|
+
self,
|
|
195
|
+
document: str | Path,
|
|
196
|
+
questions: list[str],
|
|
197
|
+
force_reindex: bool = False,
|
|
198
|
+
) -> list[dict[str, Any]]:
|
|
199
|
+
"""
|
|
200
|
+
Ask multiple questions about a document.
|
|
201
|
+
|
|
202
|
+
More efficient than calling ask() multiple times because
|
|
203
|
+
the document is only indexed once.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
document: Path to PDF file
|
|
207
|
+
questions: List of questions
|
|
208
|
+
force_reindex: If True, re-process even if cached
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List of answers
|
|
212
|
+
|
|
213
|
+
Example:
|
|
214
|
+
answers = client.ask_multiple(
|
|
215
|
+
"contract.pdf",
|
|
216
|
+
["What are the terms?", "Who are the parties?"]
|
|
217
|
+
)
|
|
218
|
+
"""
|
|
219
|
+
doc_path = Path(document)
|
|
220
|
+
skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
|
|
221
|
+
|
|
222
|
+
return [
|
|
223
|
+
run_navigator(q, skeleton, kv_store).get("answer", "No answer found.")
|
|
224
|
+
for q in questions
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
def get_document_info(self, document: str | Path) -> dict[str, Any]:
|
|
228
|
+
"""
|
|
229
|
+
Get information about a document without querying it.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
document: Path to PDF file
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Dictionary with document metadata
|
|
236
|
+
"""
|
|
237
|
+
doc_path = Path(document)
|
|
238
|
+
cache_key = self._get_cache_key(doc_path)
|
|
239
|
+
|
|
240
|
+
# Check if cached
|
|
241
|
+
if cache_key in self._session_cache:
|
|
242
|
+
skeleton, _ = self._session_cache[cache_key]
|
|
243
|
+
return {
|
|
244
|
+
"path": str(doc_path),
|
|
245
|
+
"cached": True,
|
|
246
|
+
"nodes": len(skeleton),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if self.cache_dir and (self.cache_dir / cache_key).exists():
|
|
250
|
+
from rnsr.indexing import get_index_info
|
|
251
|
+
info = get_index_info(self.cache_dir / cache_key)
|
|
252
|
+
info["path"] = str(doc_path)
|
|
253
|
+
info["cached"] = True
|
|
254
|
+
return info
|
|
255
|
+
|
|
256
|
+
return {
|
|
257
|
+
"path": str(doc_path),
|
|
258
|
+
"cached": False,
|
|
259
|
+
"exists": doc_path.exists(),
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
def clear_cache(self) -> int:
|
|
263
|
+
"""
|
|
264
|
+
Clear all cached indexes.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Number of caches cleared
|
|
268
|
+
"""
|
|
269
|
+
count = len(self._session_cache)
|
|
270
|
+
self._session_cache.clear()
|
|
271
|
+
|
|
272
|
+
if self.cache_dir:
|
|
273
|
+
import shutil
|
|
274
|
+
for item in self.cache_dir.iterdir():
|
|
275
|
+
if item.is_dir():
|
|
276
|
+
shutil.rmtree(item)
|
|
277
|
+
count += 1
|
|
278
|
+
|
|
279
|
+
logger.info("cache_cleared", count=count)
|
|
280
|
+
return count
|
|
281
|
+
|
|
282
|
+
def _get_cache_key(self, doc_path: Path) -> str:
|
|
283
|
+
"""Generate a cache key for a document."""
|
|
284
|
+
stat = doc_path.stat()
|
|
285
|
+
hash_input = f"{doc_path.name}_{stat.st_size}_{stat.st_mtime}"
|
|
286
|
+
return hashlib.md5(hash_input.encode()).hexdigest()[:16]
|
|
287
|
+
|
|
288
|
+
def _get_or_create_index(
|
|
289
|
+
self,
|
|
290
|
+
doc_path: Path,
|
|
291
|
+
force_reindex: bool,
|
|
292
|
+
) -> tuple[dict[str, SkeletonNode], KVStore]:
|
|
293
|
+
"""Get cached index or create new one."""
|
|
294
|
+
cache_key = self._get_cache_key(doc_path)
|
|
295
|
+
|
|
296
|
+
# Check session cache first
|
|
297
|
+
if not force_reindex and cache_key in self._session_cache:
|
|
298
|
+
logger.debug("using_session_cache", key=cache_key)
|
|
299
|
+
return self._session_cache[cache_key]
|
|
300
|
+
|
|
301
|
+
# Check persistent cache
|
|
302
|
+
if not force_reindex and self.cache_dir:
|
|
303
|
+
cache_path = self.cache_dir / cache_key
|
|
304
|
+
if cache_path.exists():
|
|
305
|
+
logger.debug("using_persistent_cache", key=cache_key)
|
|
306
|
+
skeleton, kv_store = load_index(cache_path)
|
|
307
|
+
self._session_cache[cache_key] = (skeleton, kv_store)
|
|
308
|
+
return skeleton, kv_store
|
|
309
|
+
|
|
310
|
+
# Create new index
|
|
311
|
+
logger.info("creating_new_index", path=str(doc_path))
|
|
312
|
+
result = ingest_document(str(doc_path))
|
|
313
|
+
skeleton, kv_store = build_skeleton_index(result.tree)
|
|
314
|
+
|
|
315
|
+
# Store in session cache
|
|
316
|
+
self._session_cache[cache_key] = (skeleton, kv_store)
|
|
317
|
+
|
|
318
|
+
# Persist if cache_dir is set
|
|
319
|
+
if self.cache_dir:
|
|
320
|
+
save_index(skeleton, kv_store, self.cache_dir / cache_key)
|
|
321
|
+
|
|
322
|
+
return skeleton, kv_store
|
|
323
|
+
|
|
324
|
+
# =========================================================================
|
|
325
|
+
# Advanced Navigation Methods
|
|
326
|
+
# =========================================================================
|
|
327
|
+
|
|
328
|
+
def ask_advanced(
|
|
329
|
+
self,
|
|
330
|
+
document: str | Path,
|
|
331
|
+
question: str,
|
|
332
|
+
use_rlm: bool = True,
|
|
333
|
+
enable_pre_filtering: bool = True,
|
|
334
|
+
enable_verification: bool = True,
|
|
335
|
+
max_recursion_depth: int = 3,
|
|
336
|
+
force_reindex: bool = False,
|
|
337
|
+
metadata: dict[str, Any] | None = None,
|
|
338
|
+
) -> dict[str, Any]:
|
|
339
|
+
"""
|
|
340
|
+
Advanced Q&A using the full RLM Navigator.
|
|
341
|
+
|
|
342
|
+
This uses the state-of-the-art Recursive Language Model approach:
|
|
343
|
+
- Pre-filtering with keyword extraction before LLM calls
|
|
344
|
+
- Deep recursive sub-LLM calls (configurable depth)
|
|
345
|
+
- Answer verification with sub-LLM validation
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
document: Path to PDF file.
|
|
349
|
+
question: Question to ask.
|
|
350
|
+
use_rlm: Use RLM Navigator (True) or standard navigator (False).
|
|
351
|
+
enable_pre_filtering: Use keyword filtering before ToT evaluation.
|
|
352
|
+
enable_verification: Verify answers with sub-LLM.
|
|
353
|
+
max_recursion_depth: Max depth for recursive sub-LLM calls.
|
|
354
|
+
force_reindex: Re-process even if cached.
|
|
355
|
+
metadata: Optional metadata (e.g., multiple choice options).
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Full result dictionary with answer, confidence, trace.
|
|
359
|
+
|
|
360
|
+
Example:
|
|
361
|
+
result = client.ask_advanced(
|
|
362
|
+
"complex_report.pdf",
|
|
363
|
+
"Compare the liability clauses in section 5 and section 8",
|
|
364
|
+
enable_verification=True,
|
|
365
|
+
)
|
|
366
|
+
print(f"Answer: {result['answer']}")
|
|
367
|
+
print(f"Confidence: {result['confidence']}")
|
|
368
|
+
"""
|
|
369
|
+
doc_path = Path(document)
|
|
370
|
+
if not doc_path.exists():
|
|
371
|
+
raise FileNotFoundError(f"Document not found: {doc_path}")
|
|
372
|
+
|
|
373
|
+
# Get or create index
|
|
374
|
+
skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
|
|
375
|
+
|
|
376
|
+
if use_rlm:
|
|
377
|
+
# Use the new RLM Navigator
|
|
378
|
+
from rnsr.agent.rlm_navigator import RLMConfig, run_rlm_navigator
|
|
379
|
+
|
|
380
|
+
config = RLMConfig(
|
|
381
|
+
max_recursion_depth=max_recursion_depth,
|
|
382
|
+
enable_pre_filtering=enable_pre_filtering,
|
|
383
|
+
enable_verification=enable_verification,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
result = run_rlm_navigator(
|
|
387
|
+
question,
|
|
388
|
+
skeleton,
|
|
389
|
+
kv_store,
|
|
390
|
+
config=config,
|
|
391
|
+
metadata=metadata,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
return result
|
|
395
|
+
else:
|
|
396
|
+
# Use standard navigator
|
|
397
|
+
result = run_navigator(question, skeleton, kv_store, metadata=metadata)
|
|
398
|
+
return result
|
|
399
|
+
|
|
400
|
+
def ask_vision(
|
|
401
|
+
self,
|
|
402
|
+
document: str | Path,
|
|
403
|
+
question: str,
|
|
404
|
+
use_hybrid: bool = True,
|
|
405
|
+
metadata: dict[str, Any] | None = None,
|
|
406
|
+
) -> dict[str, Any]:
|
|
407
|
+
"""
|
|
408
|
+
Vision-based Q&A working directly on page images.
|
|
409
|
+
|
|
410
|
+
This is ideal for:
|
|
411
|
+
- Scanned documents where OCR quality is poor
|
|
412
|
+
- Documents with charts, graphs, or diagrams
|
|
413
|
+
- Image-heavy presentations or reports
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
document: Path to PDF file.
|
|
417
|
+
question: Question to ask.
|
|
418
|
+
use_hybrid: Combine text+vision analysis (True) or vision-only (False).
|
|
419
|
+
metadata: Optional metadata (e.g., multiple choice options).
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
Result dictionary with answer, confidence, selected_pages.
|
|
423
|
+
|
|
424
|
+
Example:
|
|
425
|
+
result = client.ask_vision(
|
|
426
|
+
"financial_report.pdf",
|
|
427
|
+
"What does the revenue chart show?",
|
|
428
|
+
)
|
|
429
|
+
print(f"Answer: {result['answer']}")
|
|
430
|
+
print(f"Pages analyzed: {result['selected_pages']}")
|
|
431
|
+
"""
|
|
432
|
+
doc_path = Path(document)
|
|
433
|
+
if not doc_path.exists():
|
|
434
|
+
raise FileNotFoundError(f"Document not found: {doc_path}")
|
|
435
|
+
|
|
436
|
+
from rnsr.ingestion.vision_retrieval import (
|
|
437
|
+
VisionConfig,
|
|
438
|
+
create_vision_navigator,
|
|
439
|
+
create_hybrid_navigator,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
config = VisionConfig()
|
|
443
|
+
|
|
444
|
+
if use_hybrid:
|
|
445
|
+
# Get text-based index for hybrid mode
|
|
446
|
+
try:
|
|
447
|
+
skeleton, kv_store = self._get_or_create_index(doc_path, False)
|
|
448
|
+
except Exception:
|
|
449
|
+
skeleton, kv_store = None, None
|
|
450
|
+
|
|
451
|
+
navigator = create_hybrid_navigator(
|
|
452
|
+
doc_path,
|
|
453
|
+
skeleton=skeleton,
|
|
454
|
+
kv_store=kv_store,
|
|
455
|
+
vision_config=config,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
result = navigator.navigate(question, metadata)
|
|
459
|
+
return {
|
|
460
|
+
"answer": result.get("combined_answer"),
|
|
461
|
+
"confidence": result.get("confidence", 0),
|
|
462
|
+
"method_used": result.get("method_used"),
|
|
463
|
+
"text_result": result.get("text_result"),
|
|
464
|
+
"vision_result": result.get("vision_result"),
|
|
465
|
+
}
|
|
466
|
+
else:
|
|
467
|
+
# Vision-only mode
|
|
468
|
+
navigator = create_vision_navigator(doc_path, config)
|
|
469
|
+
return navigator.navigate(question, metadata)
|
|
470
|
+
|
|
471
|
+
def analyze_document_structure(
|
|
472
|
+
self,
|
|
473
|
+
document: str | Path,
|
|
474
|
+
force_reindex: bool = False,
|
|
475
|
+
) -> dict[str, Any]:
|
|
476
|
+
"""
|
|
477
|
+
Analyze a document's structure without querying it.
|
|
478
|
+
|
|
479
|
+
Returns detailed information about the document hierarchy,
|
|
480
|
+
sections, and content distribution.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
document: Path to PDF file.
|
|
484
|
+
force_reindex: Re-process even if cached.
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
Dictionary with structure analysis.
|
|
488
|
+
|
|
489
|
+
Example:
|
|
490
|
+
info = client.analyze_document_structure("contract.pdf")
|
|
491
|
+
print(f"Sections: {info['section_count']}")
|
|
492
|
+
print(f"Max depth: {info['max_depth']}")
|
|
493
|
+
"""
|
|
494
|
+
doc_path = Path(document)
|
|
495
|
+
skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
|
|
496
|
+
|
|
497
|
+
# Analyze structure
|
|
498
|
+
section_count = 0
|
|
499
|
+
max_depth = 0
|
|
500
|
+
total_chars = 0
|
|
501
|
+
level_counts: dict[int, int] = {}
|
|
502
|
+
|
|
503
|
+
for node_id, node in skeleton.items():
|
|
504
|
+
section_count += 1
|
|
505
|
+
max_depth = max(max_depth, node.level)
|
|
506
|
+
level_counts[node.level] = level_counts.get(node.level, 0) + 1
|
|
507
|
+
|
|
508
|
+
content = kv_store.get(node_id)
|
|
509
|
+
if content:
|
|
510
|
+
total_chars += len(content)
|
|
511
|
+
|
|
512
|
+
return {
|
|
513
|
+
"path": str(doc_path),
|
|
514
|
+
"section_count": section_count,
|
|
515
|
+
"max_depth": max_depth,
|
|
516
|
+
"level_distribution": level_counts,
|
|
517
|
+
"total_characters": total_chars,
|
|
518
|
+
"average_section_length": total_chars // max(section_count, 1),
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
def get_document_outline(
|
|
522
|
+
self,
|
|
523
|
+
document: str | Path,
|
|
524
|
+
max_depth: int = 2,
|
|
525
|
+
force_reindex: bool = False,
|
|
526
|
+
) -> list[dict[str, Any]]:
|
|
527
|
+
"""
|
|
528
|
+
Get a document outline (table of contents).
|
|
529
|
+
|
|
530
|
+
Args:
|
|
531
|
+
document: Path to PDF file.
|
|
532
|
+
max_depth: Maximum heading depth to include.
|
|
533
|
+
force_reindex: Re-process even if cached.
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
List of section dictionaries with header and level.
|
|
537
|
+
|
|
538
|
+
Example:
|
|
539
|
+
outline = client.get_document_outline("report.pdf")
|
|
540
|
+
for section in outline:
|
|
541
|
+
indent = " " * section['level']
|
|
542
|
+
print(f"{indent}{section['header']}")
|
|
543
|
+
"""
|
|
544
|
+
doc_path = Path(document)
|
|
545
|
+
skeleton, _ = self._get_or_create_index(doc_path, force_reindex)
|
|
546
|
+
|
|
547
|
+
outline = []
|
|
548
|
+
for node in skeleton.values():
|
|
549
|
+
if node.level <= max_depth:
|
|
550
|
+
outline.append({
|
|
551
|
+
"id": node.node_id,
|
|
552
|
+
"header": node.header,
|
|
553
|
+
"level": node.level,
|
|
554
|
+
"summary": node.summary[:100] if node.summary else "",
|
|
555
|
+
"child_count": len(node.child_ids),
|
|
556
|
+
})
|
|
557
|
+
|
|
558
|
+
# Sort by node_id to maintain document order
|
|
559
|
+
outline.sort(key=lambda x: x["id"])
|
|
560
|
+
return outline
|