rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/models.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Data Models
|
|
3
|
+
|
|
4
|
+
Pydantic models for all data structures in the pipeline.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
from uuid import uuid4
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# =============================================================================
|
|
16
|
+
# Ingestion Models
|
|
17
|
+
# =============================================================================
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BoundingBox(BaseModel):
|
|
21
|
+
"""Bounding box for a text element on a page."""
|
|
22
|
+
|
|
23
|
+
x0: float
|
|
24
|
+
y0: float
|
|
25
|
+
x1: float
|
|
26
|
+
y1: float
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def width(self) -> float:
|
|
30
|
+
return self.x1 - self.x0
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def height(self) -> float:
|
|
34
|
+
return self.y1 - self.y0
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def center(self) -> tuple[float, float]:
|
|
38
|
+
return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SpanInfo(BaseModel):
|
|
42
|
+
"""Information about a single text span from PyMuPDF."""
|
|
43
|
+
|
|
44
|
+
text: str
|
|
45
|
+
font_size: float
|
|
46
|
+
font_name: str
|
|
47
|
+
is_bold: bool = False
|
|
48
|
+
is_italic: bool = False
|
|
49
|
+
bbox: BoundingBox
|
|
50
|
+
page_num: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class FontAnalysis(BaseModel):
|
|
54
|
+
"""Results of font histogram analysis."""
|
|
55
|
+
|
|
56
|
+
body_size: float
|
|
57
|
+
header_threshold: float
|
|
58
|
+
size_histogram: dict[float, int]
|
|
59
|
+
span_count: int
|
|
60
|
+
unique_sizes: int
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ClassifiedSpan(SpanInfo):
|
|
64
|
+
"""A span with its classification (header level or body)."""
|
|
65
|
+
|
|
66
|
+
role: Literal["header", "body", "caption", "footnote"] = "body"
|
|
67
|
+
header_level: int = 0 # 0 = not a header, 1-3 = H1-H3
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class DocumentNode(BaseModel):
|
|
71
|
+
"""A node in the document tree structure."""
|
|
72
|
+
|
|
73
|
+
id: str = Field(default_factory=lambda: str(uuid4())[:8])
|
|
74
|
+
level: int # 0 = root, 1 = H1, 2 = H2, 3 = H3
|
|
75
|
+
header: str = ""
|
|
76
|
+
content: str = "" # Full text content
|
|
77
|
+
page_num: int | None = None
|
|
78
|
+
bbox: BoundingBox | None = None
|
|
79
|
+
children: list[DocumentNode] = Field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def child_ids(self) -> list[str]:
|
|
83
|
+
return [child.id for child in self.children]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class DocumentTree(BaseModel):
|
|
87
|
+
"""Complete document tree structure."""
|
|
88
|
+
|
|
89
|
+
id: str = Field(default_factory=lambda: f"doc_{str(uuid4())[:8]}")
|
|
90
|
+
title: str = ""
|
|
91
|
+
root: DocumentNode
|
|
92
|
+
total_nodes: int = 0
|
|
93
|
+
ingestion_tier: Literal[1, 2, 3] = 1
|
|
94
|
+
ingestion_method: IngestionMethod | None = None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# =============================================================================
|
|
98
|
+
# Indexing Models
|
|
99
|
+
# =============================================================================
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class SkeletonNode(BaseModel):
|
|
103
|
+
"""A lightweight node for the skeleton index."""
|
|
104
|
+
|
|
105
|
+
node_id: str
|
|
106
|
+
parent_id: str | None
|
|
107
|
+
level: int
|
|
108
|
+
header: str
|
|
109
|
+
summary: str # 50-100 words max - this goes in vector store
|
|
110
|
+
child_ids: list[str]
|
|
111
|
+
page_num: int | None = None
|
|
112
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class IngestionResult(BaseModel):
|
|
116
|
+
"""Result of document ingestion including metadata."""
|
|
117
|
+
|
|
118
|
+
tree: DocumentTree
|
|
119
|
+
tier_used: Literal[1, 2, 3]
|
|
120
|
+
method: IngestionMethod
|
|
121
|
+
warnings: list[str] = Field(default_factory=list)
|
|
122
|
+
stats: dict[str, Any] = Field(default_factory=dict)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# Agent Models
|
|
127
|
+
# =============================================================================
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class StoredVariable(BaseModel):
|
|
131
|
+
"""Metadata about a stored variable in the VariableStore."""
|
|
132
|
+
|
|
133
|
+
pointer: str # e.g., "$LIABILITY_CLAUSE"
|
|
134
|
+
source_node_id: str
|
|
135
|
+
content_hash: str
|
|
136
|
+
char_count: int
|
|
137
|
+
created_at: str
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TraceEntry(BaseModel):
|
|
141
|
+
"""A single entry in the retrieval trace log."""
|
|
142
|
+
|
|
143
|
+
timestamp: str
|
|
144
|
+
node_type: Literal["decomposition", "navigation", "variable_stitching", "synthesis"]
|
|
145
|
+
action: str
|
|
146
|
+
details: dict[str, Any] = Field(default_factory=dict)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class RetrievalTrace(BaseModel):
|
|
150
|
+
"""Complete trace of agent's retrieval process."""
|
|
151
|
+
|
|
152
|
+
query: str
|
|
153
|
+
total_steps: int
|
|
154
|
+
nodes_visited: list[str]
|
|
155
|
+
nodes_rejected: list[dict[str, str]]
|
|
156
|
+
variables_stored: list[str]
|
|
157
|
+
final_path: str
|
|
158
|
+
entries: list[TraceEntry] = Field(default_factory=list)
|
|
159
|
+
|
|
160
|
+
# Define the type alias for all valid ingestion methods
|
|
161
|
+
IngestionMethod = Literal[
|
|
162
|
+
"font_histogram",
|
|
163
|
+
"semantic_splitter",
|
|
164
|
+
"ocr",
|
|
165
|
+
"xy_cut",
|
|
166
|
+
"hierarchical_clustering",
|
|
167
|
+
]
|
rnsr/py.typed
ADDED