rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/models.py ADDED
@@ -0,0 +1,167 @@
1
+ """
2
+ RNSR Data Models
3
+
4
+ Pydantic models for all data structures in the pipeline.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Literal
10
+ from uuid import uuid4
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+
15
+ # =============================================================================
16
+ # Ingestion Models
17
+ # =============================================================================
18
+
19
+
20
+ class BoundingBox(BaseModel):
21
+ """Bounding box for a text element on a page."""
22
+
23
+ x0: float
24
+ y0: float
25
+ x1: float
26
+ y1: float
27
+
28
+ @property
29
+ def width(self) -> float:
30
+ return self.x1 - self.x0
31
+
32
+ @property
33
+ def height(self) -> float:
34
+ return self.y1 - self.y0
35
+
36
+ @property
37
+ def center(self) -> tuple[float, float]:
38
+ return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
39
+
40
+
41
+ class SpanInfo(BaseModel):
42
+ """Information about a single text span from PyMuPDF."""
43
+
44
+ text: str
45
+ font_size: float
46
+ font_name: str
47
+ is_bold: bool = False
48
+ is_italic: bool = False
49
+ bbox: BoundingBox
50
+ page_num: int
51
+
52
+
53
+ class FontAnalysis(BaseModel):
54
+ """Results of font histogram analysis."""
55
+
56
+ body_size: float
57
+ header_threshold: float
58
+ size_histogram: dict[float, int]
59
+ span_count: int
60
+ unique_sizes: int
61
+
62
+
63
+ class ClassifiedSpan(SpanInfo):
64
+ """A span with its classification (header level or body)."""
65
+
66
+ role: Literal["header", "body", "caption", "footnote"] = "body"
67
+ header_level: int = 0 # 0 = not a header, 1-3 = H1-H3
68
+
69
+
70
+ class DocumentNode(BaseModel):
71
+ """A node in the document tree structure."""
72
+
73
+ id: str = Field(default_factory=lambda: str(uuid4())[:8])
74
+ level: int # 0 = root, 1 = H1, 2 = H2, 3 = H3
75
+ header: str = ""
76
+ content: str = "" # Full text content
77
+ page_num: int | None = None
78
+ bbox: BoundingBox | None = None
79
+ children: list[DocumentNode] = Field(default_factory=list)
80
+
81
+ @property
82
+ def child_ids(self) -> list[str]:
83
+ return [child.id for child in self.children]
84
+
85
+
86
+ class DocumentTree(BaseModel):
87
+ """Complete document tree structure."""
88
+
89
+ id: str = Field(default_factory=lambda: f"doc_{str(uuid4())[:8]}")
90
+ title: str = ""
91
+ root: DocumentNode
92
+ total_nodes: int = 0
93
+ ingestion_tier: Literal[1, 2, 3] = 1
94
+ ingestion_method: IngestionMethod | None = None
95
+
96
+
97
+ # =============================================================================
98
+ # Indexing Models
99
+ # =============================================================================
100
+
101
+
102
+ class SkeletonNode(BaseModel):
103
+ """A lightweight node for the skeleton index."""
104
+
105
+ node_id: str
106
+ parent_id: str | None
107
+ level: int
108
+ header: str
109
+ summary: str # 50-100 words max - this goes in vector store
110
+ child_ids: list[str]
111
+ page_num: int | None = None
112
+ metadata: dict[str, Any] = Field(default_factory=dict)
113
+
114
+
115
+ class IngestionResult(BaseModel):
116
+ """Result of document ingestion including metadata."""
117
+
118
+ tree: DocumentTree
119
+ tier_used: Literal[1, 2, 3]
120
+ method: IngestionMethod
121
+ warnings: list[str] = Field(default_factory=list)
122
+ stats: dict[str, Any] = Field(default_factory=dict)
123
+
124
+
125
+ # =============================================================================
126
+ # Agent Models
127
+ # =============================================================================
128
+
129
+
130
+ class StoredVariable(BaseModel):
131
+ """Metadata about a stored variable in the VariableStore."""
132
+
133
+ pointer: str # e.g., "$LIABILITY_CLAUSE"
134
+ source_node_id: str
135
+ content_hash: str
136
+ char_count: int
137
+ created_at: str
138
+
139
+
140
+ class TraceEntry(BaseModel):
141
+ """A single entry in the retrieval trace log."""
142
+
143
+ timestamp: str
144
+ node_type: Literal["decomposition", "navigation", "variable_stitching", "synthesis"]
145
+ action: str
146
+ details: dict[str, Any] = Field(default_factory=dict)
147
+
148
+
149
+ class RetrievalTrace(BaseModel):
150
+ """Complete trace of agent's retrieval process."""
151
+
152
+ query: str
153
+ total_steps: int
154
+ nodes_visited: list[str]
155
+ nodes_rejected: list[dict[str, str]]
156
+ variables_stored: list[str]
157
+ final_path: str
158
+ entries: list[TraceEntry] = Field(default_factory=list)
159
+
160
+ # Define the type alias for all valid ingestion methods
161
+ IngestionMethod = Literal[
162
+ "font_histogram",
163
+ "semantic_splitter",
164
+ "ocr",
165
+ "xy_cut",
166
+ "hierarchical_clustering",
167
+ ]
rnsr/py.typed ADDED
@@ -0,0 +1,2 @@
1
+ # Marker file for PEP 561
2
+ # This package supports type checking