rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Extraction Data Models
|
|
3
|
+
|
|
4
|
+
Pydantic models for entity extraction, relationships, and ontological linking.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
from uuid import uuid4
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# =============================================================================
|
|
18
|
+
# Entity Types
|
|
19
|
+
# =============================================================================
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EntityType(str, Enum):
|
|
23
|
+
"""
|
|
24
|
+
Types of entities that can be extracted from documents.
|
|
25
|
+
|
|
26
|
+
Note: The OTHER type is used as a fallback for any entity types
|
|
27
|
+
the LLM identifies that don't match a predefined category.
|
|
28
|
+
The original type string is preserved in entity.metadata["original_type"].
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
PERSON = "person" # Names, roles, parties (plaintiff, defendant, witness)
|
|
32
|
+
ORGANIZATION = "organization" # Companies, agencies, courts
|
|
33
|
+
LEGAL_CONCEPT = "legal_concept" # Claims, breaches, obligations, remedies
|
|
34
|
+
DATE = "date" # Key dates, events, deadlines
|
|
35
|
+
EVENT = "event" # Significant occurrences
|
|
36
|
+
LOCATION = "location" # Places, addresses, jurisdictions
|
|
37
|
+
REFERENCE = "reference" # Section references, document citations
|
|
38
|
+
MONETARY = "monetary" # Dollar amounts, financial figures
|
|
39
|
+
DOCUMENT = "document" # Referenced documents (exhibits, contracts)
|
|
40
|
+
|
|
41
|
+
# Fallback for any type not in the predefined list
|
|
42
|
+
OTHER = "other" # Catch-all for novel/custom entity types
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RelationType(str, Enum):
|
|
46
|
+
"""
|
|
47
|
+
Types of relationships between entities and sections.
|
|
48
|
+
|
|
49
|
+
Note: The OTHER type is used as a fallback for any relationship types
|
|
50
|
+
the LLM identifies that don't match a predefined category.
|
|
51
|
+
The original type string is preserved in relationship.metadata["original_type"].
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# Entity-to-Section relationships
|
|
55
|
+
MENTIONS = "mentions" # Entity X is mentioned in Section Y
|
|
56
|
+
DEFINED_IN = "defined_in" # Entity X is defined/introduced in Section Y
|
|
57
|
+
|
|
58
|
+
# Entity-to-Entity relationships
|
|
59
|
+
TEMPORAL_BEFORE = "temporal_before" # Event X occurred before Event Y
|
|
60
|
+
TEMPORAL_AFTER = "temporal_after" # Event X occurred after Event Y
|
|
61
|
+
CAUSAL = "causal" # Action X caused/led to Outcome Y
|
|
62
|
+
AFFILIATED_WITH = "affiliated_with" # Person X is affiliated with Org Y
|
|
63
|
+
PARTY_TO = "party_to" # Entity X is party to Document/Event Y
|
|
64
|
+
|
|
65
|
+
# Section-to-Section relationships
|
|
66
|
+
SUPPORTS = "supports" # Section X supports claim in Section Y
|
|
67
|
+
CONTRADICTS = "contradicts" # Section X contradicts Section Y
|
|
68
|
+
REFERENCES = "references" # Section X references Document/Section Y
|
|
69
|
+
SUPERSEDES = "supersedes" # Section X supersedes/overrides Section Y
|
|
70
|
+
AMENDS = "amends" # Section X amends Section Y
|
|
71
|
+
|
|
72
|
+
# Fallback for any relationship type not in the predefined list
|
|
73
|
+
OTHER = "other" # Catch-all for novel/custom relationship types
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# =============================================================================
|
|
77
|
+
# Mention Model
|
|
78
|
+
# =============================================================================
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Mention(BaseModel):
|
|
82
|
+
"""A specific occurrence of an entity in a document section."""
|
|
83
|
+
|
|
84
|
+
id: str = Field(default_factory=lambda: f"mention_{str(uuid4())[:8]}")
|
|
85
|
+
node_id: str # Which skeleton node contains this mention
|
|
86
|
+
doc_id: str # Which document (for multi-doc bundles)
|
|
87
|
+
span_start: int | None = None # Character offset start (optional)
|
|
88
|
+
span_end: int | None = None # Character offset end (optional)
|
|
89
|
+
context: str = "" # Surrounding text snippet for grounding
|
|
90
|
+
page_num: int | None = None # Page number if available
|
|
91
|
+
confidence: float = 1.0 # Extraction confidence (0.0-1.0)
|
|
92
|
+
|
|
93
|
+
class Config:
|
|
94
|
+
frozen = False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# =============================================================================
|
|
98
|
+
# Entity Model
|
|
99
|
+
# =============================================================================
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Entity(BaseModel):
|
|
103
|
+
"""
|
|
104
|
+
An extracted entity from a document.
|
|
105
|
+
|
|
106
|
+
Entities represent named concepts (people, organizations, dates, etc.)
|
|
107
|
+
that can be tracked across document sections and linked across documents.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
id: str = Field(default_factory=lambda: f"ent_{str(uuid4())[:8]}")
|
|
111
|
+
type: EntityType
|
|
112
|
+
canonical_name: str # Normalized/canonical name
|
|
113
|
+
aliases: list[str] = Field(default_factory=list) # Alternative names/spellings
|
|
114
|
+
mentions: list[Mention] = Field(default_factory=list) # Where this entity appears
|
|
115
|
+
metadata: dict[str, Any] = Field(default_factory=dict) # Type-specific metadata
|
|
116
|
+
|
|
117
|
+
# Tracking
|
|
118
|
+
source_doc_id: str | None = None # Original document where first extracted
|
|
119
|
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
120
|
+
|
|
121
|
+
class Config:
|
|
122
|
+
frozen = False
|
|
123
|
+
|
|
124
|
+
def add_mention(self, mention: Mention) -> None:
|
|
125
|
+
"""Add a new mention of this entity."""
|
|
126
|
+
self.mentions.append(mention)
|
|
127
|
+
|
|
128
|
+
def add_alias(self, alias: str) -> None:
|
|
129
|
+
"""Add an alternative name for this entity."""
|
|
130
|
+
normalized = alias.strip()
|
|
131
|
+
if normalized and normalized not in self.aliases and normalized != self.canonical_name:
|
|
132
|
+
self.aliases.append(normalized)
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def all_names(self) -> list[str]:
|
|
136
|
+
"""Get all names (canonical + aliases)."""
|
|
137
|
+
return [self.canonical_name] + self.aliases
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def document_ids(self) -> set[str]:
|
|
141
|
+
"""Get all document IDs where this entity appears."""
|
|
142
|
+
return {m.doc_id for m in self.mentions}
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def node_ids(self) -> set[str]:
|
|
146
|
+
"""Get all node IDs where this entity appears."""
|
|
147
|
+
return {m.node_id for m in self.mentions}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# =============================================================================
|
|
151
|
+
# Relationship Model
|
|
152
|
+
# =============================================================================
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class Relationship(BaseModel):
|
|
156
|
+
"""
|
|
157
|
+
A relationship between entities or between sections.
|
|
158
|
+
|
|
159
|
+
Relationships capture semantic connections that enable cross-document
|
|
160
|
+
understanding and complex query resolution.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
id: str = Field(default_factory=lambda: f"rel_{str(uuid4())[:8]}")
|
|
164
|
+
type: RelationType
|
|
165
|
+
source_id: str # Entity ID or Node ID
|
|
166
|
+
target_id: str # Entity ID or Node ID
|
|
167
|
+
source_type: str = "entity" # "entity" or "node"
|
|
168
|
+
target_type: str = "entity" # "entity" or "node"
|
|
169
|
+
|
|
170
|
+
doc_id: str | None = None # Source document
|
|
171
|
+
confidence: float = 1.0 # Extraction confidence (0.0-1.0)
|
|
172
|
+
evidence: str = "" # Supporting text that establishes the relationship
|
|
173
|
+
|
|
174
|
+
# Metadata
|
|
175
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
176
|
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
177
|
+
|
|
178
|
+
class Config:
|
|
179
|
+
frozen = False
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# =============================================================================
|
|
183
|
+
# Entity Link Model (for cross-document linking)
|
|
184
|
+
# =============================================================================
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class EntityLink(BaseModel):
|
|
188
|
+
"""
|
|
189
|
+
A link between two entities that represent the same real-world entity.
|
|
190
|
+
|
|
191
|
+
Used for cross-document entity resolution.
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
entity_id_1: str # First entity ID
|
|
195
|
+
entity_id_2: str # Second entity ID
|
|
196
|
+
confidence: float = 1.0 # Link confidence (0.0-1.0)
|
|
197
|
+
link_method: str = "exact" # How the link was established (exact, fuzzy, llm)
|
|
198
|
+
evidence: str = "" # Why these entities are considered the same
|
|
199
|
+
|
|
200
|
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
201
|
+
|
|
202
|
+
class Config:
|
|
203
|
+
frozen = False
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def entity_ids(self) -> tuple[str, str]:
|
|
207
|
+
"""Get both entity IDs as a sorted tuple for consistent ordering."""
|
|
208
|
+
return tuple(sorted([self.entity_id_1, self.entity_id_2]))
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# =============================================================================
|
|
212
|
+
# Extraction Result Model
|
|
213
|
+
# =============================================================================
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class ExtractionResult(BaseModel):
|
|
217
|
+
"""
|
|
218
|
+
Result of entity/relationship extraction from a document section.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
node_id: str # Which node was processed
|
|
222
|
+
doc_id: str # Which document
|
|
223
|
+
entities: list[Entity] = Field(default_factory=list)
|
|
224
|
+
relationships: list[Relationship] = Field(default_factory=list)
|
|
225
|
+
|
|
226
|
+
# Processing metadata
|
|
227
|
+
extraction_method: str = "llm" # Method used (llm, rule-based, hybrid)
|
|
228
|
+
processing_time_ms: float = 0.0
|
|
229
|
+
warnings: list[str] = Field(default_factory=list)
|
|
230
|
+
|
|
231
|
+
class Config:
|
|
232
|
+
frozen = False
|