rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Relationship Pattern Extractor
|
|
3
|
+
|
|
4
|
+
Pre-extracts relationship candidates using patterns, similar to entity extraction.
|
|
5
|
+
This provides GROUNDED relationship candidates that are validated by LLM/ToT.
|
|
6
|
+
|
|
7
|
+
Patterns detect:
|
|
8
|
+
1. Entity proximity (co-occurrence signals relationships)
|
|
9
|
+
2. Explicit relationship markers (verbs, prepositions)
|
|
10
|
+
3. Reference patterns (citations, exhibits)
|
|
11
|
+
4. Temporal markers (before, after, during)
|
|
12
|
+
5. Causal markers (caused, led to, resulted in)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import structlog
|
|
22
|
+
|
|
23
|
+
from rnsr.extraction.candidate_extractor import EntityCandidate
|
|
24
|
+
from rnsr.extraction.models import Entity, RelationType
|
|
25
|
+
|
|
26
|
+
logger = structlog.get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class RelationshipCandidate:
|
|
31
|
+
"""
|
|
32
|
+
A candidate relationship extracted from text before LLM validation.
|
|
33
|
+
|
|
34
|
+
Grounded in actual text - tied to specific spans and patterns.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
source_text: str # Source entity text
|
|
38
|
+
target_text: str # Target entity text
|
|
39
|
+
relationship_type: str # Suggested relationship type
|
|
40
|
+
evidence: str # The text that indicates the relationship
|
|
41
|
+
span_start: int # Start of relationship evidence
|
|
42
|
+
span_end: int # End of relationship evidence
|
|
43
|
+
confidence: float = 0.5 # Pattern match confidence
|
|
44
|
+
pattern_name: str = "" # Which pattern matched
|
|
45
|
+
source_entity_id: str | None = None
|
|
46
|
+
target_entity_id: str | None = None
|
|
47
|
+
metadata: dict = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# Relationship Pattern Definitions
|
|
52
|
+
# =============================================================================
|
|
53
|
+
|
|
54
|
+
# Affiliation patterns: "X of Y", "X at Y", "X, [title] of Y"
|
|
55
|
+
AFFILIATION_PATTERNS = [
|
|
56
|
+
# "John Smith, CEO of Acme Corp"
|
|
57
|
+
(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+),?\s+(?:CEO|President|Director|Manager|Partner|Attorney|Counsel|Agent|Representative)\s+(?:of|at|for)\s+([A-Z][A-Za-z\s&]+(?:Inc\.|LLC|Corp\.?|Company)?)', "title_of"),
|
|
58
|
+
|
|
59
|
+
# "employed by", "works for"
|
|
60
|
+
(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s+(?:is\s+)?(?:employed|hired|engaged)\s+by\s+([A-Z][A-Za-z\s&]+)', "employed_by"),
|
|
61
|
+
|
|
62
|
+
# "X, an employee of Y"
|
|
63
|
+
(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+),?\s+(?:an?\s+)?(?:employee|officer|director|member)\s+of\s+([A-Z][A-Za-z\s&]+)', "member_of"),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# Party-to patterns: parties to agreements, cases
|
|
67
|
+
PARTY_TO_PATTERNS = [
|
|
68
|
+
# "X entered into [agreement] with Y"
|
|
69
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:entered\s+into|executed|signed)\s+(?:the\s+)?(?:Agreement|Contract|Lease|License)\s+with\s+([A-Z][A-Za-z\s]+)', "entered_into"),
|
|
70
|
+
|
|
71
|
+
# "between X and Y"
|
|
72
|
+
(r'between\s+([A-Z][A-Za-z\s,]+?)\s+and\s+([A-Z][A-Za-z\s,]+?)(?:,|\.|;)', "between_parties"),
|
|
73
|
+
|
|
74
|
+
# "X v. Y" (legal case)
|
|
75
|
+
(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+v\.\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', "versus"),
|
|
76
|
+
|
|
77
|
+
# "Plaintiff X" / "Defendant Y"
|
|
78
|
+
(r'(?:Plaintiff|Petitioner|Appellant)\s+([A-Z][A-Za-z\s]+?)(?:,|and|;|\.|filed)', "plaintiff"),
|
|
79
|
+
(r'(?:Defendant|Respondent|Appellee)\s+([A-Z][A-Za-z\s]+?)(?:,|and|;|\.)', "defendant"),
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
# Temporal patterns: before, after, during
|
|
83
|
+
TEMPORAL_PATTERNS = [
|
|
84
|
+
# "X before Y"
|
|
85
|
+
(r'([A-Z][A-Za-z\s]+?|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})\s+(?:before|prior\s+to|preceding)\s+([A-Z][A-Za-z\s]+?|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})', "temporal_before"),
|
|
86
|
+
|
|
87
|
+
# "X after Y"
|
|
88
|
+
(r'([A-Z][A-Za-z\s]+?|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4})\s+(?:after|following|subsequent\s+to)\s+([A-Z][A-Za-z\s]+?)', "temporal_after"),
|
|
89
|
+
|
|
90
|
+
# "from X to Y"
|
|
91
|
+
(r'from\s+([A-Z][A-Za-z\s,\d]+?)\s+(?:to|until|through)\s+([A-Z][A-Za-z\s,\d]+?)(?:,|\.)', "temporal_range"),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Causal patterns: caused, led to, resulted in
|
|
95
|
+
CAUSAL_PATTERNS = [
|
|
96
|
+
# "X caused Y"
|
|
97
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:caused|led\s+to|resulted\s+in|gave\s+rise\s+to)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "caused"),
|
|
98
|
+
|
|
99
|
+
# "X as a result of Y"
|
|
100
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:as\s+a\s+result\s+of|due\s+to|because\s+of|arising\s+from)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "result_of"),
|
|
101
|
+
|
|
102
|
+
# "X breach ... damages"
|
|
103
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:breach(?:ed)?|violat(?:ed|ion))\s+.{0,100}(damages|injury|harm|loss)', "breach_damages"),
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
# Reference patterns: citations, exhibits
|
|
107
|
+
REFERENCE_PATTERNS = [
|
|
108
|
+
# "See Exhibit A"
|
|
109
|
+
(r'(?:See|see|per|Per|As\s+(?:shown|stated|set\s+forth)\s+in)\s+(Exhibit\s+[A-Z0-9]+)', "see_exhibit"),
|
|
110
|
+
|
|
111
|
+
# "pursuant to Section 3.2"
|
|
112
|
+
(r'(?:pursuant\s+to|under|per|in\s+accordance\s+with)\s+(Section\s+[\d\.]+|Article\s+[IVX\d]+)', "pursuant_to"),
|
|
113
|
+
|
|
114
|
+
# Legal citations "123 F.3d 456"
|
|
115
|
+
(r'([A-Z][a-z]+\s+v\.\s+[A-Z][a-z]+),?\s+(\d+\s+[A-Z]\.\s*\d*[a-z]*\s+\d+)', "case_citation"),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# Support/Contradict patterns
|
|
119
|
+
SUPPORT_CONTRADICT_PATTERNS = [
|
|
120
|
+
# "consistent with", "in accordance with"
|
|
121
|
+
(r'(?:consistent\s+with|in\s+accordance\s+with|supports|confirms)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "supports"),
|
|
122
|
+
|
|
123
|
+
# "contrary to", "inconsistent with"
|
|
124
|
+
(r'(?:contrary\s+to|inconsistent\s+with|contradicts|conflicts\s+with)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "contradicts"),
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
# Supersedes/Amends patterns
|
|
128
|
+
AMENDMENT_PATTERNS = [
|
|
129
|
+
# "supersedes"
|
|
130
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:supersedes|replaces|terminates)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "supersedes"),
|
|
131
|
+
|
|
132
|
+
# "amends"
|
|
133
|
+
(r'([A-Z][A-Za-z\s]+?)\s+(?:amends|modifies|supplements)\s+([A-Z][A-Za-z\s]+?)(?:,|\.)', "amends"),
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Compile all patterns with relationship type mapping
|
|
138
|
+
COMPILED_RELATIONSHIP_PATTERNS: dict[str, list[tuple[re.Pattern, str, str]]] = {
|
|
139
|
+
"AFFILIATED_WITH": [
|
|
140
|
+
(re.compile(p, re.IGNORECASE), n, "AFFILIATED_WITH")
|
|
141
|
+
for p, n in AFFILIATION_PATTERNS
|
|
142
|
+
],
|
|
143
|
+
"PARTY_TO": [
|
|
144
|
+
(re.compile(p, re.IGNORECASE), n, "PARTY_TO")
|
|
145
|
+
for p, n in PARTY_TO_PATTERNS
|
|
146
|
+
],
|
|
147
|
+
"TEMPORAL": [
|
|
148
|
+
(re.compile(p, re.IGNORECASE), n,
|
|
149
|
+
"TEMPORAL_BEFORE" if "before" in n else "TEMPORAL_AFTER")
|
|
150
|
+
for p, n in TEMPORAL_PATTERNS
|
|
151
|
+
],
|
|
152
|
+
"CAUSAL": [
|
|
153
|
+
(re.compile(p, re.IGNORECASE), n, "CAUSAL")
|
|
154
|
+
for p, n in CAUSAL_PATTERNS
|
|
155
|
+
],
|
|
156
|
+
"REFERENCES": [
|
|
157
|
+
(re.compile(p, re.IGNORECASE), n, "REFERENCES")
|
|
158
|
+
for p, n in REFERENCE_PATTERNS
|
|
159
|
+
],
|
|
160
|
+
"SUPPORT_CONTRADICT": [
|
|
161
|
+
(re.compile(p, re.IGNORECASE), n,
|
|
162
|
+
"SUPPORTS" if "support" in n else "CONTRADICTS")
|
|
163
|
+
for p, n in SUPPORT_CONTRADICT_PATTERNS
|
|
164
|
+
],
|
|
165
|
+
"AMENDMENT": [
|
|
166
|
+
(re.compile(p, re.IGNORECASE), n,
|
|
167
|
+
"SUPERSEDES" if "supersedes" in n else "AMENDS")
|
|
168
|
+
for p, n in AMENDMENT_PATTERNS
|
|
169
|
+
],
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# =============================================================================
|
|
174
|
+
# Relationship Pattern Extractor
|
|
175
|
+
# =============================================================================
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class RelationshipPatternExtractor:
|
|
179
|
+
"""
|
|
180
|
+
Extracts relationship candidates from text using patterns.
|
|
181
|
+
|
|
182
|
+
Provides GROUNDED candidates - every relationship is tied to
|
|
183
|
+
actual text evidence, preventing hallucination.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(
|
|
187
|
+
self,
|
|
188
|
+
context_window: int = 150,
|
|
189
|
+
min_confidence: float = 0.4,
|
|
190
|
+
):
|
|
191
|
+
"""
|
|
192
|
+
Initialize the relationship pattern extractor.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
context_window: Characters of context around matches.
|
|
196
|
+
min_confidence: Minimum confidence to include.
|
|
197
|
+
"""
|
|
198
|
+
self.context_window = context_window
|
|
199
|
+
self.min_confidence = min_confidence
|
|
200
|
+
|
|
201
|
+
def extract_candidates(
|
|
202
|
+
self,
|
|
203
|
+
text: str,
|
|
204
|
+
entities: list[Entity] | None = None,
|
|
205
|
+
relationship_types: list[str] | None = None,
|
|
206
|
+
) -> list[RelationshipCandidate]:
|
|
207
|
+
"""
|
|
208
|
+
Extract relationship candidates from text.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
text: Text to extract from.
|
|
212
|
+
entities: Optional list of known entities for matching.
|
|
213
|
+
relationship_types: Optional filter for relationship types.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
List of RelationshipCandidate objects.
|
|
217
|
+
"""
|
|
218
|
+
if not text:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
candidates = []
|
|
222
|
+
types_to_check = relationship_types or list(COMPILED_RELATIONSHIP_PATTERNS.keys())
|
|
223
|
+
|
|
224
|
+
for rel_category in types_to_check:
|
|
225
|
+
patterns = COMPILED_RELATIONSHIP_PATTERNS.get(rel_category, [])
|
|
226
|
+
|
|
227
|
+
for pattern, pattern_name, rel_type in patterns:
|
|
228
|
+
for match in pattern.finditer(text):
|
|
229
|
+
candidate = self._create_candidate_from_match(
|
|
230
|
+
match=match,
|
|
231
|
+
pattern_name=pattern_name,
|
|
232
|
+
relationship_type=rel_type,
|
|
233
|
+
text=text,
|
|
234
|
+
entities=entities,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if candidate and candidate.confidence >= self.min_confidence:
|
|
238
|
+
candidates.append(candidate)
|
|
239
|
+
|
|
240
|
+
# Also extract co-occurrence relationships
|
|
241
|
+
if entities:
|
|
242
|
+
cooccurrence_candidates = self._extract_cooccurrence_candidates(
|
|
243
|
+
text=text,
|
|
244
|
+
entities=entities,
|
|
245
|
+
)
|
|
246
|
+
candidates.extend(cooccurrence_candidates)
|
|
247
|
+
|
|
248
|
+
logger.debug(
|
|
249
|
+
"relationship_candidates_extracted",
|
|
250
|
+
total=len(candidates),
|
|
251
|
+
by_type={t: sum(1 for c in candidates if c.relationship_type == t)
|
|
252
|
+
for t in set(c.relationship_type for c in candidates)},
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return candidates
|
|
256
|
+
|
|
257
|
+
def _create_candidate_from_match(
|
|
258
|
+
self,
|
|
259
|
+
match: re.Match,
|
|
260
|
+
pattern_name: str,
|
|
261
|
+
relationship_type: str,
|
|
262
|
+
text: str,
|
|
263
|
+
entities: list[Entity] | None,
|
|
264
|
+
) -> RelationshipCandidate | None:
|
|
265
|
+
"""Create a relationship candidate from a regex match."""
|
|
266
|
+
groups = match.groups()
|
|
267
|
+
|
|
268
|
+
if len(groups) < 1:
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
# For single-group patterns (like "supports X"), the target is the match
|
|
272
|
+
if len(groups) == 1:
|
|
273
|
+
source_text = "this_section" # Will be resolved to node_id
|
|
274
|
+
target_text = groups[0].strip()
|
|
275
|
+
else:
|
|
276
|
+
source_text = groups[0].strip()
|
|
277
|
+
target_text = groups[1].strip() if len(groups) > 1 else ""
|
|
278
|
+
|
|
279
|
+
if not source_text or not target_text:
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
# Calculate confidence
|
|
283
|
+
confidence = self._calculate_confidence(match, pattern_name)
|
|
284
|
+
|
|
285
|
+
# Get evidence context
|
|
286
|
+
start = max(0, match.start() - self.context_window)
|
|
287
|
+
end = min(len(text), match.end() + self.context_window)
|
|
288
|
+
evidence = text[start:end]
|
|
289
|
+
|
|
290
|
+
# Try to match to known entities
|
|
291
|
+
source_entity_id = None
|
|
292
|
+
target_entity_id = None
|
|
293
|
+
|
|
294
|
+
if entities:
|
|
295
|
+
source_entity_id = self._match_to_entity(source_text, entities)
|
|
296
|
+
target_entity_id = self._match_to_entity(target_text, entities)
|
|
297
|
+
|
|
298
|
+
return RelationshipCandidate(
|
|
299
|
+
source_text=source_text,
|
|
300
|
+
target_text=target_text,
|
|
301
|
+
relationship_type=relationship_type,
|
|
302
|
+
evidence=match.group(),
|
|
303
|
+
span_start=match.start(),
|
|
304
|
+
span_end=match.end(),
|
|
305
|
+
confidence=confidence,
|
|
306
|
+
pattern_name=pattern_name,
|
|
307
|
+
source_entity_id=source_entity_id,
|
|
308
|
+
target_entity_id=target_entity_id,
|
|
309
|
+
metadata={
|
|
310
|
+
"full_context": evidence,
|
|
311
|
+
"pattern_groups": groups,
|
|
312
|
+
},
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
def _calculate_confidence(
|
|
316
|
+
self,
|
|
317
|
+
match: re.Match,
|
|
318
|
+
pattern_name: str,
|
|
319
|
+
) -> float:
|
|
320
|
+
"""Calculate confidence for a pattern match."""
|
|
321
|
+
base_confidence = 0.6
|
|
322
|
+
|
|
323
|
+
# High confidence patterns
|
|
324
|
+
high_confidence_patterns = {
|
|
325
|
+
"versus": 0.95, # X v. Y is very explicit
|
|
326
|
+
"see_exhibit": 0.9,
|
|
327
|
+
"case_citation": 0.9,
|
|
328
|
+
"entered_into": 0.85,
|
|
329
|
+
"caused": 0.8,
|
|
330
|
+
"supersedes": 0.85,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if pattern_name in high_confidence_patterns:
|
|
334
|
+
return high_confidence_patterns[pattern_name]
|
|
335
|
+
|
|
336
|
+
# Boost for longer, more specific matches
|
|
337
|
+
match_length = len(match.group())
|
|
338
|
+
if match_length > 50:
|
|
339
|
+
base_confidence += 0.15
|
|
340
|
+
elif match_length > 25:
|
|
341
|
+
base_confidence += 0.1
|
|
342
|
+
|
|
343
|
+
return min(base_confidence, 0.95)
|
|
344
|
+
|
|
345
|
+
def _match_to_entity(
|
|
346
|
+
self,
|
|
347
|
+
text: str,
|
|
348
|
+
entities: list[Entity],
|
|
349
|
+
) -> str | None:
|
|
350
|
+
"""Try to match extracted text to a known entity."""
|
|
351
|
+
text_lower = text.lower().strip()
|
|
352
|
+
|
|
353
|
+
for entity in entities:
|
|
354
|
+
# Check canonical name
|
|
355
|
+
if entity.canonical_name.lower() == text_lower:
|
|
356
|
+
return entity.id
|
|
357
|
+
|
|
358
|
+
# Check aliases
|
|
359
|
+
for alias in entity.aliases:
|
|
360
|
+
if alias.lower() == text_lower:
|
|
361
|
+
return entity.id
|
|
362
|
+
|
|
363
|
+
# Fuzzy match (one contains the other)
|
|
364
|
+
if text_lower in entity.canonical_name.lower() or \
|
|
365
|
+
entity.canonical_name.lower() in text_lower:
|
|
366
|
+
return entity.id
|
|
367
|
+
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
def _extract_cooccurrence_candidates(
|
|
371
|
+
self,
|
|
372
|
+
text: str,
|
|
373
|
+
entities: list[Entity],
|
|
374
|
+
window_size: int = 100,
|
|
375
|
+
) -> list[RelationshipCandidate]:
|
|
376
|
+
"""
|
|
377
|
+
Extract relationship candidates based on entity co-occurrence.
|
|
378
|
+
|
|
379
|
+
Entities mentioned close together often have relationships.
|
|
380
|
+
"""
|
|
381
|
+
candidates = []
|
|
382
|
+
|
|
383
|
+
# Find all entity mentions in text
|
|
384
|
+
entity_positions = []
|
|
385
|
+
for entity in entities:
|
|
386
|
+
# Search for canonical name
|
|
387
|
+
for match in re.finditer(re.escape(entity.canonical_name), text, re.IGNORECASE):
|
|
388
|
+
entity_positions.append({
|
|
389
|
+
"entity": entity,
|
|
390
|
+
"start": match.start(),
|
|
391
|
+
"end": match.end(),
|
|
392
|
+
"text": match.group(),
|
|
393
|
+
})
|
|
394
|
+
|
|
395
|
+
# Search for aliases
|
|
396
|
+
for alias in entity.aliases:
|
|
397
|
+
for match in re.finditer(re.escape(alias), text, re.IGNORECASE):
|
|
398
|
+
entity_positions.append({
|
|
399
|
+
"entity": entity,
|
|
400
|
+
"start": match.start(),
|
|
401
|
+
"end": match.end(),
|
|
402
|
+
"text": match.group(),
|
|
403
|
+
})
|
|
404
|
+
|
|
405
|
+
# Sort by position
|
|
406
|
+
entity_positions.sort(key=lambda x: x["start"])
|
|
407
|
+
|
|
408
|
+
# Find co-occurring pairs within window
|
|
409
|
+
for i, pos1 in enumerate(entity_positions):
|
|
410
|
+
for pos2 in entity_positions[i+1:]:
|
|
411
|
+
# Skip if same entity
|
|
412
|
+
if pos1["entity"].id == pos2["entity"].id:
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
# Check if within window
|
|
416
|
+
distance = pos2["start"] - pos1["end"]
|
|
417
|
+
if distance > window_size:
|
|
418
|
+
break # Too far, no need to check further
|
|
419
|
+
|
|
420
|
+
if distance < 0:
|
|
421
|
+
continue # Overlapping, skip
|
|
422
|
+
|
|
423
|
+
# Create co-occurrence candidate
|
|
424
|
+
evidence_start = pos1["start"]
|
|
425
|
+
evidence_end = pos2["end"]
|
|
426
|
+
evidence = text[evidence_start:evidence_end]
|
|
427
|
+
|
|
428
|
+
# Determine relationship type based on entity types
|
|
429
|
+
rel_type = self._infer_cooccurrence_type(
|
|
430
|
+
pos1["entity"], pos2["entity"], evidence
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Lower confidence for co-occurrence (needs validation)
|
|
434
|
+
confidence = 0.4 + (1 - distance / window_size) * 0.2
|
|
435
|
+
|
|
436
|
+
candidates.append(RelationshipCandidate(
|
|
437
|
+
source_text=pos1["text"],
|
|
438
|
+
target_text=pos2["text"],
|
|
439
|
+
relationship_type=rel_type,
|
|
440
|
+
evidence=evidence,
|
|
441
|
+
span_start=evidence_start,
|
|
442
|
+
span_end=evidence_end,
|
|
443
|
+
confidence=confidence,
|
|
444
|
+
pattern_name="co_occurrence",
|
|
445
|
+
source_entity_id=pos1["entity"].id,
|
|
446
|
+
target_entity_id=pos2["entity"].id,
|
|
447
|
+
metadata={
|
|
448
|
+
"distance": distance,
|
|
449
|
+
"source_type": pos1["entity"].type.value,
|
|
450
|
+
"target_type": pos2["entity"].type.value,
|
|
451
|
+
},
|
|
452
|
+
))
|
|
453
|
+
|
|
454
|
+
return candidates
|
|
455
|
+
|
|
456
|
+
def _infer_cooccurrence_type(
|
|
457
|
+
self,
|
|
458
|
+
entity1: Entity,
|
|
459
|
+
entity2: Entity,
|
|
460
|
+
evidence: str,
|
|
461
|
+
) -> str:
|
|
462
|
+
"""Infer relationship type from co-occurring entities."""
|
|
463
|
+
from rnsr.extraction.models import EntityType
|
|
464
|
+
|
|
465
|
+
type1 = entity1.type
|
|
466
|
+
type2 = entity2.type
|
|
467
|
+
evidence_lower = evidence.lower()
|
|
468
|
+
|
|
469
|
+
# Person + Organization → likely AFFILIATED_WITH
|
|
470
|
+
if (type1 == EntityType.PERSON and type2 == EntityType.ORGANIZATION) or \
|
|
471
|
+
(type1 == EntityType.ORGANIZATION and type2 == EntityType.PERSON):
|
|
472
|
+
return "AFFILIATED_WITH"
|
|
473
|
+
|
|
474
|
+
# Date + Event → likely TEMPORAL
|
|
475
|
+
if type1 == EntityType.DATE or type2 == EntityType.DATE:
|
|
476
|
+
return "TEMPORAL_BEFORE" # Will be refined by validator
|
|
477
|
+
|
|
478
|
+
# Event + Event → could be CAUSAL
|
|
479
|
+
if type1 == EntityType.EVENT and type2 == EntityType.EVENT:
|
|
480
|
+
if any(word in evidence_lower for word in ["caused", "led", "resulted"]):
|
|
481
|
+
return "CAUSAL"
|
|
482
|
+
return "TEMPORAL_BEFORE"
|
|
483
|
+
|
|
484
|
+
# Reference patterns
|
|
485
|
+
if type1 == EntityType.REFERENCE or type2 == EntityType.REFERENCE:
|
|
486
|
+
return "REFERENCES"
|
|
487
|
+
|
|
488
|
+
# Document + anything → likely MENTIONS
|
|
489
|
+
if type1 == EntityType.DOCUMENT or type2 == EntityType.DOCUMENT:
|
|
490
|
+
return "MENTIONS"
|
|
491
|
+
|
|
492
|
+
# Default
|
|
493
|
+
return "MENTIONS"
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def extract_relationship_candidates(
|
|
497
|
+
text: str,
|
|
498
|
+
entities: list[Entity] | None = None,
|
|
499
|
+
) -> list[RelationshipCandidate]:
|
|
500
|
+
"""
|
|
501
|
+
Convenience function to extract relationship candidates.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
text: Text to extract from.
|
|
505
|
+
entities: Optional known entities for matching.
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
List of RelationshipCandidate objects.
|
|
509
|
+
"""
|
|
510
|
+
extractor = RelationshipPatternExtractor()
|
|
511
|
+
return extractor.extract_candidates(text, entities)
|