rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,599 @@
1
+ """
2
+ RNSR Learned Entity Types
3
+
4
+ Adaptive learning system for entity types. When the LLM discovers entity types
5
+ that don't match the predefined EntityType enum, they are stored in a flat file.
6
+ Over time, this builds a domain-specific vocabulary of entity types.
7
+
8
+ The learned types are:
9
+ 1. Stored in a JSON file (configurable location)
10
+ 2. Loaded at startup and used in extraction prompts
11
+ 3. Updated with frequency counts when new types are discovered
12
+ 4. Can be promoted to "suggested" types for the LLM
13
+
14
+ Usage:
15
+ from rnsr.extraction.learned_types import LearnedTypeRegistry
16
+
17
+ registry = LearnedTypeRegistry()
18
+
19
+ # Record a new type
20
+ registry.record_type("witness", context="John Doe, the witness, testified...")
21
+
22
+ # Get learned types for prompts
23
+ learned = registry.get_learned_types(min_count=3)
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import json
29
+ import os
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from threading import Lock
33
+ from typing import Any
34
+
35
+ import structlog
36
+
37
+ logger = structlog.get_logger(__name__)
38
+
39
+ # Default locations for the learned types files
40
+ DEFAULT_LEARNED_TYPES_PATH = Path.home() / ".rnsr" / "learned_entity_types.json"
41
+ DEFAULT_LEARNED_RELATIONSHIP_TYPES_PATH = Path.home() / ".rnsr" / "learned_relationship_types.json"
42
+
43
+
44
+ class LearnedTypeRegistry:
45
+ """
46
+ Registry for learning and storing custom entity types discovered during extraction.
47
+
48
+ The registry maintains:
49
+ - Type name and frequency count
50
+ - Example contexts where the type was found
51
+ - First/last seen timestamps
52
+ - Optional mapping to existing EntityType (for future promotion)
53
+
54
+ Thread-safe for concurrent access.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ storage_path: Path | str | None = None,
60
+ auto_save: bool = True,
61
+ max_examples_per_type: int = 5,
62
+ ):
63
+ """
64
+ Initialize the learned type registry.
65
+
66
+ Args:
67
+ storage_path: Path to the JSON file for persistence.
68
+ Defaults to ~/.rnsr/learned_entity_types.json
69
+ auto_save: Whether to save after each new type is recorded.
70
+ max_examples_per_type: Maximum example contexts to store per type.
71
+ """
72
+ self.storage_path = Path(storage_path) if storage_path else DEFAULT_LEARNED_TYPES_PATH
73
+ self.auto_save = auto_save
74
+ self.max_examples_per_type = max_examples_per_type
75
+
76
+ self._lock = Lock()
77
+ self._types: dict[str, dict[str, Any]] = {}
78
+ self._dirty = False
79
+
80
+ # Load existing types
81
+ self._load()
82
+
83
+ def _load(self) -> None:
84
+ """Load learned types from storage."""
85
+ if not self.storage_path.exists():
86
+ logger.debug("no_learned_types_file", path=str(self.storage_path))
87
+ return
88
+
89
+ try:
90
+ with open(self.storage_path, "r") as f:
91
+ data = json.load(f)
92
+
93
+ self._types = data.get("types", {})
94
+
95
+ logger.info(
96
+ "learned_types_loaded",
97
+ path=str(self.storage_path),
98
+ count=len(self._types),
99
+ )
100
+
101
+ except Exception as e:
102
+ logger.warning("failed_to_load_learned_types", error=str(e))
103
+
104
+ def _save(self) -> None:
105
+ """Save learned types to storage."""
106
+ if not self._dirty:
107
+ return
108
+
109
+ try:
110
+ # Ensure directory exists
111
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
112
+
113
+ data = {
114
+ "version": "1.0",
115
+ "updated_at": datetime.utcnow().isoformat(),
116
+ "types": self._types,
117
+ }
118
+
119
+ with open(self.storage_path, "w") as f:
120
+ json.dump(data, f, indent=2)
121
+
122
+ self._dirty = False
123
+
124
+ logger.debug(
125
+ "learned_types_saved",
126
+ path=str(self.storage_path),
127
+ count=len(self._types),
128
+ )
129
+
130
+ except Exception as e:
131
+ logger.warning("failed_to_save_learned_types", error=str(e))
132
+
133
+ def record_type(
134
+ self,
135
+ type_name: str,
136
+ context: str = "",
137
+ entity_name: str = "",
138
+ ) -> None:
139
+ """
140
+ Record a discovered entity type.
141
+
142
+ Args:
143
+ type_name: The entity type name (e.g., "witness", "clause").
144
+ context: Example context where this type was found.
145
+ entity_name: Name of the entity with this type.
146
+ """
147
+ type_name = type_name.lower().strip()
148
+
149
+ if not type_name:
150
+ return
151
+
152
+ with self._lock:
153
+ now = datetime.utcnow().isoformat()
154
+
155
+ if type_name not in self._types:
156
+ # New type
157
+ self._types[type_name] = {
158
+ "count": 0,
159
+ "first_seen": now,
160
+ "last_seen": now,
161
+ "examples": [],
162
+ "suggested_mapping": None,
163
+ }
164
+
165
+ logger.info("new_entity_type_discovered", type=type_name)
166
+
167
+ # Update existing type
168
+ type_data = self._types[type_name]
169
+ type_data["count"] += 1
170
+ type_data["last_seen"] = now
171
+
172
+ # Add example if we have context
173
+ if context and len(type_data["examples"]) < self.max_examples_per_type:
174
+ example = {
175
+ "entity": entity_name,
176
+ "context": context[:200], # Truncate long contexts
177
+ "timestamp": now,
178
+ }
179
+ type_data["examples"].append(example)
180
+
181
+ self._dirty = True
182
+
183
+ if self.auto_save:
184
+ self._save()
185
+
186
+ def get_type(self, type_name: str) -> dict[str, Any] | None:
187
+ """
188
+ Get information about a learned type.
189
+
190
+ Args:
191
+ type_name: The type name to look up.
192
+
193
+ Returns:
194
+ Type data dict or None if not found.
195
+ """
196
+ return self._types.get(type_name.lower().strip())
197
+
198
+ def get_learned_types(
199
+ self,
200
+ min_count: int = 1,
201
+ limit: int = 50,
202
+ ) -> list[dict[str, Any]]:
203
+ """
204
+ Get learned types sorted by frequency.
205
+
206
+ Args:
207
+ min_count: Minimum occurrence count to include.
208
+ limit: Maximum number of types to return.
209
+
210
+ Returns:
211
+ List of type dicts with name and count.
212
+ """
213
+ with self._lock:
214
+ filtered = [
215
+ {"name": name, **data}
216
+ for name, data in self._types.items()
217
+ if data["count"] >= min_count
218
+ ]
219
+
220
+ # Sort by count descending
221
+ filtered.sort(key=lambda x: -x["count"])
222
+
223
+ return filtered[:limit]
224
+
225
+ def get_types_for_prompt(
226
+ self,
227
+ min_count: int = 2,
228
+ limit: int = 20,
229
+ ) -> list[str]:
230
+ """
231
+ Get type names suitable for including in extraction prompts.
232
+
233
+ Only returns types that have been seen multiple times,
234
+ indicating they are likely relevant for this workload.
235
+
236
+ Args:
237
+ min_count: Minimum occurrences to be considered "learned".
238
+ limit: Maximum types to include.
239
+
240
+ Returns:
241
+ List of type name strings.
242
+ """
243
+ learned = self.get_learned_types(min_count=min_count, limit=limit)
244
+ return [t["name"] for t in learned]
245
+
246
+ def suggest_mapping(
247
+ self,
248
+ type_name: str,
249
+ map_to: str,
250
+ ) -> None:
251
+ """
252
+ Suggest a mapping from a learned type to a standard EntityType.
253
+
254
+ This allows users to map frequently occurring custom types
255
+ to one of the predefined EntityType values.
256
+
257
+ Args:
258
+ type_name: The learned type name.
259
+ map_to: The EntityType value to map to.
260
+ """
261
+ type_name = type_name.lower().strip()
262
+
263
+ with self._lock:
264
+ if type_name in self._types:
265
+ self._types[type_name]["suggested_mapping"] = map_to
266
+ self._dirty = True
267
+
268
+ if self.auto_save:
269
+ self._save()
270
+
271
+ def get_mappings(self) -> dict[str, str]:
272
+ """
273
+ Get all suggested type mappings.
274
+
275
+ Returns:
276
+ Dict mapping learned type names to EntityType values.
277
+ """
278
+ with self._lock:
279
+ return {
280
+ name: data["suggested_mapping"]
281
+ for name, data in self._types.items()
282
+ if data.get("suggested_mapping")
283
+ }
284
+
285
+ def clear(self) -> None:
286
+ """Clear all learned types."""
287
+ with self._lock:
288
+ self._types.clear()
289
+ self._dirty = True
290
+ self._save()
291
+
292
+ def get_stats(self) -> dict[str, Any]:
293
+ """Get statistics about learned types."""
294
+ with self._lock:
295
+ total_types = len(self._types)
296
+ total_occurrences = sum(t["count"] for t in self._types.values())
297
+
298
+ if self._types:
299
+ most_common = max(self._types.items(), key=lambda x: x[1]["count"])
300
+ most_common_name = most_common[0]
301
+ most_common_count = most_common[1]["count"]
302
+ else:
303
+ most_common_name = None
304
+ most_common_count = 0
305
+
306
+ return {
307
+ "total_types": total_types,
308
+ "total_occurrences": total_occurrences,
309
+ "most_common_type": most_common_name,
310
+ "most_common_count": most_common_count,
311
+ "storage_path": str(self.storage_path),
312
+ }
313
+
314
+ def force_save(self) -> None:
315
+ """Force save to disk."""
316
+ self._dirty = True
317
+ self._save()
318
+
319
+
320
+ # Global registry instance (lazily initialized)
321
+ _global_registry: LearnedTypeRegistry | None = None
322
+
323
+
324
+ def get_learned_type_registry() -> LearnedTypeRegistry:
325
+ """
326
+ Get the global learned type registry.
327
+
328
+ Returns:
329
+ The singleton LearnedTypeRegistry instance.
330
+ """
331
+ global _global_registry
332
+
333
+ if _global_registry is None:
334
+ # Check for custom path in environment
335
+ custom_path = os.getenv("RNSR_LEARNED_TYPES_PATH")
336
+ _global_registry = LearnedTypeRegistry(
337
+ storage_path=custom_path if custom_path else None
338
+ )
339
+
340
+ return _global_registry
341
+
342
+
343
+ def record_learned_type(
344
+ type_name: str,
345
+ context: str = "",
346
+ entity_name: str = "",
347
+ ) -> None:
348
+ """
349
+ Convenience function to record a learned type using the global registry.
350
+
351
+ Args:
352
+ type_name: The entity type name.
353
+ context: Example context.
354
+ entity_name: Entity name.
355
+ """
356
+ registry = get_learned_type_registry()
357
+ registry.record_type(type_name, context, entity_name)
358
+
359
+
360
+ # =============================================================================
361
+ # Learned Relationship Types Registry
362
+ # =============================================================================
363
+
364
+
365
+ class LearnedRelationshipTypeRegistry:
366
+ """
367
+ Registry for learning and storing custom relationship types discovered during extraction.
368
+
369
+ Same pattern as LearnedTypeRegistry but for relationships.
370
+ Learns types like "testified_against", "represented_by", "prescribed_by".
371
+ """
372
+
373
+ def __init__(
374
+ self,
375
+ storage_path: Path | str | None = None,
376
+ auto_save: bool = True,
377
+ max_examples_per_type: int = 5,
378
+ ):
379
+ """
380
+ Initialize the learned relationship type registry.
381
+
382
+ Args:
383
+ storage_path: Path to the JSON file for persistence.
384
+ auto_save: Whether to save after each new type is recorded.
385
+ max_examples_per_type: Maximum example contexts to store per type.
386
+ """
387
+ self.storage_path = Path(storage_path) if storage_path else DEFAULT_LEARNED_RELATIONSHIP_TYPES_PATH
388
+ self.auto_save = auto_save
389
+ self.max_examples_per_type = max_examples_per_type
390
+
391
+ self._lock = Lock()
392
+ self._types: dict[str, dict[str, Any]] = {}
393
+ self._dirty = False
394
+
395
+ self._load()
396
+
397
+ def _load(self) -> None:
398
+ """Load learned types from storage."""
399
+ if not self.storage_path.exists():
400
+ logger.debug("no_learned_relationship_types_file", path=str(self.storage_path))
401
+ return
402
+
403
+ try:
404
+ with open(self.storage_path, "r") as f:
405
+ data = json.load(f)
406
+
407
+ self._types = data.get("types", {})
408
+
409
+ logger.info(
410
+ "learned_relationship_types_loaded",
411
+ path=str(self.storage_path),
412
+ count=len(self._types),
413
+ )
414
+
415
+ except Exception as e:
416
+ logger.warning("failed_to_load_learned_relationship_types", error=str(e))
417
+
418
+ def _save(self) -> None:
419
+ """Save learned types to storage."""
420
+ if not self._dirty:
421
+ return
422
+
423
+ try:
424
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
425
+
426
+ data = {
427
+ "version": "1.0",
428
+ "updated_at": datetime.utcnow().isoformat(),
429
+ "types": self._types,
430
+ }
431
+
432
+ with open(self.storage_path, "w") as f:
433
+ json.dump(data, f, indent=2)
434
+
435
+ self._dirty = False
436
+
437
+ logger.debug(
438
+ "learned_relationship_types_saved",
439
+ path=str(self.storage_path),
440
+ count=len(self._types),
441
+ )
442
+
443
+ except Exception as e:
444
+ logger.warning("failed_to_save_learned_relationship_types", error=str(e))
445
+
446
+ def record_type(
447
+ self,
448
+ type_name: str,
449
+ context: str = "",
450
+ relationship_description: str = "",
451
+ ) -> None:
452
+ """
453
+ Record a discovered relationship type.
454
+
455
+ Args:
456
+ type_name: The relationship type name (e.g., "testified_against").
457
+ context: Example evidence text.
458
+ relationship_description: Description of the relationship (source -> target).
459
+ """
460
+ type_name = type_name.lower().strip()
461
+
462
+ if not type_name:
463
+ return
464
+
465
+ with self._lock:
466
+ now = datetime.utcnow().isoformat()
467
+
468
+ if type_name not in self._types:
469
+ self._types[type_name] = {
470
+ "count": 0,
471
+ "first_seen": now,
472
+ "last_seen": now,
473
+ "examples": [],
474
+ "suggested_mapping": None,
475
+ }
476
+
477
+ logger.info("new_relationship_type_discovered", type=type_name)
478
+
479
+ type_data = self._types[type_name]
480
+ type_data["count"] += 1
481
+ type_data["last_seen"] = now
482
+
483
+ if context and len(type_data["examples"]) < self.max_examples_per_type:
484
+ example = {
485
+ "description": relationship_description,
486
+ "context": context[:200],
487
+ "timestamp": now,
488
+ }
489
+ type_data["examples"].append(example)
490
+
491
+ self._dirty = True
492
+
493
+ if self.auto_save:
494
+ self._save()
495
+
496
+ def get_types_for_prompt(
497
+ self,
498
+ min_count: int = 2,
499
+ limit: int = 20,
500
+ ) -> list[str]:
501
+ """
502
+ Get type names suitable for including in extraction prompts.
503
+ """
504
+ with self._lock:
505
+ filtered = [
506
+ (name, data["count"])
507
+ for name, data in self._types.items()
508
+ if data["count"] >= min_count
509
+ ]
510
+
511
+ filtered.sort(key=lambda x: -x[1])
512
+ return [name for name, _ in filtered[:limit]]
513
+
514
+ def get_mappings(self) -> dict[str, str]:
515
+ """Get all suggested type mappings."""
516
+ with self._lock:
517
+ return {
518
+ name: data["suggested_mapping"]
519
+ for name, data in self._types.items()
520
+ if data.get("suggested_mapping")
521
+ }
522
+
523
+ def suggest_mapping(self, type_name: str, map_to: str) -> None:
524
+ """Suggest a mapping from a learned type to a standard RelationType."""
525
+ type_name = type_name.lower().strip()
526
+
527
+ with self._lock:
528
+ if type_name in self._types:
529
+ self._types[type_name]["suggested_mapping"] = map_to
530
+ self._dirty = True
531
+
532
+ if self.auto_save:
533
+ self._save()
534
+
535
+ def get_stats(self) -> dict[str, Any]:
536
+ """Get statistics about learned relationship types."""
537
+ with self._lock:
538
+ total_types = len(self._types)
539
+ total_occurrences = sum(t["count"] for t in self._types.values())
540
+
541
+ if self._types:
542
+ most_common = max(self._types.items(), key=lambda x: x[1]["count"])
543
+ most_common_name = most_common[0]
544
+ most_common_count = most_common[1]["count"]
545
+ else:
546
+ most_common_name = None
547
+ most_common_count = 0
548
+
549
+ return {
550
+ "total_types": total_types,
551
+ "total_occurrences": total_occurrences,
552
+ "most_common_type": most_common_name,
553
+ "most_common_count": most_common_count,
554
+ "storage_path": str(self.storage_path),
555
+ }
556
+
557
+ def force_save(self) -> None:
558
+ """Force save to disk."""
559
+ self._dirty = True
560
+ self._save()
561
+
562
+
563
+ # Global relationship registry instance
564
+ _global_relationship_registry: LearnedRelationshipTypeRegistry | None = None
565
+
566
+
567
+ def get_learned_relationship_type_registry() -> LearnedRelationshipTypeRegistry:
568
+ """
569
+ Get the global learned relationship type registry.
570
+
571
+ Returns:
572
+ The singleton LearnedRelationshipTypeRegistry instance.
573
+ """
574
+ global _global_relationship_registry
575
+
576
+ if _global_relationship_registry is None:
577
+ custom_path = os.getenv("RNSR_LEARNED_RELATIONSHIP_TYPES_PATH")
578
+ _global_relationship_registry = LearnedRelationshipTypeRegistry(
579
+ storage_path=custom_path if custom_path else None
580
+ )
581
+
582
+ return _global_relationship_registry
583
+
584
+
585
+ def record_learned_relationship_type(
586
+ type_name: str,
587
+ context: str = "",
588
+ relationship_description: str = "",
589
+ ) -> None:
590
+ """
591
+ Convenience function to record a learned relationship type.
592
+
593
+ Args:
594
+ type_name: The relationship type name.
595
+ context: Example evidence.
596
+ relationship_description: Source -> target description.
597
+ """
598
+ registry = get_learned_relationship_type_registry()
599
+ registry.record_type(type_name, context, relationship_description)