rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,595 @@
1
+ """
2
+ Header Classifier - Detect and Classify Headers by Level
3
+
4
+ This module classifies text spans into:
5
+ - Headers (H1, H2, H3) based on font size magnitude
6
+ - Body text (most frequent font size)
7
+ - Captions/footnotes (smaller than body)
8
+
9
+ Header Level Mapping (adaptive):
10
+ - Defaults: >= 24pt: H1, 18-23pt: H2, 14-17pt: H3
11
+ - Learns from document types (legal, academic, marketing, etc.)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ from threading import Lock
21
+ from typing import Any, Literal
22
+
23
+ import numpy as np
24
+ import structlog
25
+ from sklearn.cluster import KMeans
26
+
27
+ from rnsr.models import ClassifiedSpan, FontAnalysis, SpanInfo
28
+
29
+ logger = structlog.get_logger(__name__)
30
+
31
+
32
+ # =============================================================================
33
+ # Learned Header Thresholds Registry
34
+ # =============================================================================
35
+
36
+ DEFAULT_HEADER_THRESHOLDS_PATH = Path.home() / ".rnsr" / "learned_header_thresholds.json"
37
+
38
+
39
+ class LearnedHeaderThresholds:
40
+ """
41
+ Registry for learning document-type-specific header thresholds.
42
+
43
+ Different document types use different conventions:
44
+ - Legal briefs: 12pt bold = header
45
+ - Academic papers: 11pt = everything
46
+ - Marketing: 36pt+ = titles
47
+
48
+ This class learns optimal thresholds from document analysis.
49
+ """
50
+
51
+ # Default thresholds
52
+ DEFAULT_H1_MIN = 24.0
53
+ DEFAULT_H2_MIN = 18.0
54
+ DEFAULT_H3_MIN = 14.0
55
+
56
+ def __init__(
57
+ self,
58
+ storage_path: Path | str | None = None,
59
+ auto_save: bool = True,
60
+ ):
61
+ """
62
+ Initialize the header thresholds registry.
63
+
64
+ Args:
65
+ storage_path: Path to JSON file for persistence.
66
+ auto_save: Whether to save after changes.
67
+ """
68
+ self.storage_path = Path(storage_path) if storage_path else DEFAULT_HEADER_THRESHOLDS_PATH
69
+ self.auto_save = auto_save
70
+
71
+ self._lock = Lock()
72
+ self._document_types: dict[str, dict[str, Any]] = {}
73
+ self._dirty = False
74
+
75
+ self._load()
76
+
77
+ def _load(self) -> None:
78
+ """Load learned thresholds from storage."""
79
+ if not self.storage_path.exists():
80
+ return
81
+
82
+ try:
83
+ with open(self.storage_path, "r") as f:
84
+ data = json.load(f)
85
+
86
+ self._document_types = data.get("document_types", {})
87
+
88
+ logger.info(
89
+ "header_thresholds_loaded",
90
+ document_types=len(self._document_types),
91
+ )
92
+
93
+ except Exception as e:
94
+ logger.warning("failed_to_load_header_thresholds", error=str(e))
95
+
96
+ def _save(self) -> None:
97
+ """Save to storage."""
98
+ if not self._dirty:
99
+ return
100
+
101
+ try:
102
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
103
+
104
+ data = {
105
+ "version": "1.0",
106
+ "updated_at": datetime.utcnow().isoformat(),
107
+ "document_types": self._document_types,
108
+ }
109
+
110
+ with open(self.storage_path, "w") as f:
111
+ json.dump(data, f, indent=2)
112
+
113
+ self._dirty = False
114
+
115
+ except Exception as e:
116
+ logger.warning("failed_to_save_header_thresholds", error=str(e))
117
+
118
+ def record_thresholds(
119
+ self,
120
+ document_type: str,
121
+ h1_min: float,
122
+ h2_min: float,
123
+ h3_min: float,
124
+ body_size: float = 12.0,
125
+ document_example: str = "",
126
+ ) -> None:
127
+ """
128
+ Record observed header thresholds for a document type.
129
+
130
+ The system averages thresholds across multiple documents
131
+ of the same type to learn optimal values.
132
+
133
+ Args:
134
+ document_type: Type of document (legal, academic, marketing, etc.)
135
+ h1_min: Observed H1 minimum size.
136
+ h2_min: Observed H2 minimum size.
137
+ h3_min: Observed H3 minimum size.
138
+ body_size: Observed body text size.
139
+ document_example: Example document filename.
140
+ """
141
+ document_type = document_type.lower().strip()
142
+
143
+ if not document_type:
144
+ return
145
+
146
+ with self._lock:
147
+ now = datetime.utcnow().isoformat()
148
+
149
+ if document_type not in self._document_types:
150
+ self._document_types[document_type] = {
151
+ "count": 0,
152
+ "h1_min_sum": 0.0,
153
+ "h2_min_sum": 0.0,
154
+ "h3_min_sum": 0.0,
155
+ "body_size_sum": 0.0,
156
+ "first_seen": now,
157
+ "last_seen": now,
158
+ "examples": [],
159
+ }
160
+ logger.info("new_document_type_learned", document_type=document_type)
161
+
162
+ dt = self._document_types[document_type]
163
+ dt["count"] += 1
164
+ dt["h1_min_sum"] += h1_min
165
+ dt["h2_min_sum"] += h2_min
166
+ dt["h3_min_sum"] += h3_min
167
+ dt["body_size_sum"] += body_size
168
+ dt["last_seen"] = now
169
+
170
+ if document_example and len(dt["examples"]) < 3:
171
+ dt["examples"].append(document_example)
172
+
173
+ self._dirty = True
174
+
175
+ if self.auto_save:
176
+ self._save()
177
+
178
+ def get_thresholds(
179
+ self,
180
+ document_type: str | None = None,
181
+ ) -> dict[str, float]:
182
+ """
183
+ Get header thresholds for a document type.
184
+
185
+ Args:
186
+ document_type: Type of document. If None, returns defaults.
187
+
188
+ Returns:
189
+ Dict with h1_min, h2_min, h3_min, body_size.
190
+ """
191
+ if not document_type:
192
+ return {
193
+ "h1_min": self.DEFAULT_H1_MIN,
194
+ "h2_min": self.DEFAULT_H2_MIN,
195
+ "h3_min": self.DEFAULT_H3_MIN,
196
+ "body_size": 12.0,
197
+ }
198
+
199
+ document_type = document_type.lower().strip()
200
+
201
+ with self._lock:
202
+ if document_type in self._document_types:
203
+ dt = self._document_types[document_type]
204
+ count = dt["count"]
205
+
206
+ if count > 0:
207
+ return {
208
+ "h1_min": dt["h1_min_sum"] / count,
209
+ "h2_min": dt["h2_min_sum"] / count,
210
+ "h3_min": dt["h3_min_sum"] / count,
211
+ "body_size": dt["body_size_sum"] / count,
212
+ }
213
+
214
+ # Return defaults if not found
215
+ return self.get_thresholds(None)
216
+
217
+ def detect_document_type(
218
+ self,
219
+ body_size: float,
220
+ max_font_size: float,
221
+ has_legal_terms: bool = False,
222
+ has_academic_structure: bool = False,
223
+ ) -> str:
224
+ """
225
+ Attempt to detect document type from characteristics.
226
+
227
+ Args:
228
+ body_size: Most common font size.
229
+ max_font_size: Largest font in document.
230
+ has_legal_terms: Whether document contains legal terminology.
231
+ has_academic_structure: Whether document has academic structure.
232
+
233
+ Returns:
234
+ Detected document type string.
235
+ """
236
+ # Simple heuristics
237
+ if has_legal_terms:
238
+ if body_size <= 12 and max_font_size <= 16:
239
+ return "legal_brief"
240
+ return "legal_general"
241
+
242
+ if has_academic_structure:
243
+ return "academic"
244
+
245
+ if max_font_size >= 36:
246
+ return "marketing"
247
+
248
+ if body_size >= 14:
249
+ return "presentation"
250
+
251
+ return "general"
252
+
253
+ def get_known_document_types(self) -> list[str]:
254
+ """Get list of document types we have learned."""
255
+ with self._lock:
256
+ return list(self._document_types.keys())
257
+
258
+ def get_stats(self) -> dict[str, Any]:
259
+ """Get statistics about learned thresholds."""
260
+ with self._lock:
261
+ return {
262
+ "document_types_count": len(self._document_types),
263
+ "document_types": list(self._document_types.keys()),
264
+ "total_documents_analyzed": sum(
265
+ dt["count"] for dt in self._document_types.values()
266
+ ),
267
+ }
268
+
269
+
270
+ # Global header thresholds registry
271
+ _global_header_thresholds: LearnedHeaderThresholds | None = None
272
+
273
+
274
+ def get_learned_header_thresholds() -> LearnedHeaderThresholds:
275
+ """Get the global learned header thresholds registry."""
276
+ global _global_header_thresholds
277
+
278
+ if _global_header_thresholds is None:
279
+ custom_path = os.getenv("RNSR_HEADER_THRESHOLDS_PATH")
280
+ _global_header_thresholds = LearnedHeaderThresholds(
281
+ storage_path=custom_path if custom_path else None
282
+ )
283
+
284
+ return _global_header_thresholds
285
+
286
+
287
+ class HeaderClassifier:
288
+ """
289
+ Classifies text spans into headers and body text.
290
+
291
+ Uses font size analysis and optional k-means clustering
292
+ to determine header levels. Supports adaptive thresholds
293
+ that learn from document types.
294
+ """
295
+
296
+ # Default thresholds for header levels (in points)
297
+ H1_MIN_SIZE = 24.0
298
+ H2_MIN_SIZE = 18.0
299
+ H3_MIN_SIZE = 14.0
300
+
301
+ def __init__(
302
+ self,
303
+ use_clustering: bool = True,
304
+ n_header_levels: int = 3,
305
+ document_type: str | None = None,
306
+ enable_threshold_learning: bool = True,
307
+ ):
308
+ """
309
+ Initialize the Header Classifier.
310
+
311
+ Args:
312
+ use_clustering: Whether to use k-means clustering for header levels.
313
+ n_header_levels: Number of header levels to detect (default: 3).
314
+ document_type: Optional document type for adaptive thresholds.
315
+ enable_threshold_learning: Whether to learn thresholds from documents.
316
+ """
317
+ self.use_clustering = use_clustering
318
+ self.n_header_levels = n_header_levels
319
+ self.document_type = document_type
320
+ self.enable_threshold_learning = enable_threshold_learning
321
+
322
+ # Get learned thresholds registry
323
+ self._threshold_registry = get_learned_header_thresholds() if enable_threshold_learning else None
324
+
325
+ # Set thresholds based on document type
326
+ self._update_thresholds(document_type)
327
+
328
+ def _update_thresholds(self, document_type: str | None) -> None:
329
+ """Update thresholds based on document type."""
330
+ if self._threshold_registry and document_type:
331
+ thresholds = self._threshold_registry.get_thresholds(document_type)
332
+ self.H1_MIN_SIZE = thresholds["h1_min"]
333
+ self.H2_MIN_SIZE = thresholds["h2_min"]
334
+ self.H3_MIN_SIZE = thresholds["h3_min"]
335
+ else:
336
+ # Use class defaults
337
+ self.H1_MIN_SIZE = HeaderClassifier.H1_MIN_SIZE
338
+ self.H2_MIN_SIZE = HeaderClassifier.H2_MIN_SIZE
339
+ self.H3_MIN_SIZE = HeaderClassifier.H3_MIN_SIZE
340
+
341
+ def set_document_type(self, document_type: str) -> None:
342
+ """Set document type and update thresholds."""
343
+ self.document_type = document_type
344
+ self._update_thresholds(document_type)
345
+
346
+ def learn_from_analysis(
347
+ self,
348
+ analysis: FontAnalysis,
349
+ detected_h1_size: float | None = None,
350
+ detected_h2_size: float | None = None,
351
+ detected_h3_size: float | None = None,
352
+ document_name: str = "",
353
+ ) -> None:
354
+ """
355
+ Learn thresholds from document analysis.
356
+
357
+ Call this after processing a document to record observed values.
358
+ """
359
+ if not self._threshold_registry or not self.document_type:
360
+ return
361
+
362
+ # Use detected sizes or infer from analysis
363
+ h1_size = detected_h1_size or analysis.header_threshold + 6
364
+ h2_size = detected_h2_size or analysis.header_threshold + 3
365
+ h3_size = detected_h3_size or analysis.header_threshold
366
+
367
+ self._threshold_registry.record_thresholds(
368
+ document_type=self.document_type,
369
+ h1_min=h1_size,
370
+ h2_min=h2_size,
371
+ h3_min=h3_size,
372
+ body_size=analysis.body_size,
373
+ document_example=document_name,
374
+ )
375
+
376
+ def classify_spans(
377
+ self,
378
+ spans: list[SpanInfo],
379
+ analysis: FontAnalysis,
380
+ ) -> list[ClassifiedSpan]:
381
+ """
382
+ Classify all spans into headers or body text.
383
+
384
+ Args:
385
+ spans: List of SpanInfo from font analysis.
386
+ analysis: FontAnalysis with body size and threshold.
387
+
388
+ Returns:
389
+ List of ClassifiedSpan with role and header_level assigned.
390
+ """
391
+ if not spans:
392
+ return []
393
+
394
+ # First pass: identify potential headers
395
+ potential_headers: list[SpanInfo] = []
396
+ for span in spans:
397
+ if self._is_header_candidate(span, analysis):
398
+ potential_headers.append(span)
399
+
400
+ # Determine header levels
401
+ if potential_headers and self.use_clustering:
402
+ level_mapping = self._cluster_header_levels(potential_headers)
403
+ else:
404
+ level_mapping = {}
405
+
406
+ # Classify all spans
407
+ classified: list[ClassifiedSpan] = []
408
+ for span in spans:
409
+ role, level = self._classify_single_span(span, analysis, level_mapping)
410
+
411
+ classified_span = ClassifiedSpan(
412
+ text=span.text,
413
+ font_size=span.font_size,
414
+ font_name=span.font_name,
415
+ is_bold=span.is_bold,
416
+ is_italic=span.is_italic,
417
+ bbox=span.bbox,
418
+ page_num=span.page_num,
419
+ role=role,
420
+ header_level=level,
421
+ )
422
+ classified.append(classified_span)
423
+
424
+ # Log classification stats
425
+ header_count = sum(1 for s in classified if s.role == "header")
426
+ logger.info(
427
+ "spans_classified",
428
+ total=len(classified),
429
+ headers=header_count,
430
+ body=len(classified) - header_count,
431
+ )
432
+
433
+ return classified
434
+
435
+ def _is_header_candidate(
436
+ self,
437
+ span: SpanInfo,
438
+ analysis: FontAnalysis,
439
+ ) -> bool:
440
+ """
441
+ Determine if a span is a header candidate.
442
+
443
+ A span is a header candidate if:
444
+ 1. Font size > header_threshold, OR
445
+ 2. Bold text at or above body size
446
+ """
447
+ # Size-based detection
448
+ if span.font_size > analysis.header_threshold:
449
+ return True
450
+
451
+ # Bold text at body size or larger
452
+ if span.is_bold and span.font_size >= analysis.body_size:
453
+ return True
454
+
455
+ return False
456
+
457
+ def _classify_single_span(
458
+ self,
459
+ span: SpanInfo,
460
+ analysis: FontAnalysis,
461
+ level_mapping: dict[float, int],
462
+ ) -> tuple[Literal["header", "body", "caption", "footnote"], int]:
463
+ """
464
+ Classify a single span.
465
+
466
+ Returns:
467
+ Tuple of (role, header_level).
468
+ """
469
+ # Check if it's a header
470
+ if self._is_header_candidate(span, analysis):
471
+ # Use clustering-based level if available
472
+ if span.font_size in level_mapping:
473
+ level = level_mapping[span.font_size]
474
+ else:
475
+ # Fall back to absolute thresholds
476
+ level = self._get_level_by_size(span.font_size)
477
+
478
+ return ("header", level)
479
+
480
+ # Check for captions/footnotes (smaller than body)
481
+ caption_threshold = analysis.body_size - (analysis.body_size * 0.2)
482
+ if span.font_size < caption_threshold:
483
+ return ("caption", 0)
484
+
485
+ # Default to body text
486
+ return ("body", 0)
487
+
488
+ def _get_level_by_size(self, font_size: float) -> int:
489
+ """
490
+ Get header level based on absolute font size thresholds.
491
+
492
+ Args:
493
+ font_size: The font size in points.
494
+
495
+ Returns:
496
+ Header level (1, 2, or 3).
497
+ """
498
+ if font_size >= self.H1_MIN_SIZE:
499
+ return 1
500
+ elif font_size >= self.H2_MIN_SIZE:
501
+ return 2
502
+ else:
503
+ return 3
504
+
505
+ def _cluster_header_levels(
506
+ self,
507
+ headers: list[SpanInfo],
508
+ ) -> dict[float, int]:
509
+ """
510
+ Use k-means clustering to determine header levels from actual data.
511
+
512
+ This adapts to documents with non-standard font sizes.
513
+
514
+ Args:
515
+ headers: List of spans identified as headers.
516
+
517
+ Returns:
518
+ Dict mapping font_size to header_level (1, 2, or 3).
519
+ """
520
+ if len(headers) < self.n_header_levels:
521
+ # Not enough headers to cluster
522
+ return {}
523
+
524
+ # Get unique font sizes
525
+ unique_sizes = list(set(h.font_size for h in headers))
526
+
527
+ if len(unique_sizes) < self.n_header_levels:
528
+ # Fewer unique sizes than levels - assign directly
529
+ sorted_sizes = sorted(unique_sizes, reverse=True)
530
+ return {size: i + 1 for i, size in enumerate(sorted_sizes)}
531
+
532
+ # Perform k-means clustering
533
+ try:
534
+ X = np.array(unique_sizes).reshape(-1, 1)
535
+ n_clusters = min(self.n_header_levels, len(unique_sizes))
536
+
537
+ kmeans = KMeans(
538
+ n_clusters=n_clusters,
539
+ random_state=42,
540
+ n_init=10,
541
+ ).fit(X)
542
+
543
+ # Map clusters to levels by size (largest = H1)
544
+ cluster_centers = [
545
+ (i, kmeans.cluster_centers_[i][0])
546
+ for i in range(n_clusters)
547
+ ]
548
+ cluster_centers.sort(key=lambda x: -x[1]) # Descending by size
549
+
550
+ cluster_to_level = {
551
+ cluster: level + 1
552
+ for level, (cluster, _) in enumerate(cluster_centers)
553
+ }
554
+
555
+ # Map each unique size to its level
556
+ size_to_level = {}
557
+ for size in unique_sizes:
558
+ cluster = kmeans.predict([[size]])[0]
559
+ size_to_level[size] = cluster_to_level[cluster]
560
+
561
+ logger.debug(
562
+ "header_levels_clustered",
563
+ mapping=size_to_level,
564
+ )
565
+
566
+ return size_to_level
567
+
568
+ except Exception as e:
569
+ logger.warning("clustering_failed", error=str(e))
570
+ return {}
571
+
572
+
573
+ def classify_headers(
574
+ spans: list[SpanInfo],
575
+ analysis: FontAnalysis,
576
+ use_clustering: bool = True,
577
+ ) -> list[ClassifiedSpan]:
578
+ """
579
+ Convenience function to classify spans into headers and body text.
580
+
581
+ Args:
582
+ spans: List of SpanInfo from font analysis.
583
+ analysis: FontAnalysis with body size and threshold.
584
+ use_clustering: Whether to use k-means for header levels.
585
+
586
+ Returns:
587
+ List of ClassifiedSpan with roles assigned.
588
+
589
+ Example:
590
+ analysis, spans = analyze_font_histogram("doc.pdf")
591
+ classified = classify_headers(spans, analysis)
592
+ headers = [s for s in classified if s.role == "header"]
593
+ """
594
+ classifier = HeaderClassifier(use_clustering=use_clustering)
595
+ return classifier.classify_spans(spans, analysis)