rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,515 @@
1
+ """
2
+ Hierarchical Clustering for Semantic Segmentation (H-SBM)
3
+
4
+ Implements Section 4.2.2 of the research paper:
5
+ "For more advanced segmentation, we can employ unsupervised hierarchical
6
+ clustering techniques. By clustering the sentence embeddings, we can
7
+ discover 'Latent Topics' at various resolutions."
8
+
9
+ Features:
10
+ - Micro-Clusters: Groups of 5-10 sentences forming paragraph-level thoughts
11
+ - Macro-Clusters: Groups of micro-clusters forming chapter-level themes
12
+ - Synthetic Header Generation: LLM-generated titles for each cluster
13
+
14
+ This provides multi-resolution topic discovery when:
15
+ - Font histogram detects no variance (flat text)
16
+ - Semantic splitter produces too many fine-grained chunks
17
+ - Need hierarchical structure beyond simple breakpoints
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from dataclasses import dataclass, field
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ import numpy as np
27
+ import structlog
28
+
29
+ from rnsr.models import DocumentNode, DocumentTree
30
+
31
+ logger = structlog.get_logger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class TextCluster:
36
+ """A cluster of semantically related text segments."""
37
+
38
+ id: str
39
+ texts: list[str]
40
+ embeddings: np.ndarray | None = None
41
+ centroid: np.ndarray | None = None
42
+ children: list["TextCluster"] = field(default_factory=list)
43
+ synthetic_header: str = ""
44
+ level: int = 0 # 0 = leaf, 1 = micro, 2 = macro
45
+
46
+ @property
47
+ def full_text(self) -> str:
48
+ """Concatenate all texts in this cluster."""
49
+ return "\n\n".join(self.texts)
50
+
51
+ @property
52
+ def text_preview(self) -> str:
53
+ """First 200 chars for summary."""
54
+ full = self.full_text
55
+ return full[:200] + "..." if len(full) > 200 else full
56
+
57
+
58
+ class HierarchicalSemanticClusterer:
59
+ """
60
+ Multi-resolution topic discovery via hierarchical clustering.
61
+
62
+ Creates a two-level hierarchy:
63
+ 1. Micro-clusters (5-10 sentences) - paragraph-level thoughts
64
+ 2. Macro-clusters (groups of micro-clusters) - chapter-level themes
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ micro_cluster_size: int = 7, # Target sentences per micro-cluster
70
+ macro_cluster_ratio: float = 0.3, # Macro = 30% of micro count
71
+ embed_provider: str | None = None,
72
+ generate_headers: bool = True,
73
+ ):
74
+ """
75
+ Initialize the clusterer.
76
+
77
+ Args:
78
+ micro_cluster_size: Target number of sentences per micro-cluster.
79
+ macro_cluster_ratio: Ratio of macro to micro clusters.
80
+ embed_provider: "gemini", "openai", or None for auto-detect.
81
+ generate_headers: Whether to generate LLM headers for clusters.
82
+ """
83
+ self.micro_cluster_size = micro_cluster_size
84
+ self.macro_cluster_ratio = macro_cluster_ratio
85
+ self.embed_provider = embed_provider
86
+ self.generate_headers = generate_headers
87
+ self._embed_model = None
88
+
89
+ def cluster_text(self, text: str) -> list[TextCluster]:
90
+ """
91
+ Perform hierarchical clustering on text.
92
+
93
+ Args:
94
+ text: Full document text.
95
+
96
+ Returns:
97
+ List of macro-clusters, each containing micro-clusters.
98
+ """
99
+ # Split into sentences
100
+ sentences = self._split_sentences(text)
101
+
102
+ if len(sentences) < 3:
103
+ # Too short for clustering
104
+ return [TextCluster(
105
+ id="cluster_0",
106
+ texts=sentences,
107
+ synthetic_header="Document Content",
108
+ )]
109
+
110
+ logger.info("clustering_text", sentences=len(sentences))
111
+
112
+ # Get embeddings
113
+ embeddings = self._get_embeddings(sentences)
114
+
115
+ if embeddings is None:
116
+ # Fallback to simple chunking
117
+ return self._simple_cluster_fallback(sentences)
118
+
119
+ # Step 1: Create micro-clusters (sentence-level → paragraph-level)
120
+ micro_clusters = self._create_micro_clusters(sentences, embeddings)
121
+
122
+ # Step 2: Create macro-clusters (micro-level → chapter-level)
123
+ macro_clusters = self._create_macro_clusters(micro_clusters)
124
+
125
+ # Step 3: Generate synthetic headers
126
+ if self.generate_headers:
127
+ self._generate_headers_for_clusters(macro_clusters)
128
+
129
+ logger.info(
130
+ "clustering_complete",
131
+ micro_clusters=len(micro_clusters),
132
+ macro_clusters=len(macro_clusters),
133
+ )
134
+
135
+ return macro_clusters
136
+
137
+ def cluster_to_tree(self, text: str, title: str) -> DocumentTree:
138
+ """
139
+ Cluster text and convert to DocumentTree.
140
+
141
+ Args:
142
+ text: Full document text.
143
+ title: Document title.
144
+
145
+ Returns:
146
+ DocumentTree with hierarchical structure.
147
+ """
148
+ macro_clusters = self.cluster_text(text)
149
+
150
+ # Build tree
151
+ root = DocumentNode(
152
+ id="root",
153
+ level=0,
154
+ header=title,
155
+ )
156
+
157
+ for i, macro in enumerate(macro_clusters):
158
+ # Macro cluster becomes H2
159
+ macro_node = DocumentNode(
160
+ id=f"macro_{i:03d}",
161
+ level=1,
162
+ header=macro.synthetic_header or f"Section {i + 1}",
163
+ )
164
+
165
+ if macro.children:
166
+ # Add micro-clusters as H3
167
+ for j, micro in enumerate(macro.children):
168
+ micro_node = DocumentNode(
169
+ id=f"micro_{i:03d}_{j:03d}",
170
+ level=2,
171
+ header=micro.synthetic_header or f"Subsection {i + 1}.{j + 1}",
172
+ content=micro.full_text,
173
+ )
174
+ macro_node.children.append(micro_node)
175
+ else:
176
+ # Macro is a leaf (no sub-clusters)
177
+ macro_node.content = macro.full_text
178
+
179
+ root.children.append(macro_node)
180
+
181
+ return DocumentTree(
182
+ title=title,
183
+ root=root,
184
+ total_nodes=self._count_nodes(root),
185
+ ingestion_tier=2,
186
+ ingestion_method="hierarchical_clustering",
187
+ )
188
+
189
+ def _split_sentences(self, text: str) -> list[str]:
190
+ """Split text into sentences."""
191
+ import re
192
+
193
+ # Simple sentence splitting
194
+ # Handle common abbreviations
195
+ text = text.replace("Dr.", "Dr")
196
+ text = text.replace("Mr.", "Mr")
197
+ text = text.replace("Mrs.", "Mrs")
198
+ text = text.replace("Ms.", "Ms")
199
+ text = text.replace("vs.", "vs")
200
+ text = text.replace("etc.", "etc")
201
+ text = text.replace("i.e.", "ie")
202
+ text = text.replace("e.g.", "eg")
203
+
204
+ # Split on sentence endings
205
+ sentences = re.split(r'(?<=[.!?])\s+', text)
206
+
207
+ # Filter empty and very short
208
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
209
+
210
+ return sentences
211
+
212
+ def _get_embeddings(self, texts: list[str]) -> np.ndarray | None:
213
+ """Get embeddings for a list of texts."""
214
+ import os
215
+
216
+ # Auto-detect provider
217
+ provider = self.embed_provider
218
+ if provider is None:
219
+ if os.getenv("GOOGLE_API_KEY"):
220
+ provider = "gemini"
221
+ elif os.getenv("OPENAI_API_KEY"):
222
+ provider = "openai"
223
+ else:
224
+ logger.warning("no_embedding_api_key")
225
+ return None
226
+
227
+ try:
228
+ if provider == "gemini":
229
+ return self._get_gemini_embeddings(texts)
230
+ elif provider == "openai":
231
+ return self._get_openai_embeddings(texts)
232
+ else:
233
+ logger.warning("unknown_embed_provider", provider=provider)
234
+ return None
235
+ except Exception as e:
236
+ logger.warning("embedding_failed", error=str(e))
237
+ return None
238
+
239
+ def _get_gemini_embeddings(self, texts: list[str]) -> np.ndarray:
240
+ """Get embeddings using Gemini."""
241
+ from google import genai
242
+
243
+ client = genai.Client()
244
+
245
+ # Embed texts individually
246
+ embeddings = []
247
+
248
+ for text in texts:
249
+ result = client.models.embed_content(
250
+ model="models/text-embedding-004",
251
+ contents=text,
252
+ )
253
+
254
+ if result.embeddings is not None and len(result.embeddings) > 0:
255
+ embeddings.append(result.embeddings[0].values)
256
+
257
+ return np.array(embeddings)
258
+
259
+ def _get_openai_embeddings(self, texts: list[str]) -> np.ndarray:
260
+ """Get embeddings using OpenAI."""
261
+ from openai import OpenAI
262
+
263
+ client = OpenAI()
264
+
265
+ # Batch texts
266
+ embeddings = []
267
+ batch_size = 100
268
+
269
+ for i in range(0, len(texts), batch_size):
270
+ batch = texts[i:i + batch_size]
271
+
272
+ response = client.embeddings.create(
273
+ model="text-embedding-3-small",
274
+ input=batch,
275
+ )
276
+
277
+ for item in response.data:
278
+ embeddings.append(item.embedding)
279
+
280
+ return np.array(embeddings)
281
+
282
+ def _create_micro_clusters(
283
+ self,
284
+ sentences: list[str],
285
+ embeddings: np.ndarray,
286
+ ) -> list[TextCluster]:
287
+ """
288
+ Create micro-clusters (paragraph-level) via agglomerative clustering.
289
+ """
290
+ from scipy.cluster.hierarchy import fcluster, linkage
291
+ from scipy.spatial.distance import pdist
292
+
293
+ n_sentences = len(sentences)
294
+ target_clusters = max(3, n_sentences // self.micro_cluster_size)
295
+
296
+ # Compute linkage
297
+ distances = pdist(embeddings, metric='cosine')
298
+ Z = linkage(distances, method='ward')
299
+
300
+ # Cut tree to get target number of clusters
301
+ labels = fcluster(Z, t=target_clusters, criterion='maxclust')
302
+
303
+ # Group sentences by cluster
304
+ clusters = {}
305
+ for i, label in enumerate(labels):
306
+ if label not in clusters:
307
+ clusters[label] = {
308
+ 'texts': [],
309
+ 'embeddings': [],
310
+ 'indices': [],
311
+ }
312
+ clusters[label]['texts'].append(sentences[i])
313
+ clusters[label]['embeddings'].append(embeddings[i])
314
+ clusters[label]['indices'].append(i)
315
+
316
+ # Convert to TextCluster objects, sorted by first sentence index
317
+ micro_clusters = []
318
+ for label, data in sorted(clusters.items(), key=lambda x: min(x[1]['indices'])):
319
+ emb_array = np.array(data['embeddings'])
320
+ cluster = TextCluster(
321
+ id=f"micro_{label}",
322
+ texts=data['texts'],
323
+ embeddings=emb_array,
324
+ centroid=np.mean(emb_array, axis=0),
325
+ level=1,
326
+ )
327
+ micro_clusters.append(cluster)
328
+
329
+ logger.debug("micro_clusters_created", count=len(micro_clusters))
330
+ return micro_clusters
331
+
332
+ def _create_macro_clusters(
333
+ self,
334
+ micro_clusters: list[TextCluster],
335
+ ) -> list[TextCluster]:
336
+ """
337
+ Create macro-clusters (chapter-level) from micro-clusters.
338
+ """
339
+ if len(micro_clusters) <= 3:
340
+ # Too few for macro clustering
341
+ for i, micro in enumerate(micro_clusters):
342
+ micro.level = 2 # Treat as macro
343
+ micro.id = f"macro_{i}"
344
+ return micro_clusters
345
+
346
+ from scipy.cluster.hierarchy import fcluster, linkage
347
+ from scipy.spatial.distance import pdist
348
+
349
+ # Get centroids
350
+ centroids = np.array([c.centroid for c in micro_clusters if c.centroid is not None])
351
+
352
+ if len(centroids) < 3:
353
+ return micro_clusters
354
+
355
+ # Target macro clusters
356
+ target_macros = max(2, int(len(micro_clusters) * self.macro_cluster_ratio))
357
+
358
+ # Cluster centroids
359
+ distances = pdist(centroids, metric='cosine')
360
+ Z = linkage(distances, method='ward')
361
+ labels = fcluster(Z, t=target_macros, criterion='maxclust')
362
+
363
+ # Group micro-clusters by macro label
364
+ macro_groups = {}
365
+ for i, label in enumerate(labels):
366
+ if label not in macro_groups:
367
+ macro_groups[label] = []
368
+ macro_groups[label].append(micro_clusters[i])
369
+
370
+ # Create macro clusters
371
+ macro_clusters = []
372
+ for label, micros in sorted(macro_groups.items()):
373
+ # Combine all texts
374
+ all_texts = []
375
+ for micro in micros:
376
+ all_texts.extend(micro.texts)
377
+
378
+ macro = TextCluster(
379
+ id=f"macro_{label}",
380
+ texts=all_texts,
381
+ children=micros,
382
+ level=2,
383
+ )
384
+ macro_clusters.append(macro)
385
+
386
+ logger.debug("macro_clusters_created", count=len(macro_clusters))
387
+ return macro_clusters
388
+
389
+ def _generate_headers_for_clusters(
390
+ self,
391
+ clusters: list[TextCluster],
392
+ ) -> None:
393
+ """Generate synthetic headers for all clusters using LLM."""
394
+ for cluster in clusters:
395
+ # Generate header for macro cluster
396
+ cluster.synthetic_header = self._generate_single_header(cluster.text_preview)
397
+
398
+ # Generate headers for children
399
+ for child in cluster.children:
400
+ child.synthetic_header = self._generate_single_header(child.text_preview)
401
+
402
+ def _generate_single_header(self, text_preview: str) -> str:
403
+ """Generate a single header via LLM."""
404
+ import os
405
+
406
+ prompt = f"""Generate a concise 3-7 word descriptive title for this text section:
407
+
408
+ {text_preview}
409
+
410
+ Return ONLY the title, nothing else."""
411
+
412
+ # Try Gemini
413
+ if os.getenv("GOOGLE_API_KEY"):
414
+ try:
415
+ from google import genai
416
+
417
+ client = genai.Client()
418
+ response = client.models.generate_content(
419
+ model="gemini-2.0-flash",
420
+ contents=prompt,
421
+ )
422
+ if response.text is not None:
423
+ header = response.text.strip().strip('"').strip("'")
424
+ if 3 <= len(header) <= 100:
425
+ return header
426
+ except Exception:
427
+ pass
428
+
429
+ # Try OpenAI
430
+ if os.getenv("OPENAI_API_KEY"):
431
+ try:
432
+ from openai import OpenAI
433
+
434
+ client = OpenAI()
435
+ response = client.chat.completions.create(
436
+ model="gpt-4o-mini",
437
+ messages=[{"role": "user", "content": prompt}],
438
+ max_tokens=30,
439
+ temperature=0.3,
440
+ )
441
+ content = response.choices[0].message.content
442
+ if content is not None:
443
+ header = content.strip().strip('"').strip("'")
444
+ if 3 <= len(header) <= 100:
445
+ return header
446
+ except Exception:
447
+ pass
448
+
449
+ # Fallback: first few words
450
+ words = text_preview.split()[:5]
451
+ return " ".join(words) + "..." if words else "Section"
452
+
453
+ def _simple_cluster_fallback(
454
+ self,
455
+ sentences: list[str],
456
+ ) -> list[TextCluster]:
457
+ """Simple clustering when embeddings unavailable."""
458
+ # Group sentences into chunks
459
+ chunk_size = self.micro_cluster_size
460
+ clusters = []
461
+
462
+ for i in range(0, len(sentences), chunk_size):
463
+ chunk = sentences[i:i + chunk_size]
464
+ cluster = TextCluster(
465
+ id=f"cluster_{i // chunk_size}",
466
+ texts=chunk,
467
+ synthetic_header=f"Section {i // chunk_size + 1}",
468
+ level=2,
469
+ )
470
+ clusters.append(cluster)
471
+
472
+ return clusters
473
+
474
+ def _count_nodes(self, node: DocumentNode) -> int:
475
+ """Count total nodes in tree."""
476
+ return 1 + sum(self._count_nodes(c) for c in node.children)
477
+
478
+
479
+ def cluster_document_hierarchically(
480
+ pdf_path: Path | str,
481
+ title: str | None = None,
482
+ ) -> DocumentTree:
483
+ """
484
+ Convenience function for hierarchical clustering of a PDF.
485
+
486
+ Args:
487
+ pdf_path: Path to the PDF file.
488
+ title: Document title (defaults to filename).
489
+
490
+ Returns:
491
+ DocumentTree with hierarchical structure.
492
+
493
+ Example:
494
+ tree = cluster_document_hierarchically("flat_document.pdf")
495
+ for section in tree.root.children:
496
+ print(f"{section.header}: {len(section.children)} subsections")
497
+ """
498
+ import fitz
499
+
500
+ pdf_path = Path(pdf_path)
501
+ title = title or pdf_path.stem
502
+
503
+ # Extract text
504
+ doc = fitz.open(pdf_path)
505
+ pages_text = []
506
+ for page_num in range(len(doc)):
507
+ page = doc.load_page(page_num)
508
+ pages_text.append(page.get_text("text"))
509
+ doc.close()
510
+
511
+ full_text = "\n\n".join(pages_text)
512
+
513
+ # Cluster
514
+ clusterer = HierarchicalSemanticClusterer()
515
+ return clusterer.cluster_to_tree(full_text, title)