rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hierarchical Clustering for Semantic Segmentation (H-SBM)
|
|
3
|
+
|
|
4
|
+
Implements Section 4.2.2 of the research paper:
|
|
5
|
+
"For more advanced segmentation, we can employ unsupervised hierarchical
|
|
6
|
+
clustering techniques. By clustering the sentence embeddings, we can
|
|
7
|
+
discover 'Latent Topics' at various resolutions."
|
|
8
|
+
|
|
9
|
+
Features:
|
|
10
|
+
- Micro-Clusters: Groups of 5-10 sentences forming paragraph-level thoughts
|
|
11
|
+
- Macro-Clusters: Groups of micro-clusters forming chapter-level themes
|
|
12
|
+
- Synthetic Header Generation: LLM-generated titles for each cluster
|
|
13
|
+
|
|
14
|
+
This provides multi-resolution topic discovery when:
|
|
15
|
+
- Font histogram detects no variance (flat text)
|
|
16
|
+
- Semantic splitter produces too many fine-grained chunks
|
|
17
|
+
- Need hierarchical structure beyond simple breakpoints
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import structlog
|
|
28
|
+
|
|
29
|
+
from rnsr.models import DocumentNode, DocumentTree
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class TextCluster:
|
|
36
|
+
"""A cluster of semantically related text segments."""
|
|
37
|
+
|
|
38
|
+
id: str
|
|
39
|
+
texts: list[str]
|
|
40
|
+
embeddings: np.ndarray | None = None
|
|
41
|
+
centroid: np.ndarray | None = None
|
|
42
|
+
children: list["TextCluster"] = field(default_factory=list)
|
|
43
|
+
synthetic_header: str = ""
|
|
44
|
+
level: int = 0 # 0 = leaf, 1 = micro, 2 = macro
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def full_text(self) -> str:
|
|
48
|
+
"""Concatenate all texts in this cluster."""
|
|
49
|
+
return "\n\n".join(self.texts)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def text_preview(self) -> str:
|
|
53
|
+
"""First 200 chars for summary."""
|
|
54
|
+
full = self.full_text
|
|
55
|
+
return full[:200] + "..." if len(full) > 200 else full
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class HierarchicalSemanticClusterer:
|
|
59
|
+
"""
|
|
60
|
+
Multi-resolution topic discovery via hierarchical clustering.
|
|
61
|
+
|
|
62
|
+
Creates a two-level hierarchy:
|
|
63
|
+
1. Micro-clusters (5-10 sentences) - paragraph-level thoughts
|
|
64
|
+
2. Macro-clusters (groups of micro-clusters) - chapter-level themes
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
micro_cluster_size: int = 7, # Target sentences per micro-cluster
|
|
70
|
+
macro_cluster_ratio: float = 0.3, # Macro = 30% of micro count
|
|
71
|
+
embed_provider: str | None = None,
|
|
72
|
+
generate_headers: bool = True,
|
|
73
|
+
):
|
|
74
|
+
"""
|
|
75
|
+
Initialize the clusterer.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
micro_cluster_size: Target number of sentences per micro-cluster.
|
|
79
|
+
macro_cluster_ratio: Ratio of macro to micro clusters.
|
|
80
|
+
embed_provider: "gemini", "openai", or None for auto-detect.
|
|
81
|
+
generate_headers: Whether to generate LLM headers for clusters.
|
|
82
|
+
"""
|
|
83
|
+
self.micro_cluster_size = micro_cluster_size
|
|
84
|
+
self.macro_cluster_ratio = macro_cluster_ratio
|
|
85
|
+
self.embed_provider = embed_provider
|
|
86
|
+
self.generate_headers = generate_headers
|
|
87
|
+
self._embed_model = None
|
|
88
|
+
|
|
89
|
+
def cluster_text(self, text: str) -> list[TextCluster]:
|
|
90
|
+
"""
|
|
91
|
+
Perform hierarchical clustering on text.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
text: Full document text.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
List of macro-clusters, each containing micro-clusters.
|
|
98
|
+
"""
|
|
99
|
+
# Split into sentences
|
|
100
|
+
sentences = self._split_sentences(text)
|
|
101
|
+
|
|
102
|
+
if len(sentences) < 3:
|
|
103
|
+
# Too short for clustering
|
|
104
|
+
return [TextCluster(
|
|
105
|
+
id="cluster_0",
|
|
106
|
+
texts=sentences,
|
|
107
|
+
synthetic_header="Document Content",
|
|
108
|
+
)]
|
|
109
|
+
|
|
110
|
+
logger.info("clustering_text", sentences=len(sentences))
|
|
111
|
+
|
|
112
|
+
# Get embeddings
|
|
113
|
+
embeddings = self._get_embeddings(sentences)
|
|
114
|
+
|
|
115
|
+
if embeddings is None:
|
|
116
|
+
# Fallback to simple chunking
|
|
117
|
+
return self._simple_cluster_fallback(sentences)
|
|
118
|
+
|
|
119
|
+
# Step 1: Create micro-clusters (sentence-level → paragraph-level)
|
|
120
|
+
micro_clusters = self._create_micro_clusters(sentences, embeddings)
|
|
121
|
+
|
|
122
|
+
# Step 2: Create macro-clusters (micro-level → chapter-level)
|
|
123
|
+
macro_clusters = self._create_macro_clusters(micro_clusters)
|
|
124
|
+
|
|
125
|
+
# Step 3: Generate synthetic headers
|
|
126
|
+
if self.generate_headers:
|
|
127
|
+
self._generate_headers_for_clusters(macro_clusters)
|
|
128
|
+
|
|
129
|
+
logger.info(
|
|
130
|
+
"clustering_complete",
|
|
131
|
+
micro_clusters=len(micro_clusters),
|
|
132
|
+
macro_clusters=len(macro_clusters),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return macro_clusters
|
|
136
|
+
|
|
137
|
+
def cluster_to_tree(self, text: str, title: str) -> DocumentTree:
|
|
138
|
+
"""
|
|
139
|
+
Cluster text and convert to DocumentTree.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
text: Full document text.
|
|
143
|
+
title: Document title.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
DocumentTree with hierarchical structure.
|
|
147
|
+
"""
|
|
148
|
+
macro_clusters = self.cluster_text(text)
|
|
149
|
+
|
|
150
|
+
# Build tree
|
|
151
|
+
root = DocumentNode(
|
|
152
|
+
id="root",
|
|
153
|
+
level=0,
|
|
154
|
+
header=title,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
for i, macro in enumerate(macro_clusters):
|
|
158
|
+
# Macro cluster becomes H2
|
|
159
|
+
macro_node = DocumentNode(
|
|
160
|
+
id=f"macro_{i:03d}",
|
|
161
|
+
level=1,
|
|
162
|
+
header=macro.synthetic_header or f"Section {i + 1}",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if macro.children:
|
|
166
|
+
# Add micro-clusters as H3
|
|
167
|
+
for j, micro in enumerate(macro.children):
|
|
168
|
+
micro_node = DocumentNode(
|
|
169
|
+
id=f"micro_{i:03d}_{j:03d}",
|
|
170
|
+
level=2,
|
|
171
|
+
header=micro.synthetic_header or f"Subsection {i + 1}.{j + 1}",
|
|
172
|
+
content=micro.full_text,
|
|
173
|
+
)
|
|
174
|
+
macro_node.children.append(micro_node)
|
|
175
|
+
else:
|
|
176
|
+
# Macro is a leaf (no sub-clusters)
|
|
177
|
+
macro_node.content = macro.full_text
|
|
178
|
+
|
|
179
|
+
root.children.append(macro_node)
|
|
180
|
+
|
|
181
|
+
return DocumentTree(
|
|
182
|
+
title=title,
|
|
183
|
+
root=root,
|
|
184
|
+
total_nodes=self._count_nodes(root),
|
|
185
|
+
ingestion_tier=2,
|
|
186
|
+
ingestion_method="hierarchical_clustering",
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def _split_sentences(self, text: str) -> list[str]:
|
|
190
|
+
"""Split text into sentences."""
|
|
191
|
+
import re
|
|
192
|
+
|
|
193
|
+
# Simple sentence splitting
|
|
194
|
+
# Handle common abbreviations
|
|
195
|
+
text = text.replace("Dr.", "Dr")
|
|
196
|
+
text = text.replace("Mr.", "Mr")
|
|
197
|
+
text = text.replace("Mrs.", "Mrs")
|
|
198
|
+
text = text.replace("Ms.", "Ms")
|
|
199
|
+
text = text.replace("vs.", "vs")
|
|
200
|
+
text = text.replace("etc.", "etc")
|
|
201
|
+
text = text.replace("i.e.", "ie")
|
|
202
|
+
text = text.replace("e.g.", "eg")
|
|
203
|
+
|
|
204
|
+
# Split on sentence endings
|
|
205
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
206
|
+
|
|
207
|
+
# Filter empty and very short
|
|
208
|
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
|
|
209
|
+
|
|
210
|
+
return sentences
|
|
211
|
+
|
|
212
|
+
def _get_embeddings(self, texts: list[str]) -> np.ndarray | None:
|
|
213
|
+
"""Get embeddings for a list of texts."""
|
|
214
|
+
import os
|
|
215
|
+
|
|
216
|
+
# Auto-detect provider
|
|
217
|
+
provider = self.embed_provider
|
|
218
|
+
if provider is None:
|
|
219
|
+
if os.getenv("GOOGLE_API_KEY"):
|
|
220
|
+
provider = "gemini"
|
|
221
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
222
|
+
provider = "openai"
|
|
223
|
+
else:
|
|
224
|
+
logger.warning("no_embedding_api_key")
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
if provider == "gemini":
|
|
229
|
+
return self._get_gemini_embeddings(texts)
|
|
230
|
+
elif provider == "openai":
|
|
231
|
+
return self._get_openai_embeddings(texts)
|
|
232
|
+
else:
|
|
233
|
+
logger.warning("unknown_embed_provider", provider=provider)
|
|
234
|
+
return None
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.warning("embedding_failed", error=str(e))
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def _get_gemini_embeddings(self, texts: list[str]) -> np.ndarray:
|
|
240
|
+
"""Get embeddings using Gemini."""
|
|
241
|
+
from google import genai
|
|
242
|
+
|
|
243
|
+
client = genai.Client()
|
|
244
|
+
|
|
245
|
+
# Embed texts individually
|
|
246
|
+
embeddings = []
|
|
247
|
+
|
|
248
|
+
for text in texts:
|
|
249
|
+
result = client.models.embed_content(
|
|
250
|
+
model="models/text-embedding-004",
|
|
251
|
+
contents=text,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if result.embeddings is not None and len(result.embeddings) > 0:
|
|
255
|
+
embeddings.append(result.embeddings[0].values)
|
|
256
|
+
|
|
257
|
+
return np.array(embeddings)
|
|
258
|
+
|
|
259
|
+
def _get_openai_embeddings(self, texts: list[str]) -> np.ndarray:
|
|
260
|
+
"""Get embeddings using OpenAI."""
|
|
261
|
+
from openai import OpenAI
|
|
262
|
+
|
|
263
|
+
client = OpenAI()
|
|
264
|
+
|
|
265
|
+
# Batch texts
|
|
266
|
+
embeddings = []
|
|
267
|
+
batch_size = 100
|
|
268
|
+
|
|
269
|
+
for i in range(0, len(texts), batch_size):
|
|
270
|
+
batch = texts[i:i + batch_size]
|
|
271
|
+
|
|
272
|
+
response = client.embeddings.create(
|
|
273
|
+
model="text-embedding-3-small",
|
|
274
|
+
input=batch,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
for item in response.data:
|
|
278
|
+
embeddings.append(item.embedding)
|
|
279
|
+
|
|
280
|
+
return np.array(embeddings)
|
|
281
|
+
|
|
282
|
+
def _create_micro_clusters(
|
|
283
|
+
self,
|
|
284
|
+
sentences: list[str],
|
|
285
|
+
embeddings: np.ndarray,
|
|
286
|
+
) -> list[TextCluster]:
|
|
287
|
+
"""
|
|
288
|
+
Create micro-clusters (paragraph-level) via agglomerative clustering.
|
|
289
|
+
"""
|
|
290
|
+
from scipy.cluster.hierarchy import fcluster, linkage
|
|
291
|
+
from scipy.spatial.distance import pdist
|
|
292
|
+
|
|
293
|
+
n_sentences = len(sentences)
|
|
294
|
+
target_clusters = max(3, n_sentences // self.micro_cluster_size)
|
|
295
|
+
|
|
296
|
+
# Compute linkage
|
|
297
|
+
distances = pdist(embeddings, metric='cosine')
|
|
298
|
+
Z = linkage(distances, method='ward')
|
|
299
|
+
|
|
300
|
+
# Cut tree to get target number of clusters
|
|
301
|
+
labels = fcluster(Z, t=target_clusters, criterion='maxclust')
|
|
302
|
+
|
|
303
|
+
# Group sentences by cluster
|
|
304
|
+
clusters = {}
|
|
305
|
+
for i, label in enumerate(labels):
|
|
306
|
+
if label not in clusters:
|
|
307
|
+
clusters[label] = {
|
|
308
|
+
'texts': [],
|
|
309
|
+
'embeddings': [],
|
|
310
|
+
'indices': [],
|
|
311
|
+
}
|
|
312
|
+
clusters[label]['texts'].append(sentences[i])
|
|
313
|
+
clusters[label]['embeddings'].append(embeddings[i])
|
|
314
|
+
clusters[label]['indices'].append(i)
|
|
315
|
+
|
|
316
|
+
# Convert to TextCluster objects, sorted by first sentence index
|
|
317
|
+
micro_clusters = []
|
|
318
|
+
for label, data in sorted(clusters.items(), key=lambda x: min(x[1]['indices'])):
|
|
319
|
+
emb_array = np.array(data['embeddings'])
|
|
320
|
+
cluster = TextCluster(
|
|
321
|
+
id=f"micro_{label}",
|
|
322
|
+
texts=data['texts'],
|
|
323
|
+
embeddings=emb_array,
|
|
324
|
+
centroid=np.mean(emb_array, axis=0),
|
|
325
|
+
level=1,
|
|
326
|
+
)
|
|
327
|
+
micro_clusters.append(cluster)
|
|
328
|
+
|
|
329
|
+
logger.debug("micro_clusters_created", count=len(micro_clusters))
|
|
330
|
+
return micro_clusters
|
|
331
|
+
|
|
332
|
+
def _create_macro_clusters(
|
|
333
|
+
self,
|
|
334
|
+
micro_clusters: list[TextCluster],
|
|
335
|
+
) -> list[TextCluster]:
|
|
336
|
+
"""
|
|
337
|
+
Create macro-clusters (chapter-level) from micro-clusters.
|
|
338
|
+
"""
|
|
339
|
+
if len(micro_clusters) <= 3:
|
|
340
|
+
# Too few for macro clustering
|
|
341
|
+
for i, micro in enumerate(micro_clusters):
|
|
342
|
+
micro.level = 2 # Treat as macro
|
|
343
|
+
micro.id = f"macro_{i}"
|
|
344
|
+
return micro_clusters
|
|
345
|
+
|
|
346
|
+
from scipy.cluster.hierarchy import fcluster, linkage
|
|
347
|
+
from scipy.spatial.distance import pdist
|
|
348
|
+
|
|
349
|
+
# Get centroids
|
|
350
|
+
centroids = np.array([c.centroid for c in micro_clusters if c.centroid is not None])
|
|
351
|
+
|
|
352
|
+
if len(centroids) < 3:
|
|
353
|
+
return micro_clusters
|
|
354
|
+
|
|
355
|
+
# Target macro clusters
|
|
356
|
+
target_macros = max(2, int(len(micro_clusters) * self.macro_cluster_ratio))
|
|
357
|
+
|
|
358
|
+
# Cluster centroids
|
|
359
|
+
distances = pdist(centroids, metric='cosine')
|
|
360
|
+
Z = linkage(distances, method='ward')
|
|
361
|
+
labels = fcluster(Z, t=target_macros, criterion='maxclust')
|
|
362
|
+
|
|
363
|
+
# Group micro-clusters by macro label
|
|
364
|
+
macro_groups = {}
|
|
365
|
+
for i, label in enumerate(labels):
|
|
366
|
+
if label not in macro_groups:
|
|
367
|
+
macro_groups[label] = []
|
|
368
|
+
macro_groups[label].append(micro_clusters[i])
|
|
369
|
+
|
|
370
|
+
# Create macro clusters
|
|
371
|
+
macro_clusters = []
|
|
372
|
+
for label, micros in sorted(macro_groups.items()):
|
|
373
|
+
# Combine all texts
|
|
374
|
+
all_texts = []
|
|
375
|
+
for micro in micros:
|
|
376
|
+
all_texts.extend(micro.texts)
|
|
377
|
+
|
|
378
|
+
macro = TextCluster(
|
|
379
|
+
id=f"macro_{label}",
|
|
380
|
+
texts=all_texts,
|
|
381
|
+
children=micros,
|
|
382
|
+
level=2,
|
|
383
|
+
)
|
|
384
|
+
macro_clusters.append(macro)
|
|
385
|
+
|
|
386
|
+
logger.debug("macro_clusters_created", count=len(macro_clusters))
|
|
387
|
+
return macro_clusters
|
|
388
|
+
|
|
389
|
+
def _generate_headers_for_clusters(
|
|
390
|
+
self,
|
|
391
|
+
clusters: list[TextCluster],
|
|
392
|
+
) -> None:
|
|
393
|
+
"""Generate synthetic headers for all clusters using LLM."""
|
|
394
|
+
for cluster in clusters:
|
|
395
|
+
# Generate header for macro cluster
|
|
396
|
+
cluster.synthetic_header = self._generate_single_header(cluster.text_preview)
|
|
397
|
+
|
|
398
|
+
# Generate headers for children
|
|
399
|
+
for child in cluster.children:
|
|
400
|
+
child.synthetic_header = self._generate_single_header(child.text_preview)
|
|
401
|
+
|
|
402
|
+
def _generate_single_header(self, text_preview: str) -> str:
|
|
403
|
+
"""Generate a single header via LLM."""
|
|
404
|
+
import os
|
|
405
|
+
|
|
406
|
+
prompt = f"""Generate a concise 3-7 word descriptive title for this text section:
|
|
407
|
+
|
|
408
|
+
{text_preview}
|
|
409
|
+
|
|
410
|
+
Return ONLY the title, nothing else."""
|
|
411
|
+
|
|
412
|
+
# Try Gemini
|
|
413
|
+
if os.getenv("GOOGLE_API_KEY"):
|
|
414
|
+
try:
|
|
415
|
+
from google import genai
|
|
416
|
+
|
|
417
|
+
client = genai.Client()
|
|
418
|
+
response = client.models.generate_content(
|
|
419
|
+
model="gemini-2.0-flash",
|
|
420
|
+
contents=prompt,
|
|
421
|
+
)
|
|
422
|
+
if response.text is not None:
|
|
423
|
+
header = response.text.strip().strip('"').strip("'")
|
|
424
|
+
if 3 <= len(header) <= 100:
|
|
425
|
+
return header
|
|
426
|
+
except Exception:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
# Try OpenAI
|
|
430
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
431
|
+
try:
|
|
432
|
+
from openai import OpenAI
|
|
433
|
+
|
|
434
|
+
client = OpenAI()
|
|
435
|
+
response = client.chat.completions.create(
|
|
436
|
+
model="gpt-4o-mini",
|
|
437
|
+
messages=[{"role": "user", "content": prompt}],
|
|
438
|
+
max_tokens=30,
|
|
439
|
+
temperature=0.3,
|
|
440
|
+
)
|
|
441
|
+
content = response.choices[0].message.content
|
|
442
|
+
if content is not None:
|
|
443
|
+
header = content.strip().strip('"').strip("'")
|
|
444
|
+
if 3 <= len(header) <= 100:
|
|
445
|
+
return header
|
|
446
|
+
except Exception:
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# Fallback: first few words
|
|
450
|
+
words = text_preview.split()[:5]
|
|
451
|
+
return " ".join(words) + "..." if words else "Section"
|
|
452
|
+
|
|
453
|
+
def _simple_cluster_fallback(
|
|
454
|
+
self,
|
|
455
|
+
sentences: list[str],
|
|
456
|
+
) -> list[TextCluster]:
|
|
457
|
+
"""Simple clustering when embeddings unavailable."""
|
|
458
|
+
# Group sentences into chunks
|
|
459
|
+
chunk_size = self.micro_cluster_size
|
|
460
|
+
clusters = []
|
|
461
|
+
|
|
462
|
+
for i in range(0, len(sentences), chunk_size):
|
|
463
|
+
chunk = sentences[i:i + chunk_size]
|
|
464
|
+
cluster = TextCluster(
|
|
465
|
+
id=f"cluster_{i // chunk_size}",
|
|
466
|
+
texts=chunk,
|
|
467
|
+
synthetic_header=f"Section {i // chunk_size + 1}",
|
|
468
|
+
level=2,
|
|
469
|
+
)
|
|
470
|
+
clusters.append(cluster)
|
|
471
|
+
|
|
472
|
+
return clusters
|
|
473
|
+
|
|
474
|
+
def _count_nodes(self, node: DocumentNode) -> int:
|
|
475
|
+
"""Count total nodes in tree."""
|
|
476
|
+
return 1 + sum(self._count_nodes(c) for c in node.children)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def cluster_document_hierarchically(
|
|
480
|
+
pdf_path: Path | str,
|
|
481
|
+
title: str | None = None,
|
|
482
|
+
) -> DocumentTree:
|
|
483
|
+
"""
|
|
484
|
+
Convenience function for hierarchical clustering of a PDF.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
pdf_path: Path to the PDF file.
|
|
488
|
+
title: Document title (defaults to filename).
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
DocumentTree with hierarchical structure.
|
|
492
|
+
|
|
493
|
+
Example:
|
|
494
|
+
tree = cluster_document_hierarchically("flat_document.pdf")
|
|
495
|
+
for section in tree.root.children:
|
|
496
|
+
print(f"{section.header}: {len(section.children)} subsections")
|
|
497
|
+
"""
|
|
498
|
+
import fitz
|
|
499
|
+
|
|
500
|
+
pdf_path = Path(pdf_path)
|
|
501
|
+
title = title or pdf_path.stem
|
|
502
|
+
|
|
503
|
+
# Extract text
|
|
504
|
+
doc = fitz.open(pdf_path)
|
|
505
|
+
pages_text = []
|
|
506
|
+
for page_num in range(len(doc)):
|
|
507
|
+
page = doc.load_page(page_num)
|
|
508
|
+
pages_text.append(page.get_text("text"))
|
|
509
|
+
doc.close()
|
|
510
|
+
|
|
511
|
+
full_text = "\n\n".join(pages_text)
|
|
512
|
+
|
|
513
|
+
# Cluster
|
|
514
|
+
clusterer = HierarchicalSemanticClusterer()
|
|
515
|
+
return clusterer.cluster_to_tree(full_text, title)
|