rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Header Classifier - Detect and Classify Headers by Level
|
|
3
|
+
|
|
4
|
+
This module classifies text spans into:
|
|
5
|
+
- Headers (H1, H2, H3) based on font size magnitude
|
|
6
|
+
- Body text (most frequent font size)
|
|
7
|
+
- Captions/footnotes (smaller than body)
|
|
8
|
+
|
|
9
|
+
Header Level Mapping (adaptive):
|
|
10
|
+
- Defaults: >= 24pt: H1, 18-23pt: H2, 14-17pt: H3
|
|
11
|
+
- Learns from document types (legal, academic, marketing, etc.)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from threading import Lock
|
|
21
|
+
from typing import Any, Literal
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import structlog
|
|
25
|
+
from sklearn.cluster import KMeans
|
|
26
|
+
|
|
27
|
+
from rnsr.models import ClassifiedSpan, FontAnalysis, SpanInfo
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# =============================================================================
|
|
33
|
+
# Learned Header Thresholds Registry
|
|
34
|
+
# =============================================================================
|
|
35
|
+
|
|
36
|
+
DEFAULT_HEADER_THRESHOLDS_PATH = Path.home() / ".rnsr" / "learned_header_thresholds.json"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LearnedHeaderThresholds:
|
|
40
|
+
"""
|
|
41
|
+
Registry for learning document-type-specific header thresholds.
|
|
42
|
+
|
|
43
|
+
Different document types use different conventions:
|
|
44
|
+
- Legal briefs: 12pt bold = header
|
|
45
|
+
- Academic papers: 11pt = everything
|
|
46
|
+
- Marketing: 36pt+ = titles
|
|
47
|
+
|
|
48
|
+
This class learns optimal thresholds from document analysis.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Default thresholds
|
|
52
|
+
DEFAULT_H1_MIN = 24.0
|
|
53
|
+
DEFAULT_H2_MIN = 18.0
|
|
54
|
+
DEFAULT_H3_MIN = 14.0
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
storage_path: Path | str | None = None,
|
|
59
|
+
auto_save: bool = True,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Initialize the header thresholds registry.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
storage_path: Path to JSON file for persistence.
|
|
66
|
+
auto_save: Whether to save after changes.
|
|
67
|
+
"""
|
|
68
|
+
self.storage_path = Path(storage_path) if storage_path else DEFAULT_HEADER_THRESHOLDS_PATH
|
|
69
|
+
self.auto_save = auto_save
|
|
70
|
+
|
|
71
|
+
self._lock = Lock()
|
|
72
|
+
self._document_types: dict[str, dict[str, Any]] = {}
|
|
73
|
+
self._dirty = False
|
|
74
|
+
|
|
75
|
+
self._load()
|
|
76
|
+
|
|
77
|
+
def _load(self) -> None:
|
|
78
|
+
"""Load learned thresholds from storage."""
|
|
79
|
+
if not self.storage_path.exists():
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
with open(self.storage_path, "r") as f:
|
|
84
|
+
data = json.load(f)
|
|
85
|
+
|
|
86
|
+
self._document_types = data.get("document_types", {})
|
|
87
|
+
|
|
88
|
+
logger.info(
|
|
89
|
+
"header_thresholds_loaded",
|
|
90
|
+
document_types=len(self._document_types),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.warning("failed_to_load_header_thresholds", error=str(e))
|
|
95
|
+
|
|
96
|
+
def _save(self) -> None:
|
|
97
|
+
"""Save to storage."""
|
|
98
|
+
if not self._dirty:
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
+
|
|
104
|
+
data = {
|
|
105
|
+
"version": "1.0",
|
|
106
|
+
"updated_at": datetime.utcnow().isoformat(),
|
|
107
|
+
"document_types": self._document_types,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
with open(self.storage_path, "w") as f:
|
|
111
|
+
json.dump(data, f, indent=2)
|
|
112
|
+
|
|
113
|
+
self._dirty = False
|
|
114
|
+
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.warning("failed_to_save_header_thresholds", error=str(e))
|
|
117
|
+
|
|
118
|
+
def record_thresholds(
|
|
119
|
+
self,
|
|
120
|
+
document_type: str,
|
|
121
|
+
h1_min: float,
|
|
122
|
+
h2_min: float,
|
|
123
|
+
h3_min: float,
|
|
124
|
+
body_size: float = 12.0,
|
|
125
|
+
document_example: str = "",
|
|
126
|
+
) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Record observed header thresholds for a document type.
|
|
129
|
+
|
|
130
|
+
The system averages thresholds across multiple documents
|
|
131
|
+
of the same type to learn optimal values.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
document_type: Type of document (legal, academic, marketing, etc.)
|
|
135
|
+
h1_min: Observed H1 minimum size.
|
|
136
|
+
h2_min: Observed H2 minimum size.
|
|
137
|
+
h3_min: Observed H3 minimum size.
|
|
138
|
+
body_size: Observed body text size.
|
|
139
|
+
document_example: Example document filename.
|
|
140
|
+
"""
|
|
141
|
+
document_type = document_type.lower().strip()
|
|
142
|
+
|
|
143
|
+
if not document_type:
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
with self._lock:
|
|
147
|
+
now = datetime.utcnow().isoformat()
|
|
148
|
+
|
|
149
|
+
if document_type not in self._document_types:
|
|
150
|
+
self._document_types[document_type] = {
|
|
151
|
+
"count": 0,
|
|
152
|
+
"h1_min_sum": 0.0,
|
|
153
|
+
"h2_min_sum": 0.0,
|
|
154
|
+
"h3_min_sum": 0.0,
|
|
155
|
+
"body_size_sum": 0.0,
|
|
156
|
+
"first_seen": now,
|
|
157
|
+
"last_seen": now,
|
|
158
|
+
"examples": [],
|
|
159
|
+
}
|
|
160
|
+
logger.info("new_document_type_learned", document_type=document_type)
|
|
161
|
+
|
|
162
|
+
dt = self._document_types[document_type]
|
|
163
|
+
dt["count"] += 1
|
|
164
|
+
dt["h1_min_sum"] += h1_min
|
|
165
|
+
dt["h2_min_sum"] += h2_min
|
|
166
|
+
dt["h3_min_sum"] += h3_min
|
|
167
|
+
dt["body_size_sum"] += body_size
|
|
168
|
+
dt["last_seen"] = now
|
|
169
|
+
|
|
170
|
+
if document_example and len(dt["examples"]) < 3:
|
|
171
|
+
dt["examples"].append(document_example)
|
|
172
|
+
|
|
173
|
+
self._dirty = True
|
|
174
|
+
|
|
175
|
+
if self.auto_save:
|
|
176
|
+
self._save()
|
|
177
|
+
|
|
178
|
+
def get_thresholds(
|
|
179
|
+
self,
|
|
180
|
+
document_type: str | None = None,
|
|
181
|
+
) -> dict[str, float]:
|
|
182
|
+
"""
|
|
183
|
+
Get header thresholds for a document type.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
document_type: Type of document. If None, returns defaults.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Dict with h1_min, h2_min, h3_min, body_size.
|
|
190
|
+
"""
|
|
191
|
+
if not document_type:
|
|
192
|
+
return {
|
|
193
|
+
"h1_min": self.DEFAULT_H1_MIN,
|
|
194
|
+
"h2_min": self.DEFAULT_H2_MIN,
|
|
195
|
+
"h3_min": self.DEFAULT_H3_MIN,
|
|
196
|
+
"body_size": 12.0,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
document_type = document_type.lower().strip()
|
|
200
|
+
|
|
201
|
+
with self._lock:
|
|
202
|
+
if document_type in self._document_types:
|
|
203
|
+
dt = self._document_types[document_type]
|
|
204
|
+
count = dt["count"]
|
|
205
|
+
|
|
206
|
+
if count > 0:
|
|
207
|
+
return {
|
|
208
|
+
"h1_min": dt["h1_min_sum"] / count,
|
|
209
|
+
"h2_min": dt["h2_min_sum"] / count,
|
|
210
|
+
"h3_min": dt["h3_min_sum"] / count,
|
|
211
|
+
"body_size": dt["body_size_sum"] / count,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Return defaults if not found
|
|
215
|
+
return self.get_thresholds(None)
|
|
216
|
+
|
|
217
|
+
def detect_document_type(
|
|
218
|
+
self,
|
|
219
|
+
body_size: float,
|
|
220
|
+
max_font_size: float,
|
|
221
|
+
has_legal_terms: bool = False,
|
|
222
|
+
has_academic_structure: bool = False,
|
|
223
|
+
) -> str:
|
|
224
|
+
"""
|
|
225
|
+
Attempt to detect document type from characteristics.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
body_size: Most common font size.
|
|
229
|
+
max_font_size: Largest font in document.
|
|
230
|
+
has_legal_terms: Whether document contains legal terminology.
|
|
231
|
+
has_academic_structure: Whether document has academic structure.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Detected document type string.
|
|
235
|
+
"""
|
|
236
|
+
# Simple heuristics
|
|
237
|
+
if has_legal_terms:
|
|
238
|
+
if body_size <= 12 and max_font_size <= 16:
|
|
239
|
+
return "legal_brief"
|
|
240
|
+
return "legal_general"
|
|
241
|
+
|
|
242
|
+
if has_academic_structure:
|
|
243
|
+
return "academic"
|
|
244
|
+
|
|
245
|
+
if max_font_size >= 36:
|
|
246
|
+
return "marketing"
|
|
247
|
+
|
|
248
|
+
if body_size >= 14:
|
|
249
|
+
return "presentation"
|
|
250
|
+
|
|
251
|
+
return "general"
|
|
252
|
+
|
|
253
|
+
def get_known_document_types(self) -> list[str]:
|
|
254
|
+
"""Get list of document types we have learned."""
|
|
255
|
+
with self._lock:
|
|
256
|
+
return list(self._document_types.keys())
|
|
257
|
+
|
|
258
|
+
def get_stats(self) -> dict[str, Any]:
|
|
259
|
+
"""Get statistics about learned thresholds."""
|
|
260
|
+
with self._lock:
|
|
261
|
+
return {
|
|
262
|
+
"document_types_count": len(self._document_types),
|
|
263
|
+
"document_types": list(self._document_types.keys()),
|
|
264
|
+
"total_documents_analyzed": sum(
|
|
265
|
+
dt["count"] for dt in self._document_types.values()
|
|
266
|
+
),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# Global header thresholds registry
|
|
271
|
+
_global_header_thresholds: LearnedHeaderThresholds | None = None
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def get_learned_header_thresholds() -> LearnedHeaderThresholds:
|
|
275
|
+
"""Get the global learned header thresholds registry."""
|
|
276
|
+
global _global_header_thresholds
|
|
277
|
+
|
|
278
|
+
if _global_header_thresholds is None:
|
|
279
|
+
custom_path = os.getenv("RNSR_HEADER_THRESHOLDS_PATH")
|
|
280
|
+
_global_header_thresholds = LearnedHeaderThresholds(
|
|
281
|
+
storage_path=custom_path if custom_path else None
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return _global_header_thresholds
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class HeaderClassifier:
|
|
288
|
+
"""
|
|
289
|
+
Classifies text spans into headers and body text.
|
|
290
|
+
|
|
291
|
+
Uses font size analysis and optional k-means clustering
|
|
292
|
+
to determine header levels. Supports adaptive thresholds
|
|
293
|
+
that learn from document types.
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
# Default thresholds for header levels (in points)
|
|
297
|
+
H1_MIN_SIZE = 24.0
|
|
298
|
+
H2_MIN_SIZE = 18.0
|
|
299
|
+
H3_MIN_SIZE = 14.0
|
|
300
|
+
|
|
301
|
+
def __init__(
|
|
302
|
+
self,
|
|
303
|
+
use_clustering: bool = True,
|
|
304
|
+
n_header_levels: int = 3,
|
|
305
|
+
document_type: str | None = None,
|
|
306
|
+
enable_threshold_learning: bool = True,
|
|
307
|
+
):
|
|
308
|
+
"""
|
|
309
|
+
Initialize the Header Classifier.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
use_clustering: Whether to use k-means clustering for header levels.
|
|
313
|
+
n_header_levels: Number of header levels to detect (default: 3).
|
|
314
|
+
document_type: Optional document type for adaptive thresholds.
|
|
315
|
+
enable_threshold_learning: Whether to learn thresholds from documents.
|
|
316
|
+
"""
|
|
317
|
+
self.use_clustering = use_clustering
|
|
318
|
+
self.n_header_levels = n_header_levels
|
|
319
|
+
self.document_type = document_type
|
|
320
|
+
self.enable_threshold_learning = enable_threshold_learning
|
|
321
|
+
|
|
322
|
+
# Get learned thresholds registry
|
|
323
|
+
self._threshold_registry = get_learned_header_thresholds() if enable_threshold_learning else None
|
|
324
|
+
|
|
325
|
+
# Set thresholds based on document type
|
|
326
|
+
self._update_thresholds(document_type)
|
|
327
|
+
|
|
328
|
+
def _update_thresholds(self, document_type: str | None) -> None:
|
|
329
|
+
"""Update thresholds based on document type."""
|
|
330
|
+
if self._threshold_registry and document_type:
|
|
331
|
+
thresholds = self._threshold_registry.get_thresholds(document_type)
|
|
332
|
+
self.H1_MIN_SIZE = thresholds["h1_min"]
|
|
333
|
+
self.H2_MIN_SIZE = thresholds["h2_min"]
|
|
334
|
+
self.H3_MIN_SIZE = thresholds["h3_min"]
|
|
335
|
+
else:
|
|
336
|
+
# Use class defaults
|
|
337
|
+
self.H1_MIN_SIZE = HeaderClassifier.H1_MIN_SIZE
|
|
338
|
+
self.H2_MIN_SIZE = HeaderClassifier.H2_MIN_SIZE
|
|
339
|
+
self.H3_MIN_SIZE = HeaderClassifier.H3_MIN_SIZE
|
|
340
|
+
|
|
341
|
+
def set_document_type(self, document_type: str) -> None:
|
|
342
|
+
"""Set document type and update thresholds."""
|
|
343
|
+
self.document_type = document_type
|
|
344
|
+
self._update_thresholds(document_type)
|
|
345
|
+
|
|
346
|
+
def learn_from_analysis(
|
|
347
|
+
self,
|
|
348
|
+
analysis: FontAnalysis,
|
|
349
|
+
detected_h1_size: float | None = None,
|
|
350
|
+
detected_h2_size: float | None = None,
|
|
351
|
+
detected_h3_size: float | None = None,
|
|
352
|
+
document_name: str = "",
|
|
353
|
+
) -> None:
|
|
354
|
+
"""
|
|
355
|
+
Learn thresholds from document analysis.
|
|
356
|
+
|
|
357
|
+
Call this after processing a document to record observed values.
|
|
358
|
+
"""
|
|
359
|
+
if not self._threshold_registry or not self.document_type:
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
# Use detected sizes or infer from analysis
|
|
363
|
+
h1_size = detected_h1_size or analysis.header_threshold + 6
|
|
364
|
+
h2_size = detected_h2_size or analysis.header_threshold + 3
|
|
365
|
+
h3_size = detected_h3_size or analysis.header_threshold
|
|
366
|
+
|
|
367
|
+
self._threshold_registry.record_thresholds(
|
|
368
|
+
document_type=self.document_type,
|
|
369
|
+
h1_min=h1_size,
|
|
370
|
+
h2_min=h2_size,
|
|
371
|
+
h3_min=h3_size,
|
|
372
|
+
body_size=analysis.body_size,
|
|
373
|
+
document_example=document_name,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
def classify_spans(
|
|
377
|
+
self,
|
|
378
|
+
spans: list[SpanInfo],
|
|
379
|
+
analysis: FontAnalysis,
|
|
380
|
+
) -> list[ClassifiedSpan]:
|
|
381
|
+
"""
|
|
382
|
+
Classify all spans into headers or body text.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
spans: List of SpanInfo from font analysis.
|
|
386
|
+
analysis: FontAnalysis with body size and threshold.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
List of ClassifiedSpan with role and header_level assigned.
|
|
390
|
+
"""
|
|
391
|
+
if not spans:
|
|
392
|
+
return []
|
|
393
|
+
|
|
394
|
+
# First pass: identify potential headers
|
|
395
|
+
potential_headers: list[SpanInfo] = []
|
|
396
|
+
for span in spans:
|
|
397
|
+
if self._is_header_candidate(span, analysis):
|
|
398
|
+
potential_headers.append(span)
|
|
399
|
+
|
|
400
|
+
# Determine header levels
|
|
401
|
+
if potential_headers and self.use_clustering:
|
|
402
|
+
level_mapping = self._cluster_header_levels(potential_headers)
|
|
403
|
+
else:
|
|
404
|
+
level_mapping = {}
|
|
405
|
+
|
|
406
|
+
# Classify all spans
|
|
407
|
+
classified: list[ClassifiedSpan] = []
|
|
408
|
+
for span in spans:
|
|
409
|
+
role, level = self._classify_single_span(span, analysis, level_mapping)
|
|
410
|
+
|
|
411
|
+
classified_span = ClassifiedSpan(
|
|
412
|
+
text=span.text,
|
|
413
|
+
font_size=span.font_size,
|
|
414
|
+
font_name=span.font_name,
|
|
415
|
+
is_bold=span.is_bold,
|
|
416
|
+
is_italic=span.is_italic,
|
|
417
|
+
bbox=span.bbox,
|
|
418
|
+
page_num=span.page_num,
|
|
419
|
+
role=role,
|
|
420
|
+
header_level=level,
|
|
421
|
+
)
|
|
422
|
+
classified.append(classified_span)
|
|
423
|
+
|
|
424
|
+
# Log classification stats
|
|
425
|
+
header_count = sum(1 for s in classified if s.role == "header")
|
|
426
|
+
logger.info(
|
|
427
|
+
"spans_classified",
|
|
428
|
+
total=len(classified),
|
|
429
|
+
headers=header_count,
|
|
430
|
+
body=len(classified) - header_count,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return classified
|
|
434
|
+
|
|
435
|
+
def _is_header_candidate(
|
|
436
|
+
self,
|
|
437
|
+
span: SpanInfo,
|
|
438
|
+
analysis: FontAnalysis,
|
|
439
|
+
) -> bool:
|
|
440
|
+
"""
|
|
441
|
+
Determine if a span is a header candidate.
|
|
442
|
+
|
|
443
|
+
A span is a header candidate if:
|
|
444
|
+
1. Font size > header_threshold, OR
|
|
445
|
+
2. Bold text at or above body size
|
|
446
|
+
"""
|
|
447
|
+
# Size-based detection
|
|
448
|
+
if span.font_size > analysis.header_threshold:
|
|
449
|
+
return True
|
|
450
|
+
|
|
451
|
+
# Bold text at body size or larger
|
|
452
|
+
if span.is_bold and span.font_size >= analysis.body_size:
|
|
453
|
+
return True
|
|
454
|
+
|
|
455
|
+
return False
|
|
456
|
+
|
|
457
|
+
def _classify_single_span(
|
|
458
|
+
self,
|
|
459
|
+
span: SpanInfo,
|
|
460
|
+
analysis: FontAnalysis,
|
|
461
|
+
level_mapping: dict[float, int],
|
|
462
|
+
) -> tuple[Literal["header", "body", "caption", "footnote"], int]:
|
|
463
|
+
"""
|
|
464
|
+
Classify a single span.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Tuple of (role, header_level).
|
|
468
|
+
"""
|
|
469
|
+
# Check if it's a header
|
|
470
|
+
if self._is_header_candidate(span, analysis):
|
|
471
|
+
# Use clustering-based level if available
|
|
472
|
+
if span.font_size in level_mapping:
|
|
473
|
+
level = level_mapping[span.font_size]
|
|
474
|
+
else:
|
|
475
|
+
# Fall back to absolute thresholds
|
|
476
|
+
level = self._get_level_by_size(span.font_size)
|
|
477
|
+
|
|
478
|
+
return ("header", level)
|
|
479
|
+
|
|
480
|
+
# Check for captions/footnotes (smaller than body)
|
|
481
|
+
caption_threshold = analysis.body_size - (analysis.body_size * 0.2)
|
|
482
|
+
if span.font_size < caption_threshold:
|
|
483
|
+
return ("caption", 0)
|
|
484
|
+
|
|
485
|
+
# Default to body text
|
|
486
|
+
return ("body", 0)
|
|
487
|
+
|
|
488
|
+
def _get_level_by_size(self, font_size: float) -> int:
|
|
489
|
+
"""
|
|
490
|
+
Get header level based on absolute font size thresholds.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
font_size: The font size in points.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Header level (1, 2, or 3).
|
|
497
|
+
"""
|
|
498
|
+
if font_size >= self.H1_MIN_SIZE:
|
|
499
|
+
return 1
|
|
500
|
+
elif font_size >= self.H2_MIN_SIZE:
|
|
501
|
+
return 2
|
|
502
|
+
else:
|
|
503
|
+
return 3
|
|
504
|
+
|
|
505
|
+
def _cluster_header_levels(
|
|
506
|
+
self,
|
|
507
|
+
headers: list[SpanInfo],
|
|
508
|
+
) -> dict[float, int]:
|
|
509
|
+
"""
|
|
510
|
+
Use k-means clustering to determine header levels from actual data.
|
|
511
|
+
|
|
512
|
+
This adapts to documents with non-standard font sizes.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
headers: List of spans identified as headers.
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
Dict mapping font_size to header_level (1, 2, or 3).
|
|
519
|
+
"""
|
|
520
|
+
if len(headers) < self.n_header_levels:
|
|
521
|
+
# Not enough headers to cluster
|
|
522
|
+
return {}
|
|
523
|
+
|
|
524
|
+
# Get unique font sizes
|
|
525
|
+
unique_sizes = list(set(h.font_size for h in headers))
|
|
526
|
+
|
|
527
|
+
if len(unique_sizes) < self.n_header_levels:
|
|
528
|
+
# Fewer unique sizes than levels - assign directly
|
|
529
|
+
sorted_sizes = sorted(unique_sizes, reverse=True)
|
|
530
|
+
return {size: i + 1 for i, size in enumerate(sorted_sizes)}
|
|
531
|
+
|
|
532
|
+
# Perform k-means clustering
|
|
533
|
+
try:
|
|
534
|
+
X = np.array(unique_sizes).reshape(-1, 1)
|
|
535
|
+
n_clusters = min(self.n_header_levels, len(unique_sizes))
|
|
536
|
+
|
|
537
|
+
kmeans = KMeans(
|
|
538
|
+
n_clusters=n_clusters,
|
|
539
|
+
random_state=42,
|
|
540
|
+
n_init=10,
|
|
541
|
+
).fit(X)
|
|
542
|
+
|
|
543
|
+
# Map clusters to levels by size (largest = H1)
|
|
544
|
+
cluster_centers = [
|
|
545
|
+
(i, kmeans.cluster_centers_[i][0])
|
|
546
|
+
for i in range(n_clusters)
|
|
547
|
+
]
|
|
548
|
+
cluster_centers.sort(key=lambda x: -x[1]) # Descending by size
|
|
549
|
+
|
|
550
|
+
cluster_to_level = {
|
|
551
|
+
cluster: level + 1
|
|
552
|
+
for level, (cluster, _) in enumerate(cluster_centers)
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
# Map each unique size to its level
|
|
556
|
+
size_to_level = {}
|
|
557
|
+
for size in unique_sizes:
|
|
558
|
+
cluster = kmeans.predict([[size]])[0]
|
|
559
|
+
size_to_level[size] = cluster_to_level[cluster]
|
|
560
|
+
|
|
561
|
+
logger.debug(
|
|
562
|
+
"header_levels_clustered",
|
|
563
|
+
mapping=size_to_level,
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
return size_to_level
|
|
567
|
+
|
|
568
|
+
except Exception as e:
|
|
569
|
+
logger.warning("clustering_failed", error=str(e))
|
|
570
|
+
return {}
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def classify_headers(
|
|
574
|
+
spans: list[SpanInfo],
|
|
575
|
+
analysis: FontAnalysis,
|
|
576
|
+
use_clustering: bool = True,
|
|
577
|
+
) -> list[ClassifiedSpan]:
|
|
578
|
+
"""
|
|
579
|
+
Convenience function to classify spans into headers and body text.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
spans: List of SpanInfo from font analysis.
|
|
583
|
+
analysis: FontAnalysis with body size and threshold.
|
|
584
|
+
use_clustering: Whether to use k-means for header levels.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
List of ClassifiedSpan with roles assigned.
|
|
588
|
+
|
|
589
|
+
Example:
|
|
590
|
+
analysis, spans = analyze_font_histogram("doc.pdf")
|
|
591
|
+
classified = classify_headers(spans, analysis)
|
|
592
|
+
headers = [s for s in classified if s.role == "header"]
|
|
593
|
+
"""
|
|
594
|
+
classifier = HeaderClassifier(use_clustering=use_clustering)
|
|
595
|
+
return classifier.classify_spans(spans, analysis)
|