natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +4 -2
- natural_pdf/core/pdf.py +53 -38
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +203 -59
- natural_pdf/elements/region.py +43 -11
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -51,10 +51,13 @@ ElementCollection = None
|
|
51
51
|
|
52
52
|
# Search options (if extras installed)
|
53
53
|
try:
|
54
|
-
from natural_pdf.search.search_options import
|
54
|
+
from natural_pdf.search.search_options import (
|
55
|
+
BaseSearchOptions,
|
56
|
+
MultiModalSearchOptions,
|
57
|
+
TextSearchOptions,
|
58
|
+
)
|
55
59
|
except ImportError:
|
56
60
|
# Define dummy classes if extras not installed, so imports don't break
|
57
|
-
# but using them will raise the ImportError from check_haystack_availability
|
58
61
|
class BaseSearchOptions:
|
59
62
|
def __init__(self, *args, **kwargs):
|
60
63
|
pass
|
@@ -67,9 +70,11 @@ except ImportError:
|
|
67
70
|
def __init__(self, *args, **kwargs):
|
68
71
|
pass
|
69
72
|
|
73
|
+
|
70
74
|
# Import QA module if available
|
71
75
|
try:
|
72
76
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
77
|
+
|
73
78
|
HAS_QA = True
|
74
79
|
except ImportError:
|
75
80
|
HAS_QA = False
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from dataclasses import dataclass, field
|
3
|
-
from typing import List, Optional
|
3
|
+
from typing import List, Optional, Union
|
4
4
|
|
5
5
|
logger = logging.getLogger(__name__)
|
6
6
|
|
@@ -35,6 +35,14 @@ class TextStyleOptions:
|
|
35
35
|
# Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
|
36
36
|
label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
|
37
37
|
|
38
|
+
# Configuration for font size bucketing.
|
39
|
+
# - List[float]: Explicit bucket boundaries (e.g., [10.0, 18.0, 24.0]).
|
40
|
+
# Creates buckets: <10, 10-18, 18-24, >=24.
|
41
|
+
# - int: Number of buckets to determine automatically (e.g., 5).
|
42
|
+
# - str ('auto'): Automatically determine the optimal number of buckets.
|
43
|
+
# - None: No font size bucketing is applied (default).
|
44
|
+
font_size_buckets: Optional[Union[List[float], int, str]] = "auto"
|
45
|
+
|
38
46
|
def __post_init__(self):
|
39
47
|
# Validate size_tolerance
|
40
48
|
if self.size_tolerance <= 0:
|
@@ -5,7 +5,9 @@ Text structure analyzer for natural-pdf.
|
|
5
5
|
import logging
|
6
6
|
import re
|
7
7
|
from collections import defaultdict
|
8
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
9
|
+
|
10
|
+
import jenkspy # Added import for jenkspy
|
9
11
|
|
10
12
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
11
13
|
|
@@ -30,6 +32,13 @@ FONT_WEIGHTS = {
|
|
30
32
|
}
|
31
33
|
FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
|
32
34
|
|
35
|
+
# Constants for automatic font size bucketing
|
36
|
+
MAX_UNIQUE_SIZES_FOR_JENKS_INPUT = (
|
37
|
+
3000 # Max unique sizes to feed directly into Jenks; uses sampling above this
|
38
|
+
)
|
39
|
+
DEFAULT_MAX_AUTO_BUCKETS = 7 # Max number of buckets to try when font_size_buckets='auto'
|
40
|
+
MIN_BUCKETS_FOR_AUTO = 2
|
41
|
+
|
33
42
|
|
34
43
|
class TextStyleAnalyzer:
|
35
44
|
"""
|
@@ -50,20 +59,229 @@ class TextStyleAnalyzer:
|
|
50
59
|
self.options = options or TextStyleOptions()
|
51
60
|
logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
|
52
61
|
|
62
|
+
# To store the font size bucket mapper if bucketing is active
|
63
|
+
self._font_size_bucket_mapper = None
|
64
|
+
self._font_size_bucket_count = 0
|
65
|
+
|
66
|
+
def _calculate_jenks_breaks(self, data: List[float], num_classes: int) -> List[float]:
|
67
|
+
if not data or num_classes <= 1:
|
68
|
+
return []
|
69
|
+
|
70
|
+
unique_data = sorted(
|
71
|
+
list(set(data))
|
72
|
+
) # jenkspy works best with unique, sorted data for clarity of breaks
|
73
|
+
if len(unique_data) < 2 or len(unique_data) < num_classes:
|
74
|
+
# Not enough unique data points to form meaningful breaks for the requested number of classes
|
75
|
+
# or no way to make breaks if fewer than 2 unique points.
|
76
|
+
# If len(unique_data) == 1, and num_classes > 1, jenkspy might error or give trivial breaks.
|
77
|
+
# If num_classes is 1, we already returned [].
|
78
|
+
# If len(unique_data) < num_classes, it means we cannot have num_classes distinct groups based on these unique points.
|
79
|
+
# The calling function _get_font_size_bucket_mapper already adjusts num_classes if it's > len(unique_data)
|
80
|
+
# so this condition here is a safeguard or handles cases where data is extremely sparse.
|
81
|
+
if (
|
82
|
+
len(unique_data) > 1 and num_classes > 1
|
83
|
+
): # Try to make at least one break if possible
|
84
|
+
# Fallback: create breaks between all unique points if Jenks is not suitable
|
85
|
+
# This ensures we get some division if possible, up to num_classes-1 breaks
|
86
|
+
# breaks = [(unique_data[i] + unique_data[i+1]) / 2.0 for i in range(len(unique_data)-1)]
|
87
|
+
# return sorted(list(set(breaks)))[:num_classes-1]
|
88
|
+
# However, with jenkspy, it might be better to let it try and handle its output.
|
89
|
+
# If jenkspy cannot form num_classes, it might return fewer breaks or specific values.
|
90
|
+
pass # Let jenkspy attempt it, its behavior for sparse data is specific to its C implementation.
|
91
|
+
else:
|
92
|
+
return [] # Cannot form breaks
|
93
|
+
|
94
|
+
try:
|
95
|
+
# jenkspy.jenks_breaks returns all boundaries, including min and max of data
|
96
|
+
# e.g., for n_classes=5, it returns 6 values: [min, break1, break2, break3, break4, max]
|
97
|
+
all_boundaries = jenkspy.jenks_breaks(
|
98
|
+
unique_data, n_classes=num_classes
|
99
|
+
) # Use unique_data
|
100
|
+
|
101
|
+
# We need the inner breaks: [break1, break2, break3, break4]
|
102
|
+
if len(all_boundaries) > 2: # Ensure there are inner breaks
|
103
|
+
inner_breaks = all_boundaries[1:-1]
|
104
|
+
return sorted(list(set(inner_breaks))) # Ensure breaks are unique and sorted
|
105
|
+
else:
|
106
|
+
# This case implies n_classes=1 or data was so uniform jenkspy couldn't break it
|
107
|
+
return []
|
108
|
+
except Exception as e:
|
109
|
+
logger.warning(
|
110
|
+
f"jenkspy.jenks_breaks failed with {num_classes} classes for data (first 10 shown): {unique_data[:10]}. Error: {e}. Falling back to no breaks for this k."
|
111
|
+
)
|
112
|
+
return [] # Fallback if jenkspy fails
|
113
|
+
|
114
|
+
def _calculate_gvf(self, data: List[float], breaks: List[float]) -> float:
|
115
|
+
if not data:
|
116
|
+
return 0.0
|
117
|
+
overall_mean = sum(data) / len(data)
|
118
|
+
sdam = sum([(x - overall_mean) ** 2 for x in data])
|
119
|
+
if sdam == 0:
|
120
|
+
return 1.0 # Perfect fit if all data points are the same
|
121
|
+
|
122
|
+
sdcm = 0.0
|
123
|
+
all_breaks = [-float("inf")] + breaks + [float("inf")]
|
124
|
+
for i in range(len(all_breaks) - 1):
|
125
|
+
lower_bound = all_breaks[i]
|
126
|
+
upper_bound = all_breaks[i + 1]
|
127
|
+
cluster = [x for x in data if x > lower_bound and x <= upper_bound]
|
128
|
+
if not cluster:
|
129
|
+
continue
|
130
|
+
cluster_mean = sum(cluster) / len(cluster)
|
131
|
+
sdcm += sum([(x - cluster_mean) ** 2 for x in cluster])
|
132
|
+
|
133
|
+
return (sdam - sdcm) / sdam if sdam > 0 else 1.0
|
134
|
+
|
135
|
+
def _get_font_size_bucket_mapper(
|
136
|
+
self, all_font_sizes: List[float], config: Union[List[float], int, str]
|
137
|
+
) -> Tuple[Optional[Callable[[float], int]], int]:
|
138
|
+
if not all_font_sizes:
|
139
|
+
return None, 0
|
140
|
+
|
141
|
+
unique_font_sizes = sorted(list(set(s for s in all_font_sizes if s is not None)))
|
142
|
+
if not unique_font_sizes:
|
143
|
+
return None, 0
|
144
|
+
|
145
|
+
# Apply sampling if too many unique font sizes for Jenks input
|
146
|
+
jenks_input_data = unique_font_sizes
|
147
|
+
if len(unique_font_sizes) > MAX_UNIQUE_SIZES_FOR_JENKS_INPUT:
|
148
|
+
logger.debug(
|
149
|
+
f"Sampling {MAX_UNIQUE_SIZES_FOR_JENKS_INPUT} from {len(unique_font_sizes)} unique font sizes for Jenks."
|
150
|
+
)
|
151
|
+
# Simple uniform sampling from sorted unique values
|
152
|
+
indices = [
|
153
|
+
int(i * (len(unique_font_sizes) - 1) / (MAX_UNIQUE_SIZES_FOR_JENKS_INPUT - 1))
|
154
|
+
for i in range(MAX_UNIQUE_SIZES_FOR_JENKS_INPUT)
|
155
|
+
]
|
156
|
+
jenks_input_data = [unique_font_sizes[i] for i in indices]
|
157
|
+
jenks_input_data = sorted(list(set(jenks_input_data))) # Ensure still sorted and unique
|
158
|
+
|
159
|
+
breaks: List[float] = []
|
160
|
+
num_buckets = 0
|
161
|
+
|
162
|
+
if isinstance(config, list): # Explicit boundaries
|
163
|
+
breaks = sorted(list(set(config))) # Ensure sorted and unique
|
164
|
+
num_buckets = len(breaks) + 1
|
165
|
+
elif isinstance(config, int): # User-defined number of buckets
|
166
|
+
num_buckets_to_find = config
|
167
|
+
if num_buckets_to_find <= 0:
|
168
|
+
logger.warning(f"Invalid number of buckets ({config}), disabling bucketing.")
|
169
|
+
return None, 0
|
170
|
+
if num_buckets_to_find == 1:
|
171
|
+
return (lambda size: 0), 1 # All in one bucket
|
172
|
+
if (
|
173
|
+
not jenks_input_data
|
174
|
+
or len(jenks_input_data) < num_buckets_to_find
|
175
|
+
and len(jenks_input_data) > 0
|
176
|
+
):
|
177
|
+
logger.debug(
|
178
|
+
f"Not enough unique font sizes ({len(jenks_input_data)}) to create {num_buckets_to_find} distinct buckets based on input data. Adjusting."
|
179
|
+
)
|
180
|
+
# Fallback to fewer buckets if not enough unique data points to separate
|
181
|
+
num_buckets_to_find = max(
|
182
|
+
1, len(jenks_input_data) - 1 if len(jenks_input_data) > 1 else 1
|
183
|
+
)
|
184
|
+
if num_buckets_to_find == 1:
|
185
|
+
return (lambda size: 0), 1
|
186
|
+
|
187
|
+
breaks = self._calculate_jenks_breaks(jenks_input_data, num_buckets_to_find)
|
188
|
+
num_buckets = len(breaks) + 1
|
189
|
+
elif config == "auto":
|
190
|
+
best_gvf = -1.0
|
191
|
+
best_breaks = []
|
192
|
+
best_k = 0
|
193
|
+
# Iterate from MIN_BUCKETS_FOR_AUTO up to a max (or len of data if smaller)
|
194
|
+
max_k_to_try = min(
|
195
|
+
DEFAULT_MAX_AUTO_BUCKETS,
|
196
|
+
len(jenks_input_data) if jenks_input_data else MIN_BUCKETS_FOR_AUTO,
|
197
|
+
)
|
198
|
+
if len(jenks_input_data) == 1: # Only one unique font size
|
199
|
+
return (lambda size: 0), 1
|
200
|
+
|
201
|
+
for k_buckets in range(MIN_BUCKETS_FOR_AUTO, max_k_to_try + 1):
|
202
|
+
if k_buckets > len(
|
203
|
+
jenks_input_data
|
204
|
+
): # Cannot have more buckets than unique data points
|
205
|
+
break
|
206
|
+
current_breaks = self._calculate_jenks_breaks(jenks_input_data, k_buckets)
|
207
|
+
if (
|
208
|
+
len(current_breaks) != k_buckets - 1
|
209
|
+
): # Jenks couldn't find enough distinct breaks
|
210
|
+
# This can happen if data points are too few or clustered.
|
211
|
+
# If we requested k_buckets, we expect k_buckets-1 breaks.
|
212
|
+
# If we get fewer, it implies the effective number of buckets is less.
|
213
|
+
# We should only proceed if number of breaks matches k_buckets-1 for a valid GVF.
|
214
|
+
if (
|
215
|
+
k_buckets > 1 and not current_breaks
|
216
|
+
): # requested multiple buckets but got no breaks
|
217
|
+
continue
|
218
|
+
# else: proceed with fewer breaks which means fewer effective buckets for GVF.
|
219
|
+
|
220
|
+
gvf = self._calculate_gvf(jenks_input_data, current_breaks)
|
221
|
+
# Simple strategy: pick k with highest GVF.
|
222
|
+
# More sophisticated: look for an elbow or significant GVF jump.
|
223
|
+
if gvf > best_gvf:
|
224
|
+
best_gvf = gvf
|
225
|
+
best_breaks = current_breaks
|
226
|
+
best_k = len(current_breaks) + 1 # Number of buckets is breaks + 1
|
227
|
+
|
228
|
+
breaks = best_breaks
|
229
|
+
num_buckets = best_k if best_k > 0 else 1 # Ensure at least 1 bucket
|
230
|
+
if num_buckets == 1 and breaks: # If only 1 bucket, there should be no breaks
|
231
|
+
breaks = []
|
232
|
+
logger.debug(
|
233
|
+
f"Auto bucketing: Chose {num_buckets} buckets with GVF {best_gvf:.4f}. Breaks: {breaks}"
|
234
|
+
)
|
235
|
+
|
236
|
+
else:
|
237
|
+
return None, 0 # Invalid config or no bucketing
|
238
|
+
|
239
|
+
if not breaks and num_buckets > 1 and len(unique_font_sizes) > 1:
|
240
|
+
# This can happen if Jenks fails to find breaks for N > 1 buckets but config specified N > 1
|
241
|
+
# Or if auto chose num_buckets > 1 but ended up with no breaks.
|
242
|
+
# Fallback to treating all as one bucket if no breaks were determined for multiple requested buckets.
|
243
|
+
logger.debug(
|
244
|
+
f"No breaks determined for {num_buckets} requested buckets. Treating as 1 bucket."
|
245
|
+
)
|
246
|
+
num_buckets = 1
|
247
|
+
elif num_buckets <= 1 and breaks: # Contradiction: 1 bucket should have no breaks
|
248
|
+
breaks = []
|
249
|
+
num_buckets = 1
|
250
|
+
|
251
|
+
final_breaks = sorted(list(set(breaks))) # Ensure unique and sorted
|
252
|
+
|
253
|
+
if not final_breaks and len(unique_font_sizes) > 1 and num_buckets > 1:
|
254
|
+
# If still no breaks but we expect multiple buckets (e.g. config=2, unique_sizes=[10,12])
|
255
|
+
# This implies Jenks failed to produce breaks. Fallback to simpler split for 2 buckets.
|
256
|
+
if num_buckets == 2 and len(unique_font_sizes) >= 2:
|
257
|
+
mid_point = (unique_font_sizes[0] + unique_font_sizes[-1]) / 2.0
|
258
|
+
final_breaks = [mid_point]
|
259
|
+
logger.debug(f"Jenks failed for 2 buckets, using midpoint break: {final_breaks}")
|
260
|
+
else: # For >2 buckets and no breaks, it defaults to 1 bucket effectively.
|
261
|
+
num_buckets = 1
|
262
|
+
elif final_breaks and num_buckets <= 1:
|
263
|
+
num_buckets = len(final_breaks) + 1 # Recalculate num_buckets from actual breaks
|
264
|
+
|
265
|
+
if num_buckets <= 1: # If effectively one bucket (or no data to bucket)
|
266
|
+
return (lambda size: 0), 1
|
267
|
+
|
268
|
+
# Create a mapper function
|
269
|
+
def mapper(size: float) -> int:
|
270
|
+
if size is None:
|
271
|
+
return -1 # Or some other indicator for unbucketable
|
272
|
+
# Find which bucket the size falls into
|
273
|
+
# bisect_left finds insertion point, which corresponds to bucket index
|
274
|
+
bucket_index = 0
|
275
|
+
for i, break_val in enumerate(final_breaks):
|
276
|
+
if size <= break_val:
|
277
|
+
return i
|
278
|
+
return len(final_breaks) # Belongs to the last bucket
|
279
|
+
|
280
|
+
return mapper, num_buckets
|
281
|
+
|
53
282
|
def analyze(
|
54
283
|
self, page: "Page", options: Optional[TextStyleOptions] = None
|
55
284
|
) -> "ElementCollection":
|
56
|
-
"""
|
57
|
-
Analyze text styles on a page, group elements, and add style attributes.
|
58
|
-
|
59
|
-
Args:
|
60
|
-
page: The Page object to analyze.
|
61
|
-
options: Override the analyzer's default TextStyleOptions for this run.
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
ElementCollection containing all processed text elements (typically words)
|
65
|
-
with added 'style_label', 'style_key', and 'style_properties' attributes.
|
66
|
-
"""
|
67
285
|
from natural_pdf.elements.collections import ElementCollection
|
68
286
|
|
69
287
|
current_options = options or self.options
|
@@ -71,27 +289,40 @@ class TextStyleAnalyzer:
|
|
71
289
|
f"Starting text style analysis for page {page.number} with options: {current_options}"
|
72
290
|
)
|
73
291
|
|
74
|
-
# Use page.words for better granularity
|
75
292
|
text_elements = page.words
|
76
|
-
# Fallback if words are somehow empty/not generated
|
77
293
|
if not text_elements:
|
78
|
-
text_elements = page.find_all("text").elements
|
294
|
+
text_elements = page.find_all("text").elements
|
79
295
|
|
80
|
-
# Skip empty pages or pages with no text elements
|
81
296
|
if not text_elements:
|
82
297
|
logger.warning(f"Page {page.number} has no text elements to analyze.")
|
83
298
|
return ElementCollection([])
|
84
299
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
300
|
+
# --- Font Size Bucketing Setup ---
|
301
|
+
self._font_size_bucket_mapper = None
|
302
|
+
self._font_size_bucket_count = 0
|
303
|
+
bucketing_config = getattr(current_options, "font_size_buckets", None)
|
304
|
+
|
305
|
+
if bucketing_config is not None:
|
306
|
+
all_page_font_sizes = [
|
307
|
+
el.size for el in text_elements if hasattr(el, "size") and el.size is not None
|
308
|
+
]
|
309
|
+
if all_page_font_sizes:
|
310
|
+
self._font_size_bucket_mapper, self._font_size_bucket_count = (
|
311
|
+
self._get_font_size_bucket_mapper(all_page_font_sizes, bucketing_config)
|
312
|
+
)
|
313
|
+
if self._font_size_bucket_mapper:
|
314
|
+
logger.debug(
|
315
|
+
f"Font size bucketing active with {self._font_size_bucket_count} buckets for page {page.number}."
|
316
|
+
)
|
317
|
+
else:
|
318
|
+
logger.debug("No font sizes found on page for bucketing.")
|
319
|
+
# --- End Bucketing Setup ---
|
89
320
|
|
90
|
-
|
321
|
+
style_cache: Dict[Tuple, Dict[str, Any]] = {}
|
322
|
+
processed_elements: List["Element"] = []
|
91
323
|
group_by_keys = sorted(current_options.group_by)
|
92
324
|
|
93
325
|
for element in text_elements:
|
94
|
-
# Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
|
95
326
|
if not hasattr(element, "text") or not hasattr(element, "size"):
|
96
327
|
logger.debug(f"Skipping element without text/size: {element}")
|
97
328
|
continue
|
@@ -102,60 +333,64 @@ class TextStyleAnalyzer:
|
|
102
333
|
|
103
334
|
if style_key not in style_cache:
|
104
335
|
label = self._generate_style_label(
|
105
|
-
style_properties,
|
336
|
+
style_properties,
|
337
|
+
current_options,
|
338
|
+
len(style_cache) + 1,
|
339
|
+
self._font_size_bucket_count,
|
106
340
|
)
|
107
341
|
style_cache[style_key] = {"label": label, "properties": style_properties}
|
108
342
|
logger.debug(
|
109
343
|
f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
|
110
344
|
)
|
111
345
|
|
112
|
-
# Add attributes to the element
|
113
346
|
element.style_label = style_cache[style_key]["label"]
|
114
347
|
element.style_key = style_key
|
115
|
-
# Add the full properties dict for potential detailed inspection
|
116
348
|
element.style_properties = style_cache[style_key]["properties"]
|
117
|
-
|
349
|
+
element.font_bucket_name = style_cache[style_key]["properties"].get(
|
350
|
+
"font_bucket_name"
|
351
|
+
)
|
118
352
|
processed_elements.append(element)
|
119
|
-
|
120
353
|
except Exception as e:
|
121
354
|
logger.warning(
|
122
355
|
f"Error processing element {element} for text style: {e}", exc_info=True
|
123
356
|
)
|
124
|
-
# Optionally add element without style info or skip it
|
125
|
-
# processed_elements.append(element) # Add anyway?
|
126
357
|
|
127
|
-
# Optionally store a summary on the page
|
128
358
|
page._text_styles_summary = style_cache
|
129
359
|
logger.info(
|
130
360
|
f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
|
131
361
|
)
|
132
|
-
|
133
362
|
return ElementCollection(processed_elements)
|
134
363
|
|
135
364
|
def _extract_style_properties(
|
136
365
|
self, element: "Element", options: TextStyleOptions
|
137
366
|
) -> Dict[str, Any]:
|
138
|
-
"""
|
139
|
-
Extract style properties from a text element based on options.
|
140
|
-
|
141
|
-
Args:
|
142
|
-
element: Text element.
|
143
|
-
options: TextStyleOptions driving the extraction.
|
144
|
-
|
145
|
-
Returns:
|
146
|
-
Dictionary of extracted style properties.
|
147
|
-
"""
|
148
367
|
properties = {}
|
368
|
+
original_size = getattr(element, "size", None)
|
369
|
+
rounded_size = None
|
370
|
+
|
371
|
+
properties["original_size"] = original_size
|
149
372
|
|
150
|
-
|
151
|
-
font_size = None
|
152
|
-
if hasattr(element, "size") and element.size is not None:
|
153
|
-
# Round based on tolerance
|
373
|
+
if original_size is not None:
|
154
374
|
rounding_factor = 1.0 / options.size_tolerance
|
155
|
-
|
156
|
-
properties["size"] =
|
375
|
+
rounded_size = round(original_size * rounding_factor) / rounding_factor
|
376
|
+
properties["size"] = rounded_size # For display in labels
|
377
|
+
properties["rounded_size"] = rounded_size # Explicit storage
|
378
|
+
|
379
|
+
# Font size bucketing logic
|
380
|
+
properties["font_bucket_id"] = None
|
381
|
+
properties["font_bucket_name"] = None # Initialize font_bucket_name
|
382
|
+
size_for_keying = rounded_size
|
383
|
+
|
384
|
+
if self._font_size_bucket_mapper and original_size is not None:
|
385
|
+
bucket_id = self._font_size_bucket_mapper(original_size)
|
386
|
+
properties["font_bucket_id"] = bucket_id
|
387
|
+
properties["font_bucket_name"] = self._get_bucket_name(
|
388
|
+
bucket_id, self._font_size_bucket_count
|
389
|
+
)
|
390
|
+
size_for_keying = bucket_id
|
391
|
+
|
392
|
+
properties["size_for_keying"] = size_for_keying
|
157
393
|
|
158
|
-
# Font name
|
159
394
|
font_name = None
|
160
395
|
normalized_font_name = None
|
161
396
|
if hasattr(element, "fontname") and element.fontname is not None:
|
@@ -257,28 +492,50 @@ class TextStyleAnalyzer:
|
|
257
492
|
return {"family": family, "weight": weight, "style": style}
|
258
493
|
|
259
494
|
def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
|
260
|
-
"""Create a hashable tuple key based on selected properties."""
|
261
495
|
key_parts = []
|
262
|
-
for key in group_by_keys:
|
263
|
-
|
264
|
-
|
265
|
-
|
496
|
+
for key in group_by_keys:
|
497
|
+
if key == "size":
|
498
|
+
value = properties.get("size_for_keying") # Use the correct size value for keying
|
499
|
+
else:
|
500
|
+
value = properties.get(key)
|
501
|
+
|
502
|
+
if isinstance(value, list):
|
266
503
|
value = tuple(value)
|
267
504
|
key_parts.append(value)
|
268
505
|
return tuple(key_parts)
|
269
506
|
|
270
507
|
def _generate_style_label(
|
271
|
-
self,
|
508
|
+
self,
|
509
|
+
properties: Dict[str, Any],
|
510
|
+
options: TextStyleOptions,
|
511
|
+
style_index: int,
|
512
|
+
num_font_buckets: int = 0,
|
272
513
|
) -> str:
|
273
|
-
"""Generate a style label based on properties and options."""
|
274
514
|
if not options.descriptive_labels:
|
515
|
+
# If bucketing is active and only 1 bucket, it's not very informative
|
516
|
+
is_meaningful_bucketing = (
|
517
|
+
self._font_size_bucket_mapper is not None and num_font_buckets > 1
|
518
|
+
)
|
519
|
+
bucket_id = properties.get("font_bucket_id")
|
520
|
+
if is_meaningful_bucketing and bucket_id is not None:
|
521
|
+
return f"{options.label_prefix} (Bucket {bucket_id + 1}) {style_index}"
|
275
522
|
return f"{options.label_prefix} {style_index}"
|
276
523
|
|
277
524
|
try:
|
278
525
|
font_details = self._parse_font_name(properties.get("fontname", ""))
|
526
|
+
bucket_label_part = ""
|
527
|
+
bucket_id = properties.get("font_bucket_id")
|
528
|
+
|
529
|
+
# Only add bucket info if bucketing is active and meaningful (more than 1 bucket)
|
530
|
+
if (
|
531
|
+
self._font_size_bucket_mapper is not None
|
532
|
+
and num_font_buckets > 1
|
533
|
+
and bucket_id is not None
|
534
|
+
):
|
535
|
+
bucket_label_part = f" (Bucket {bucket_id + 1})" # Simple numeric label for now
|
279
536
|
|
280
537
|
label_data = {
|
281
|
-
"size": properties.get("
|
538
|
+
"size": properties.get("rounded_size", "?"), # Use rounded_size for display
|
282
539
|
"fontname": properties.get("fontname", "Unknown"),
|
283
540
|
"is_bold": properties.get("is_bold", False),
|
284
541
|
"is_italic": properties.get("is_italic", False),
|
@@ -292,6 +549,8 @@ class TextStyleAnalyzer:
|
|
292
549
|
if label_data["weight"] and label_data["style"]:
|
293
550
|
label_data["style"] = " " + label_data["style"]
|
294
551
|
|
552
|
+
label_data["bucket_info"] = bucket_label_part
|
553
|
+
|
295
554
|
# Handle color formatting for label
|
296
555
|
color_val = label_data["color"]
|
297
556
|
if isinstance(color_val, tuple):
|
@@ -303,12 +562,66 @@ class TextStyleAnalyzer:
|
|
303
562
|
label_data["color_str"] = color_str
|
304
563
|
|
305
564
|
# Format the label, handle potential missing keys in format string gracefully
|
306
|
-
|
307
|
-
|
565
|
+
# Add {bucket_info} to default format string if not already customized by user?
|
566
|
+
# For now, user would need to add {bucket_info} to their custom label_format if they want it.
|
567
|
+
current_label_format = options.label_format
|
568
|
+
bucket_name_for_label = properties.get("font_bucket_name")
|
569
|
+
|
570
|
+
# Construct a bucket_info string if a bucket name exists and it's not already in the format
|
571
|
+
# And if there are multiple buckets to make it meaningful.
|
572
|
+
bucket_info_str = ""
|
573
|
+
if bucket_name_for_label and num_font_buckets > 1:
|
574
|
+
bucket_info_str = f" ({bucket_name_for_label})"
|
308
575
|
|
576
|
+
if "{bucket_info}" not in current_label_format and bucket_info_str:
|
577
|
+
current_label_format += " {bucket_info}" # Placeholder name for format_map
|
578
|
+
|
579
|
+
# Populate label_data with the actual bucket string for the {bucket_info} placeholder
|
580
|
+
label_data["bucket_info"] = bucket_info_str
|
581
|
+
|
582
|
+
label = current_label_format.format_map(defaultdict(str, label_data))
|
583
|
+
return label.strip().replace(" ", " ")
|
309
584
|
except Exception as e:
|
310
585
|
logger.warning(
|
311
586
|
f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
|
312
587
|
)
|
313
588
|
# Fallback to numeric label on error
|
314
589
|
return f"{options.label_prefix} {style_index}"
|
590
|
+
|
591
|
+
def _get_bucket_name(self, bucket_id: Optional[int], total_buckets: int) -> Optional[str]:
|
592
|
+
if bucket_id is None or not (0 <= bucket_id < total_buckets):
|
593
|
+
return None # Or "N/A"
|
594
|
+
|
595
|
+
if total_buckets <= 0: # Should not happen if called correctly
|
596
|
+
return f"Invalid Bucket {bucket_id}"
|
597
|
+
|
598
|
+
# Predefined human-readable names for up to 8 buckets
|
599
|
+
# Buckets are 0-indexed internally, names correspond to that index.
|
600
|
+
bucket_name_sets = {
|
601
|
+
1: ["standard"],
|
602
|
+
2: ["small", "large"],
|
603
|
+
3: ["small", "medium", "large"],
|
604
|
+
4: ["small", "medium", "large", "x-large"],
|
605
|
+
5: ["x-small", "small", "medium", "large", "x-large"],
|
606
|
+
6: ["x-small", "small", "medium", "large", "x-large", "xx-large"],
|
607
|
+
7: ["xx-small", "x-small", "small", "medium", "large", "x-large", "xx-large"],
|
608
|
+
8: [
|
609
|
+
"xx-small",
|
610
|
+
"x-small",
|
611
|
+
"small",
|
612
|
+
"medium",
|
613
|
+
"large",
|
614
|
+
"x-large",
|
615
|
+
"xx-large",
|
616
|
+
"xxx-large",
|
617
|
+
],
|
618
|
+
}
|
619
|
+
|
620
|
+
if total_buckets in bucket_name_sets:
|
621
|
+
names = bucket_name_sets[total_buckets]
|
622
|
+
if 0 <= bucket_id < len(names):
|
623
|
+
return names[bucket_id]
|
624
|
+
else: # Should not happen if bucket_id is valid for total_buckets
|
625
|
+
return f"Size Group {bucket_id}"
|
626
|
+
else: # Fallback for more than 8 buckets or unhandled cases
|
627
|
+
return f"Size Group {bucket_id}"
|
@@ -69,7 +69,7 @@ class ClassificationManager:
|
|
69
69
|
if not _CLASSIFICATION_AVAILABLE:
|
70
70
|
raise ImportError(
|
71
71
|
"Classification dependencies missing. "
|
72
|
-
'Install with: pip install "natural-pdf[
|
72
|
+
'Install with: pip install "natural-pdf[core-ml]"'
|
73
73
|
)
|
74
74
|
|
75
75
|
self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (
|
@@ -407,7 +407,17 @@ class ElementManager:
|
|
407
407
|
char_dict_data = ocr_char_dict # Use the one we already created
|
408
408
|
char_dict_data["object_type"] = "char" # Mark as char type
|
409
409
|
char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
|
410
|
-
|
410
|
+
|
411
|
+
# Create a TextElement for the char representation
|
412
|
+
# Ensure _char_dicts is handled correctly by TextElement constructor
|
413
|
+
# For an OCR word represented as a char, its _char_dicts can be a list containing its own data
|
414
|
+
char_element_specific_data = char_dict_data.copy()
|
415
|
+
char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
|
416
|
+
|
417
|
+
ocr_char_as_element = TextElement(char_element_specific_data, self._page)
|
418
|
+
self._elements["chars"].append(
|
419
|
+
ocr_char_as_element
|
420
|
+
) # Append TextElement instance
|
411
421
|
|
412
422
|
except (KeyError, ValueError, TypeError) as e:
|
413
423
|
logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)
|