natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py CHANGED
@@ -37,72 +37,61 @@ def configure_logging(level=logging.INFO, handler=None):
37
37
  logger.propagate = False
38
38
 
39
39
 
40
+ # Version
41
+ __version__ = "0.1.1"
42
+
43
+ # Core imports
44
+ from natural_pdf.collections.pdf_collection import PDFCollection
40
45
  from natural_pdf.core.page import Page
41
46
  from natural_pdf.core.pdf import PDF
42
47
  from natural_pdf.elements.collections import ElementCollection
43
48
  from natural_pdf.elements.region import Region
44
49
 
45
- # Import QA module if available
46
- try:
47
- from natural_pdf.qa import DocumentQA, get_qa_engine
48
-
49
- HAS_QA = True
50
- except ImportError:
51
- HAS_QA = False
52
-
53
- __version__ = "0.1.1"
54
-
55
- __all__ = [
56
- "PDF",
57
- "PDFCollection",
58
- "Page",
59
- "Region",
60
- "ElementCollection",
61
- "TextSearchOptions",
62
- "MultiModalSearchOptions",
63
- "BaseSearchOptions",
64
- "configure_logging",
65
- ]
66
-
67
- if HAS_QA:
68
- __all__.extend(["DocumentQA", "get_qa_engine"])
69
-
70
-
71
- from .collections.pdf_collection import PDFCollection
72
-
73
- # Core classes
74
- from .core.pdf import PDF
75
- from .elements.region import Region
50
+ ElementCollection = None
76
51
 
77
52
  # Search options (if extras installed)
78
53
  try:
79
- from .search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
54
+ from natural_pdf.search.search_options import (
55
+ BaseSearchOptions,
56
+ MultiModalSearchOptions,
57
+ TextSearchOptions,
58
+ )
80
59
  except ImportError:
81
60
  # Define dummy classes if extras not installed, so imports don't break
82
- # but using them will raise the ImportError from check_haystack_availability
83
- class TextSearchOptions:
61
+ class BaseSearchOptions:
84
62
  def __init__(self, *args, **kwargs):
85
63
  pass
86
64
 
87
- class MultiModalSearchOptions:
65
+ class TextSearchOptions:
88
66
  def __init__(self, *args, **kwargs):
89
67
  pass
90
68
 
91
- class BaseSearchOptions:
69
+ class MultiModalSearchOptions:
92
70
  def __init__(self, *args, **kwargs):
93
71
  pass
94
72
 
95
73
 
96
- # Expose logging setup? (Optional)
97
- # from . import logging_config
98
- # logging_config.setup_logging()
74
+ # Import QA module if available
75
+ try:
76
+ from natural_pdf.qa import DocumentQA, get_qa_engine
77
+
78
+ HAS_QA = True
79
+ except ImportError:
80
+ HAS_QA = False
99
81
 
100
82
  # Explicitly define what gets imported with 'from natural_pdf import *'
101
83
  __all__ = [
102
84
  "PDF",
103
85
  "PDFCollection",
86
+ "Page",
104
87
  "Region",
105
- "TextSearchOptions", # Include search options
88
+ "ElementCollection",
89
+ "TextSearchOptions",
106
90
  "MultiModalSearchOptions",
107
91
  "BaseSearchOptions",
92
+ "configure_logging",
108
93
  ]
94
+
95
+ # Add QA components to __all__ if available
96
+ if HAS_QA:
97
+ __all__.extend(["DocumentQA", "get_qa_engine"])
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import List, Optional
3
+ from typing import List, Optional, Union
4
4
 
5
5
  logger = logging.getLogger(__name__)
6
6
 
@@ -35,6 +35,14 @@ class TextStyleOptions:
35
35
  # Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
36
36
  label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
37
37
 
38
+ # Configuration for font size bucketing.
39
+ # - List[float]: Explicit bucket boundaries (e.g., [10.0, 18.0, 24.0]).
40
+ # Creates buckets: <10, 10-18, 18-24, >=24.
41
+ # - int: Number of buckets to determine automatically (e.g., 5).
42
+ # - str ('auto'): Automatically determine the optimal number of buckets.
43
+ # - None: No font size bucketing is applied (default).
44
+ font_size_buckets: Optional[Union[List[float], int, str]] = "auto"
45
+
38
46
  def __post_init__(self):
39
47
  # Validate size_tolerance
40
48
  if self.size_tolerance <= 0:
@@ -5,7 +5,9 @@ Text structure analyzer for natural-pdf.
5
5
  import logging
6
6
  import re
7
7
  from collections import defaultdict
8
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
8
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
9
+
10
+ import jenkspy # Added import for jenkspy
9
11
 
10
12
  from natural_pdf.analyzers.text_options import TextStyleOptions
11
13
 
@@ -30,6 +32,13 @@ FONT_WEIGHTS = {
30
32
  }
31
33
  FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
32
34
 
35
+ # Constants for automatic font size bucketing
36
+ MAX_UNIQUE_SIZES_FOR_JENKS_INPUT = (
37
+ 3000 # Max unique sizes to feed directly into Jenks; uses sampling above this
38
+ )
39
+ DEFAULT_MAX_AUTO_BUCKETS = 7 # Max number of buckets to try when font_size_buckets='auto'
40
+ MIN_BUCKETS_FOR_AUTO = 2
41
+
33
42
 
34
43
  class TextStyleAnalyzer:
35
44
  """
@@ -50,20 +59,229 @@ class TextStyleAnalyzer:
50
59
  self.options = options or TextStyleOptions()
51
60
  logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
52
61
 
62
+ # To store the font size bucket mapper if bucketing is active
63
+ self._font_size_bucket_mapper = None
64
+ self._font_size_bucket_count = 0
65
+
66
+ def _calculate_jenks_breaks(self, data: List[float], num_classes: int) -> List[float]:
67
+ if not data or num_classes <= 1:
68
+ return []
69
+
70
+ unique_data = sorted(
71
+ list(set(data))
72
+ ) # jenkspy works best with unique, sorted data for clarity of breaks
73
+ if len(unique_data) < 2 or len(unique_data) < num_classes:
74
+ # Not enough unique data points to form meaningful breaks for the requested number of classes
75
+ # or no way to make breaks if fewer than 2 unique points.
76
+ # If len(unique_data) == 1, and num_classes > 1, jenkspy might error or give trivial breaks.
77
+ # If num_classes is 1, we already returned [].
78
+ # If len(unique_data) < num_classes, it means we cannot have num_classes distinct groups based on these unique points.
79
+ # The calling function _get_font_size_bucket_mapper already adjusts num_classes if it's > len(unique_data)
80
+ # so this condition here is a safeguard or handles cases where data is extremely sparse.
81
+ if (
82
+ len(unique_data) > 1 and num_classes > 1
83
+ ): # Try to make at least one break if possible
84
+ # Fallback: create breaks between all unique points if Jenks is not suitable
85
+ # This ensures we get some division if possible, up to num_classes-1 breaks
86
+ # breaks = [(unique_data[i] + unique_data[i+1]) / 2.0 for i in range(len(unique_data)-1)]
87
+ # return sorted(list(set(breaks)))[:num_classes-1]
88
+ # However, with jenkspy, it might be better to let it try and handle its output.
89
+ # If jenkspy cannot form num_classes, it might return fewer breaks or specific values.
90
+ pass # Let jenkspy attempt it, its behavior for sparse data is specific to its C implementation.
91
+ else:
92
+ return [] # Cannot form breaks
93
+
94
+ try:
95
+ # jenkspy.jenks_breaks returns all boundaries, including min and max of data
96
+ # e.g., for n_classes=5, it returns 6 values: [min, break1, break2, break3, break4, max]
97
+ all_boundaries = jenkspy.jenks_breaks(
98
+ unique_data, n_classes=num_classes
99
+ ) # Use unique_data
100
+
101
+ # We need the inner breaks: [break1, break2, break3, break4]
102
+ if len(all_boundaries) > 2: # Ensure there are inner breaks
103
+ inner_breaks = all_boundaries[1:-1]
104
+ return sorted(list(set(inner_breaks))) # Ensure breaks are unique and sorted
105
+ else:
106
+ # This case implies n_classes=1 or data was so uniform jenkspy couldn't break it
107
+ return []
108
+ except Exception as e:
109
+ logger.warning(
110
+ f"jenkspy.jenks_breaks failed with {num_classes} classes for data (first 10 shown): {unique_data[:10]}. Error: {e}. Falling back to no breaks for this k."
111
+ )
112
+ return [] # Fallback if jenkspy fails
113
+
114
+ def _calculate_gvf(self, data: List[float], breaks: List[float]) -> float:
115
+ if not data:
116
+ return 0.0
117
+ overall_mean = sum(data) / len(data)
118
+ sdam = sum([(x - overall_mean) ** 2 for x in data])
119
+ if sdam == 0:
120
+ return 1.0 # Perfect fit if all data points are the same
121
+
122
+ sdcm = 0.0
123
+ all_breaks = [-float("inf")] + breaks + [float("inf")]
124
+ for i in range(len(all_breaks) - 1):
125
+ lower_bound = all_breaks[i]
126
+ upper_bound = all_breaks[i + 1]
127
+ cluster = [x for x in data if x > lower_bound and x <= upper_bound]
128
+ if not cluster:
129
+ continue
130
+ cluster_mean = sum(cluster) / len(cluster)
131
+ sdcm += sum([(x - cluster_mean) ** 2 for x in cluster])
132
+
133
+ return (sdam - sdcm) / sdam if sdam > 0 else 1.0
134
+
135
+ def _get_font_size_bucket_mapper(
136
+ self, all_font_sizes: List[float], config: Union[List[float], int, str]
137
+ ) -> Tuple[Optional[Callable[[float], int]], int]:
138
+ if not all_font_sizes:
139
+ return None, 0
140
+
141
+ unique_font_sizes = sorted(list(set(s for s in all_font_sizes if s is not None)))
142
+ if not unique_font_sizes:
143
+ return None, 0
144
+
145
+ # Apply sampling if too many unique font sizes for Jenks input
146
+ jenks_input_data = unique_font_sizes
147
+ if len(unique_font_sizes) > MAX_UNIQUE_SIZES_FOR_JENKS_INPUT:
148
+ logger.debug(
149
+ f"Sampling {MAX_UNIQUE_SIZES_FOR_JENKS_INPUT} from {len(unique_font_sizes)} unique font sizes for Jenks."
150
+ )
151
+ # Simple uniform sampling from sorted unique values
152
+ indices = [
153
+ int(i * (len(unique_font_sizes) - 1) / (MAX_UNIQUE_SIZES_FOR_JENKS_INPUT - 1))
154
+ for i in range(MAX_UNIQUE_SIZES_FOR_JENKS_INPUT)
155
+ ]
156
+ jenks_input_data = [unique_font_sizes[i] for i in indices]
157
+ jenks_input_data = sorted(list(set(jenks_input_data))) # Ensure still sorted and unique
158
+
159
+ breaks: List[float] = []
160
+ num_buckets = 0
161
+
162
+ if isinstance(config, list): # Explicit boundaries
163
+ breaks = sorted(list(set(config))) # Ensure sorted and unique
164
+ num_buckets = len(breaks) + 1
165
+ elif isinstance(config, int): # User-defined number of buckets
166
+ num_buckets_to_find = config
167
+ if num_buckets_to_find <= 0:
168
+ logger.warning(f"Invalid number of buckets ({config}), disabling bucketing.")
169
+ return None, 0
170
+ if num_buckets_to_find == 1:
171
+ return (lambda size: 0), 1 # All in one bucket
172
+ if (
173
+ not jenks_input_data
174
+ or len(jenks_input_data) < num_buckets_to_find
175
+ and len(jenks_input_data) > 0
176
+ ):
177
+ logger.debug(
178
+ f"Not enough unique font sizes ({len(jenks_input_data)}) to create {num_buckets_to_find} distinct buckets based on input data. Adjusting."
179
+ )
180
+ # Fallback to fewer buckets if not enough unique data points to separate
181
+ num_buckets_to_find = max(
182
+ 1, len(jenks_input_data) - 1 if len(jenks_input_data) > 1 else 1
183
+ )
184
+ if num_buckets_to_find == 1:
185
+ return (lambda size: 0), 1
186
+
187
+ breaks = self._calculate_jenks_breaks(jenks_input_data, num_buckets_to_find)
188
+ num_buckets = len(breaks) + 1
189
+ elif config == "auto":
190
+ best_gvf = -1.0
191
+ best_breaks = []
192
+ best_k = 0
193
+ # Iterate from MIN_BUCKETS_FOR_AUTO up to a max (or len of data if smaller)
194
+ max_k_to_try = min(
195
+ DEFAULT_MAX_AUTO_BUCKETS,
196
+ len(jenks_input_data) if jenks_input_data else MIN_BUCKETS_FOR_AUTO,
197
+ )
198
+ if len(jenks_input_data) == 1: # Only one unique font size
199
+ return (lambda size: 0), 1
200
+
201
+ for k_buckets in range(MIN_BUCKETS_FOR_AUTO, max_k_to_try + 1):
202
+ if k_buckets > len(
203
+ jenks_input_data
204
+ ): # Cannot have more buckets than unique data points
205
+ break
206
+ current_breaks = self._calculate_jenks_breaks(jenks_input_data, k_buckets)
207
+ if (
208
+ len(current_breaks) != k_buckets - 1
209
+ ): # Jenks couldn't find enough distinct breaks
210
+ # This can happen if data points are too few or clustered.
211
+ # If we requested k_buckets, we expect k_buckets-1 breaks.
212
+ # If we get fewer, it implies the effective number of buckets is less.
213
+ # We should only proceed if number of breaks matches k_buckets-1 for a valid GVF.
214
+ if (
215
+ k_buckets > 1 and not current_breaks
216
+ ): # requested multiple buckets but got no breaks
217
+ continue
218
+ # else: proceed with fewer breaks which means fewer effective buckets for GVF.
219
+
220
+ gvf = self._calculate_gvf(jenks_input_data, current_breaks)
221
+ # Simple strategy: pick k with highest GVF.
222
+ # More sophisticated: look for an elbow or significant GVF jump.
223
+ if gvf > best_gvf:
224
+ best_gvf = gvf
225
+ best_breaks = current_breaks
226
+ best_k = len(current_breaks) + 1 # Number of buckets is breaks + 1
227
+
228
+ breaks = best_breaks
229
+ num_buckets = best_k if best_k > 0 else 1 # Ensure at least 1 bucket
230
+ if num_buckets == 1 and breaks: # If only 1 bucket, there should be no breaks
231
+ breaks = []
232
+ logger.debug(
233
+ f"Auto bucketing: Chose {num_buckets} buckets with GVF {best_gvf:.4f}. Breaks: {breaks}"
234
+ )
235
+
236
+ else:
237
+ return None, 0 # Invalid config or no bucketing
238
+
239
+ if not breaks and num_buckets > 1 and len(unique_font_sizes) > 1:
240
+ # This can happen if Jenks fails to find breaks for N > 1 buckets but config specified N > 1
241
+ # Or if auto chose num_buckets > 1 but ended up with no breaks.
242
+ # Fallback to treating all as one bucket if no breaks were determined for multiple requested buckets.
243
+ logger.debug(
244
+ f"No breaks determined for {num_buckets} requested buckets. Treating as 1 bucket."
245
+ )
246
+ num_buckets = 1
247
+ elif num_buckets <= 1 and breaks: # Contradiction: 1 bucket should have no breaks
248
+ breaks = []
249
+ num_buckets = 1
250
+
251
+ final_breaks = sorted(list(set(breaks))) # Ensure unique and sorted
252
+
253
+ if not final_breaks and len(unique_font_sizes) > 1 and num_buckets > 1:
254
+ # If still no breaks but we expect multiple buckets (e.g. config=2, unique_sizes=[10,12])
255
+ # This implies Jenks failed to produce breaks. Fallback to simpler split for 2 buckets.
256
+ if num_buckets == 2 and len(unique_font_sizes) >= 2:
257
+ mid_point = (unique_font_sizes[0] + unique_font_sizes[-1]) / 2.0
258
+ final_breaks = [mid_point]
259
+ logger.debug(f"Jenks failed for 2 buckets, using midpoint break: {final_breaks}")
260
+ else: # For >2 buckets and no breaks, it defaults to 1 bucket effectively.
261
+ num_buckets = 1
262
+ elif final_breaks and num_buckets <= 1:
263
+ num_buckets = len(final_breaks) + 1 # Recalculate num_buckets from actual breaks
264
+
265
+ if num_buckets <= 1: # If effectively one bucket (or no data to bucket)
266
+ return (lambda size: 0), 1
267
+
268
+ # Create a mapper function
269
+ def mapper(size: float) -> int:
270
+ if size is None:
271
+ return -1 # Or some other indicator for unbucketable
272
+ # Find which bucket the size falls into
273
+ # bisect_left finds insertion point, which corresponds to bucket index
274
+ bucket_index = 0
275
+ for i, break_val in enumerate(final_breaks):
276
+ if size <= break_val:
277
+ return i
278
+ return len(final_breaks) # Belongs to the last bucket
279
+
280
+ return mapper, num_buckets
281
+
53
282
  def analyze(
54
283
  self, page: "Page", options: Optional[TextStyleOptions] = None
55
284
  ) -> "ElementCollection":
56
- """
57
- Analyze text styles on a page, group elements, and add style attributes.
58
-
59
- Args:
60
- page: The Page object to analyze.
61
- options: Override the analyzer's default TextStyleOptions for this run.
62
-
63
- Returns:
64
- ElementCollection containing all processed text elements (typically words)
65
- with added 'style_label', 'style_key', and 'style_properties' attributes.
66
- """
67
285
  from natural_pdf.elements.collections import ElementCollection
68
286
 
69
287
  current_options = options or self.options
@@ -71,27 +289,40 @@ class TextStyleAnalyzer:
71
289
  f"Starting text style analysis for page {page.number} with options: {current_options}"
72
290
  )
73
291
 
74
- # Use page.words for better granularity
75
292
  text_elements = page.words
76
- # Fallback if words are somehow empty/not generated
77
293
  if not text_elements:
78
- text_elements = page.find_all("text").elements # Get list from collection
294
+ text_elements = page.find_all("text").elements
79
295
 
80
- # Skip empty pages or pages with no text elements
81
296
  if not text_elements:
82
297
  logger.warning(f"Page {page.number} has no text elements to analyze.")
83
298
  return ElementCollection([])
84
299
 
85
- style_cache: Dict[Tuple, Dict[str, Any]] = (
86
- {}
87
- ) # Maps style_key_tuple -> {'label': str, 'properties': dict}
88
- processed_elements: List["Element"] = []
300
+ # --- Font Size Bucketing Setup ---
301
+ self._font_size_bucket_mapper = None
302
+ self._font_size_bucket_count = 0
303
+ bucketing_config = getattr(current_options, "font_size_buckets", None)
304
+
305
+ if bucketing_config is not None:
306
+ all_page_font_sizes = [
307
+ el.size for el in text_elements if hasattr(el, "size") and el.size is not None
308
+ ]
309
+ if all_page_font_sizes:
310
+ self._font_size_bucket_mapper, self._font_size_bucket_count = (
311
+ self._get_font_size_bucket_mapper(all_page_font_sizes, bucketing_config)
312
+ )
313
+ if self._font_size_bucket_mapper:
314
+ logger.debug(
315
+ f"Font size bucketing active with {self._font_size_bucket_count} buckets for page {page.number}."
316
+ )
317
+ else:
318
+ logger.debug("No font sizes found on page for bucketing.")
319
+ # --- End Bucketing Setup ---
89
320
 
90
- # Ensure consistent ordering for style key creation
321
+ style_cache: Dict[Tuple, Dict[str, Any]] = {}
322
+ processed_elements: List["Element"] = []
91
323
  group_by_keys = sorted(current_options.group_by)
92
324
 
93
325
  for element in text_elements:
94
- # Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
95
326
  if not hasattr(element, "text") or not hasattr(element, "size"):
96
327
  logger.debug(f"Skipping element without text/size: {element}")
97
328
  continue
@@ -102,60 +333,64 @@ class TextStyleAnalyzer:
102
333
 
103
334
  if style_key not in style_cache:
104
335
  label = self._generate_style_label(
105
- style_properties, current_options, len(style_cache) + 1
336
+ style_properties,
337
+ current_options,
338
+ len(style_cache) + 1,
339
+ self._font_size_bucket_count,
106
340
  )
107
341
  style_cache[style_key] = {"label": label, "properties": style_properties}
108
342
  logger.debug(
109
343
  f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
110
344
  )
111
345
 
112
- # Add attributes to the element
113
346
  element.style_label = style_cache[style_key]["label"]
114
347
  element.style_key = style_key
115
- # Add the full properties dict for potential detailed inspection
116
348
  element.style_properties = style_cache[style_key]["properties"]
117
-
349
+ element.font_bucket_name = style_cache[style_key]["properties"].get(
350
+ "font_bucket_name"
351
+ )
118
352
  processed_elements.append(element)
119
-
120
353
  except Exception as e:
121
354
  logger.warning(
122
355
  f"Error processing element {element} for text style: {e}", exc_info=True
123
356
  )
124
- # Optionally add element without style info or skip it
125
- # processed_elements.append(element) # Add anyway?
126
357
 
127
- # Optionally store a summary on the page
128
358
  page._text_styles_summary = style_cache
129
359
  logger.info(
130
360
  f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
131
361
  )
132
-
133
362
  return ElementCollection(processed_elements)
134
363
 
135
364
  def _extract_style_properties(
136
365
  self, element: "Element", options: TextStyleOptions
137
366
  ) -> Dict[str, Any]:
138
- """
139
- Extract style properties from a text element based on options.
140
-
141
- Args:
142
- element: Text element.
143
- options: TextStyleOptions driving the extraction.
144
-
145
- Returns:
146
- Dictionary of extracted style properties.
147
- """
148
367
  properties = {}
368
+ original_size = getattr(element, "size", None)
369
+ rounded_size = None
370
+
371
+ properties["original_size"] = original_size
149
372
 
150
- # Font size
151
- font_size = None
152
- if hasattr(element, "size") and element.size is not None:
153
- # Round based on tolerance
373
+ if original_size is not None:
154
374
  rounding_factor = 1.0 / options.size_tolerance
155
- font_size = round(element.size * rounding_factor) / rounding_factor
156
- properties["size"] = font_size
375
+ rounded_size = round(original_size * rounding_factor) / rounding_factor
376
+ properties["size"] = rounded_size # For display in labels
377
+ properties["rounded_size"] = rounded_size # Explicit storage
378
+
379
+ # Font size bucketing logic
380
+ properties["font_bucket_id"] = None
381
+ properties["font_bucket_name"] = None # Initialize font_bucket_name
382
+ size_for_keying = rounded_size
383
+
384
+ if self._font_size_bucket_mapper and original_size is not None:
385
+ bucket_id = self._font_size_bucket_mapper(original_size)
386
+ properties["font_bucket_id"] = bucket_id
387
+ properties["font_bucket_name"] = self._get_bucket_name(
388
+ bucket_id, self._font_size_bucket_count
389
+ )
390
+ size_for_keying = bucket_id
391
+
392
+ properties["size_for_keying"] = size_for_keying
157
393
 
158
- # Font name
159
394
  font_name = None
160
395
  normalized_font_name = None
161
396
  if hasattr(element, "fontname") and element.fontname is not None:
@@ -257,28 +492,50 @@ class TextStyleAnalyzer:
257
492
  return {"family": family, "weight": weight, "style": style}
258
493
 
259
494
  def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
260
- """Create a hashable tuple key based on selected properties."""
261
495
  key_parts = []
262
- for key in group_by_keys: # Use the pre-sorted list
263
- value = properties.get(key)
264
- # Ensure hashable - colors should already be tuples or basic types
265
- if isinstance(value, list): # Should not happen if _extract handled color correctly
496
+ for key in group_by_keys:
497
+ if key == "size":
498
+ value = properties.get("size_for_keying") # Use the correct size value for keying
499
+ else:
500
+ value = properties.get(key)
501
+
502
+ if isinstance(value, list):
266
503
  value = tuple(value)
267
504
  key_parts.append(value)
268
505
  return tuple(key_parts)
269
506
 
270
507
  def _generate_style_label(
271
- self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int
508
+ self,
509
+ properties: Dict[str, Any],
510
+ options: TextStyleOptions,
511
+ style_index: int,
512
+ num_font_buckets: int = 0,
272
513
  ) -> str:
273
- """Generate a style label based on properties and options."""
274
514
  if not options.descriptive_labels:
515
+ # If bucketing is active and only 1 bucket, it's not very informative
516
+ is_meaningful_bucketing = (
517
+ self._font_size_bucket_mapper is not None and num_font_buckets > 1
518
+ )
519
+ bucket_id = properties.get("font_bucket_id")
520
+ if is_meaningful_bucketing and bucket_id is not None:
521
+ return f"{options.label_prefix} (Bucket {bucket_id + 1}) {style_index}"
275
522
  return f"{options.label_prefix} {style_index}"
276
523
 
277
524
  try:
278
525
  font_details = self._parse_font_name(properties.get("fontname", ""))
526
+ bucket_label_part = ""
527
+ bucket_id = properties.get("font_bucket_id")
528
+
529
+ # Only add bucket info if bucketing is active and meaningful (more than 1 bucket)
530
+ if (
531
+ self._font_size_bucket_mapper is not None
532
+ and num_font_buckets > 1
533
+ and bucket_id is not None
534
+ ):
535
+ bucket_label_part = f" (Bucket {bucket_id + 1})" # Simple numeric label for now
279
536
 
280
537
  label_data = {
281
- "size": properties.get("size", "?"),
538
+ "size": properties.get("rounded_size", "?"), # Use rounded_size for display
282
539
  "fontname": properties.get("fontname", "Unknown"),
283
540
  "is_bold": properties.get("is_bold", False),
284
541
  "is_italic": properties.get("is_italic", False),
@@ -292,6 +549,8 @@ class TextStyleAnalyzer:
292
549
  if label_data["weight"] and label_data["style"]:
293
550
  label_data["style"] = " " + label_data["style"]
294
551
 
552
+ label_data["bucket_info"] = bucket_label_part
553
+
295
554
  # Handle color formatting for label
296
555
  color_val = label_data["color"]
297
556
  if isinstance(color_val, tuple):
@@ -303,12 +562,66 @@ class TextStyleAnalyzer:
303
562
  label_data["color_str"] = color_str
304
563
 
305
564
  # Format the label, handle potential missing keys in format string gracefully
306
- label = options.label_format.format_map(defaultdict(str, label_data))
307
- return label.strip().replace(" ", " ") # Cleanup extra spaces
565
+ # Add {bucket_info} to default format string if not already customized by user?
566
+ # For now, user would need to add {bucket_info} to their custom label_format if they want it.
567
+ current_label_format = options.label_format
568
+ bucket_name_for_label = properties.get("font_bucket_name")
569
+
570
+ # Construct a bucket_info string if a bucket name exists and it's not already in the format
571
+ # And if there are multiple buckets to make it meaningful.
572
+ bucket_info_str = ""
573
+ if bucket_name_for_label and num_font_buckets > 1:
574
+ bucket_info_str = f" ({bucket_name_for_label})"
308
575
 
576
+ if "{bucket_info}" not in current_label_format and bucket_info_str:
577
+ current_label_format += " {bucket_info}" # Placeholder name for format_map
578
+
579
+ # Populate label_data with the actual bucket string for the {bucket_info} placeholder
580
+ label_data["bucket_info"] = bucket_info_str
581
+
582
+ label = current_label_format.format_map(defaultdict(str, label_data))
583
+ return label.strip().replace(" ", " ")
309
584
  except Exception as e:
310
585
  logger.warning(
311
586
  f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
312
587
  )
313
588
  # Fallback to numeric label on error
314
589
  return f"{options.label_prefix} {style_index}"
590
+
591
+ def _get_bucket_name(self, bucket_id: Optional[int], total_buckets: int) -> Optional[str]:
592
+ if bucket_id is None or not (0 <= bucket_id < total_buckets):
593
+ return None # Or "N/A"
594
+
595
+ if total_buckets <= 0: # Should not happen if called correctly
596
+ return f"Invalid Bucket {bucket_id}"
597
+
598
+ # Predefined human-readable names for up to 8 buckets
599
+ # Buckets are 0-indexed internally, names correspond to that index.
600
+ bucket_name_sets = {
601
+ 1: ["standard"],
602
+ 2: ["small", "large"],
603
+ 3: ["small", "medium", "large"],
604
+ 4: ["small", "medium", "large", "x-large"],
605
+ 5: ["x-small", "small", "medium", "large", "x-large"],
606
+ 6: ["x-small", "small", "medium", "large", "x-large", "xx-large"],
607
+ 7: ["xx-small", "x-small", "small", "medium", "large", "x-large", "xx-large"],
608
+ 8: [
609
+ "xx-small",
610
+ "x-small",
611
+ "small",
612
+ "medium",
613
+ "large",
614
+ "x-large",
615
+ "xx-large",
616
+ "xxx-large",
617
+ ],
618
+ }
619
+
620
+ if total_buckets in bucket_name_sets:
621
+ names = bucket_name_sets[total_buckets]
622
+ if 0 <= bucket_id < len(names):
623
+ return names[bucket_id]
624
+ else: # Should not happen if bucket_id is valid for total_buckets
625
+ return f"Size Group {bucket_id}"
626
+ else: # Fallback for more than 8 buckets or unhandled cases
627
+ return f"Size Group {bucket_id}"
@@ -69,7 +69,7 @@ class ClassificationManager:
69
69
  if not _CLASSIFICATION_AVAILABLE:
70
70
  raise ImportError(
71
71
  "Classification dependencies missing. "
72
- 'Install with: pip install "natural-pdf[classification]"'
72
+ 'Install with: pip install "natural-pdf[core-ml]"'
73
73
  )
74
74
 
75
75
  self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (