PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +1 -1
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +120 -40
natural_pdf/core/page.py +4 -2
natural_pdf/core/pdf.py +53 -38
natural_pdf/elements/base.py +17 -0
natural_pdf/elements/collections.py +203 -59
natural_pdf/elements/region.py +43 -11
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -51,10 +51,13 @@ ElementCollection = None
 # Search options (if extras installed)
 try:
-    from natural_pdf.search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
+    from natural_pdf.search.search_options import (
+        BaseSearchOptions,
+        MultiModalSearchOptions,
+        TextSearchOptions,
+    )
 except ImportError:
     # Define dummy classes if extras not installed, so imports don't break
-    # but using them will raise the ImportError from check_haystack_availability
     class BaseSearchOptions:
         def __init__(self, *args, **kwargs):
             pass
@@ -67,9 +70,11 @@ except ImportError:
         def __init__(self, *args, **kwargs):
             pass
 # Import QA module if available
 try:
     from natural_pdf.qa import DocumentQA, get_qa_engine
     HAS_QA = True
 except ImportError:
     HAS_QA = False

natural_pdf/analyzers/text_options.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union
 logger = logging.getLogger(__name__)
@@ -35,6 +35,14 @@ class TextStyleOptions:
     # Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
     label_format: str = "{size}pt {weight}{style} {family}"  # Default format without color
+    # Configuration for font size bucketing.
+    # - List[float]: Explicit bucket boundaries (e.g., [10.0, 18.0, 24.0]).
+    #                Creates buckets: <10, 10-18, 18-24, >=24.
+    # - int: Number of buckets to determine automatically (e.g., 5).
+    # - str ('auto'): Automatically determine the optimal number of buckets.
+    # - None: No font size bucketing is applied (default).
+    font_size_buckets: Optional[Union[List[float], int, str]] = "auto"
     def __post_init__(self):
         # Validate size_tolerance
         if self.size_tolerance <= 0:

natural_pdf/analyzers/text_structure.py CHANGED Viewed

@@ -5,7 +5,9 @@ Text structure analyzer for natural-pdf.
 import logging
 import re
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+import jenkspy  # Added import for jenkspy
 from natural_pdf.analyzers.text_options import TextStyleOptions
@@ -30,6 +32,13 @@ FONT_WEIGHTS = {
 }
 FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
+# Constants for automatic font size bucketing
+MAX_UNIQUE_SIZES_FOR_JENKS_INPUT = (
+    3000  # Max unique sizes to feed directly into Jenks; uses sampling above this
+)
+DEFAULT_MAX_AUTO_BUCKETS = 7  # Max number of buckets to try when font_size_buckets='auto'
+MIN_BUCKETS_FOR_AUTO = 2
 class TextStyleAnalyzer:
     """
@@ -50,20 +59,229 @@ class TextStyleAnalyzer:
         self.options = options or TextStyleOptions()
         logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
+        # To store the font size bucket mapper if bucketing is active
+        self._font_size_bucket_mapper = None
+        self._font_size_bucket_count = 0
+    def _calculate_jenks_breaks(self, data: List[float], num_classes: int) -> List[float]:
+        if not data or num_classes <= 1:
+            return []
+        unique_data = sorted(
+            list(set(data))
+        )  # jenkspy works best with unique, sorted data for clarity of breaks
+        if len(unique_data) < 2 or len(unique_data) < num_classes:
+            # Not enough unique data points to form meaningful breaks for the requested number of classes
+            # or no way to make breaks if fewer than 2 unique points.
+            # If len(unique_data) == 1, and num_classes > 1, jenkspy might error or give trivial breaks.
+            # If num_classes is 1, we already returned [].
+            # If len(unique_data) < num_classes, it means we cannot have num_classes distinct groups based on these unique points.
+            # The calling function _get_font_size_bucket_mapper already adjusts num_classes if it's > len(unique_data)
+            # so this condition here is a safeguard or handles cases where data is extremely sparse.
+            if (
+                len(unique_data) > 1 and num_classes > 1
+            ):  # Try to make at least one break if possible
+                # Fallback: create breaks between all unique points if Jenks is not suitable
+                # This ensures we get some division if possible, up to num_classes-1 breaks
+                # breaks = [(unique_data[i] + unique_data[i+1]) / 2.0 for i in range(len(unique_data)-1)]
+                # return sorted(list(set(breaks)))[:num_classes-1]
+                # However, with jenkspy, it might be better to let it try and handle its output.
+                # If jenkspy cannot form num_classes, it might return fewer breaks or specific values.
+                pass  # Let jenkspy attempt it, its behavior for sparse data is specific to its C implementation.
+            else:
+                return []  # Cannot form breaks
+        try:
+            # jenkspy.jenks_breaks returns all boundaries, including min and max of data
+            # e.g., for n_classes=5, it returns 6 values: [min, break1, break2, break3, break4, max]
+            all_boundaries = jenkspy.jenks_breaks(
+                unique_data, n_classes=num_classes
+            )  # Use unique_data
+            # We need the inner breaks: [break1, break2, break3, break4]
+            if len(all_boundaries) > 2:  # Ensure there are inner breaks
+                inner_breaks = all_boundaries[1:-1]
+                return sorted(list(set(inner_breaks)))  # Ensure breaks are unique and sorted
+            else:
+                # This case implies n_classes=1 or data was so uniform jenkspy couldn't break it
+                return []
+        except Exception as e:
+            logger.warning(
+                f"jenkspy.jenks_breaks failed with {num_classes} classes for data (first 10 shown): {unique_data[:10]}. Error: {e}. Falling back to no breaks for this k."
+            )
+            return []  # Fallback if jenkspy fails
+    def _calculate_gvf(self, data: List[float], breaks: List[float]) -> float:
+        if not data:
+            return 0.0
+        overall_mean = sum(data) / len(data)
+        sdam = sum([(x - overall_mean) ** 2 for x in data])
+        if sdam == 0:
+            return 1.0  # Perfect fit if all data points are the same
+        sdcm = 0.0
+        all_breaks = [-float("inf")] + breaks + [float("inf")]
+        for i in range(len(all_breaks) - 1):
+            lower_bound = all_breaks[i]
+            upper_bound = all_breaks[i + 1]
+            cluster = [x for x in data if x > lower_bound and x <= upper_bound]
+            if not cluster:
+                continue
+            cluster_mean = sum(cluster) / len(cluster)
+            sdcm += sum([(x - cluster_mean) ** 2 for x in cluster])
+        return (sdam - sdcm) / sdam if sdam > 0 else 1.0
+    def _get_font_size_bucket_mapper(
+        self, all_font_sizes: List[float], config: Union[List[float], int, str]
+    ) -> Tuple[Optional[Callable[[float], int]], int]:
+        if not all_font_sizes:
+            return None, 0
+        unique_font_sizes = sorted(list(set(s for s in all_font_sizes if s is not None)))
+        if not unique_font_sizes:
+            return None, 0
+        # Apply sampling if too many unique font sizes for Jenks input
+        jenks_input_data = unique_font_sizes
+        if len(unique_font_sizes) > MAX_UNIQUE_SIZES_FOR_JENKS_INPUT:
+            logger.debug(
+                f"Sampling {MAX_UNIQUE_SIZES_FOR_JENKS_INPUT} from {len(unique_font_sizes)} unique font sizes for Jenks."
+            )
+            # Simple uniform sampling from sorted unique values
+            indices = [
+                int(i * (len(unique_font_sizes) - 1) / (MAX_UNIQUE_SIZES_FOR_JENKS_INPUT - 1))
+                for i in range(MAX_UNIQUE_SIZES_FOR_JENKS_INPUT)
+            ]
+            jenks_input_data = [unique_font_sizes[i] for i in indices]
+            jenks_input_data = sorted(list(set(jenks_input_data)))  # Ensure still sorted and unique
+        breaks: List[float] = []
+        num_buckets = 0
+        if isinstance(config, list):  # Explicit boundaries
+            breaks = sorted(list(set(config)))  # Ensure sorted and unique
+            num_buckets = len(breaks) + 1
+        elif isinstance(config, int):  # User-defined number of buckets
+            num_buckets_to_find = config
+            if num_buckets_to_find <= 0:
+                logger.warning(f"Invalid number of buckets ({config}), disabling bucketing.")
+                return None, 0
+            if num_buckets_to_find == 1:
+                return (lambda size: 0), 1  # All in one bucket
+            if (
+                not jenks_input_data
+                or len(jenks_input_data) < num_buckets_to_find
+                and len(jenks_input_data) > 0
+            ):
+                logger.debug(
+                    f"Not enough unique font sizes ({len(jenks_input_data)}) to create {num_buckets_to_find} distinct buckets based on input data. Adjusting."
+                )
+                # Fallback to fewer buckets if not enough unique data points to separate
+                num_buckets_to_find = max(
+                    1, len(jenks_input_data) - 1 if len(jenks_input_data) > 1 else 1
+                )
+                if num_buckets_to_find == 1:
+                    return (lambda size: 0), 1
+            breaks = self._calculate_jenks_breaks(jenks_input_data, num_buckets_to_find)
+            num_buckets = len(breaks) + 1
+        elif config == "auto":
+            best_gvf = -1.0
+            best_breaks = []
+            best_k = 0
+            # Iterate from MIN_BUCKETS_FOR_AUTO up to a max (or len of data if smaller)
+            max_k_to_try = min(
+                DEFAULT_MAX_AUTO_BUCKETS,
+                len(jenks_input_data) if jenks_input_data else MIN_BUCKETS_FOR_AUTO,
+            )
+            if len(jenks_input_data) == 1:  # Only one unique font size
+                return (lambda size: 0), 1
+            for k_buckets in range(MIN_BUCKETS_FOR_AUTO, max_k_to_try + 1):
+                if k_buckets > len(
+                    jenks_input_data
+                ):  # Cannot have more buckets than unique data points
+                    break
+                current_breaks = self._calculate_jenks_breaks(jenks_input_data, k_buckets)
+                if (
+                    len(current_breaks) != k_buckets - 1
+                ):  # Jenks couldn't find enough distinct breaks
+                    # This can happen if data points are too few or clustered.
+                    # If we requested k_buckets, we expect k_buckets-1 breaks.
+                    # If we get fewer, it implies the effective number of buckets is less.
+                    # We should only proceed if number of breaks matches k_buckets-1 for a valid GVF.
+                    if (
+                        k_buckets > 1 and not current_breaks
+                    ):  # requested multiple buckets but got no breaks
+                        continue
+                    # else: proceed with fewer breaks which means fewer effective buckets for GVF.
+                gvf = self._calculate_gvf(jenks_input_data, current_breaks)
+                # Simple strategy: pick k with highest GVF.
+                # More sophisticated: look for an elbow or significant GVF jump.
+                if gvf > best_gvf:
+                    best_gvf = gvf
+                    best_breaks = current_breaks
+                    best_k = len(current_breaks) + 1  # Number of buckets is breaks + 1
+            breaks = best_breaks
+            num_buckets = best_k if best_k > 0 else 1  # Ensure at least 1 bucket
+            if num_buckets == 1 and breaks:  # If only 1 bucket, there should be no breaks
+                breaks = []
+            logger.debug(
+                f"Auto bucketing: Chose {num_buckets} buckets with GVF {best_gvf:.4f}. Breaks: {breaks}"
+            )
+        else:
+            return None, 0  # Invalid config or no bucketing
+        if not breaks and num_buckets > 1 and len(unique_font_sizes) > 1:
+            # This can happen if Jenks fails to find breaks for N > 1 buckets but config specified N > 1
+            # Or if auto chose num_buckets > 1 but ended up with no breaks.
+            # Fallback to treating all as one bucket if no breaks were determined for multiple requested buckets.
+            logger.debug(
+                f"No breaks determined for {num_buckets} requested buckets. Treating as 1 bucket."
+            )
+            num_buckets = 1
+        elif num_buckets <= 1 and breaks:  # Contradiction: 1 bucket should have no breaks
+            breaks = []
+            num_buckets = 1
+        final_breaks = sorted(list(set(breaks)))  # Ensure unique and sorted
+        if not final_breaks and len(unique_font_sizes) > 1 and num_buckets > 1:
+            # If still no breaks but we expect multiple buckets (e.g. config=2, unique_sizes=[10,12])
+            # This implies Jenks failed to produce breaks. Fallback to simpler split for 2 buckets.
+            if num_buckets == 2 and len(unique_font_sizes) >= 2:
+                mid_point = (unique_font_sizes[0] + unique_font_sizes[-1]) / 2.0
+                final_breaks = [mid_point]
+                logger.debug(f"Jenks failed for 2 buckets, using midpoint break: {final_breaks}")
+            else:  # For >2 buckets and no breaks, it defaults to 1 bucket effectively.
+                num_buckets = 1
+        elif final_breaks and num_buckets <= 1:
+            num_buckets = len(final_breaks) + 1  # Recalculate num_buckets from actual breaks
+        if num_buckets <= 1:  # If effectively one bucket (or no data to bucket)
+            return (lambda size: 0), 1
+        # Create a mapper function
+        def mapper(size: float) -> int:
+            if size is None:
+                return -1  # Or some other indicator for unbucketable
+            # Find which bucket the size falls into
+            # bisect_left finds insertion point, which corresponds to bucket index
+            bucket_index = 0
+            for i, break_val in enumerate(final_breaks):
+                if size <= break_val:
+                    return i
+            return len(final_breaks)  # Belongs to the last bucket
+        return mapper, num_buckets
     def analyze(
         self, page: "Page", options: Optional[TextStyleOptions] = None
     ) -> "ElementCollection":
-        """
-        Analyze text styles on a page, group elements, and add style attributes.
-        Args:
-            page: The Page object to analyze.
-            options: Override the analyzer's default TextStyleOptions for this run.
-        Returns:
-            ElementCollection containing all processed text elements (typically words)
-            with added 'style_label', 'style_key', and 'style_properties' attributes.
-        """
         from natural_pdf.elements.collections import ElementCollection
         current_options = options or self.options
@@ -71,27 +289,40 @@ class TextStyleAnalyzer:
             f"Starting text style analysis for page {page.number} with options: {current_options}"
         )
-        # Use page.words for better granularity
         text_elements = page.words
-        # Fallback if words are somehow empty/not generated
         if not text_elements:
-            text_elements = page.find_all("text").elements  # Get list from collection
+            text_elements = page.find_all("text").elements
-        # Skip empty pages or pages with no text elements
         if not text_elements:
             logger.warning(f"Page {page.number} has no text elements to analyze.")
             return ElementCollection([])
-        style_cache: Dict[Tuple, Dict[str, Any]] = (
-            {}
-        )  # Maps style_key_tuple -> {'label': str, 'properties': dict}
-        processed_elements: List["Element"] = []
+        # --- Font Size Bucketing Setup ---
+        self._font_size_bucket_mapper = None
+        self._font_size_bucket_count = 0
+        bucketing_config = getattr(current_options, "font_size_buckets", None)
+        if bucketing_config is not None:
+            all_page_font_sizes = [
+                el.size for el in text_elements if hasattr(el, "size") and el.size is not None
+            ]
+            if all_page_font_sizes:
+                self._font_size_bucket_mapper, self._font_size_bucket_count = (
+                    self._get_font_size_bucket_mapper(all_page_font_sizes, bucketing_config)
+                )
+                if self._font_size_bucket_mapper:
+                    logger.debug(
+                        f"Font size bucketing active with {self._font_size_bucket_count} buckets for page {page.number}."
+                    )
+            else:
+                logger.debug("No font sizes found on page for bucketing.")
+        # --- End Bucketing Setup ---
-        # Ensure consistent ordering for style key creation
+        style_cache: Dict[Tuple, Dict[str, Any]] = {}
+        processed_elements: List["Element"] = []
         group_by_keys = sorted(current_options.group_by)
         for element in text_elements:
-            # Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
             if not hasattr(element, "text") or not hasattr(element, "size"):
                 logger.debug(f"Skipping element without text/size: {element}")
                 continue
@@ -102,60 +333,64 @@ class TextStyleAnalyzer:
                 if style_key not in style_cache:
                     label = self._generate_style_label(
-                        style_properties, current_options, len(style_cache) + 1
+                        style_properties,
+                        current_options,
+                        len(style_cache) + 1,
+                        self._font_size_bucket_count,
                     )
                     style_cache[style_key] = {"label": label, "properties": style_properties}
                     logger.debug(
                         f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
                     )
-                # Add attributes to the element
                 element.style_label = style_cache[style_key]["label"]
                 element.style_key = style_key
-                # Add the full properties dict for potential detailed inspection
                 element.style_properties = style_cache[style_key]["properties"]
+                element.font_bucket_name = style_cache[style_key]["properties"].get(
+                    "font_bucket_name"
+                )
                 processed_elements.append(element)
             except Exception as e:
                 logger.warning(
                     f"Error processing element {element} for text style: {e}", exc_info=True
                 )
-                # Optionally add element without style info or skip it
-                # processed_elements.append(element) # Add anyway?
-        # Optionally store a summary on the page
         page._text_styles_summary = style_cache
         logger.info(
             f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
         )
         return ElementCollection(processed_elements)
     def _extract_style_properties(
         self, element: "Element", options: TextStyleOptions
     ) -> Dict[str, Any]:
-        """
-        Extract style properties from a text element based on options.
-        Args:
-            element: Text element.
-            options: TextStyleOptions driving the extraction.
-        Returns:
-            Dictionary of extracted style properties.
-        """
         properties = {}
+        original_size = getattr(element, "size", None)
+        rounded_size = None
+        properties["original_size"] = original_size
-        # Font size
-        font_size = None
-        if hasattr(element, "size") and element.size is not None:
-            # Round based on tolerance
+        if original_size is not None:
             rounding_factor = 1.0 / options.size_tolerance
-            font_size = round(element.size * rounding_factor) / rounding_factor
-        properties["size"] = font_size
+            rounded_size = round(original_size * rounding_factor) / rounding_factor
+        properties["size"] = rounded_size  # For display in labels
+        properties["rounded_size"] = rounded_size  # Explicit storage
+        # Font size bucketing logic
+        properties["font_bucket_id"] = None
+        properties["font_bucket_name"] = None  # Initialize font_bucket_name
+        size_for_keying = rounded_size
+        if self._font_size_bucket_mapper and original_size is not None:
+            bucket_id = self._font_size_bucket_mapper(original_size)
+            properties["font_bucket_id"] = bucket_id
+            properties["font_bucket_name"] = self._get_bucket_name(
+                bucket_id, self._font_size_bucket_count
+            )
+            size_for_keying = bucket_id
+        properties["size_for_keying"] = size_for_keying
-        # Font name
         font_name = None
         normalized_font_name = None
         if hasattr(element, "fontname") and element.fontname is not None:
@@ -257,28 +492,50 @@ class TextStyleAnalyzer:
         return {"family": family, "weight": weight, "style": style}
     def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
-        """Create a hashable tuple key based on selected properties."""
         key_parts = []
-        for key in group_by_keys:  # Use the pre-sorted list
-            value = properties.get(key)
-            # Ensure hashable - colors should already be tuples or basic types
-            if isinstance(value, list):  # Should not happen if _extract handled color correctly
+        for key in group_by_keys:
+            if key == "size":
+                value = properties.get("size_for_keying")  # Use the correct size value for keying
+            else:
+                value = properties.get(key)
+            if isinstance(value, list):
                 value = tuple(value)
             key_parts.append(value)
         return tuple(key_parts)
     def _generate_style_label(
-        self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int
+        self,
+        properties: Dict[str, Any],
+        options: TextStyleOptions,
+        style_index: int,
+        num_font_buckets: int = 0,
     ) -> str:
-        """Generate a style label based on properties and options."""
         if not options.descriptive_labels:
+            # If bucketing is active and only 1 bucket, it's not very informative
+            is_meaningful_bucketing = (
+                self._font_size_bucket_mapper is not None and num_font_buckets > 1
+            )
+            bucket_id = properties.get("font_bucket_id")
+            if is_meaningful_bucketing and bucket_id is not None:
+                return f"{options.label_prefix} (Bucket {bucket_id + 1}) {style_index}"
             return f"{options.label_prefix} {style_index}"
         try:
             font_details = self._parse_font_name(properties.get("fontname", ""))
+            bucket_label_part = ""
+            bucket_id = properties.get("font_bucket_id")
+            # Only add bucket info if bucketing is active and meaningful (more than 1 bucket)
+            if (
+                self._font_size_bucket_mapper is not None
+                and num_font_buckets > 1
+                and bucket_id is not None
+            ):
+                bucket_label_part = f" (Bucket {bucket_id + 1})"  # Simple numeric label for now
             label_data = {
-                "size": properties.get("size", "?"),
+                "size": properties.get("rounded_size", "?"),  # Use rounded_size for display
                 "fontname": properties.get("fontname", "Unknown"),
                 "is_bold": properties.get("is_bold", False),
                 "is_italic": properties.get("is_italic", False),
@@ -292,6 +549,8 @@ class TextStyleAnalyzer:
             if label_data["weight"] and label_data["style"]:
                 label_data["style"] = " " + label_data["style"]
+            label_data["bucket_info"] = bucket_label_part
             # Handle color formatting for label
             color_val = label_data["color"]
             if isinstance(color_val, tuple):
@@ -303,12 +562,66 @@ class TextStyleAnalyzer:
             label_data["color_str"] = color_str
             # Format the label, handle potential missing keys in format string gracefully
-            label = options.label_format.format_map(defaultdict(str, label_data))
-            return label.strip().replace("  ", " ")  # Cleanup extra spaces
+            # Add {bucket_info} to default format string if not already customized by user?
+            # For now, user would need to add {bucket_info} to their custom label_format if they want it.
+            current_label_format = options.label_format
+            bucket_name_for_label = properties.get("font_bucket_name")
+            # Construct a bucket_info string if a bucket name exists and it's not already in the format
+            # And if there are multiple buckets to make it meaningful.
+            bucket_info_str = ""
+            if bucket_name_for_label and num_font_buckets > 1:
+                bucket_info_str = f" ({bucket_name_for_label})"
+            if "{bucket_info}" not in current_label_format and bucket_info_str:
+                current_label_format += " {bucket_info}"  # Placeholder name for format_map
+            # Populate label_data with the actual bucket string for the {bucket_info} placeholder
+            label_data["bucket_info"] = bucket_info_str
+            label = current_label_format.format_map(defaultdict(str, label_data))
+            return label.strip().replace("  ", " ")
         except Exception as e:
             logger.warning(
                 f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
             )
             # Fallback to numeric label on error
             return f"{options.label_prefix} {style_index}"
+    def _get_bucket_name(self, bucket_id: Optional[int], total_buckets: int) -> Optional[str]:
+        if bucket_id is None or not (0 <= bucket_id < total_buckets):
+            return None  # Or "N/A"
+        if total_buckets <= 0:  # Should not happen if called correctly
+            return f"Invalid Bucket {bucket_id}"
+        # Predefined human-readable names for up to 8 buckets
+        # Buckets are 0-indexed internally, names correspond to that index.
+        bucket_name_sets = {
+            1: ["standard"],
+            2: ["small", "large"],
+            3: ["small", "medium", "large"],
+            4: ["small", "medium", "large", "x-large"],
+            5: ["x-small", "small", "medium", "large", "x-large"],
+            6: ["x-small", "small", "medium", "large", "x-large", "xx-large"],
+            7: ["xx-small", "x-small", "small", "medium", "large", "x-large", "xx-large"],
+            8: [
+                "xx-small",
+                "x-small",
+                "small",
+                "medium",
+                "large",
+                "x-large",
+                "xx-large",
+                "xxx-large",
+            ],
+        }
+        if total_buckets in bucket_name_sets:
+            names = bucket_name_sets[total_buckets]
+            if 0 <= bucket_id < len(names):
+                return names[bucket_id]
+            else:  # Should not happen if bucket_id is valid for total_buckets
+                return f"Size Group {bucket_id}"
+        else:  # Fallback for more than 8 buckets or unhandled cases
+            return f"Size Group {bucket_id}"

natural_pdf/classification/manager.py CHANGED Viewed

@@ -69,7 +69,7 @@ class ClassificationManager:
         if not _CLASSIFICATION_AVAILABLE:
             raise ImportError(
                 "Classification dependencies missing. "
-                'Install with: pip install "natural-pdf[classification]"'
+                'Install with: pip install "natural-pdf[core-ml]"'
             )
         self.pipelines: Dict[Tuple[str, str], "Pipeline"] = (

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -407,7 +407,17 @@ class ElementManager:
                     char_dict_data = ocr_char_dict  # Use the one we already created
                     char_dict_data["object_type"] = "char"  # Mark as char type
                     char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
-                    self._elements["chars"].append(char_dict_data)  # Append the dictionary
+                    # Create a TextElement for the char representation
+                    # Ensure _char_dicts is handled correctly by TextElement constructor
+                    # For an OCR word represented as a char, its _char_dicts can be a list containing its own data
+                    char_element_specific_data = char_dict_data.copy()
+                    char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
+                    ocr_char_as_element = TextElement(char_element_specific_data, self._page)
+                    self._elements["chars"].append(
+                        ocr_char_as_element
+                    )  # Append TextElement instance
             except (KeyError, ValueError, TypeError) as e:
                 logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)

natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl