PyPI - natural-pdf - Versions diffs - 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

natural-pdf 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +44 -0
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +556 -25
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +89 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/ocr_manager.py +50 -0
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -15,9 +15,37 @@ from pdfplumber.utils.text import WordExtractor
 from natural_pdf.elements.line import LineElement
 from natural_pdf.elements.rect import RectangleElement
 from natural_pdf.elements.text import TextElement
+from natural_pdf.elements.image import ImageElement
 logger = logging.getLogger(__name__)
+# ------------------------------------------------------------------
+#  Default decoration-detection parameters (magic numbers centralised)
+# ------------------------------------------------------------------
+STRIKE_DEFAULTS = {
+    "thickness_tol": 1.5,   # pt ; max height of line/rect to be considered strike
+    "horiz_tol": 1.0,       # pt ; vertical tolerance for horizontality
+    "coverage_ratio": 0.7,  # proportion of glyph width to be overlapped
+    "band_top_frac": 0.35,  # fraction of glyph height above top baseline band
+    "band_bottom_frac": 0.65,  # fraction below top (same used internally)
+}
+UNDERLINE_DEFAULTS = {
+    "thickness_tol": 1.5,
+    "horiz_tol": 1.0,
+    "coverage_ratio": 0.8,
+    "band_frac": 0.25,   # height fraction above baseline
+    "below_pad": 0.7,    # pt ; pad below baseline
+}
+HIGHLIGHT_DEFAULTS = {
+    "height_min_ratio": 0.6,  # rect height relative to char height lower bound
+    "height_max_ratio": 2.0,  # upper bound
+    "coverage_ratio": 0.6,    # horizontal overlap with glyph
+    "color_saturation_min": 0.4,  # HSV S >
+    "color_value_min": 0.4,        # HSV V >
+}
 class NaturalWordExtractor(WordExtractor):
     """
@@ -125,6 +153,54 @@ class ElementManager:
             f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
         )
+        # -------------------------------------------------------------
+        # Detect strikethrough (horizontal strike-out lines) on raw
+        # characters BEFORE we run any word-grouping.  This way the
+        # NaturalWordExtractor can use the presence/absence of a
+        # "strike" attribute to decide whether two neighbouring chars
+        # belong to the same word.
+        # -------------------------------------------------------------
+        try:
+            self._mark_strikethrough_chars(prepared_char_dicts)
+        except Exception as strike_err:  # pragma: no cover – strike detection must never crash loading
+            logger.warning(
+                f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
+                exc_info=True,
+            )
+        # -------------------------------------------------------------
+        # Detect underlines on raw characters (must come after strike so
+        # both attributes are present before word grouping).
+        # -------------------------------------------------------------
+        try:
+            self._mark_underline_chars(prepared_char_dicts)
+        except Exception as u_err:  # pragma: no cover
+            logger.warning(
+                f"Page {self._page.number}: Underline detection failed – {u_err}",
+                exc_info=True,
+            )
+        # Detect highlights
+        try:
+            self._mark_highlight_chars(prepared_char_dicts)
+        except Exception as h_err:
+            logger.warning(
+                f"Page {self._page.number}: Highlight detection failed – {h_err}",
+                exc_info=True,
+            )
+        # Create a mapping from character dict to index for efficient lookup
+        char_to_index = {}
+        for idx, char_dict in enumerate(prepared_char_dicts):
+            key = (
+                char_dict.get("x0", 0),
+                char_dict.get("top", 0),
+                char_dict.get("text", ""),
+            )
+            char_to_index[key] = idx
         # 2. Instantiate the custom word extractor
         # Get config settings from the parent PDF or use defaults
         pdf_config = getattr(self._page._parent, "_config", {})
@@ -134,45 +210,230 @@ class ElementManager:
         # Define which attributes to preserve on the merged word object
         # Should include split attributes + any others needed for filtering (like color)
-        attributes_to_preserve = list(set(self._word_split_attributes + ["non_stroking_color"]))
-        # Pass our configured attributes for splitting
-        extractor = NaturalWordExtractor(
-            word_split_attributes=self._word_split_attributes,
-            extra_attrs=attributes_to_preserve,
-            x_tolerance=xt,
-            y_tolerance=yt,
-            keep_blank_chars=True,
-            use_text_flow=use_flow,
-            # Assuming default directions are okay, configure if needed
-            # line_dir=..., char_dir=...
+        attributes_to_preserve = list(
+            set(
+                self._word_split_attributes
+                + [
+                    "non_stroking_color",
+                    "strike",
+                    "underline",
+                    "highlight",
+                    "highlight_color",
+                ]
+            )
         )
-        # 3. Generate words using the extractor
-        generated_words = []
-        if prepared_char_dicts:
-            # Sort chars primarily by upright status, then page reading order
-            # Grouping by upright is crucial for WordExtractor's direction logic
-            sorted_chars_for_extraction = sorted(
-                prepared_char_dicts,
-                key=lambda c: (c.get("upright", True), round(c.get("top", 0)), c.get("x0", 0)),
+        # -------------------------------------------------------------
+        # NEW: Detect direction (LTR vs RTL) per visual line and feed
+        #       pdfplumber's WordExtractor with the correct settings.
+        # -------------------------------------------------------------
+        import unicodedata
+        def _is_rtl_char(ch: str) -> bool:
+            """Return True if the character has an RTL bidi class."""
+            if not ch:
+                return False
+            # If string has more than one character take first (works for most PDFs)
+            first = ch[0]
+            try:
+                return unicodedata.bidirectional(first) in ("R", "AL", "AN")
+            except Exception:
+                return False
+        # Helper: group characters into visual lines using y-tolerance
+        sorted_chars_for_line_grouping = sorted(
+            prepared_char_dicts,
+            key=lambda c: (round(c.get("top", 0) / max(yt, 1)) * yt, c.get("x0", 0)),
+        )
+        lines: List[List[Dict[str, Any]]] = []
+        current_line_key = None
+        for char_dict in sorted_chars_for_line_grouping:
+            top_val = char_dict.get("top", 0)
+            line_key = round(top_val / max(yt, 1))  # bucket index
+            if current_line_key is None or line_key != current_line_key:
+                # start new line bucket
+                lines.append([])
+                current_line_key = line_key
+            lines[-1].append(char_dict)
+        word_elements: List[TextElement] = []
+        # Process each line separately with direction detection
+        for line_chars in lines:
+            if not line_chars:
+                continue
+            # Determine RTL ratio
+            rtl_count = sum(1 for ch in line_chars if _is_rtl_char(ch.get("text", "")))
+            ltr_count = len(line_chars) - rtl_count
+            # Consider RTL if it has strictly more RTL than LTR strong characters
+            is_rtl_line = rtl_count > ltr_count
+            # Build a WordExtractor tailored for this line's direction
+            if is_rtl_line:
+                line_dir = "ttb"  # horizontal lines stacked top→bottom
+                char_dir = "rtl"  # characters right→left within the line
+            else:
+                line_dir = "ttb"
+                char_dir = "ltr"
+            extractor = NaturalWordExtractor(
+                word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
+                extra_attrs=attributes_to_preserve,
+                x_tolerance=xt,
+                y_tolerance=yt,
+                keep_blank_chars=True,
+                use_text_flow=use_flow,
+                line_dir=line_dir,
+                char_dir=char_dir,
             )
-            word_tuples = extractor.iter_extract_tuples(sorted_chars_for_extraction)
+            # Prepare character sequence for the extractor:
+            #  • For LTR lines -> left→right order (x0 ascending)
+            #  • For RTL lines -> feed **reversed** list so that neighbouring
+            #    characters appear adjacent when the extractor walks right→left.
+            line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
+            try:
+                word_tuples = extractor.iter_extract_tuples(line_chars_for_extractor)
+            except Exception as e:  # pragma: no cover
+                logger.error(
+                    f"Word extraction failed on line (rtl={is_rtl_line}) of page {self._page.number}: {e}",
+                    exc_info=True,
+                )
+                word_tuples = []
             for word_dict, char_list in word_tuples:
-                # Convert the generated word_dict to a TextElement
-                word_dict["_char_dicts"] = char_list
+                # Memory optimisation for char indices
+                char_indices = []
+                for char_dict in char_list:
+                    key = (
+                        char_dict.get("x0", 0),
+                        char_dict.get("top", 0),
+                        char_dict.get("text", ""),
+                    )
+                    # char_to_index dict built earlier in load_elements
+                    if key in char_to_index:
+                        char_indices.append(char_to_index[key])
+                word_dict["_char_indices"] = char_indices
+                word_dict["_char_dicts"] = char_list  # keep for back-compat
+                # Create and append TextElement
                 word_element = self._create_word_element(word_dict)
-                generated_words.append(word_element)
+                word_elements.append(word_element)
+                # Decide if this individual word contains RTL characters; safer than relying
+                # on the whole-line heuristic.
+                rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
+                if rtl_in_word:
+                    try:
+                        from bidi.algorithm import get_display  # type: ignore
+                        from natural_pdf.utils.bidi_mirror import mirror_brackets
+                        word_element.text = mirror_brackets(
+                            get_display(word_element.text, base_dir="R")
+                        )
+                    except Exception:
+                        # Fallback: keep original text if python-bidi fails
+                        pass
+        # ------------------------------------------------------------------
+        #  Propagate per-char strikethrough info up to word level.
+        # ------------------------------------------------------------------
+        if prepared_char_dicts:
+            for w in word_elements:
+                strike_chars = 0
+                total_chars = 0
+                if getattr(w, "_char_indices", None):
+                    for idx in w._char_indices:
+                        if 0 <= idx < len(prepared_char_dicts):
+                            total_chars += 1
+                            if prepared_char_dicts[idx].get("strike"):
+                                strike_chars += 1
+                elif getattr(w, "_char_dicts", None):
+                    for ch in w._char_dicts:
+                        total_chars += 1
+                        if ch.get("strike"):
+                            strike_chars += 1
+                if total_chars:
+                    w._obj["strike"] = (strike_chars / total_chars) >= 0.6
+                else:
+                    w._obj["strike"] = False
+                # underline propagation
+                ul_chars = 0
+                if getattr(w, "_char_indices", None):
+                    for idx in w._char_indices:
+                        if 0 <= idx < len(prepared_char_dicts):
+                            if prepared_char_dicts[idx].get("underline"):
+                                ul_chars += 1
+                elif getattr(w, "_char_dicts", None):
+                    ul_chars = sum(1 for ch in w._char_dicts if ch.get("underline"))
+                if total_chars:
+                    w._obj["underline"] = (ul_chars / total_chars) >= 0.6
+                else:
+                    w._obj["underline"] = False
+                # highlight propagation
+                hl_chars = 0
+                if getattr(w, "_char_indices", None):
+                    for idx in w._char_indices:
+                        if 0 <= idx < len(prepared_char_dicts):
+                            if prepared_char_dicts[idx].get("highlight"):
+                                hl_chars += 1
+                elif getattr(w, "_char_dicts", None):
+                    hl_chars = sum(1 for ch in w._char_dicts if ch.get("highlight"))
+                if total_chars:
+                    w._obj["highlight"] = (hl_chars / total_chars) >= 0.6
+                else:
+                    w._obj["highlight"] = False
+                # Determine dominant highlight color among chars
+                if w._obj.get("highlight"):
+                    color_counts = {}
+                    source_iter = (
+                        (prepared_char_dicts[idx] for idx in w._char_indices)
+                        if getattr(w, "_char_indices", None)
+                        else w._char_dicts if getattr(w, "_char_dicts", None) else []
+                    )
+                    for chd in source_iter:
+                        if chd.get("highlight") and chd.get("highlight_color") is not None:
+                            col = chd["highlight_color"]
+                            color_counts[col] = color_counts.get(col, 0) + 1
+                    if color_counts:
+                        dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
+                        try:
+                            w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
+                        except Exception:
+                            w._obj["highlight_color"] = dominant_color
+        generated_words = word_elements
         logger.debug(
             f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
         )
+        # --- Post-processing pass to ensure every word containing RTL characters is
+        #     stored in logical order and with mirrored brackets.  This is a
+        #     safeguard in case the per-line loop above missed some tokens.
+        try:
+            from bidi.algorithm import get_display  # type: ignore
+            from natural_pdf.utils.bidi_mirror import mirror_brackets
+            for w in generated_words:
+                if any(_is_rtl_char(ch) for ch in w.text):
+                    w.text = mirror_brackets(get_display(w.text, base_dir="R"))
+        except Exception:
+            pass  # graceful degradation – keep original text
         # 4. Load other elements (rects, lines)
         rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
         line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
+        image_elements = [ImageElement(i, self._page) for i in self._page._page.images]
         logger.debug(
-            f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
+            f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines, {len(image_elements)} images."
         )
         # 5. Create the final elements dictionary
@@ -183,6 +444,7 @@ class ElementManager:
             "words": generated_words,
             "rects": rect_elements,
             "lines": line_elements,
+            "images": image_elements,
         }
         # Add regions if they exist
@@ -238,6 +500,11 @@ class ElementManager:
             augmented_dict.setdefault("upright", True)
             augmented_dict.setdefault("fontname", "Unknown")
             augmented_dict.setdefault("size", 0)
+            augmented_dict.setdefault("highlight_color", None)
+            # Ensure decoration keys exist for safe grouping
+            augmented_dict.setdefault("strike", False)
+            augmented_dict.setdefault("underline", False)
+            augmented_dict.setdefault("highlight", False)
             prepared_dicts.append(augmented_dict)
             # Use a unique identifier if available (e.g., tuple of key properties)
@@ -385,12 +652,21 @@ class ElementManager:
                     "italic": False,
                     "upright": True,
                     "doctop": pdf_top + self._page._page.initial_doctop,
+                    "strike": False,
+                    "underline": False,
+                    "highlight": False,
+                    "highlight_color": None,
                 }
                 # Create the representative char dict for this OCR word
                 ocr_char_dict = word_element_data.copy()
                 ocr_char_dict["object_type"] = "char"
                 ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
+                # Ensure decoration keys
+                ocr_char_dict.setdefault("strike", False)
+                ocr_char_dict.setdefault("underline", False)
+                ocr_char_dict.setdefault("highlight", False)
+                ocr_char_dict.setdefault("highlight_color", None)
                 # Add the char dict list to the word data before creating TextElement
                 word_element_data["_char_dicts"] = [ocr_char_dict]  # Store itself as its only char
@@ -550,6 +826,12 @@ class ElementManager:
         self.load_elements()
         return self._elements.get("regions", [])
+    @property
+    def images(self):
+        """Get all image elements."""
+        self.load_elements()
+        return self._elements.get("images", [])
     def remove_ocr_elements(self):
         """
         Remove all elements with source="ocr" from the elements dictionary.
@@ -632,3 +914,252 @@ class ElementManager:
                 return True
         return False
+    # ------------------------------------------------------------------
+    #  Strikethrough detection (horizontal strike-out lines)
+    # ------------------------------------------------------------------
+    def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
+                                  thickness_tol: float = 1.5,
+                                  horiz_tol: float = 1.0,
+                                  coverage_ratio: float = 0.7,
+                                  band_top: float = 0.35,
+                                  band_bottom: float = 0.65) -> None:
+        """Annotate character dictionaries with a boolean ``strike`` flag.
+        Args
+        ----
+        char_dicts : list
+            The list that _prepare_char_dicts() returned – *modified in place*.
+        thickness_tol : float
+            Maximum height (in PDF pts) for a path to be considered a strike.
+        horiz_tol : float
+            Vertical tolerance when deciding if a pdfplumber ``line`` object
+            is horizontal (|y0-y1| ≤ horiz_tol).
+        coverage_ratio : float
+            Minimum proportion of the glyph's width that must be overlapped
+            by a candidate line.
+        band_top, band_bottom : float
+            Fractions of the glyph's height that define the central band in
+            which a line must fall to count as a strikethrough.  Defaults to
+            35–65 %.
+        """
+        # -------------------------------------------------------------
+        # Collect candidate horizontal primitives (lines + skinny rects)
+        # -------------------------------------------------------------
+        raw_lines = list(getattr(self._page._page, "lines", []))
+        raw_rects = list(getattr(self._page._page, "rects", []))
+        candidates: List[Tuple[float, float, float, float]] = []  # (x0, y0, x1, y1)
+        # pdfplumber line objects – treat those whose angle ≈ 0°
+        for ln in raw_lines:
+            y0 = min(ln.get("y0", 0), ln.get("y1", 0))
+            y1 = max(ln.get("y0", 0), ln.get("y1", 0))
+            if abs(y1 - y0) <= horiz_tol:  # horizontal
+                candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
+        # Thin rectangles that act as drawn lines
+        pg_height = self._page.height
+        for rc in raw_rects:
+            rb0 = rc.get("y0", 0)
+            rb1 = rc.get("y1", 0)
+            y0_raw = min(rb0, rb1)
+            y1_raw = max(rb0, rb1)
+            if (y1_raw - y0_raw) <= thickness_tol:
+                # Convert from PDF (origin bottom-left) to top-based coords used by chars
+                y0 = pg_height - y1_raw  # upper edge distance from top
+                y1 = pg_height - y0_raw  # lower edge distance from top
+                candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
+        if not candidates:
+            return  # nothing to mark
+        # -------------------------------------------------------------
+        # Walk through characters and flag those crossed by a candidate
+        # -------------------------------------------------------------
+        for ch in char_dicts:
+            ch.setdefault("strike", False)  # default value
+            try:
+                x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
+            except KeyError:
+                continue  # skip malformed char dict
+            width = x1 - x0
+            height = bottom - top
+            if width <= 0 or height <= 0:
+                continue
+            mid_y0 = top + band_top * height
+            mid_y1 = top + band_bottom * height
+            # Check each candidate line for overlap
+            for lx0, ly0, lx1, ly1 in candidates:
+                if (ly0 >= (mid_y0 - 1.0)) and (ly1 <= (mid_y1 + 1.0)):  # lies inside central band
+                    overlap = min(x1, lx1) - max(x0, lx0)
+                    if overlap > 0 and (overlap / width) >= coverage_ratio:
+                        ch["strike"] = True
+                        break  # no need to check further lines
+        # Done – char_dicts mutated in place
+    # ------------------------------------------------------------------
+    #  Underline detection
+    # ------------------------------------------------------------------
+    def _mark_underline_chars(
+        self,
+        char_dicts: List[Dict[str, Any]],
+        *,
+        thickness_tol: float = None,
+        horiz_tol: float = None,
+        coverage_ratio: float = None,
+        band_frac: float = None,
+        below_pad: float = None,
+    ) -> None:
+        """Annotate character dicts with ``underline`` flag."""
+        # Allow user overrides via PDF._config["underline_detection"]
+        pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
+        thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
+        horiz_tol     = horiz_tol     if horiz_tol     is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
+        coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
+        band_frac     = band_frac     if band_frac     is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
+        below_pad     = below_pad     if below_pad     is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
+        raw_lines = list(getattr(self._page._page, "lines", []))
+        raw_rects = list(getattr(self._page._page, "rects", []))
+        candidates: List[Tuple[float, float, float, float]] = []
+        for ln in raw_lines:
+            y0 = min(ln.get("y0", 0), ln.get("y1", 0))
+            y1 = max(ln.get("y0", 0), ln.get("y1", 0))
+            if abs(y1 - y0) <= horiz_tol and (
+                (ln.get("x1", 0) - ln.get("x0", 0)) < self._page.width * 0.95
+            ):  # ignore full-width rules
+                candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
+        pg_height = self._page.height
+        for rc in raw_rects:
+            rb0 = rc.get("y0", 0)
+            rb1 = rc.get("y1", 0)
+            y0_raw = min(rb0, rb1)
+            y1_raw = max(rb0, rb1)
+            if (y1_raw - y0_raw) <= thickness_tol and (
+                (rc.get("x1", 0) - rc.get("x0", 0)) < self._page.width * 0.95
+            ):
+                y0 = pg_height - y1_raw
+                y1 = pg_height - y0_raw
+                candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
+        if not candidates:
+            for ch in char_dicts:
+                ch.setdefault("underline", False)
+            return
+        # group candidates by y within tolerance 0.5 to detect repeating table borders
+        y_groups: Dict[int, int] = {}
+        for _, y0, _, y1 in candidates:
+            key = int((y0 + y1) / 2)
+            y_groups[key] = y_groups.get(key, 0) + 1
+        table_y = {k for k, v in y_groups.items() if v >= 3}
+        # filter out candidates on those y values
+        filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
+        # annotate chars
+        for ch in char_dicts:
+            ch.setdefault("underline", False)
+            try:
+                x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
+            except KeyError:
+                continue
+            width = x1 - x0
+            height = bottom - top
+            if width <= 0 or height <= 0:
+                continue
+            band_top = bottom - band_frac * height
+            band_bottom = bottom + below_pad  # allow some distance below baseline
+            for lx0, ly0, lx1, ly1 in filtered_candidates:
+                if (ly0 >= band_top - 1) and (ly1 <= band_bottom + 1):
+                    overlap = min(x1, lx1) - max(x0, lx0)
+                    if overlap > 0 and (overlap / width) >= coverage_ratio:
+                        ch["underline"] = True
+                        break
+    # ------------------------------------------------------------------
+    #  Highlight detection
+    # ------------------------------------------------------------------
+    def _mark_highlight_chars(self, char_dicts: List[Dict[str, Any]]) -> None:
+        """Detect PDF marker-style highlights and set ``highlight`` on char dicts."""
+        cfg = getattr(self._page._parent, "_config", {}).get("highlight_detection", {})
+        height_min_ratio = cfg.get("height_min_ratio", HIGHLIGHT_DEFAULTS["height_min_ratio"])
+        height_max_ratio = cfg.get("height_max_ratio", HIGHLIGHT_DEFAULTS["height_max_ratio"])
+        coverage_ratio = cfg.get("coverage_ratio", HIGHLIGHT_DEFAULTS["coverage_ratio"])
+        raw_rects = list(getattr(self._page._page, "rects", []))
+        pg_height = self._page.height
+        # Build list of candidate highlight rectangles (convert to top-based coords)
+        highlight_rects = []
+        for rc in raw_rects:
+            if rc.get("stroke", False):
+                continue  # border stroke, not fill-only
+            if not rc.get("fill", False):
+                continue
+            fill_col = rc.get("non_stroking_color")
+            # We keep colour as metadata but no longer filter on it
+            if fill_col is None:
+                continue
+            y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
+            y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
+            rheight = y1_rect - y0_rect
+            highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
+        if not highlight_rects:
+            for ch in char_dicts:
+                ch.setdefault("highlight", False)
+            return
+        for ch in char_dicts:
+            ch.setdefault("highlight", False)
+            try:
+                x0_raw, y0_raw, x1_raw, y1_raw = ch["x0"], ch["y0"], ch["x1"], ch["y1"]
+            except KeyError:
+                continue
+            width = x1_raw - x0_raw
+            height = y1_raw - y0_raw
+            if width <= 0 or height <= 0:
+                continue
+            for rx0, ry0, rx1, ry1, rheight, rcolor in highlight_rects:
+                # height ratio check relative to char
+                ratio = rheight / height if height else 0
+                if ratio < height_min_ratio or ratio > height_max_ratio:
+                    continue
+                # vertical containment in raw coords
+                if not (y0_raw + 1 >= ry0 and y1_raw - 1 <= ry1):
+                    continue
+                overlap = min(x1_raw, rx1) - max(x0_raw, rx0)
+                if overlap > 0 and (overlap / width) >= coverage_ratio:
+                    ch["highlight"] = True
+                    try:
+                        ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
+                    except Exception:
+                        ch["highlight_color"] = rcolor
+                    break

natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

natural-pdf 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl