natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
7
7
 
8
8
  import logging
9
9
  import re
10
+ from contextlib import contextmanager
10
11
  from itertools import groupby
11
12
  from typing import Any, Dict, List, Optional, Tuple, Union
12
13
 
@@ -15,9 +16,64 @@ from pdfplumber.utils.text import WordExtractor
15
16
  from natural_pdf.elements.line import LineElement
16
17
  from natural_pdf.elements.rect import RectangleElement
17
18
  from natural_pdf.elements.text import TextElement
19
+ from natural_pdf.elements.image import ImageElement
18
20
 
19
21
  logger = logging.getLogger(__name__)
20
22
 
23
+ # ------------------------------------------------------------------
24
+ # Default decoration-detection parameters (magic numbers centralised)
25
+ # ------------------------------------------------------------------
26
+
27
+ STRIKE_DEFAULTS = {
28
+ "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
29
+ "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
30
+ "coverage_ratio": 0.7, # proportion of glyph width to be overlapped
31
+ "band_top_frac": 0.35, # fraction of glyph height above top baseline band
32
+ "band_bottom_frac": 0.65, # fraction below top (same used internally)
33
+ }
34
+
35
+ UNDERLINE_DEFAULTS = {
36
+ "thickness_tol": 1.5,
37
+ "horiz_tol": 1.0,
38
+ "coverage_ratio": 0.8,
39
+ "band_frac": 0.25, # height fraction above baseline
40
+ "below_pad": 0.7, # pt ; pad below baseline
41
+ }
42
+
43
+ HIGHLIGHT_DEFAULTS = {
44
+ "height_min_ratio": 0.6, # rect height relative to char height lower bound
45
+ "height_max_ratio": 2.0, # upper bound
46
+ "coverage_ratio": 0.6, # horizontal overlap with glyph
47
+ "color_saturation_min": 0.4, # HSV S >
48
+ "color_value_min": 0.4, # HSV V >
49
+ }
50
+
51
+
52
+ @contextmanager
53
+ def disable_text_sync():
54
+ """
55
+ Temporarily disable text synchronization for performance.
56
+
57
+ This is used when bulk-updating text content where character-level
58
+ synchronization is not needed, such as during bidi processing.
59
+ Fixes exponential recursion issue with Arabic/RTL text processing.
60
+ """
61
+ # Save original setter
62
+ original_setter = TextElement.text.fset
63
+
64
+ # Create a fast setter that skips sync
65
+ def fast_setter(self, value):
66
+ self._obj["text"] = value
67
+ # Skip character synchronization for performance
68
+
69
+ # Apply fast setter
70
+ TextElement.text = property(TextElement.text.fget, fast_setter)
71
+
72
+ try:
73
+ yield
74
+ finally:
75
+ # Restore original setter
76
+ TextElement.text = property(TextElement.text.fget, original_setter)
21
77
 
22
78
  class NaturalWordExtractor(WordExtractor):
23
79
  """
@@ -125,6 +181,54 @@ class ElementManager:
125
181
  f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
126
182
  )
127
183
 
184
+ # -------------------------------------------------------------
185
+ # Detect strikethrough (horizontal strike-out lines) on raw
186
+ # characters BEFORE we run any word-grouping. This way the
187
+ # NaturalWordExtractor can use the presence/absence of a
188
+ # "strike" attribute to decide whether two neighbouring chars
189
+ # belong to the same word.
190
+ # -------------------------------------------------------------
191
+
192
+ try:
193
+ self._mark_strikethrough_chars(prepared_char_dicts)
194
+ except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
195
+ logger.warning(
196
+ f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
197
+ exc_info=True,
198
+ )
199
+
200
+ # -------------------------------------------------------------
201
+ # Detect underlines on raw characters (must come after strike so
202
+ # both attributes are present before word grouping).
203
+ # -------------------------------------------------------------
204
+
205
+ try:
206
+ self._mark_underline_chars(prepared_char_dicts)
207
+ except Exception as u_err: # pragma: no cover
208
+ logger.warning(
209
+ f"Page {self._page.number}: Underline detection failed – {u_err}",
210
+ exc_info=True,
211
+ )
212
+
213
+ # Detect highlights
214
+ try:
215
+ self._mark_highlight_chars(prepared_char_dicts)
216
+ except Exception as h_err:
217
+ logger.warning(
218
+ f"Page {self._page.number}: Highlight detection failed – {h_err}",
219
+ exc_info=True,
220
+ )
221
+
222
+ # Create a mapping from character dict to index for efficient lookup
223
+ char_to_index = {}
224
+ for idx, char_dict in enumerate(prepared_char_dicts):
225
+ key = (
226
+ char_dict.get("x0", 0),
227
+ char_dict.get("top", 0),
228
+ char_dict.get("text", ""),
229
+ )
230
+ char_to_index[key] = idx
231
+
128
232
  # 2. Instantiate the custom word extractor
129
233
  # Get config settings from the parent PDF or use defaults
130
234
  pdf_config = getattr(self._page._parent, "_config", {})
@@ -132,38 +236,212 @@ class ElementManager:
132
236
  yt = pdf_config.get("y_tolerance", 3)
133
237
  use_flow = pdf_config.get("use_text_flow", False)
134
238
 
135
- # Define which attributes to preserve on the merged word object
136
- # Should include split attributes + any others needed for filtering (like color)
137
- attributes_to_preserve = list(set(self._word_split_attributes + ["non_stroking_color"]))
138
-
139
- # Pass our configured attributes for splitting
140
- extractor = NaturalWordExtractor(
141
- word_split_attributes=self._word_split_attributes,
142
- extra_attrs=attributes_to_preserve,
143
- x_tolerance=xt,
144
- y_tolerance=yt,
145
- keep_blank_chars=True,
146
- use_text_flow=use_flow,
147
- # Assuming default directions are okay, configure if needed
148
- # line_dir=..., char_dir=...
239
+ # List of attributes to preserve on word objects
240
+ attributes_to_preserve = list(
241
+ set(
242
+ self._word_split_attributes
243
+ + [
244
+ "non_stroking_color",
245
+ "strike",
246
+ "underline",
247
+ "highlight",
248
+ "highlight_color",
249
+ ]
250
+ )
149
251
  )
150
252
 
151
- # 3. Generate words using the extractor
152
- generated_words = []
153
- if prepared_char_dicts:
154
- # Sort chars primarily by upright status, then page reading order
155
- # Grouping by upright is crucial for WordExtractor's direction logic
156
- sorted_chars_for_extraction = sorted(
157
- prepared_char_dicts,
158
- key=lambda c: (c.get("upright", True), round(c.get("top", 0)), c.get("x0", 0)),
253
+ # ------------------------------------------------------------------
254
+ # NEW: Detect direction (LTR vs RTL) per visual line and feed
255
+ # pdfplumber's WordExtractor with the correct settings.
256
+ # -------------------------------------------------------------
257
+ import unicodedata
258
+
259
+ def _is_rtl_char(ch: str) -> bool:
260
+ """Return True if the character has an RTL bidi class."""
261
+ if not ch:
262
+ return False
263
+ # If string has more than one character take first (works for most PDFs)
264
+ first = ch[0]
265
+ try:
266
+ return unicodedata.bidirectional(first) in ("R", "AL", "AN")
267
+ except Exception:
268
+ return False
269
+
270
+ # Helper: group characters into visual lines using y-tolerance
271
+ sorted_chars_for_line_grouping = sorted(
272
+ prepared_char_dicts,
273
+ key=lambda c: (round(c.get("top", 0) / max(yt, 1)) * yt, c.get("x0", 0)),
274
+ )
275
+
276
+ lines: List[List[Dict[str, Any]]] = []
277
+ current_line_key = None
278
+ for char_dict in sorted_chars_for_line_grouping:
279
+ top_val = char_dict.get("top", 0)
280
+ line_key = round(top_val / max(yt, 1)) # bucket index
281
+ if current_line_key is None or line_key != current_line_key:
282
+ # start new line bucket
283
+ lines.append([])
284
+ current_line_key = line_key
285
+ lines[-1].append(char_dict)
286
+
287
+ word_elements: List[TextElement] = []
288
+ # Process each line separately with direction detection
289
+ for line_chars in lines:
290
+ if not line_chars:
291
+ continue
292
+ # Determine RTL ratio
293
+ rtl_count = sum(1 for ch in line_chars if _is_rtl_char(ch.get("text", "")))
294
+ ltr_count = len(line_chars) - rtl_count
295
+ # Consider RTL if it has strictly more RTL than LTR strong characters
296
+ is_rtl_line = rtl_count > ltr_count
297
+
298
+ # Build a WordExtractor tailored for this line's direction
299
+ if is_rtl_line:
300
+ line_dir = "ttb" # horizontal lines stacked top→bottom
301
+ # Feed characters in right→left x-order; extractor can then treat
302
+ # them as left-to-right so that resulting text stays logical.
303
+ char_dir = "ltr"
304
+ else:
305
+ line_dir = "ttb"
306
+ char_dir = "ltr"
307
+
308
+ extractor = NaturalWordExtractor(
309
+ word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
310
+ extra_attrs=attributes_to_preserve,
311
+ x_tolerance=xt,
312
+ y_tolerance=yt,
313
+ keep_blank_chars=True,
314
+ use_text_flow=use_flow,
315
+ line_dir=line_dir,
316
+ char_dir=char_dir,
159
317
  )
160
318
 
161
- word_tuples = extractor.iter_extract_tuples(sorted_chars_for_extraction)
319
+ # Prepare character sequence for the extractor:
320
+ # Always feed characters in spatial order (x0 ascending)
321
+ # PDF stores glyphs in visual order, so this gives us the visual sequence
322
+ line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
323
+
324
+ try:
325
+ word_tuples = extractor.iter_extract_tuples(line_chars_for_extractor)
326
+ except Exception as e: # pragma: no cover
327
+ logger.error(
328
+ f"Word extraction failed on line (rtl={is_rtl_line}) of page {self._page.number}: {e}",
329
+ exc_info=True,
330
+ )
331
+ word_tuples = []
332
+
162
333
  for word_dict, char_list in word_tuples:
163
- # Convert the generated word_dict to a TextElement
164
- word_dict["_char_dicts"] = char_list
334
+ # Memory optimisation for char indices
335
+ char_indices = []
336
+ for char_dict in char_list:
337
+ key = (
338
+ char_dict.get("x0", 0),
339
+ char_dict.get("top", 0),
340
+ char_dict.get("text", ""),
341
+ )
342
+ # char_to_index dict built earlier in load_elements
343
+ if key in char_to_index:
344
+ char_indices.append(char_to_index[key])
345
+ word_dict["_char_indices"] = char_indices
346
+ word_dict["_char_dicts"] = char_list # keep for back-compat
347
+ # Create and append TextElement
165
348
  word_element = self._create_word_element(word_dict)
166
- generated_words.append(word_element)
349
+ word_elements.append(word_element)
350
+
351
+ # Decide if this individual word contains RTL characters; safer than relying
352
+ # on the whole-line heuristic.
353
+ rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
354
+ if rtl_in_word:
355
+ # Convert from visual order (from PDF) to logical order using bidi
356
+ try:
357
+ from bidi.algorithm import get_display # type: ignore
358
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
359
+
360
+ with disable_text_sync():
361
+ # word_element.text is currently in visual order (from PDF)
362
+ # Convert to logical order using bidi with auto direction detection
363
+ logical_text = get_display(word_element.text, base_dir='L')
364
+ # Apply bracket mirroring for logical order
365
+ word_element.text = mirror_brackets(logical_text)
366
+ except Exception:
367
+ pass
368
+
369
+ # ------------------------------------------------------------------
370
+ # Propagate per-char strikethrough info up to word level.
371
+ # ------------------------------------------------------------------
372
+
373
+ if prepared_char_dicts:
374
+ for w in word_elements:
375
+ strike_chars = 0
376
+ total_chars = 0
377
+ if getattr(w, "_char_indices", None):
378
+ for idx in w._char_indices:
379
+ if 0 <= idx < len(prepared_char_dicts):
380
+ total_chars += 1
381
+ if prepared_char_dicts[idx].get("strike"):
382
+ strike_chars += 1
383
+ elif getattr(w, "_char_dicts", None):
384
+ for ch in w._char_dicts:
385
+ total_chars += 1
386
+ if ch.get("strike"):
387
+ strike_chars += 1
388
+
389
+ if total_chars:
390
+ w._obj["strike"] = (strike_chars / total_chars) >= 0.6
391
+ else:
392
+ w._obj["strike"] = False
393
+
394
+ # underline propagation
395
+ ul_chars = 0
396
+ if getattr(w, "_char_indices", None):
397
+ for idx in w._char_indices:
398
+ if 0 <= idx < len(prepared_char_dicts):
399
+ if prepared_char_dicts[idx].get("underline"):
400
+ ul_chars += 1
401
+ elif getattr(w, "_char_dicts", None):
402
+ ul_chars = sum(1 for ch in w._char_dicts if ch.get("underline"))
403
+
404
+ if total_chars:
405
+ w._obj["underline"] = (ul_chars / total_chars) >= 0.6
406
+ else:
407
+ w._obj["underline"] = False
408
+
409
+ # highlight propagation
410
+ hl_chars = 0
411
+ if getattr(w, "_char_indices", None):
412
+ for idx in w._char_indices:
413
+ if 0 <= idx < len(prepared_char_dicts):
414
+ if prepared_char_dicts[idx].get("highlight"):
415
+ hl_chars += 1
416
+ elif getattr(w, "_char_dicts", None):
417
+ hl_chars = sum(1 for ch in w._char_dicts if ch.get("highlight"))
418
+
419
+ if total_chars:
420
+ w._obj["highlight"] = (hl_chars / total_chars) >= 0.6
421
+ else:
422
+ w._obj["highlight"] = False
423
+
424
+ # Determine dominant highlight color among chars
425
+ if w._obj.get("highlight"):
426
+ color_counts = {}
427
+ source_iter = (
428
+ (prepared_char_dicts[idx] for idx in w._char_indices)
429
+ if getattr(w, "_char_indices", None)
430
+ else w._char_dicts if getattr(w, "_char_dicts", None) else []
431
+ )
432
+ for chd in source_iter:
433
+ if chd.get("highlight") and chd.get("highlight_color") is not None:
434
+ col = chd["highlight_color"]
435
+ color_counts[col] = color_counts.get(col, 0) + 1
436
+
437
+ if color_counts:
438
+ dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
439
+ try:
440
+ w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
441
+ except Exception:
442
+ w._obj["highlight_color"] = dominant_color
443
+
444
+ generated_words = word_elements
167
445
  logger.debug(
168
446
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
169
447
  )
@@ -171,8 +449,9 @@ class ElementManager:
171
449
  # 4. Load other elements (rects, lines)
172
450
  rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
173
451
  line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
452
+ image_elements = [ImageElement(i, self._page) for i in self._page._page.images]
174
453
  logger.debug(
175
- f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
454
+ f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines, {len(image_elements)} images."
176
455
  )
177
456
 
178
457
  # 5. Create the final elements dictionary
@@ -183,6 +462,7 @@ class ElementManager:
183
462
  "words": generated_words,
184
463
  "rects": rect_elements,
185
464
  "lines": line_elements,
465
+ "images": image_elements,
186
466
  }
187
467
 
188
468
  # Add regions if they exist
@@ -201,6 +481,8 @@ class ElementManager:
201
481
 
202
482
  logger.debug(f"Page {self._page.number}: Element loading complete.")
203
483
 
484
+ # If per-word BiDi was skipped, generated_words already stay in logical order.
485
+
204
486
  def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
205
487
  """
206
488
  Prepares a list of character dictionaries from native PDF characters,
@@ -238,6 +520,11 @@ class ElementManager:
238
520
  augmented_dict.setdefault("upright", True)
239
521
  augmented_dict.setdefault("fontname", "Unknown")
240
522
  augmented_dict.setdefault("size", 0)
523
+ augmented_dict.setdefault("highlight_color", None)
524
+ # Ensure decoration keys exist for safe grouping
525
+ augmented_dict.setdefault("strike", False)
526
+ augmented_dict.setdefault("underline", False)
527
+ augmented_dict.setdefault("highlight", False)
241
528
 
242
529
  prepared_dicts.append(augmented_dict)
243
530
  # Use a unique identifier if available (e.g., tuple of key properties)
@@ -385,12 +672,21 @@ class ElementManager:
385
672
  "italic": False,
386
673
  "upright": True,
387
674
  "doctop": pdf_top + self._page._page.initial_doctop,
675
+ "strike": False,
676
+ "underline": False,
677
+ "highlight": False,
678
+ "highlight_color": None,
388
679
  }
389
680
 
390
681
  # Create the representative char dict for this OCR word
391
682
  ocr_char_dict = word_element_data.copy()
392
683
  ocr_char_dict["object_type"] = "char"
393
684
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
685
+ # Ensure decoration keys
686
+ ocr_char_dict.setdefault("strike", False)
687
+ ocr_char_dict.setdefault("underline", False)
688
+ ocr_char_dict.setdefault("highlight", False)
689
+ ocr_char_dict.setdefault("highlight_color", None)
394
690
 
395
691
  # Add the char dict list to the word data before creating TextElement
396
692
  word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
@@ -550,6 +846,12 @@ class ElementManager:
550
846
  self.load_elements()
551
847
  return self._elements.get("regions", [])
552
848
 
849
+ @property
850
+ def images(self):
851
+ """Get all image elements."""
852
+ self.load_elements()
853
+ return self._elements.get("images", [])
854
+
553
855
  def remove_ocr_elements(self):
554
856
  """
555
857
  Remove all elements with source="ocr" from the elements dictionary.
@@ -632,3 +934,252 @@ class ElementManager:
632
934
  return True
633
935
 
634
936
  return False
937
+
938
+ # ------------------------------------------------------------------
939
+ # Strikethrough detection (horizontal strike-out lines)
940
+ # ------------------------------------------------------------------
941
+
942
+ def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
943
+ thickness_tol: float = 1.5,
944
+ horiz_tol: float = 1.0,
945
+ coverage_ratio: float = 0.7,
946
+ band_top: float = 0.35,
947
+ band_bottom: float = 0.65) -> None:
948
+ """Annotate character dictionaries with a boolean ``strike`` flag.
949
+
950
+ Args
951
+ ----
952
+ char_dicts : list
953
+ The list that _prepare_char_dicts() returned – *modified in place*.
954
+ thickness_tol : float
955
+ Maximum height (in PDF pts) for a path to be considered a strike.
956
+ horiz_tol : float
957
+ Vertical tolerance when deciding if a pdfplumber ``line`` object
958
+ is horizontal (|y0-y1| ≤ horiz_tol).
959
+ coverage_ratio : float
960
+ Minimum proportion of the glyph's width that must be overlapped
961
+ by a candidate line.
962
+ band_top, band_bottom : float
963
+ Fractions of the glyph's height that define the central band in
964
+ which a line must fall to count as a strikethrough. Defaults to
965
+ 35–65 %.
966
+ """
967
+
968
+ # -------------------------------------------------------------
969
+ # Collect candidate horizontal primitives (lines + skinny rects)
970
+ # -------------------------------------------------------------
971
+ raw_lines = list(getattr(self._page._page, "lines", []))
972
+ raw_rects = list(getattr(self._page._page, "rects", []))
973
+
974
+ candidates: List[Tuple[float, float, float, float]] = [] # (x0, y0, x1, y1)
975
+
976
+ # pdfplumber line objects – treat those whose angle ≈ 0°
977
+ for ln in raw_lines:
978
+ y0 = min(ln.get("y0", 0), ln.get("y1", 0))
979
+ y1 = max(ln.get("y0", 0), ln.get("y1", 0))
980
+ if abs(y1 - y0) <= horiz_tol: # horizontal
981
+ candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
982
+
983
+ # Thin rectangles that act as drawn lines
984
+ pg_height = self._page.height
985
+ for rc in raw_rects:
986
+ rb0 = rc.get("y0", 0)
987
+ rb1 = rc.get("y1", 0)
988
+ y0_raw = min(rb0, rb1)
989
+ y1_raw = max(rb0, rb1)
990
+ if (y1_raw - y0_raw) <= thickness_tol:
991
+ # Convert from PDF (origin bottom-left) to top-based coords used by chars
992
+ y0 = pg_height - y1_raw # upper edge distance from top
993
+ y1 = pg_height - y0_raw # lower edge distance from top
994
+ candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
995
+
996
+ if not candidates:
997
+ return # nothing to mark
998
+
999
+ # -------------------------------------------------------------
1000
+ # Walk through characters and flag those crossed by a candidate
1001
+ # -------------------------------------------------------------
1002
+ for ch in char_dicts:
1003
+ ch.setdefault("strike", False) # default value
1004
+ try:
1005
+ x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
1006
+ except KeyError:
1007
+ continue # skip malformed char dict
1008
+
1009
+ width = x1 - x0
1010
+ height = bottom - top
1011
+ if width <= 0 or height <= 0:
1012
+ continue
1013
+
1014
+ mid_y0 = top + band_top * height
1015
+ mid_y1 = top + band_bottom * height
1016
+
1017
+ # Check each candidate line for overlap
1018
+ for lx0, ly0, lx1, ly1 in candidates:
1019
+ if (ly0 >= (mid_y0 - 1.0)) and (ly1 <= (mid_y1 + 1.0)): # lies inside central band
1020
+ overlap = min(x1, lx1) - max(x0, lx0)
1021
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1022
+ ch["strike"] = True
1023
+ break # no need to check further lines
1024
+
1025
+ # Done – char_dicts mutated in place
1026
+
1027
+ # ------------------------------------------------------------------
1028
+ # Underline detection
1029
+ # ------------------------------------------------------------------
1030
+
1031
+ def _mark_underline_chars(
1032
+ self,
1033
+ char_dicts: List[Dict[str, Any]],
1034
+ *,
1035
+ thickness_tol: float = None,
1036
+ horiz_tol: float = None,
1037
+ coverage_ratio: float = None,
1038
+ band_frac: float = None,
1039
+ below_pad: float = None,
1040
+ ) -> None:
1041
+ """Annotate character dicts with ``underline`` flag."""
1042
+
1043
+ # Allow user overrides via PDF._config["underline_detection"]
1044
+ pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
1045
+
1046
+ thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1047
+ horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1048
+ coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1049
+ band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1050
+ below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1051
+
1052
+ raw_lines = list(getattr(self._page._page, "lines", []))
1053
+ raw_rects = list(getattr(self._page._page, "rects", []))
1054
+
1055
+ candidates: List[Tuple[float, float, float, float]] = []
1056
+
1057
+ for ln in raw_lines:
1058
+ y0 = min(ln.get("y0", 0), ln.get("y1", 0))
1059
+ y1 = max(ln.get("y0", 0), ln.get("y1", 0))
1060
+ if abs(y1 - y0) <= horiz_tol and (
1061
+ (ln.get("x1", 0) - ln.get("x0", 0)) < self._page.width * 0.95
1062
+ ): # ignore full-width rules
1063
+ candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
1064
+
1065
+ pg_height = self._page.height
1066
+ for rc in raw_rects:
1067
+ rb0 = rc.get("y0", 0)
1068
+ rb1 = rc.get("y1", 0)
1069
+ y0_raw = min(rb0, rb1)
1070
+ y1_raw = max(rb0, rb1)
1071
+ if (y1_raw - y0_raw) <= thickness_tol and (
1072
+ (rc.get("x1", 0) - rc.get("x0", 0)) < self._page.width * 0.95
1073
+ ):
1074
+ y0 = pg_height - y1_raw
1075
+ y1 = pg_height - y0_raw
1076
+ candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
1077
+
1078
+ if not candidates:
1079
+ for ch in char_dicts:
1080
+ ch.setdefault("underline", False)
1081
+ return
1082
+
1083
+ # group candidates by y within tolerance 0.5 to detect repeating table borders
1084
+ y_groups: Dict[int, int] = {}
1085
+ for _, y0, _, y1 in candidates:
1086
+ key = int((y0 + y1) / 2)
1087
+ y_groups[key] = y_groups.get(key, 0) + 1
1088
+
1089
+ table_y = {k for k, v in y_groups.items() if v >= 3}
1090
+
1091
+ # filter out candidates on those y values
1092
+ filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
1093
+
1094
+ # annotate chars
1095
+ for ch in char_dicts:
1096
+ ch.setdefault("underline", False)
1097
+ try:
1098
+ x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
1099
+ except KeyError:
1100
+ continue
1101
+
1102
+ width = x1 - x0
1103
+ height = bottom - top
1104
+ if width <= 0 or height <= 0:
1105
+ continue
1106
+
1107
+ band_top = bottom - band_frac * height
1108
+ band_bottom = bottom + below_pad # allow some distance below baseline
1109
+
1110
+ for lx0, ly0, lx1, ly1 in filtered_candidates:
1111
+ if (ly0 >= band_top - 1) and (ly1 <= band_bottom + 1):
1112
+ overlap = min(x1, lx1) - max(x0, lx0)
1113
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1114
+ ch["underline"] = True
1115
+ break
1116
+
1117
+ # ------------------------------------------------------------------
1118
+ # Highlight detection
1119
+ # ------------------------------------------------------------------
1120
+
1121
+ def _mark_highlight_chars(self, char_dicts: List[Dict[str, Any]]) -> None:
1122
+ """Detect PDF marker-style highlights and set ``highlight`` on char dicts."""
1123
+
1124
+ cfg = getattr(self._page._parent, "_config", {}).get("highlight_detection", {})
1125
+
1126
+ height_min_ratio = cfg.get("height_min_ratio", HIGHLIGHT_DEFAULTS["height_min_ratio"])
1127
+ height_max_ratio = cfg.get("height_max_ratio", HIGHLIGHT_DEFAULTS["height_max_ratio"])
1128
+ coverage_ratio = cfg.get("coverage_ratio", HIGHLIGHT_DEFAULTS["coverage_ratio"])
1129
+
1130
+ raw_rects = list(getattr(self._page._page, "rects", []))
1131
+ pg_height = self._page.height
1132
+
1133
+ # Build list of candidate highlight rectangles (convert to top-based coords)
1134
+ highlight_rects = []
1135
+ for rc in raw_rects:
1136
+ if rc.get("stroke", False):
1137
+ continue # border stroke, not fill-only
1138
+ if not rc.get("fill", False):
1139
+ continue
1140
+
1141
+ fill_col = rc.get("non_stroking_color")
1142
+ # We keep colour as metadata but no longer filter on it
1143
+ if fill_col is None:
1144
+ continue
1145
+
1146
+ y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
1147
+ y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
1148
+ rheight = y1_rect - y0_rect
1149
+ highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
1150
+
1151
+ if not highlight_rects:
1152
+ for ch in char_dicts:
1153
+ ch.setdefault("highlight", False)
1154
+ return
1155
+
1156
+ for ch in char_dicts:
1157
+ ch.setdefault("highlight", False)
1158
+ try:
1159
+ x0_raw, y0_raw, x1_raw, y1_raw = ch["x0"], ch["y0"], ch["x1"], ch["y1"]
1160
+ except KeyError:
1161
+ continue
1162
+
1163
+ width = x1_raw - x0_raw
1164
+ height = y1_raw - y0_raw
1165
+ if width <= 0 or height <= 0:
1166
+ continue
1167
+
1168
+ for rx0, ry0, rx1, ry1, rheight, rcolor in highlight_rects:
1169
+ # height ratio check relative to char
1170
+ ratio = rheight / height if height else 0
1171
+ if ratio < height_min_ratio or ratio > height_max_ratio:
1172
+ continue
1173
+
1174
+ # vertical containment in raw coords
1175
+ if not (y0_raw + 1 >= ry0 and y1_raw - 1 <= ry1):
1176
+ continue
1177
+
1178
+ overlap = min(x1_raw, rx1) - max(x0_raw, rx0)
1179
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1180
+ ch["highlight"] = True
1181
+ try:
1182
+ ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1183
+ except Exception:
1184
+ ch["highlight_color"] = rcolor
1185
+ break