natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -15,9 +15,37 @@ from pdfplumber.utils.text import WordExtractor
15
15
  from natural_pdf.elements.line import LineElement
16
16
  from natural_pdf.elements.rect import RectangleElement
17
17
  from natural_pdf.elements.text import TextElement
18
+ from natural_pdf.elements.image import ImageElement
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+ # ------------------------------------------------------------------
23
+ # Default decoration-detection parameters (magic numbers centralised)
24
+ # ------------------------------------------------------------------
25
+
26
+ STRIKE_DEFAULTS = {
27
+ "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
28
+ "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
29
+ "coverage_ratio": 0.7, # proportion of glyph width to be overlapped
30
+ "band_top_frac": 0.35, # fraction of glyph height above top baseline band
31
+ "band_bottom_frac": 0.65, # fraction below top (same used internally)
32
+ }
33
+
34
+ UNDERLINE_DEFAULTS = {
35
+ "thickness_tol": 1.5,
36
+ "horiz_tol": 1.0,
37
+ "coverage_ratio": 0.8,
38
+ "band_frac": 0.25, # height fraction above baseline
39
+ "below_pad": 0.7, # pt ; pad below baseline
40
+ }
41
+
42
+ HIGHLIGHT_DEFAULTS = {
43
+ "height_min_ratio": 0.6, # rect height relative to char height lower bound
44
+ "height_max_ratio": 2.0, # upper bound
45
+ "coverage_ratio": 0.6, # horizontal overlap with glyph
46
+ "color_saturation_min": 0.4, # HSV S >
47
+ "color_value_min": 0.4, # HSV V >
48
+ }
21
49
 
22
50
  class NaturalWordExtractor(WordExtractor):
23
51
  """
@@ -125,6 +153,54 @@ class ElementManager:
125
153
  f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
126
154
  )
127
155
 
156
+ # -------------------------------------------------------------
157
+ # Detect strikethrough (horizontal strike-out lines) on raw
158
+ # characters BEFORE we run any word-grouping. This way the
159
+ # NaturalWordExtractor can use the presence/absence of a
160
+ # "strike" attribute to decide whether two neighbouring chars
161
+ # belong to the same word.
162
+ # -------------------------------------------------------------
163
+
164
+ try:
165
+ self._mark_strikethrough_chars(prepared_char_dicts)
166
+ except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
167
+ logger.warning(
168
+ f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
169
+ exc_info=True,
170
+ )
171
+
172
+ # -------------------------------------------------------------
173
+ # Detect underlines on raw characters (must come after strike so
174
+ # both attributes are present before word grouping).
175
+ # -------------------------------------------------------------
176
+
177
+ try:
178
+ self._mark_underline_chars(prepared_char_dicts)
179
+ except Exception as u_err: # pragma: no cover
180
+ logger.warning(
181
+ f"Page {self._page.number}: Underline detection failed – {u_err}",
182
+ exc_info=True,
183
+ )
184
+
185
+ # Detect highlights
186
+ try:
187
+ self._mark_highlight_chars(prepared_char_dicts)
188
+ except Exception as h_err:
189
+ logger.warning(
190
+ f"Page {self._page.number}: Highlight detection failed – {h_err}",
191
+ exc_info=True,
192
+ )
193
+
194
+ # Create a mapping from character dict to index for efficient lookup
195
+ char_to_index = {}
196
+ for idx, char_dict in enumerate(prepared_char_dicts):
197
+ key = (
198
+ char_dict.get("x0", 0),
199
+ char_dict.get("top", 0),
200
+ char_dict.get("text", ""),
201
+ )
202
+ char_to_index[key] = idx
203
+
128
204
  # 2. Instantiate the custom word extractor
129
205
  # Get config settings from the parent PDF or use defaults
130
206
  pdf_config = getattr(self._page._parent, "_config", {})
@@ -134,45 +210,230 @@ class ElementManager:
134
210
 
135
211
  # Define which attributes to preserve on the merged word object
136
212
  # Should include split attributes + any others needed for filtering (like color)
137
- attributes_to_preserve = list(set(self._word_split_attributes + ["non_stroking_color"]))
138
-
139
- # Pass our configured attributes for splitting
140
- extractor = NaturalWordExtractor(
141
- word_split_attributes=self._word_split_attributes,
142
- extra_attrs=attributes_to_preserve,
143
- x_tolerance=xt,
144
- y_tolerance=yt,
145
- keep_blank_chars=True,
146
- use_text_flow=use_flow,
147
- # Assuming default directions are okay, configure if needed
148
- # line_dir=..., char_dir=...
213
+ attributes_to_preserve = list(
214
+ set(
215
+ self._word_split_attributes
216
+ + [
217
+ "non_stroking_color",
218
+ "strike",
219
+ "underline",
220
+ "highlight",
221
+ "highlight_color",
222
+ ]
223
+ )
149
224
  )
150
225
 
151
- # 3. Generate words using the extractor
152
- generated_words = []
153
- if prepared_char_dicts:
154
- # Sort chars primarily by upright status, then page reading order
155
- # Grouping by upright is crucial for WordExtractor's direction logic
156
- sorted_chars_for_extraction = sorted(
157
- prepared_char_dicts,
158
- key=lambda c: (c.get("upright", True), round(c.get("top", 0)), c.get("x0", 0)),
226
+ # -------------------------------------------------------------
227
+ # NEW: Detect direction (LTR vs RTL) per visual line and feed
228
+ # pdfplumber's WordExtractor with the correct settings.
229
+ # -------------------------------------------------------------
230
+ import unicodedata
231
+
232
+ def _is_rtl_char(ch: str) -> bool:
233
+ """Return True if the character has an RTL bidi class."""
234
+ if not ch:
235
+ return False
236
+ # If string has more than one character take first (works for most PDFs)
237
+ first = ch[0]
238
+ try:
239
+ return unicodedata.bidirectional(first) in ("R", "AL", "AN")
240
+ except Exception:
241
+ return False
242
+
243
+ # Helper: group characters into visual lines using y-tolerance
244
+ sorted_chars_for_line_grouping = sorted(
245
+ prepared_char_dicts,
246
+ key=lambda c: (round(c.get("top", 0) / max(yt, 1)) * yt, c.get("x0", 0)),
247
+ )
248
+
249
+ lines: List[List[Dict[str, Any]]] = []
250
+ current_line_key = None
251
+ for char_dict in sorted_chars_for_line_grouping:
252
+ top_val = char_dict.get("top", 0)
253
+ line_key = round(top_val / max(yt, 1)) # bucket index
254
+ if current_line_key is None or line_key != current_line_key:
255
+ # start new line bucket
256
+ lines.append([])
257
+ current_line_key = line_key
258
+ lines[-1].append(char_dict)
259
+
260
+ word_elements: List[TextElement] = []
261
+ # Process each line separately with direction detection
262
+ for line_chars in lines:
263
+ if not line_chars:
264
+ continue
265
+ # Determine RTL ratio
266
+ rtl_count = sum(1 for ch in line_chars if _is_rtl_char(ch.get("text", "")))
267
+ ltr_count = len(line_chars) - rtl_count
268
+ # Consider RTL if it has strictly more RTL than LTR strong characters
269
+ is_rtl_line = rtl_count > ltr_count
270
+
271
+ # Build a WordExtractor tailored for this line's direction
272
+ if is_rtl_line:
273
+ line_dir = "ttb" # horizontal lines stacked top→bottom
274
+ char_dir = "rtl" # characters right→left within the line
275
+ else:
276
+ line_dir = "ttb"
277
+ char_dir = "ltr"
278
+
279
+ extractor = NaturalWordExtractor(
280
+ word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
281
+ extra_attrs=attributes_to_preserve,
282
+ x_tolerance=xt,
283
+ y_tolerance=yt,
284
+ keep_blank_chars=True,
285
+ use_text_flow=use_flow,
286
+ line_dir=line_dir,
287
+ char_dir=char_dir,
159
288
  )
160
289
 
161
- word_tuples = extractor.iter_extract_tuples(sorted_chars_for_extraction)
290
+ # Prepare character sequence for the extractor:
291
+ # • For LTR lines -> left→right order (x0 ascending)
292
+ # • For RTL lines -> feed **reversed** list so that neighbouring
293
+ # characters appear adjacent when the extractor walks right→left.
294
+ line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
295
+
296
+ try:
297
+ word_tuples = extractor.iter_extract_tuples(line_chars_for_extractor)
298
+ except Exception as e: # pragma: no cover
299
+ logger.error(
300
+ f"Word extraction failed on line (rtl={is_rtl_line}) of page {self._page.number}: {e}",
301
+ exc_info=True,
302
+ )
303
+ word_tuples = []
304
+
162
305
  for word_dict, char_list in word_tuples:
163
- # Convert the generated word_dict to a TextElement
164
- word_dict["_char_dicts"] = char_list
306
+ # Memory optimisation for char indices
307
+ char_indices = []
308
+ for char_dict in char_list:
309
+ key = (
310
+ char_dict.get("x0", 0),
311
+ char_dict.get("top", 0),
312
+ char_dict.get("text", ""),
313
+ )
314
+ # char_to_index dict built earlier in load_elements
315
+ if key in char_to_index:
316
+ char_indices.append(char_to_index[key])
317
+ word_dict["_char_indices"] = char_indices
318
+ word_dict["_char_dicts"] = char_list # keep for back-compat
319
+ # Create and append TextElement
165
320
  word_element = self._create_word_element(word_dict)
166
- generated_words.append(word_element)
321
+ word_elements.append(word_element)
322
+
323
+ # Decide if this individual word contains RTL characters; safer than relying
324
+ # on the whole-line heuristic.
325
+ rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
326
+ if rtl_in_word:
327
+ try:
328
+ from bidi.algorithm import get_display # type: ignore
329
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
330
+
331
+ word_element.text = mirror_brackets(
332
+ get_display(word_element.text, base_dir="R")
333
+ )
334
+ except Exception:
335
+ # Fallback: keep original text if python-bidi fails
336
+ pass
337
+
338
+ # ------------------------------------------------------------------
339
+ # Propagate per-char strikethrough info up to word level.
340
+ # ------------------------------------------------------------------
341
+
342
+ if prepared_char_dicts:
343
+ for w in word_elements:
344
+ strike_chars = 0
345
+ total_chars = 0
346
+ if getattr(w, "_char_indices", None):
347
+ for idx in w._char_indices:
348
+ if 0 <= idx < len(prepared_char_dicts):
349
+ total_chars += 1
350
+ if prepared_char_dicts[idx].get("strike"):
351
+ strike_chars += 1
352
+ elif getattr(w, "_char_dicts", None):
353
+ for ch in w._char_dicts:
354
+ total_chars += 1
355
+ if ch.get("strike"):
356
+ strike_chars += 1
357
+
358
+ if total_chars:
359
+ w._obj["strike"] = (strike_chars / total_chars) >= 0.6
360
+ else:
361
+ w._obj["strike"] = False
362
+
363
+ # underline propagation
364
+ ul_chars = 0
365
+ if getattr(w, "_char_indices", None):
366
+ for idx in w._char_indices:
367
+ if 0 <= idx < len(prepared_char_dicts):
368
+ if prepared_char_dicts[idx].get("underline"):
369
+ ul_chars += 1
370
+ elif getattr(w, "_char_dicts", None):
371
+ ul_chars = sum(1 for ch in w._char_dicts if ch.get("underline"))
372
+
373
+ if total_chars:
374
+ w._obj["underline"] = (ul_chars / total_chars) >= 0.6
375
+ else:
376
+ w._obj["underline"] = False
377
+
378
+ # highlight propagation
379
+ hl_chars = 0
380
+ if getattr(w, "_char_indices", None):
381
+ for idx in w._char_indices:
382
+ if 0 <= idx < len(prepared_char_dicts):
383
+ if prepared_char_dicts[idx].get("highlight"):
384
+ hl_chars += 1
385
+ elif getattr(w, "_char_dicts", None):
386
+ hl_chars = sum(1 for ch in w._char_dicts if ch.get("highlight"))
387
+
388
+ if total_chars:
389
+ w._obj["highlight"] = (hl_chars / total_chars) >= 0.6
390
+ else:
391
+ w._obj["highlight"] = False
392
+
393
+ # Determine dominant highlight color among chars
394
+ if w._obj.get("highlight"):
395
+ color_counts = {}
396
+ source_iter = (
397
+ (prepared_char_dicts[idx] for idx in w._char_indices)
398
+ if getattr(w, "_char_indices", None)
399
+ else w._char_dicts if getattr(w, "_char_dicts", None) else []
400
+ )
401
+ for chd in source_iter:
402
+ if chd.get("highlight") and chd.get("highlight_color") is not None:
403
+ col = chd["highlight_color"]
404
+ color_counts[col] = color_counts.get(col, 0) + 1
405
+
406
+ if color_counts:
407
+ dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
408
+ try:
409
+ w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
410
+ except Exception:
411
+ w._obj["highlight_color"] = dominant_color
412
+
413
+ generated_words = word_elements
167
414
  logger.debug(
168
415
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
169
416
  )
170
417
 
418
+ # --- Post-processing pass to ensure every word containing RTL characters is
419
+ # stored in logical order and with mirrored brackets. This is a
420
+ # safeguard in case the per-line loop above missed some tokens.
421
+ try:
422
+ from bidi.algorithm import get_display # type: ignore
423
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
424
+
425
+ for w in generated_words:
426
+ if any(_is_rtl_char(ch) for ch in w.text):
427
+ w.text = mirror_brackets(get_display(w.text, base_dir="R"))
428
+ except Exception:
429
+ pass # graceful degradation – keep original text
430
+
171
431
  # 4. Load other elements (rects, lines)
172
432
  rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
173
433
  line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
434
+ image_elements = [ImageElement(i, self._page) for i in self._page._page.images]
174
435
  logger.debug(
175
- f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
436
+ f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines, {len(image_elements)} images."
176
437
  )
177
438
 
178
439
  # 5. Create the final elements dictionary
@@ -183,6 +444,7 @@ class ElementManager:
183
444
  "words": generated_words,
184
445
  "rects": rect_elements,
185
446
  "lines": line_elements,
447
+ "images": image_elements,
186
448
  }
187
449
 
188
450
  # Add regions if they exist
@@ -238,6 +500,11 @@ class ElementManager:
238
500
  augmented_dict.setdefault("upright", True)
239
501
  augmented_dict.setdefault("fontname", "Unknown")
240
502
  augmented_dict.setdefault("size", 0)
503
+ augmented_dict.setdefault("highlight_color", None)
504
+ # Ensure decoration keys exist for safe grouping
505
+ augmented_dict.setdefault("strike", False)
506
+ augmented_dict.setdefault("underline", False)
507
+ augmented_dict.setdefault("highlight", False)
241
508
 
242
509
  prepared_dicts.append(augmented_dict)
243
510
  # Use a unique identifier if available (e.g., tuple of key properties)
@@ -385,12 +652,21 @@ class ElementManager:
385
652
  "italic": False,
386
653
  "upright": True,
387
654
  "doctop": pdf_top + self._page._page.initial_doctop,
655
+ "strike": False,
656
+ "underline": False,
657
+ "highlight": False,
658
+ "highlight_color": None,
388
659
  }
389
660
 
390
661
  # Create the representative char dict for this OCR word
391
662
  ocr_char_dict = word_element_data.copy()
392
663
  ocr_char_dict["object_type"] = "char"
393
664
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
665
+ # Ensure decoration keys
666
+ ocr_char_dict.setdefault("strike", False)
667
+ ocr_char_dict.setdefault("underline", False)
668
+ ocr_char_dict.setdefault("highlight", False)
669
+ ocr_char_dict.setdefault("highlight_color", None)
394
670
 
395
671
  # Add the char dict list to the word data before creating TextElement
396
672
  word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
@@ -550,6 +826,12 @@ class ElementManager:
550
826
  self.load_elements()
551
827
  return self._elements.get("regions", [])
552
828
 
829
+ @property
830
+ def images(self):
831
+ """Get all image elements."""
832
+ self.load_elements()
833
+ return self._elements.get("images", [])
834
+
553
835
  def remove_ocr_elements(self):
554
836
  """
555
837
  Remove all elements with source="ocr" from the elements dictionary.
@@ -632,3 +914,252 @@ class ElementManager:
632
914
  return True
633
915
 
634
916
  return False
917
+
918
+ # ------------------------------------------------------------------
919
+ # Strikethrough detection (horizontal strike-out lines)
920
+ # ------------------------------------------------------------------
921
+
922
+ def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
923
+ thickness_tol: float = 1.5,
924
+ horiz_tol: float = 1.0,
925
+ coverage_ratio: float = 0.7,
926
+ band_top: float = 0.35,
927
+ band_bottom: float = 0.65) -> None:
928
+ """Annotate character dictionaries with a boolean ``strike`` flag.
929
+
930
+ Args
931
+ ----
932
+ char_dicts : list
933
+ The list that _prepare_char_dicts() returned – *modified in place*.
934
+ thickness_tol : float
935
+ Maximum height (in PDF pts) for a path to be considered a strike.
936
+ horiz_tol : float
937
+ Vertical tolerance when deciding if a pdfplumber ``line`` object
938
+ is horizontal (|y0-y1| ≤ horiz_tol).
939
+ coverage_ratio : float
940
+ Minimum proportion of the glyph's width that must be overlapped
941
+ by a candidate line.
942
+ band_top, band_bottom : float
943
+ Fractions of the glyph's height that define the central band in
944
+ which a line must fall to count as a strikethrough. Defaults to
945
+ 35–65 %.
946
+ """
947
+
948
+ # -------------------------------------------------------------
949
+ # Collect candidate horizontal primitives (lines + skinny rects)
950
+ # -------------------------------------------------------------
951
+ raw_lines = list(getattr(self._page._page, "lines", []))
952
+ raw_rects = list(getattr(self._page._page, "rects", []))
953
+
954
+ candidates: List[Tuple[float, float, float, float]] = [] # (x0, y0, x1, y1)
955
+
956
+ # pdfplumber line objects – treat those whose angle ≈ 0°
957
+ for ln in raw_lines:
958
+ y0 = min(ln.get("y0", 0), ln.get("y1", 0))
959
+ y1 = max(ln.get("y0", 0), ln.get("y1", 0))
960
+ if abs(y1 - y0) <= horiz_tol: # horizontal
961
+ candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
962
+
963
+ # Thin rectangles that act as drawn lines
964
+ pg_height = self._page.height
965
+ for rc in raw_rects:
966
+ rb0 = rc.get("y0", 0)
967
+ rb1 = rc.get("y1", 0)
968
+ y0_raw = min(rb0, rb1)
969
+ y1_raw = max(rb0, rb1)
970
+ if (y1_raw - y0_raw) <= thickness_tol:
971
+ # Convert from PDF (origin bottom-left) to top-based coords used by chars
972
+ y0 = pg_height - y1_raw # upper edge distance from top
973
+ y1 = pg_height - y0_raw # lower edge distance from top
974
+ candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
975
+
976
+ if not candidates:
977
+ return # nothing to mark
978
+
979
+ # -------------------------------------------------------------
980
+ # Walk through characters and flag those crossed by a candidate
981
+ # -------------------------------------------------------------
982
+ for ch in char_dicts:
983
+ ch.setdefault("strike", False) # default value
984
+ try:
985
+ x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
986
+ except KeyError:
987
+ continue # skip malformed char dict
988
+
989
+ width = x1 - x0
990
+ height = bottom - top
991
+ if width <= 0 or height <= 0:
992
+ continue
993
+
994
+ mid_y0 = top + band_top * height
995
+ mid_y1 = top + band_bottom * height
996
+
997
+ # Check each candidate line for overlap
998
+ for lx0, ly0, lx1, ly1 in candidates:
999
+ if (ly0 >= (mid_y0 - 1.0)) and (ly1 <= (mid_y1 + 1.0)): # lies inside central band
1000
+ overlap = min(x1, lx1) - max(x0, lx0)
1001
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1002
+ ch["strike"] = True
1003
+ break # no need to check further lines
1004
+
1005
+ # Done – char_dicts mutated in place
1006
+
1007
+ # ------------------------------------------------------------------
1008
+ # Underline detection
1009
+ # ------------------------------------------------------------------
1010
+
1011
+ def _mark_underline_chars(
1012
+ self,
1013
+ char_dicts: List[Dict[str, Any]],
1014
+ *,
1015
+ thickness_tol: float = None,
1016
+ horiz_tol: float = None,
1017
+ coverage_ratio: float = None,
1018
+ band_frac: float = None,
1019
+ below_pad: float = None,
1020
+ ) -> None:
1021
+ """Annotate character dicts with ``underline`` flag."""
1022
+
1023
+ # Allow user overrides via PDF._config["underline_detection"]
1024
+ pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
1025
+
1026
+ thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1027
+ horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1028
+ coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1029
+ band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1030
+ below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1031
+
1032
+ raw_lines = list(getattr(self._page._page, "lines", []))
1033
+ raw_rects = list(getattr(self._page._page, "rects", []))
1034
+
1035
+ candidates: List[Tuple[float, float, float, float]] = []
1036
+
1037
+ for ln in raw_lines:
1038
+ y0 = min(ln.get("y0", 0), ln.get("y1", 0))
1039
+ y1 = max(ln.get("y0", 0), ln.get("y1", 0))
1040
+ if abs(y1 - y0) <= horiz_tol and (
1041
+ (ln.get("x1", 0) - ln.get("x0", 0)) < self._page.width * 0.95
1042
+ ): # ignore full-width rules
1043
+ candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
1044
+
1045
+ pg_height = self._page.height
1046
+ for rc in raw_rects:
1047
+ rb0 = rc.get("y0", 0)
1048
+ rb1 = rc.get("y1", 0)
1049
+ y0_raw = min(rb0, rb1)
1050
+ y1_raw = max(rb0, rb1)
1051
+ if (y1_raw - y0_raw) <= thickness_tol and (
1052
+ (rc.get("x1", 0) - rc.get("x0", 0)) < self._page.width * 0.95
1053
+ ):
1054
+ y0 = pg_height - y1_raw
1055
+ y1 = pg_height - y0_raw
1056
+ candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
1057
+
1058
+ if not candidates:
1059
+ for ch in char_dicts:
1060
+ ch.setdefault("underline", False)
1061
+ return
1062
+
1063
+ # group candidates by y within tolerance 0.5 to detect repeating table borders
1064
+ y_groups: Dict[int, int] = {}
1065
+ for _, y0, _, y1 in candidates:
1066
+ key = int((y0 + y1) / 2)
1067
+ y_groups[key] = y_groups.get(key, 0) + 1
1068
+
1069
+ table_y = {k for k, v in y_groups.items() if v >= 3}
1070
+
1071
+ # filter out candidates on those y values
1072
+ filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
1073
+
1074
+ # annotate chars
1075
+ for ch in char_dicts:
1076
+ ch.setdefault("underline", False)
1077
+ try:
1078
+ x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
1079
+ except KeyError:
1080
+ continue
1081
+
1082
+ width = x1 - x0
1083
+ height = bottom - top
1084
+ if width <= 0 or height <= 0:
1085
+ continue
1086
+
1087
+ band_top = bottom - band_frac * height
1088
+ band_bottom = bottom + below_pad # allow some distance below baseline
1089
+
1090
+ for lx0, ly0, lx1, ly1 in filtered_candidates:
1091
+ if (ly0 >= band_top - 1) and (ly1 <= band_bottom + 1):
1092
+ overlap = min(x1, lx1) - max(x0, lx0)
1093
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1094
+ ch["underline"] = True
1095
+ break
1096
+
1097
+ # ------------------------------------------------------------------
1098
+ # Highlight detection
1099
+ # ------------------------------------------------------------------
1100
+
1101
+ def _mark_highlight_chars(self, char_dicts: List[Dict[str, Any]]) -> None:
1102
+ """Detect PDF marker-style highlights and set ``highlight`` on char dicts."""
1103
+
1104
+ cfg = getattr(self._page._parent, "_config", {}).get("highlight_detection", {})
1105
+
1106
+ height_min_ratio = cfg.get("height_min_ratio", HIGHLIGHT_DEFAULTS["height_min_ratio"])
1107
+ height_max_ratio = cfg.get("height_max_ratio", HIGHLIGHT_DEFAULTS["height_max_ratio"])
1108
+ coverage_ratio = cfg.get("coverage_ratio", HIGHLIGHT_DEFAULTS["coverage_ratio"])
1109
+
1110
+ raw_rects = list(getattr(self._page._page, "rects", []))
1111
+ pg_height = self._page.height
1112
+
1113
+ # Build list of candidate highlight rectangles (convert to top-based coords)
1114
+ highlight_rects = []
1115
+ for rc in raw_rects:
1116
+ if rc.get("stroke", False):
1117
+ continue # border stroke, not fill-only
1118
+ if not rc.get("fill", False):
1119
+ continue
1120
+
1121
+ fill_col = rc.get("non_stroking_color")
1122
+ # We keep colour as metadata but no longer filter on it
1123
+ if fill_col is None:
1124
+ continue
1125
+
1126
+ y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
1127
+ y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
1128
+ rheight = y1_rect - y0_rect
1129
+ highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
1130
+
1131
+ if not highlight_rects:
1132
+ for ch in char_dicts:
1133
+ ch.setdefault("highlight", False)
1134
+ return
1135
+
1136
+ for ch in char_dicts:
1137
+ ch.setdefault("highlight", False)
1138
+ try:
1139
+ x0_raw, y0_raw, x1_raw, y1_raw = ch["x0"], ch["y0"], ch["x1"], ch["y1"]
1140
+ except KeyError:
1141
+ continue
1142
+
1143
+ width = x1_raw - x0_raw
1144
+ height = y1_raw - y0_raw
1145
+ if width <= 0 or height <= 0:
1146
+ continue
1147
+
1148
+ for rx0, ry0, rx1, ry1, rheight, rcolor in highlight_rects:
1149
+ # height ratio check relative to char
1150
+ ratio = rheight / height if height else 0
1151
+ if ratio < height_min_ratio or ratio > height_max_ratio:
1152
+ continue
1153
+
1154
+ # vertical containment in raw coords
1155
+ if not (y0_raw + 1 >= ry0 and y1_raw - 1 <= ry1):
1156
+ continue
1157
+
1158
+ overlap = min(x1_raw, rx1) - max(x0_raw, rx0)
1159
+ if overlap > 0 and (overlap / width) >= coverage_ratio:
1160
+ ch["highlight"] = True
1161
+ try:
1162
+ ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1163
+ except Exception:
1164
+ ch["highlight_color"] = rcolor
1165
+ break