natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,16 @@
1
- """
2
- Element Manager for natural-pdf.
3
-
4
- This class handles the loading, creation, and management of PDF elements like
5
- characters, words, rectangles, and lines extracted from a page.
1
+ """Element Manager for natural-pdf.
2
+
3
+ This module handles the loading, creation, and management of PDF elements like
4
+ characters, words, rectangles, lines, and images extracted from a page. The
5
+ ElementManager class serves as the central coordinator for element lifecycle
6
+ management and provides enhanced word extraction capabilities.
7
+
8
+ The module includes:
9
+ - Element creation and caching for performance
10
+ - Custom word extraction that respects font boundaries
11
+ - OCR coordinate transformation and integration
12
+ - Text decoration detection (underline, strikethrough, highlights)
13
+ - Performance optimizations for bulk text processing
6
14
  """
7
15
 
8
16
  import logging
@@ -13,10 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
13
21
 
14
22
  from pdfplumber.utils.text import WordExtractor
15
23
 
24
+ from natural_pdf.elements.image import ImageElement
16
25
  from natural_pdf.elements.line import LineElement
17
26
  from natural_pdf.elements.rect import RectangleElement
18
27
  from natural_pdf.elements.text import TextElement
19
- from natural_pdf.elements.image import ImageElement
20
28
 
21
29
  logger = logging.getLogger(__name__)
22
30
 
@@ -25,8 +33,8 @@ logger = logging.getLogger(__name__)
25
33
  # ------------------------------------------------------------------
26
34
 
27
35
  STRIKE_DEFAULTS = {
28
- "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
29
- "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
36
+ "thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
37
+ "horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
30
38
  "coverage_ratio": 0.7, # proportion of glyph width to be overlapped
31
39
  "band_top_frac": 0.35, # fraction of glyph height above top baseline band
32
40
  "band_bottom_frac": 0.65, # fraction below top (same used internally)
@@ -36,48 +44,90 @@ UNDERLINE_DEFAULTS = {
36
44
  "thickness_tol": 1.5,
37
45
  "horiz_tol": 1.0,
38
46
  "coverage_ratio": 0.8,
39
- "band_frac": 0.25, # height fraction above baseline
40
- "below_pad": 0.7, # pt ; pad below baseline
47
+ "band_frac": 0.25, # height fraction above baseline
48
+ "below_pad": 0.7, # pt ; pad below baseline
41
49
  }
42
50
 
43
51
  HIGHLIGHT_DEFAULTS = {
44
52
  "height_min_ratio": 0.6, # rect height relative to char height lower bound
45
53
  "height_max_ratio": 2.0, # upper bound
46
- "coverage_ratio": 0.6, # horizontal overlap with glyph
54
+ "coverage_ratio": 0.6, # horizontal overlap with glyph
47
55
  "color_saturation_min": 0.4, # HSV S >
48
- "color_value_min": 0.4, # HSV V >
56
+ "color_value_min": 0.4, # HSV V >
49
57
  }
50
58
 
51
59
 
52
60
  @contextmanager
53
61
  def disable_text_sync():
54
- """
55
- Temporarily disable text synchronization for performance.
56
-
57
- This is used when bulk-updating text content where character-level
58
- synchronization is not needed, such as during bidi processing.
59
- Fixes exponential recursion issue with Arabic/RTL text processing.
62
+ """Temporarily disable text synchronization for performance.
63
+
64
+ This context manager is used when bulk-updating text content where character-level
65
+ synchronization is not needed, such as during bidi processing or large-scale
66
+ text transformations. It prevents exponential recursion issues with Arabic/RTL
67
+ text processing by bypassing the normal text property setter.
68
+
69
+ Yields:
70
+ None: The context where text synchronization is disabled.
71
+
72
+ Example:
73
+ ```python
74
+ with disable_text_sync():
75
+ for element in text_elements:
76
+ element.text = process_arabic_text(element.text)
77
+ # Text sync automatically restored after the block
78
+ ```
79
+
80
+ Note:
81
+ This optimization is critical for performance when processing documents
82
+ with complex text layouts or right-to-left scripts that would otherwise
83
+ trigger expensive character synchronization operations.
60
84
  """
61
85
  # Save original setter
62
86
  original_setter = TextElement.text.fset
63
-
87
+
64
88
  # Create a fast setter that skips sync
65
89
  def fast_setter(self, value):
66
90
  self._obj["text"] = value
67
91
  # Skip character synchronization for performance
68
-
92
+
69
93
  # Apply fast setter
70
94
  TextElement.text = property(TextElement.text.fget, fast_setter)
71
-
95
+
72
96
  try:
73
97
  yield
74
98
  finally:
75
99
  # Restore original setter
76
100
  TextElement.text = property(TextElement.text.fget, original_setter)
77
101
 
102
+
78
103
  class NaturalWordExtractor(WordExtractor):
79
- """
80
- Custom WordExtractor that splits words based on specified character attributes
104
+ """Custom WordExtractor that splits words based on specified character attributes.
105
+
106
+ This class extends pdfplumber's WordExtractor to provide more intelligent word
107
+ segmentation that respects font boundaries and other character attributes.
108
+ It prevents words from spanning across different fonts, sizes, or styles,
109
+ which is essential for maintaining semantic meaning in document analysis.
110
+
111
+ The extractor considers multiple character attributes when determining word
112
+ boundaries, ensuring that visually distinct text elements (like bold headers
113
+ mixed with regular text) are properly separated into distinct words.
114
+
115
+ Attributes:
116
+ font_attrs: List of character attributes to consider for word boundaries.
117
+ Common attributes include 'fontname', 'size', 'flags', etc.
118
+
119
+ Example:
120
+ ```python
121
+ # Create extractor that splits on font and size changes
122
+ extractor = NaturalWordExtractor(['fontname', 'size'])
123
+
124
+ # Extract words with font-aware boundaries
125
+ words = extractor.extract_words(page_chars)
126
+
127
+ # Each word will have consistent font properties
128
+ for word in words:
129
+ print(f"'{word['text']}' in {word['fontname']} size {word['size']}")
130
+ ```
81
131
  in addition to pdfplumber's default spatial logic.
82
132
  """
83
133
 
@@ -146,7 +196,7 @@ class ElementManager:
146
196
  contained in the Page class, providing better separation of concerns.
147
197
  """
148
198
 
149
- def __init__(self, page, font_attrs=None):
199
+ def __init__(self, page, font_attrs=None, load_text: bool = True):
150
200
  """
151
201
  Initialize the ElementManager.
152
202
 
@@ -156,9 +206,11 @@ class ElementManager:
156
206
  Default: ['fontname', 'size', 'bold', 'italic']
157
207
  None: Only consider spatial relationships
158
208
  List: Custom attributes to consider
209
+ load_text: Whether to load text elements from the PDF (default: True).
159
210
  """
160
211
  self._page = page
161
212
  self._elements = None # Lazy-loaded
213
+ self._load_text = load_text
162
214
  # Default to splitting by fontname, size, bold, italic if not specified
163
215
  # Renamed internal variable for clarity
164
216
  self._word_split_attributes = (
@@ -175,11 +227,15 @@ class ElementManager:
175
227
 
176
228
  logger.debug(f"Page {self._page.number}: Loading elements...")
177
229
 
178
- # 1. Prepare character dictionaries (native + OCR) with necessary attributes
179
- prepared_char_dicts = self._prepare_char_dicts()
180
- logger.debug(
181
- f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
182
- )
230
+ # 1. Prepare character dictionaries only if loading text
231
+ if self._load_text:
232
+ prepared_char_dicts = self._prepare_char_dicts()
233
+ logger.debug(
234
+ f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
235
+ )
236
+ else:
237
+ prepared_char_dicts = []
238
+ logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
183
239
 
184
240
  # -------------------------------------------------------------
185
241
  # Detect strikethrough (horizontal strike-out lines) on raw
@@ -189,61 +245,77 @@ class ElementManager:
189
245
  # belong to the same word.
190
246
  # -------------------------------------------------------------
191
247
 
192
- try:
193
- self._mark_strikethrough_chars(prepared_char_dicts)
194
- except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
195
- logger.warning(
196
- f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
197
- exc_info=True,
198
- )
248
+ if self._load_text and prepared_char_dicts:
249
+ try:
250
+ self._mark_strikethrough_chars(prepared_char_dicts)
251
+ except (
252
+ Exception
253
+ ) as strike_err: # pragma: no cover – strike detection must never crash loading
254
+ logger.warning(
255
+ f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
256
+ exc_info=True,
257
+ )
199
258
 
200
259
  # -------------------------------------------------------------
201
260
  # Detect underlines on raw characters (must come after strike so
202
261
  # both attributes are present before word grouping).
203
262
  # -------------------------------------------------------------
204
263
 
205
- try:
206
- self._mark_underline_chars(prepared_char_dicts)
207
- except Exception as u_err: # pragma: no cover
208
- logger.warning(
209
- f"Page {self._page.number}: Underline detection failed – {u_err}",
210
- exc_info=True,
211
- )
264
+ if self._load_text and prepared_char_dicts:
265
+ try:
266
+ self._mark_underline_chars(prepared_char_dicts)
267
+ except Exception as u_err: # pragma: no cover
268
+ logger.warning(
269
+ f"Page {self._page.number}: Underline detection failed – {u_err}",
270
+ exc_info=True,
271
+ )
212
272
 
213
273
  # Detect highlights
214
- try:
215
- self._mark_highlight_chars(prepared_char_dicts)
216
- except Exception as h_err:
217
- logger.warning(
218
- f"Page {self._page.number}: Highlight detection failed – {h_err}",
219
- exc_info=True,
220
- )
274
+ if self._load_text and prepared_char_dicts:
275
+ try:
276
+ self._mark_highlight_chars(prepared_char_dicts)
277
+ except Exception as h_err:
278
+ logger.warning(
279
+ f"Page {self._page.number}: Highlight detection failed – {h_err}",
280
+ exc_info=True,
281
+ )
221
282
 
222
283
  # Create a mapping from character dict to index for efficient lookup
223
- char_to_index = {}
224
- for idx, char_dict in enumerate(prepared_char_dicts):
225
- key = (
226
- char_dict.get("x0", 0),
227
- char_dict.get("top", 0),
228
- char_dict.get("text", ""),
229
- )
230
- char_to_index[key] = idx
284
+ if self._load_text:
285
+ char_to_index = {}
286
+ for idx, char_dict in enumerate(prepared_char_dicts):
287
+ key = (
288
+ char_dict.get("x0", 0),
289
+ char_dict.get("top", 0),
290
+ char_dict.get("text", ""),
291
+ )
292
+ char_to_index[key] = idx
293
+ else:
294
+ char_to_index = {}
231
295
 
232
296
  # 2. Instantiate the custom word extractor
233
297
  # Prefer page-level config over PDF-level for tolerance lookup
298
+ word_elements: List[TextElement] = []
299
+
300
+ # Get config objects (needed for auto_text_tolerance check)
234
301
  page_config = getattr(self._page, "_config", {})
235
302
  pdf_config = getattr(self._page._parent, "_config", {})
236
303
 
237
- # Start with any explicitly supplied tolerances (may be None)
238
- xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
239
- yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
304
+ # Initialize tolerance variables
305
+ xt = None
306
+ yt = None
240
307
  use_flow = pdf_config.get("use_text_flow", False)
241
308
 
309
+ if self._load_text and prepared_char_dicts:
310
+ # Start with any explicitly supplied tolerances (may be None)
311
+ xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
312
+ yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
313
+
242
314
  # ------------------------------------------------------------------
243
315
  # Auto-adaptive tolerance: scale based on median character size when
244
316
  # requested and explicit values are absent.
245
317
  # ------------------------------------------------------------------
246
- if pdf_config.get("auto_text_tolerance", True):
318
+ if self._load_text and pdf_config.get("auto_text_tolerance", True):
247
319
  import statistics
248
320
 
249
321
  sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
@@ -255,7 +327,7 @@ class ElementManager:
255
327
  # Record back to page config for downstream users
256
328
  page_config["x_tolerance"] = xt
257
329
  if yt is None:
258
- yt = 0.6 * median_size # ~line spacing fraction
330
+ yt = 0.6 * median_size # ~line spacing fraction
259
331
  page_config["y_tolerance"] = yt
260
332
 
261
333
  # Warn users when the page's font size is extremely small –
@@ -323,7 +395,6 @@ class ElementManager:
323
395
  current_line_key = line_key
324
396
  lines[-1].append(char_dict)
325
397
 
326
- word_elements: List[TextElement] = []
327
398
  # Process each line separately with direction detection
328
399
  for line_chars in lines:
329
400
  if not line_chars:
@@ -345,7 +416,8 @@ class ElementManager:
345
416
  char_dir = "ltr"
346
417
 
347
418
  extractor = NaturalWordExtractor(
348
- word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
419
+ word_split_attributes=self._word_split_attributes
420
+ + ["strike", "underline", "highlight"],
349
421
  extra_attrs=attributes_to_preserve,
350
422
  x_tolerance=xt,
351
423
  y_tolerance=yt,
@@ -394,12 +466,13 @@ class ElementManager:
394
466
  # Convert from visual order (from PDF) to logical order using bidi
395
467
  try:
396
468
  from bidi.algorithm import get_display # type: ignore
469
+
397
470
  from natural_pdf.utils.bidi_mirror import mirror_brackets
398
471
 
399
472
  with disable_text_sync():
400
473
  # word_element.text is currently in visual order (from PDF)
401
474
  # Convert to logical order using bidi with auto direction detection
402
- logical_text = get_display(word_element.text, base_dir='L')
475
+ logical_text = get_display(word_element.text, base_dir="L")
403
476
  # Apply bracket mirroring for logical order
404
477
  word_element.text = mirror_brackets(logical_text)
405
478
  except Exception:
@@ -476,11 +549,16 @@ class ElementManager:
476
549
  if color_counts:
477
550
  dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
478
551
  try:
479
- w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
552
+ w._obj["highlight_color"] = (
553
+ tuple(dominant_color)
554
+ if isinstance(dominant_color, (list, tuple))
555
+ else dominant_color
556
+ )
480
557
  except Exception:
481
558
  w._obj["highlight_color"] = dominant_color
482
559
 
483
- generated_words = word_elements
560
+ # generated_words defaults to empty list if text loading is disabled
561
+ generated_words = word_elements if self._load_text else []
484
562
  logger.debug(
485
563
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
486
564
  )
@@ -978,12 +1056,16 @@ class ElementManager:
978
1056
  # Strikethrough detection (horizontal strike-out lines)
979
1057
  # ------------------------------------------------------------------
980
1058
 
981
- def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
982
- thickness_tol: float = 1.5,
983
- horiz_tol: float = 1.0,
984
- coverage_ratio: float = 0.7,
985
- band_top: float = 0.35,
986
- band_bottom: float = 0.65) -> None:
1059
+ def _mark_strikethrough_chars(
1060
+ self,
1061
+ char_dicts: List[Dict[str, Any]],
1062
+ *,
1063
+ thickness_tol: float = 1.5,
1064
+ horiz_tol: float = 1.0,
1065
+ coverage_ratio: float = 0.7,
1066
+ band_top: float = 0.35,
1067
+ band_bottom: float = 0.65,
1068
+ ) -> None:
987
1069
  """Annotate character dictionaries with a boolean ``strike`` flag.
988
1070
 
989
1071
  Args
@@ -1082,11 +1164,31 @@ class ElementManager:
1082
1164
  # Allow user overrides via PDF._config["underline_detection"]
1083
1165
  pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
1084
1166
 
1085
- thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1086
- horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1087
- coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1088
- band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1089
- below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1167
+ thickness_tol = (
1168
+ thickness_tol
1169
+ if thickness_tol is not None
1170
+ else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
1171
+ )
1172
+ horiz_tol = (
1173
+ horiz_tol
1174
+ if horiz_tol is not None
1175
+ else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
1176
+ )
1177
+ coverage_ratio = (
1178
+ coverage_ratio
1179
+ if coverage_ratio is not None
1180
+ else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
1181
+ )
1182
+ band_frac = (
1183
+ band_frac
1184
+ if band_frac is not None
1185
+ else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
1186
+ )
1187
+ below_pad = (
1188
+ below_pad
1189
+ if below_pad is not None
1190
+ else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
1191
+ )
1090
1192
 
1091
1193
  raw_lines = list(getattr(self._page._page, "lines", []))
1092
1194
  raw_rects = list(getattr(self._page._page, "rects", []))
@@ -1128,7 +1230,7 @@ class ElementManager:
1128
1230
  table_y = {k for k, v in y_groups.items() if v >= 3}
1129
1231
 
1130
1232
  # filter out candidates on those y values
1131
- filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
1233
+ filtered_candidates = [c for c in candidates if int((c[1] + c[3]) / 2) not in table_y]
1132
1234
 
1133
1235
  # annotate chars
1134
1236
  for ch in char_dicts:
@@ -1185,7 +1287,9 @@ class ElementManager:
1185
1287
  y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
1186
1288
  y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
1187
1289
  rheight = y1_rect - y0_rect
1188
- highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
1290
+ highlight_rects.append(
1291
+ (rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col)
1292
+ )
1189
1293
 
1190
1294
  if not highlight_rects:
1191
1295
  for ch in char_dicts:
@@ -1218,7 +1322,9 @@ class ElementManager:
1218
1322
  if overlap > 0 and (overlap / width) >= coverage_ratio:
1219
1323
  ch["highlight"] = True
1220
1324
  try:
1221
- ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1325
+ ch["highlight_color"] = (
1326
+ tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
1327
+ )
1222
1328
  except Exception:
1223
1329
  ch["highlight_color"] = rcolor
1224
1330
  break
@@ -98,7 +98,9 @@ class HighlightRenderer:
98
98
  scaled_bbox = None
99
99
 
100
100
  if highlight.is_polygon:
101
- scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
101
+ scaled_polygon = [
102
+ (p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
103
+ ]
102
104
  # Draw polygon fill and border
103
105
  draw.polygon(
104
106
  scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
@@ -597,7 +599,7 @@ class HighlightingService:
597
599
  if page_index in self._highlights_by_page:
598
600
  del self._highlights_by_page[page_index]
599
601
  logger.debug(f"Cleared highlights for page {page_index}.")
600
-
602
+
601
603
  # Also clear any cached rendered images for this page so the next render
602
604
  # reflects the removal of highlights.
603
605
  try:
@@ -683,7 +685,6 @@ class HighlightingService:
683
685
  )
684
686
 
685
687
  try:
686
- # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
687
688
  img_object = page_obj._page.to_image(**to_image_args)
688
689
  base_image_pil = (
689
690
  img_object.annotated
@@ -929,9 +930,7 @@ class HighlightingService:
929
930
  right_px = max(left_px + 1, min(right_px, rendered_image.width))
930
931
  bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
931
932
 
932
- rendered_image = rendered_image.crop(
933
- (left_px, top_px, right_px, bottom_px)
934
- )
933
+ rendered_image = rendered_image.crop((left_px, top_px, right_px, bottom_px))
935
934
 
936
935
  legend = None
937
936
  if labels: