natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,16 @@
|
|
1
|
-
"""
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
"""Element Manager for natural-pdf.
|
2
|
+
|
3
|
+
This module handles the loading, creation, and management of PDF elements like
|
4
|
+
characters, words, rectangles, lines, and images extracted from a page. The
|
5
|
+
ElementManager class serves as the central coordinator for element lifecycle
|
6
|
+
management and provides enhanced word extraction capabilities.
|
7
|
+
|
8
|
+
The module includes:
|
9
|
+
- Element creation and caching for performance
|
10
|
+
- Custom word extraction that respects font boundaries
|
11
|
+
- OCR coordinate transformation and integration
|
12
|
+
- Text decoration detection (underline, strikethrough, highlights)
|
13
|
+
- Performance optimizations for bulk text processing
|
6
14
|
"""
|
7
15
|
|
8
16
|
import logging
|
@@ -13,10 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
13
21
|
|
14
22
|
from pdfplumber.utils.text import WordExtractor
|
15
23
|
|
24
|
+
from natural_pdf.elements.image import ImageElement
|
16
25
|
from natural_pdf.elements.line import LineElement
|
17
26
|
from natural_pdf.elements.rect import RectangleElement
|
18
27
|
from natural_pdf.elements.text import TextElement
|
19
|
-
from natural_pdf.elements.image import ImageElement
|
20
28
|
|
21
29
|
logger = logging.getLogger(__name__)
|
22
30
|
|
@@ -25,8 +33,8 @@ logger = logging.getLogger(__name__)
|
|
25
33
|
# ------------------------------------------------------------------
|
26
34
|
|
27
35
|
STRIKE_DEFAULTS = {
|
28
|
-
"thickness_tol": 1.5,
|
29
|
-
"horiz_tol": 1.0,
|
36
|
+
"thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
|
37
|
+
"horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
|
30
38
|
"coverage_ratio": 0.7, # proportion of glyph width to be overlapped
|
31
39
|
"band_top_frac": 0.35, # fraction of glyph height above top baseline band
|
32
40
|
"band_bottom_frac": 0.65, # fraction below top (same used internally)
|
@@ -36,48 +44,90 @@ UNDERLINE_DEFAULTS = {
|
|
36
44
|
"thickness_tol": 1.5,
|
37
45
|
"horiz_tol": 1.0,
|
38
46
|
"coverage_ratio": 0.8,
|
39
|
-
"band_frac": 0.25,
|
40
|
-
"below_pad": 0.7,
|
47
|
+
"band_frac": 0.25, # height fraction above baseline
|
48
|
+
"below_pad": 0.7, # pt ; pad below baseline
|
41
49
|
}
|
42
50
|
|
43
51
|
HIGHLIGHT_DEFAULTS = {
|
44
52
|
"height_min_ratio": 0.6, # rect height relative to char height lower bound
|
45
53
|
"height_max_ratio": 2.0, # upper bound
|
46
|
-
"coverage_ratio": 0.6,
|
54
|
+
"coverage_ratio": 0.6, # horizontal overlap with glyph
|
47
55
|
"color_saturation_min": 0.4, # HSV S >
|
48
|
-
"color_value_min": 0.4,
|
56
|
+
"color_value_min": 0.4, # HSV V >
|
49
57
|
}
|
50
58
|
|
51
59
|
|
52
60
|
@contextmanager
|
53
61
|
def disable_text_sync():
|
54
|
-
"""
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
62
|
+
"""Temporarily disable text synchronization for performance.
|
63
|
+
|
64
|
+
This context manager is used when bulk-updating text content where character-level
|
65
|
+
synchronization is not needed, such as during bidi processing or large-scale
|
66
|
+
text transformations. It prevents exponential recursion issues with Arabic/RTL
|
67
|
+
text processing by bypassing the normal text property setter.
|
68
|
+
|
69
|
+
Yields:
|
70
|
+
None: The context where text synchronization is disabled.
|
71
|
+
|
72
|
+
Example:
|
73
|
+
```python
|
74
|
+
with disable_text_sync():
|
75
|
+
for element in text_elements:
|
76
|
+
element.text = process_arabic_text(element.text)
|
77
|
+
# Text sync automatically restored after the block
|
78
|
+
```
|
79
|
+
|
80
|
+
Note:
|
81
|
+
This optimization is critical for performance when processing documents
|
82
|
+
with complex text layouts or right-to-left scripts that would otherwise
|
83
|
+
trigger expensive character synchronization operations.
|
60
84
|
"""
|
61
85
|
# Save original setter
|
62
86
|
original_setter = TextElement.text.fset
|
63
|
-
|
87
|
+
|
64
88
|
# Create a fast setter that skips sync
|
65
89
|
def fast_setter(self, value):
|
66
90
|
self._obj["text"] = value
|
67
91
|
# Skip character synchronization for performance
|
68
|
-
|
92
|
+
|
69
93
|
# Apply fast setter
|
70
94
|
TextElement.text = property(TextElement.text.fget, fast_setter)
|
71
|
-
|
95
|
+
|
72
96
|
try:
|
73
97
|
yield
|
74
98
|
finally:
|
75
99
|
# Restore original setter
|
76
100
|
TextElement.text = property(TextElement.text.fget, original_setter)
|
77
101
|
|
102
|
+
|
78
103
|
class NaturalWordExtractor(WordExtractor):
|
79
|
-
"""
|
80
|
-
|
104
|
+
"""Custom WordExtractor that splits words based on specified character attributes.
|
105
|
+
|
106
|
+
This class extends pdfplumber's WordExtractor to provide more intelligent word
|
107
|
+
segmentation that respects font boundaries and other character attributes.
|
108
|
+
It prevents words from spanning across different fonts, sizes, or styles,
|
109
|
+
which is essential for maintaining semantic meaning in document analysis.
|
110
|
+
|
111
|
+
The extractor considers multiple character attributes when determining word
|
112
|
+
boundaries, ensuring that visually distinct text elements (like bold headers
|
113
|
+
mixed with regular text) are properly separated into distinct words.
|
114
|
+
|
115
|
+
Attributes:
|
116
|
+
font_attrs: List of character attributes to consider for word boundaries.
|
117
|
+
Common attributes include 'fontname', 'size', 'flags', etc.
|
118
|
+
|
119
|
+
Example:
|
120
|
+
```python
|
121
|
+
# Create extractor that splits on font and size changes
|
122
|
+
extractor = NaturalWordExtractor(['fontname', 'size'])
|
123
|
+
|
124
|
+
# Extract words with font-aware boundaries
|
125
|
+
words = extractor.extract_words(page_chars)
|
126
|
+
|
127
|
+
# Each word will have consistent font properties
|
128
|
+
for word in words:
|
129
|
+
print(f"'{word['text']}' in {word['fontname']} size {word['size']}")
|
130
|
+
```
|
81
131
|
in addition to pdfplumber's default spatial logic.
|
82
132
|
"""
|
83
133
|
|
@@ -146,7 +196,7 @@ class ElementManager:
|
|
146
196
|
contained in the Page class, providing better separation of concerns.
|
147
197
|
"""
|
148
198
|
|
149
|
-
def __init__(self, page, font_attrs=None):
|
199
|
+
def __init__(self, page, font_attrs=None, load_text: bool = True):
|
150
200
|
"""
|
151
201
|
Initialize the ElementManager.
|
152
202
|
|
@@ -156,9 +206,11 @@ class ElementManager:
|
|
156
206
|
Default: ['fontname', 'size', 'bold', 'italic']
|
157
207
|
None: Only consider spatial relationships
|
158
208
|
List: Custom attributes to consider
|
209
|
+
load_text: Whether to load text elements from the PDF (default: True).
|
159
210
|
"""
|
160
211
|
self._page = page
|
161
212
|
self._elements = None # Lazy-loaded
|
213
|
+
self._load_text = load_text
|
162
214
|
# Default to splitting by fontname, size, bold, italic if not specified
|
163
215
|
# Renamed internal variable for clarity
|
164
216
|
self._word_split_attributes = (
|
@@ -175,11 +227,15 @@ class ElementManager:
|
|
175
227
|
|
176
228
|
logger.debug(f"Page {self._page.number}: Loading elements...")
|
177
229
|
|
178
|
-
# 1. Prepare character dictionaries
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
230
|
+
# 1. Prepare character dictionaries only if loading text
|
231
|
+
if self._load_text:
|
232
|
+
prepared_char_dicts = self._prepare_char_dicts()
|
233
|
+
logger.debug(
|
234
|
+
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
235
|
+
)
|
236
|
+
else:
|
237
|
+
prepared_char_dicts = []
|
238
|
+
logger.debug(f"Page {self._page.number}: Skipping text loading (load_text=False)")
|
183
239
|
|
184
240
|
# -------------------------------------------------------------
|
185
241
|
# Detect strikethrough (horizontal strike-out lines) on raw
|
@@ -189,61 +245,77 @@ class ElementManager:
|
|
189
245
|
# belong to the same word.
|
190
246
|
# -------------------------------------------------------------
|
191
247
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
248
|
+
if self._load_text and prepared_char_dicts:
|
249
|
+
try:
|
250
|
+
self._mark_strikethrough_chars(prepared_char_dicts)
|
251
|
+
except (
|
252
|
+
Exception
|
253
|
+
) as strike_err: # pragma: no cover – strike detection must never crash loading
|
254
|
+
logger.warning(
|
255
|
+
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
256
|
+
exc_info=True,
|
257
|
+
)
|
199
258
|
|
200
259
|
# -------------------------------------------------------------
|
201
260
|
# Detect underlines on raw characters (must come after strike so
|
202
261
|
# both attributes are present before word grouping).
|
203
262
|
# -------------------------------------------------------------
|
204
263
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
264
|
+
if self._load_text and prepared_char_dicts:
|
265
|
+
try:
|
266
|
+
self._mark_underline_chars(prepared_char_dicts)
|
267
|
+
except Exception as u_err: # pragma: no cover
|
268
|
+
logger.warning(
|
269
|
+
f"Page {self._page.number}: Underline detection failed – {u_err}",
|
270
|
+
exc_info=True,
|
271
|
+
)
|
212
272
|
|
213
273
|
# Detect highlights
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
274
|
+
if self._load_text and prepared_char_dicts:
|
275
|
+
try:
|
276
|
+
self._mark_highlight_chars(prepared_char_dicts)
|
277
|
+
except Exception as h_err:
|
278
|
+
logger.warning(
|
279
|
+
f"Page {self._page.number}: Highlight detection failed – {h_err}",
|
280
|
+
exc_info=True,
|
281
|
+
)
|
221
282
|
|
222
283
|
# Create a mapping from character dict to index for efficient lookup
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
284
|
+
if self._load_text:
|
285
|
+
char_to_index = {}
|
286
|
+
for idx, char_dict in enumerate(prepared_char_dicts):
|
287
|
+
key = (
|
288
|
+
char_dict.get("x0", 0),
|
289
|
+
char_dict.get("top", 0),
|
290
|
+
char_dict.get("text", ""),
|
291
|
+
)
|
292
|
+
char_to_index[key] = idx
|
293
|
+
else:
|
294
|
+
char_to_index = {}
|
231
295
|
|
232
296
|
# 2. Instantiate the custom word extractor
|
233
297
|
# Prefer page-level config over PDF-level for tolerance lookup
|
298
|
+
word_elements: List[TextElement] = []
|
299
|
+
|
300
|
+
# Get config objects (needed for auto_text_tolerance check)
|
234
301
|
page_config = getattr(self._page, "_config", {})
|
235
302
|
pdf_config = getattr(self._page._parent, "_config", {})
|
236
303
|
|
237
|
-
#
|
238
|
-
xt =
|
239
|
-
yt =
|
304
|
+
# Initialize tolerance variables
|
305
|
+
xt = None
|
306
|
+
yt = None
|
240
307
|
use_flow = pdf_config.get("use_text_flow", False)
|
241
308
|
|
309
|
+
if self._load_text and prepared_char_dicts:
|
310
|
+
# Start with any explicitly supplied tolerances (may be None)
|
311
|
+
xt = page_config.get("x_tolerance", pdf_config.get("x_tolerance"))
|
312
|
+
yt = page_config.get("y_tolerance", pdf_config.get("y_tolerance"))
|
313
|
+
|
242
314
|
# ------------------------------------------------------------------
|
243
315
|
# Auto-adaptive tolerance: scale based on median character size when
|
244
316
|
# requested and explicit values are absent.
|
245
317
|
# ------------------------------------------------------------------
|
246
|
-
if pdf_config.get("auto_text_tolerance", True):
|
318
|
+
if self._load_text and pdf_config.get("auto_text_tolerance", True):
|
247
319
|
import statistics
|
248
320
|
|
249
321
|
sizes = [c.get("size", 0) for c in prepared_char_dicts if c.get("size")]
|
@@ -255,7 +327,7 @@ class ElementManager:
|
|
255
327
|
# Record back to page config for downstream users
|
256
328
|
page_config["x_tolerance"] = xt
|
257
329
|
if yt is None:
|
258
|
-
yt = 0.6 * median_size
|
330
|
+
yt = 0.6 * median_size # ~line spacing fraction
|
259
331
|
page_config["y_tolerance"] = yt
|
260
332
|
|
261
333
|
# Warn users when the page's font size is extremely small –
|
@@ -323,7 +395,6 @@ class ElementManager:
|
|
323
395
|
current_line_key = line_key
|
324
396
|
lines[-1].append(char_dict)
|
325
397
|
|
326
|
-
word_elements: List[TextElement] = []
|
327
398
|
# Process each line separately with direction detection
|
328
399
|
for line_chars in lines:
|
329
400
|
if not line_chars:
|
@@ -345,7 +416,8 @@ class ElementManager:
|
|
345
416
|
char_dir = "ltr"
|
346
417
|
|
347
418
|
extractor = NaturalWordExtractor(
|
348
|
-
word_split_attributes=self._word_split_attributes
|
419
|
+
word_split_attributes=self._word_split_attributes
|
420
|
+
+ ["strike", "underline", "highlight"],
|
349
421
|
extra_attrs=attributes_to_preserve,
|
350
422
|
x_tolerance=xt,
|
351
423
|
y_tolerance=yt,
|
@@ -394,12 +466,13 @@ class ElementManager:
|
|
394
466
|
# Convert from visual order (from PDF) to logical order using bidi
|
395
467
|
try:
|
396
468
|
from bidi.algorithm import get_display # type: ignore
|
469
|
+
|
397
470
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
398
471
|
|
399
472
|
with disable_text_sync():
|
400
473
|
# word_element.text is currently in visual order (from PDF)
|
401
474
|
# Convert to logical order using bidi with auto direction detection
|
402
|
-
logical_text = get_display(word_element.text, base_dir=
|
475
|
+
logical_text = get_display(word_element.text, base_dir="L")
|
403
476
|
# Apply bracket mirroring for logical order
|
404
477
|
word_element.text = mirror_brackets(logical_text)
|
405
478
|
except Exception:
|
@@ -476,11 +549,16 @@ class ElementManager:
|
|
476
549
|
if color_counts:
|
477
550
|
dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
|
478
551
|
try:
|
479
|
-
w._obj["highlight_color"] =
|
552
|
+
w._obj["highlight_color"] = (
|
553
|
+
tuple(dominant_color)
|
554
|
+
if isinstance(dominant_color, (list, tuple))
|
555
|
+
else dominant_color
|
556
|
+
)
|
480
557
|
except Exception:
|
481
558
|
w._obj["highlight_color"] = dominant_color
|
482
559
|
|
483
|
-
generated_words
|
560
|
+
# generated_words defaults to empty list if text loading is disabled
|
561
|
+
generated_words = word_elements if self._load_text else []
|
484
562
|
logger.debug(
|
485
563
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
486
564
|
)
|
@@ -978,12 +1056,16 @@ class ElementManager:
|
|
978
1056
|
# Strikethrough detection (horizontal strike-out lines)
|
979
1057
|
# ------------------------------------------------------------------
|
980
1058
|
|
981
|
-
def _mark_strikethrough_chars(
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
1059
|
+
def _mark_strikethrough_chars(
|
1060
|
+
self,
|
1061
|
+
char_dicts: List[Dict[str, Any]],
|
1062
|
+
*,
|
1063
|
+
thickness_tol: float = 1.5,
|
1064
|
+
horiz_tol: float = 1.0,
|
1065
|
+
coverage_ratio: float = 0.7,
|
1066
|
+
band_top: float = 0.35,
|
1067
|
+
band_bottom: float = 0.65,
|
1068
|
+
) -> None:
|
987
1069
|
"""Annotate character dictionaries with a boolean ``strike`` flag.
|
988
1070
|
|
989
1071
|
Args
|
@@ -1082,11 +1164,31 @@ class ElementManager:
|
|
1082
1164
|
# Allow user overrides via PDF._config["underline_detection"]
|
1083
1165
|
pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
|
1084
1166
|
|
1085
|
-
thickness_tol =
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1167
|
+
thickness_tol = (
|
1168
|
+
thickness_tol
|
1169
|
+
if thickness_tol is not None
|
1170
|
+
else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
|
1171
|
+
)
|
1172
|
+
horiz_tol = (
|
1173
|
+
horiz_tol
|
1174
|
+
if horiz_tol is not None
|
1175
|
+
else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
|
1176
|
+
)
|
1177
|
+
coverage_ratio = (
|
1178
|
+
coverage_ratio
|
1179
|
+
if coverage_ratio is not None
|
1180
|
+
else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
|
1181
|
+
)
|
1182
|
+
band_frac = (
|
1183
|
+
band_frac
|
1184
|
+
if band_frac is not None
|
1185
|
+
else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
|
1186
|
+
)
|
1187
|
+
below_pad = (
|
1188
|
+
below_pad
|
1189
|
+
if below_pad is not None
|
1190
|
+
else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
|
1191
|
+
)
|
1090
1192
|
|
1091
1193
|
raw_lines = list(getattr(self._page._page, "lines", []))
|
1092
1194
|
raw_rects = list(getattr(self._page._page, "rects", []))
|
@@ -1128,7 +1230,7 @@ class ElementManager:
|
|
1128
1230
|
table_y = {k for k, v in y_groups.items() if v >= 3}
|
1129
1231
|
|
1130
1232
|
# filter out candidates on those y values
|
1131
|
-
filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
|
1233
|
+
filtered_candidates = [c for c in candidates if int((c[1] + c[3]) / 2) not in table_y]
|
1132
1234
|
|
1133
1235
|
# annotate chars
|
1134
1236
|
for ch in char_dicts:
|
@@ -1185,7 +1287,9 @@ class ElementManager:
|
|
1185
1287
|
y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
|
1186
1288
|
y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
|
1187
1289
|
rheight = y1_rect - y0_rect
|
1188
|
-
highlight_rects.append(
|
1290
|
+
highlight_rects.append(
|
1291
|
+
(rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col)
|
1292
|
+
)
|
1189
1293
|
|
1190
1294
|
if not highlight_rects:
|
1191
1295
|
for ch in char_dicts:
|
@@ -1218,7 +1322,9 @@ class ElementManager:
|
|
1218
1322
|
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1219
1323
|
ch["highlight"] = True
|
1220
1324
|
try:
|
1221
|
-
ch["highlight_color"] =
|
1325
|
+
ch["highlight_color"] = (
|
1326
|
+
tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
|
1327
|
+
)
|
1222
1328
|
except Exception:
|
1223
1329
|
ch["highlight_color"] = rcolor
|
1224
1330
|
break
|
@@ -98,7 +98,9 @@ class HighlightRenderer:
|
|
98
98
|
scaled_bbox = None
|
99
99
|
|
100
100
|
if highlight.is_polygon:
|
101
|
-
scaled_polygon = [
|
101
|
+
scaled_polygon = [
|
102
|
+
(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
|
103
|
+
]
|
102
104
|
# Draw polygon fill and border
|
103
105
|
draw.polygon(
|
104
106
|
scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
|
@@ -597,7 +599,7 @@ class HighlightingService:
|
|
597
599
|
if page_index in self._highlights_by_page:
|
598
600
|
del self._highlights_by_page[page_index]
|
599
601
|
logger.debug(f"Cleared highlights for page {page_index}.")
|
600
|
-
|
602
|
+
|
601
603
|
# Also clear any cached rendered images for this page so the next render
|
602
604
|
# reflects the removal of highlights.
|
603
605
|
try:
|
@@ -683,7 +685,6 @@ class HighlightingService:
|
|
683
685
|
)
|
684
686
|
|
685
687
|
try:
|
686
|
-
# base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
|
687
688
|
img_object = page_obj._page.to_image(**to_image_args)
|
688
689
|
base_image_pil = (
|
689
690
|
img_object.annotated
|
@@ -929,9 +930,7 @@ class HighlightingService:
|
|
929
930
|
right_px = max(left_px + 1, min(right_px, rendered_image.width))
|
930
931
|
bottom_px = max(top_px + 1, min(bottom_px, rendered_image.height))
|
931
932
|
|
932
|
-
rendered_image = rendered_image.crop(
|
933
|
-
(left_px, top_px, right_px, bottom_px)
|
934
|
-
)
|
933
|
+
rendered_image = rendered_image.crop((left_px, top_px, right_px, bottom_px))
|
935
934
|
|
936
935
|
legend = None
|
937
936
|
if labels:
|