natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +44 -0
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +578 -27
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +118 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/ocr_manager.py +50 -0
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
- natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
|
|
7
7
|
|
8
8
|
import logging
|
9
9
|
import re
|
10
|
+
from contextlib import contextmanager
|
10
11
|
from itertools import groupby
|
11
12
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
12
13
|
|
@@ -15,9 +16,64 @@ from pdfplumber.utils.text import WordExtractor
|
|
15
16
|
from natural_pdf.elements.line import LineElement
|
16
17
|
from natural_pdf.elements.rect import RectangleElement
|
17
18
|
from natural_pdf.elements.text import TextElement
|
19
|
+
from natural_pdf.elements.image import ImageElement
|
18
20
|
|
19
21
|
logger = logging.getLogger(__name__)
|
20
22
|
|
23
|
+
# ------------------------------------------------------------------
|
24
|
+
# Default decoration-detection parameters (magic numbers centralised)
|
25
|
+
# ------------------------------------------------------------------
|
26
|
+
|
27
|
+
STRIKE_DEFAULTS = {
|
28
|
+
"thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
|
29
|
+
"horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
|
30
|
+
"coverage_ratio": 0.7, # proportion of glyph width to be overlapped
|
31
|
+
"band_top_frac": 0.35, # fraction of glyph height above top baseline band
|
32
|
+
"band_bottom_frac": 0.65, # fraction below top (same used internally)
|
33
|
+
}
|
34
|
+
|
35
|
+
UNDERLINE_DEFAULTS = {
|
36
|
+
"thickness_tol": 1.5,
|
37
|
+
"horiz_tol": 1.0,
|
38
|
+
"coverage_ratio": 0.8,
|
39
|
+
"band_frac": 0.25, # height fraction above baseline
|
40
|
+
"below_pad": 0.7, # pt ; pad below baseline
|
41
|
+
}
|
42
|
+
|
43
|
+
HIGHLIGHT_DEFAULTS = {
|
44
|
+
"height_min_ratio": 0.6, # rect height relative to char height lower bound
|
45
|
+
"height_max_ratio": 2.0, # upper bound
|
46
|
+
"coverage_ratio": 0.6, # horizontal overlap with glyph
|
47
|
+
"color_saturation_min": 0.4, # HSV S >
|
48
|
+
"color_value_min": 0.4, # HSV V >
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
@contextmanager
|
53
|
+
def disable_text_sync():
|
54
|
+
"""
|
55
|
+
Temporarily disable text synchronization for performance.
|
56
|
+
|
57
|
+
This is used when bulk-updating text content where character-level
|
58
|
+
synchronization is not needed, such as during bidi processing.
|
59
|
+
Fixes exponential recursion issue with Arabic/RTL text processing.
|
60
|
+
"""
|
61
|
+
# Save original setter
|
62
|
+
original_setter = TextElement.text.fset
|
63
|
+
|
64
|
+
# Create a fast setter that skips sync
|
65
|
+
def fast_setter(self, value):
|
66
|
+
self._obj["text"] = value
|
67
|
+
# Skip character synchronization for performance
|
68
|
+
|
69
|
+
# Apply fast setter
|
70
|
+
TextElement.text = property(TextElement.text.fget, fast_setter)
|
71
|
+
|
72
|
+
try:
|
73
|
+
yield
|
74
|
+
finally:
|
75
|
+
# Restore original setter
|
76
|
+
TextElement.text = property(TextElement.text.fget, original_setter)
|
21
77
|
|
22
78
|
class NaturalWordExtractor(WordExtractor):
|
23
79
|
"""
|
@@ -125,6 +181,54 @@ class ElementManager:
|
|
125
181
|
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
126
182
|
)
|
127
183
|
|
184
|
+
# -------------------------------------------------------------
|
185
|
+
# Detect strikethrough (horizontal strike-out lines) on raw
|
186
|
+
# characters BEFORE we run any word-grouping. This way the
|
187
|
+
# NaturalWordExtractor can use the presence/absence of a
|
188
|
+
# "strike" attribute to decide whether two neighbouring chars
|
189
|
+
# belong to the same word.
|
190
|
+
# -------------------------------------------------------------
|
191
|
+
|
192
|
+
try:
|
193
|
+
self._mark_strikethrough_chars(prepared_char_dicts)
|
194
|
+
except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
|
195
|
+
logger.warning(
|
196
|
+
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
197
|
+
exc_info=True,
|
198
|
+
)
|
199
|
+
|
200
|
+
# -------------------------------------------------------------
|
201
|
+
# Detect underlines on raw characters (must come after strike so
|
202
|
+
# both attributes are present before word grouping).
|
203
|
+
# -------------------------------------------------------------
|
204
|
+
|
205
|
+
try:
|
206
|
+
self._mark_underline_chars(prepared_char_dicts)
|
207
|
+
except Exception as u_err: # pragma: no cover
|
208
|
+
logger.warning(
|
209
|
+
f"Page {self._page.number}: Underline detection failed – {u_err}",
|
210
|
+
exc_info=True,
|
211
|
+
)
|
212
|
+
|
213
|
+
# Detect highlights
|
214
|
+
try:
|
215
|
+
self._mark_highlight_chars(prepared_char_dicts)
|
216
|
+
except Exception as h_err:
|
217
|
+
logger.warning(
|
218
|
+
f"Page {self._page.number}: Highlight detection failed – {h_err}",
|
219
|
+
exc_info=True,
|
220
|
+
)
|
221
|
+
|
222
|
+
# Create a mapping from character dict to index for efficient lookup
|
223
|
+
char_to_index = {}
|
224
|
+
for idx, char_dict in enumerate(prepared_char_dicts):
|
225
|
+
key = (
|
226
|
+
char_dict.get("x0", 0),
|
227
|
+
char_dict.get("top", 0),
|
228
|
+
char_dict.get("text", ""),
|
229
|
+
)
|
230
|
+
char_to_index[key] = idx
|
231
|
+
|
128
232
|
# 2. Instantiate the custom word extractor
|
129
233
|
# Get config settings from the parent PDF or use defaults
|
130
234
|
pdf_config = getattr(self._page._parent, "_config", {})
|
@@ -132,38 +236,212 @@ class ElementManager:
|
|
132
236
|
yt = pdf_config.get("y_tolerance", 3)
|
133
237
|
use_flow = pdf_config.get("use_text_flow", False)
|
134
238
|
|
135
|
-
#
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
# Assuming default directions are okay, configure if needed
|
148
|
-
# line_dir=..., char_dir=...
|
239
|
+
# List of attributes to preserve on word objects
|
240
|
+
attributes_to_preserve = list(
|
241
|
+
set(
|
242
|
+
self._word_split_attributes
|
243
|
+
+ [
|
244
|
+
"non_stroking_color",
|
245
|
+
"strike",
|
246
|
+
"underline",
|
247
|
+
"highlight",
|
248
|
+
"highlight_color",
|
249
|
+
]
|
250
|
+
)
|
149
251
|
)
|
150
252
|
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
253
|
+
# ------------------------------------------------------------------
|
254
|
+
# NEW: Detect direction (LTR vs RTL) per visual line and feed
|
255
|
+
# pdfplumber's WordExtractor with the correct settings.
|
256
|
+
# -------------------------------------------------------------
|
257
|
+
import unicodedata
|
258
|
+
|
259
|
+
def _is_rtl_char(ch: str) -> bool:
|
260
|
+
"""Return True if the character has an RTL bidi class."""
|
261
|
+
if not ch:
|
262
|
+
return False
|
263
|
+
# If string has more than one character take first (works for most PDFs)
|
264
|
+
first = ch[0]
|
265
|
+
try:
|
266
|
+
return unicodedata.bidirectional(first) in ("R", "AL", "AN")
|
267
|
+
except Exception:
|
268
|
+
return False
|
269
|
+
|
270
|
+
# Helper: group characters into visual lines using y-tolerance
|
271
|
+
sorted_chars_for_line_grouping = sorted(
|
272
|
+
prepared_char_dicts,
|
273
|
+
key=lambda c: (round(c.get("top", 0) / max(yt, 1)) * yt, c.get("x0", 0)),
|
274
|
+
)
|
275
|
+
|
276
|
+
lines: List[List[Dict[str, Any]]] = []
|
277
|
+
current_line_key = None
|
278
|
+
for char_dict in sorted_chars_for_line_grouping:
|
279
|
+
top_val = char_dict.get("top", 0)
|
280
|
+
line_key = round(top_val / max(yt, 1)) # bucket index
|
281
|
+
if current_line_key is None or line_key != current_line_key:
|
282
|
+
# start new line bucket
|
283
|
+
lines.append([])
|
284
|
+
current_line_key = line_key
|
285
|
+
lines[-1].append(char_dict)
|
286
|
+
|
287
|
+
word_elements: List[TextElement] = []
|
288
|
+
# Process each line separately with direction detection
|
289
|
+
for line_chars in lines:
|
290
|
+
if not line_chars:
|
291
|
+
continue
|
292
|
+
# Determine RTL ratio
|
293
|
+
rtl_count = sum(1 for ch in line_chars if _is_rtl_char(ch.get("text", "")))
|
294
|
+
ltr_count = len(line_chars) - rtl_count
|
295
|
+
# Consider RTL if it has strictly more RTL than LTR strong characters
|
296
|
+
is_rtl_line = rtl_count > ltr_count
|
297
|
+
|
298
|
+
# Build a WordExtractor tailored for this line's direction
|
299
|
+
if is_rtl_line:
|
300
|
+
line_dir = "ttb" # horizontal lines stacked top→bottom
|
301
|
+
# Feed characters in right→left x-order; extractor can then treat
|
302
|
+
# them as left-to-right so that resulting text stays logical.
|
303
|
+
char_dir = "ltr"
|
304
|
+
else:
|
305
|
+
line_dir = "ttb"
|
306
|
+
char_dir = "ltr"
|
307
|
+
|
308
|
+
extractor = NaturalWordExtractor(
|
309
|
+
word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
|
310
|
+
extra_attrs=attributes_to_preserve,
|
311
|
+
x_tolerance=xt,
|
312
|
+
y_tolerance=yt,
|
313
|
+
keep_blank_chars=True,
|
314
|
+
use_text_flow=use_flow,
|
315
|
+
line_dir=line_dir,
|
316
|
+
char_dir=char_dir,
|
159
317
|
)
|
160
318
|
|
161
|
-
|
319
|
+
# Prepare character sequence for the extractor:
|
320
|
+
# Always feed characters in spatial order (x0 ascending)
|
321
|
+
# PDF stores glyphs in visual order, so this gives us the visual sequence
|
322
|
+
line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
|
323
|
+
|
324
|
+
try:
|
325
|
+
word_tuples = extractor.iter_extract_tuples(line_chars_for_extractor)
|
326
|
+
except Exception as e: # pragma: no cover
|
327
|
+
logger.error(
|
328
|
+
f"Word extraction failed on line (rtl={is_rtl_line}) of page {self._page.number}: {e}",
|
329
|
+
exc_info=True,
|
330
|
+
)
|
331
|
+
word_tuples = []
|
332
|
+
|
162
333
|
for word_dict, char_list in word_tuples:
|
163
|
-
#
|
164
|
-
|
334
|
+
# Memory optimisation for char indices
|
335
|
+
char_indices = []
|
336
|
+
for char_dict in char_list:
|
337
|
+
key = (
|
338
|
+
char_dict.get("x0", 0),
|
339
|
+
char_dict.get("top", 0),
|
340
|
+
char_dict.get("text", ""),
|
341
|
+
)
|
342
|
+
# char_to_index dict built earlier in load_elements
|
343
|
+
if key in char_to_index:
|
344
|
+
char_indices.append(char_to_index[key])
|
345
|
+
word_dict["_char_indices"] = char_indices
|
346
|
+
word_dict["_char_dicts"] = char_list # keep for back-compat
|
347
|
+
# Create and append TextElement
|
165
348
|
word_element = self._create_word_element(word_dict)
|
166
|
-
|
349
|
+
word_elements.append(word_element)
|
350
|
+
|
351
|
+
# Decide if this individual word contains RTL characters; safer than relying
|
352
|
+
# on the whole-line heuristic.
|
353
|
+
rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
|
354
|
+
if rtl_in_word:
|
355
|
+
# Convert from visual order (from PDF) to logical order using bidi
|
356
|
+
try:
|
357
|
+
from bidi.algorithm import get_display # type: ignore
|
358
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
359
|
+
|
360
|
+
with disable_text_sync():
|
361
|
+
# word_element.text is currently in visual order (from PDF)
|
362
|
+
# Convert to logical order using bidi with auto direction detection
|
363
|
+
logical_text = get_display(word_element.text, base_dir='L')
|
364
|
+
# Apply bracket mirroring for logical order
|
365
|
+
word_element.text = mirror_brackets(logical_text)
|
366
|
+
except Exception:
|
367
|
+
pass
|
368
|
+
|
369
|
+
# ------------------------------------------------------------------
|
370
|
+
# Propagate per-char strikethrough info up to word level.
|
371
|
+
# ------------------------------------------------------------------
|
372
|
+
|
373
|
+
if prepared_char_dicts:
|
374
|
+
for w in word_elements:
|
375
|
+
strike_chars = 0
|
376
|
+
total_chars = 0
|
377
|
+
if getattr(w, "_char_indices", None):
|
378
|
+
for idx in w._char_indices:
|
379
|
+
if 0 <= idx < len(prepared_char_dicts):
|
380
|
+
total_chars += 1
|
381
|
+
if prepared_char_dicts[idx].get("strike"):
|
382
|
+
strike_chars += 1
|
383
|
+
elif getattr(w, "_char_dicts", None):
|
384
|
+
for ch in w._char_dicts:
|
385
|
+
total_chars += 1
|
386
|
+
if ch.get("strike"):
|
387
|
+
strike_chars += 1
|
388
|
+
|
389
|
+
if total_chars:
|
390
|
+
w._obj["strike"] = (strike_chars / total_chars) >= 0.6
|
391
|
+
else:
|
392
|
+
w._obj["strike"] = False
|
393
|
+
|
394
|
+
# underline propagation
|
395
|
+
ul_chars = 0
|
396
|
+
if getattr(w, "_char_indices", None):
|
397
|
+
for idx in w._char_indices:
|
398
|
+
if 0 <= idx < len(prepared_char_dicts):
|
399
|
+
if prepared_char_dicts[idx].get("underline"):
|
400
|
+
ul_chars += 1
|
401
|
+
elif getattr(w, "_char_dicts", None):
|
402
|
+
ul_chars = sum(1 for ch in w._char_dicts if ch.get("underline"))
|
403
|
+
|
404
|
+
if total_chars:
|
405
|
+
w._obj["underline"] = (ul_chars / total_chars) >= 0.6
|
406
|
+
else:
|
407
|
+
w._obj["underline"] = False
|
408
|
+
|
409
|
+
# highlight propagation
|
410
|
+
hl_chars = 0
|
411
|
+
if getattr(w, "_char_indices", None):
|
412
|
+
for idx in w._char_indices:
|
413
|
+
if 0 <= idx < len(prepared_char_dicts):
|
414
|
+
if prepared_char_dicts[idx].get("highlight"):
|
415
|
+
hl_chars += 1
|
416
|
+
elif getattr(w, "_char_dicts", None):
|
417
|
+
hl_chars = sum(1 for ch in w._char_dicts if ch.get("highlight"))
|
418
|
+
|
419
|
+
if total_chars:
|
420
|
+
w._obj["highlight"] = (hl_chars / total_chars) >= 0.6
|
421
|
+
else:
|
422
|
+
w._obj["highlight"] = False
|
423
|
+
|
424
|
+
# Determine dominant highlight color among chars
|
425
|
+
if w._obj.get("highlight"):
|
426
|
+
color_counts = {}
|
427
|
+
source_iter = (
|
428
|
+
(prepared_char_dicts[idx] for idx in w._char_indices)
|
429
|
+
if getattr(w, "_char_indices", None)
|
430
|
+
else w._char_dicts if getattr(w, "_char_dicts", None) else []
|
431
|
+
)
|
432
|
+
for chd in source_iter:
|
433
|
+
if chd.get("highlight") and chd.get("highlight_color") is not None:
|
434
|
+
col = chd["highlight_color"]
|
435
|
+
color_counts[col] = color_counts.get(col, 0) + 1
|
436
|
+
|
437
|
+
if color_counts:
|
438
|
+
dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
|
439
|
+
try:
|
440
|
+
w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
|
441
|
+
except Exception:
|
442
|
+
w._obj["highlight_color"] = dominant_color
|
443
|
+
|
444
|
+
generated_words = word_elements
|
167
445
|
logger.debug(
|
168
446
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
169
447
|
)
|
@@ -171,8 +449,9 @@ class ElementManager:
|
|
171
449
|
# 4. Load other elements (rects, lines)
|
172
450
|
rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
|
173
451
|
line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
|
452
|
+
image_elements = [ImageElement(i, self._page) for i in self._page._page.images]
|
174
453
|
logger.debug(
|
175
|
-
f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
|
454
|
+
f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines, {len(image_elements)} images."
|
176
455
|
)
|
177
456
|
|
178
457
|
# 5. Create the final elements dictionary
|
@@ -183,6 +462,7 @@ class ElementManager:
|
|
183
462
|
"words": generated_words,
|
184
463
|
"rects": rect_elements,
|
185
464
|
"lines": line_elements,
|
465
|
+
"images": image_elements,
|
186
466
|
}
|
187
467
|
|
188
468
|
# Add regions if they exist
|
@@ -201,6 +481,8 @@ class ElementManager:
|
|
201
481
|
|
202
482
|
logger.debug(f"Page {self._page.number}: Element loading complete.")
|
203
483
|
|
484
|
+
# If per-word BiDi was skipped, generated_words already stay in logical order.
|
485
|
+
|
204
486
|
def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
|
205
487
|
"""
|
206
488
|
Prepares a list of character dictionaries from native PDF characters,
|
@@ -238,6 +520,11 @@ class ElementManager:
|
|
238
520
|
augmented_dict.setdefault("upright", True)
|
239
521
|
augmented_dict.setdefault("fontname", "Unknown")
|
240
522
|
augmented_dict.setdefault("size", 0)
|
523
|
+
augmented_dict.setdefault("highlight_color", None)
|
524
|
+
# Ensure decoration keys exist for safe grouping
|
525
|
+
augmented_dict.setdefault("strike", False)
|
526
|
+
augmented_dict.setdefault("underline", False)
|
527
|
+
augmented_dict.setdefault("highlight", False)
|
241
528
|
|
242
529
|
prepared_dicts.append(augmented_dict)
|
243
530
|
# Use a unique identifier if available (e.g., tuple of key properties)
|
@@ -385,12 +672,21 @@ class ElementManager:
|
|
385
672
|
"italic": False,
|
386
673
|
"upright": True,
|
387
674
|
"doctop": pdf_top + self._page._page.initial_doctop,
|
675
|
+
"strike": False,
|
676
|
+
"underline": False,
|
677
|
+
"highlight": False,
|
678
|
+
"highlight_color": None,
|
388
679
|
}
|
389
680
|
|
390
681
|
# Create the representative char dict for this OCR word
|
391
682
|
ocr_char_dict = word_element_data.copy()
|
392
683
|
ocr_char_dict["object_type"] = "char"
|
393
684
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
685
|
+
# Ensure decoration keys
|
686
|
+
ocr_char_dict.setdefault("strike", False)
|
687
|
+
ocr_char_dict.setdefault("underline", False)
|
688
|
+
ocr_char_dict.setdefault("highlight", False)
|
689
|
+
ocr_char_dict.setdefault("highlight_color", None)
|
394
690
|
|
395
691
|
# Add the char dict list to the word data before creating TextElement
|
396
692
|
word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
|
@@ -550,6 +846,12 @@ class ElementManager:
|
|
550
846
|
self.load_elements()
|
551
847
|
return self._elements.get("regions", [])
|
552
848
|
|
849
|
+
@property
|
850
|
+
def images(self):
|
851
|
+
"""Get all image elements."""
|
852
|
+
self.load_elements()
|
853
|
+
return self._elements.get("images", [])
|
854
|
+
|
553
855
|
def remove_ocr_elements(self):
|
554
856
|
"""
|
555
857
|
Remove all elements with source="ocr" from the elements dictionary.
|
@@ -632,3 +934,252 @@ class ElementManager:
|
|
632
934
|
return True
|
633
935
|
|
634
936
|
return False
|
937
|
+
|
938
|
+
# ------------------------------------------------------------------
|
939
|
+
# Strikethrough detection (horizontal strike-out lines)
|
940
|
+
# ------------------------------------------------------------------
|
941
|
+
|
942
|
+
def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
|
943
|
+
thickness_tol: float = 1.5,
|
944
|
+
horiz_tol: float = 1.0,
|
945
|
+
coverage_ratio: float = 0.7,
|
946
|
+
band_top: float = 0.35,
|
947
|
+
band_bottom: float = 0.65) -> None:
|
948
|
+
"""Annotate character dictionaries with a boolean ``strike`` flag.
|
949
|
+
|
950
|
+
Args
|
951
|
+
----
|
952
|
+
char_dicts : list
|
953
|
+
The list that _prepare_char_dicts() returned – *modified in place*.
|
954
|
+
thickness_tol : float
|
955
|
+
Maximum height (in PDF pts) for a path to be considered a strike.
|
956
|
+
horiz_tol : float
|
957
|
+
Vertical tolerance when deciding if a pdfplumber ``line`` object
|
958
|
+
is horizontal (|y0-y1| ≤ horiz_tol).
|
959
|
+
coverage_ratio : float
|
960
|
+
Minimum proportion of the glyph's width that must be overlapped
|
961
|
+
by a candidate line.
|
962
|
+
band_top, band_bottom : float
|
963
|
+
Fractions of the glyph's height that define the central band in
|
964
|
+
which a line must fall to count as a strikethrough. Defaults to
|
965
|
+
35–65 %.
|
966
|
+
"""
|
967
|
+
|
968
|
+
# -------------------------------------------------------------
|
969
|
+
# Collect candidate horizontal primitives (lines + skinny rects)
|
970
|
+
# -------------------------------------------------------------
|
971
|
+
raw_lines = list(getattr(self._page._page, "lines", []))
|
972
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
973
|
+
|
974
|
+
candidates: List[Tuple[float, float, float, float]] = [] # (x0, y0, x1, y1)
|
975
|
+
|
976
|
+
# pdfplumber line objects – treat those whose angle ≈ 0°
|
977
|
+
for ln in raw_lines:
|
978
|
+
y0 = min(ln.get("y0", 0), ln.get("y1", 0))
|
979
|
+
y1 = max(ln.get("y0", 0), ln.get("y1", 0))
|
980
|
+
if abs(y1 - y0) <= horiz_tol: # horizontal
|
981
|
+
candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
|
982
|
+
|
983
|
+
# Thin rectangles that act as drawn lines
|
984
|
+
pg_height = self._page.height
|
985
|
+
for rc in raw_rects:
|
986
|
+
rb0 = rc.get("y0", 0)
|
987
|
+
rb1 = rc.get("y1", 0)
|
988
|
+
y0_raw = min(rb0, rb1)
|
989
|
+
y1_raw = max(rb0, rb1)
|
990
|
+
if (y1_raw - y0_raw) <= thickness_tol:
|
991
|
+
# Convert from PDF (origin bottom-left) to top-based coords used by chars
|
992
|
+
y0 = pg_height - y1_raw # upper edge distance from top
|
993
|
+
y1 = pg_height - y0_raw # lower edge distance from top
|
994
|
+
candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
|
995
|
+
|
996
|
+
if not candidates:
|
997
|
+
return # nothing to mark
|
998
|
+
|
999
|
+
# -------------------------------------------------------------
|
1000
|
+
# Walk through characters and flag those crossed by a candidate
|
1001
|
+
# -------------------------------------------------------------
|
1002
|
+
for ch in char_dicts:
|
1003
|
+
ch.setdefault("strike", False) # default value
|
1004
|
+
try:
|
1005
|
+
x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
|
1006
|
+
except KeyError:
|
1007
|
+
continue # skip malformed char dict
|
1008
|
+
|
1009
|
+
width = x1 - x0
|
1010
|
+
height = bottom - top
|
1011
|
+
if width <= 0 or height <= 0:
|
1012
|
+
continue
|
1013
|
+
|
1014
|
+
mid_y0 = top + band_top * height
|
1015
|
+
mid_y1 = top + band_bottom * height
|
1016
|
+
|
1017
|
+
# Check each candidate line for overlap
|
1018
|
+
for lx0, ly0, lx1, ly1 in candidates:
|
1019
|
+
if (ly0 >= (mid_y0 - 1.0)) and (ly1 <= (mid_y1 + 1.0)): # lies inside central band
|
1020
|
+
overlap = min(x1, lx1) - max(x0, lx0)
|
1021
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1022
|
+
ch["strike"] = True
|
1023
|
+
break # no need to check further lines
|
1024
|
+
|
1025
|
+
# Done – char_dicts mutated in place
|
1026
|
+
|
1027
|
+
# ------------------------------------------------------------------
|
1028
|
+
# Underline detection
|
1029
|
+
# ------------------------------------------------------------------
|
1030
|
+
|
1031
|
+
def _mark_underline_chars(
|
1032
|
+
self,
|
1033
|
+
char_dicts: List[Dict[str, Any]],
|
1034
|
+
*,
|
1035
|
+
thickness_tol: float = None,
|
1036
|
+
horiz_tol: float = None,
|
1037
|
+
coverage_ratio: float = None,
|
1038
|
+
band_frac: float = None,
|
1039
|
+
below_pad: float = None,
|
1040
|
+
) -> None:
|
1041
|
+
"""Annotate character dicts with ``underline`` flag."""
|
1042
|
+
|
1043
|
+
# Allow user overrides via PDF._config["underline_detection"]
|
1044
|
+
pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
|
1045
|
+
|
1046
|
+
thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
|
1047
|
+
horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
|
1048
|
+
coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
|
1049
|
+
band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
|
1050
|
+
below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
|
1051
|
+
|
1052
|
+
raw_lines = list(getattr(self._page._page, "lines", []))
|
1053
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
1054
|
+
|
1055
|
+
candidates: List[Tuple[float, float, float, float]] = []
|
1056
|
+
|
1057
|
+
for ln in raw_lines:
|
1058
|
+
y0 = min(ln.get("y0", 0), ln.get("y1", 0))
|
1059
|
+
y1 = max(ln.get("y0", 0), ln.get("y1", 0))
|
1060
|
+
if abs(y1 - y0) <= horiz_tol and (
|
1061
|
+
(ln.get("x1", 0) - ln.get("x0", 0)) < self._page.width * 0.95
|
1062
|
+
): # ignore full-width rules
|
1063
|
+
candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
|
1064
|
+
|
1065
|
+
pg_height = self._page.height
|
1066
|
+
for rc in raw_rects:
|
1067
|
+
rb0 = rc.get("y0", 0)
|
1068
|
+
rb1 = rc.get("y1", 0)
|
1069
|
+
y0_raw = min(rb0, rb1)
|
1070
|
+
y1_raw = max(rb0, rb1)
|
1071
|
+
if (y1_raw - y0_raw) <= thickness_tol and (
|
1072
|
+
(rc.get("x1", 0) - rc.get("x0", 0)) < self._page.width * 0.95
|
1073
|
+
):
|
1074
|
+
y0 = pg_height - y1_raw
|
1075
|
+
y1 = pg_height - y0_raw
|
1076
|
+
candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
|
1077
|
+
|
1078
|
+
if not candidates:
|
1079
|
+
for ch in char_dicts:
|
1080
|
+
ch.setdefault("underline", False)
|
1081
|
+
return
|
1082
|
+
|
1083
|
+
# group candidates by y within tolerance 0.5 to detect repeating table borders
|
1084
|
+
y_groups: Dict[int, int] = {}
|
1085
|
+
for _, y0, _, y1 in candidates:
|
1086
|
+
key = int((y0 + y1) / 2)
|
1087
|
+
y_groups[key] = y_groups.get(key, 0) + 1
|
1088
|
+
|
1089
|
+
table_y = {k for k, v in y_groups.items() if v >= 3}
|
1090
|
+
|
1091
|
+
# filter out candidates on those y values
|
1092
|
+
filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
|
1093
|
+
|
1094
|
+
# annotate chars
|
1095
|
+
for ch in char_dicts:
|
1096
|
+
ch.setdefault("underline", False)
|
1097
|
+
try:
|
1098
|
+
x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
|
1099
|
+
except KeyError:
|
1100
|
+
continue
|
1101
|
+
|
1102
|
+
width = x1 - x0
|
1103
|
+
height = bottom - top
|
1104
|
+
if width <= 0 or height <= 0:
|
1105
|
+
continue
|
1106
|
+
|
1107
|
+
band_top = bottom - band_frac * height
|
1108
|
+
band_bottom = bottom + below_pad # allow some distance below baseline
|
1109
|
+
|
1110
|
+
for lx0, ly0, lx1, ly1 in filtered_candidates:
|
1111
|
+
if (ly0 >= band_top - 1) and (ly1 <= band_bottom + 1):
|
1112
|
+
overlap = min(x1, lx1) - max(x0, lx0)
|
1113
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1114
|
+
ch["underline"] = True
|
1115
|
+
break
|
1116
|
+
|
1117
|
+
# ------------------------------------------------------------------
|
1118
|
+
# Highlight detection
|
1119
|
+
# ------------------------------------------------------------------
|
1120
|
+
|
1121
|
+
def _mark_highlight_chars(self, char_dicts: List[Dict[str, Any]]) -> None:
|
1122
|
+
"""Detect PDF marker-style highlights and set ``highlight`` on char dicts."""
|
1123
|
+
|
1124
|
+
cfg = getattr(self._page._parent, "_config", {}).get("highlight_detection", {})
|
1125
|
+
|
1126
|
+
height_min_ratio = cfg.get("height_min_ratio", HIGHLIGHT_DEFAULTS["height_min_ratio"])
|
1127
|
+
height_max_ratio = cfg.get("height_max_ratio", HIGHLIGHT_DEFAULTS["height_max_ratio"])
|
1128
|
+
coverage_ratio = cfg.get("coverage_ratio", HIGHLIGHT_DEFAULTS["coverage_ratio"])
|
1129
|
+
|
1130
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
1131
|
+
pg_height = self._page.height
|
1132
|
+
|
1133
|
+
# Build list of candidate highlight rectangles (convert to top-based coords)
|
1134
|
+
highlight_rects = []
|
1135
|
+
for rc in raw_rects:
|
1136
|
+
if rc.get("stroke", False):
|
1137
|
+
continue # border stroke, not fill-only
|
1138
|
+
if not rc.get("fill", False):
|
1139
|
+
continue
|
1140
|
+
|
1141
|
+
fill_col = rc.get("non_stroking_color")
|
1142
|
+
# We keep colour as metadata but no longer filter on it
|
1143
|
+
if fill_col is None:
|
1144
|
+
continue
|
1145
|
+
|
1146
|
+
y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
|
1147
|
+
y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
|
1148
|
+
rheight = y1_rect - y0_rect
|
1149
|
+
highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
|
1150
|
+
|
1151
|
+
if not highlight_rects:
|
1152
|
+
for ch in char_dicts:
|
1153
|
+
ch.setdefault("highlight", False)
|
1154
|
+
return
|
1155
|
+
|
1156
|
+
for ch in char_dicts:
|
1157
|
+
ch.setdefault("highlight", False)
|
1158
|
+
try:
|
1159
|
+
x0_raw, y0_raw, x1_raw, y1_raw = ch["x0"], ch["y0"], ch["x1"], ch["y1"]
|
1160
|
+
except KeyError:
|
1161
|
+
continue
|
1162
|
+
|
1163
|
+
width = x1_raw - x0_raw
|
1164
|
+
height = y1_raw - y0_raw
|
1165
|
+
if width <= 0 or height <= 0:
|
1166
|
+
continue
|
1167
|
+
|
1168
|
+
for rx0, ry0, rx1, ry1, rheight, rcolor in highlight_rects:
|
1169
|
+
# height ratio check relative to char
|
1170
|
+
ratio = rheight / height if height else 0
|
1171
|
+
if ratio < height_min_ratio or ratio > height_max_ratio:
|
1172
|
+
continue
|
1173
|
+
|
1174
|
+
# vertical containment in raw coords
|
1175
|
+
if not (y0_raw + 1 >= ry0 and y1_raw - 1 <= ry1):
|
1176
|
+
continue
|
1177
|
+
|
1178
|
+
overlap = min(x1_raw, rx1) - max(x0_raw, rx0)
|
1179
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1180
|
+
ch["highlight"] = True
|
1181
|
+
try:
|
1182
|
+
ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
|
1183
|
+
except Exception:
|
1184
|
+
ch["highlight_color"] = rcolor
|
1185
|
+
break
|