natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -15,9 +15,37 @@ from pdfplumber.utils.text import WordExtractor
|
|
15
15
|
from natural_pdf.elements.line import LineElement
|
16
16
|
from natural_pdf.elements.rect import RectangleElement
|
17
17
|
from natural_pdf.elements.text import TextElement
|
18
|
+
from natural_pdf.elements.image import ImageElement
|
18
19
|
|
19
20
|
logger = logging.getLogger(__name__)
|
20
21
|
|
22
|
+
# ------------------------------------------------------------------
|
23
|
+
# Default decoration-detection parameters (magic numbers centralised)
|
24
|
+
# ------------------------------------------------------------------
|
25
|
+
|
26
|
+
STRIKE_DEFAULTS = {
|
27
|
+
"thickness_tol": 1.5, # pt ; max height of line/rect to be considered strike
|
28
|
+
"horiz_tol": 1.0, # pt ; vertical tolerance for horizontality
|
29
|
+
"coverage_ratio": 0.7, # proportion of glyph width to be overlapped
|
30
|
+
"band_top_frac": 0.35, # fraction of glyph height above top baseline band
|
31
|
+
"band_bottom_frac": 0.65, # fraction below top (same used internally)
|
32
|
+
}
|
33
|
+
|
34
|
+
UNDERLINE_DEFAULTS = {
|
35
|
+
"thickness_tol": 1.5,
|
36
|
+
"horiz_tol": 1.0,
|
37
|
+
"coverage_ratio": 0.8,
|
38
|
+
"band_frac": 0.25, # height fraction above baseline
|
39
|
+
"below_pad": 0.7, # pt ; pad below baseline
|
40
|
+
}
|
41
|
+
|
42
|
+
HIGHLIGHT_DEFAULTS = {
|
43
|
+
"height_min_ratio": 0.6, # rect height relative to char height lower bound
|
44
|
+
"height_max_ratio": 2.0, # upper bound
|
45
|
+
"coverage_ratio": 0.6, # horizontal overlap with glyph
|
46
|
+
"color_saturation_min": 0.4, # HSV S >
|
47
|
+
"color_value_min": 0.4, # HSV V >
|
48
|
+
}
|
21
49
|
|
22
50
|
class NaturalWordExtractor(WordExtractor):
|
23
51
|
"""
|
@@ -125,6 +153,54 @@ class ElementManager:
|
|
125
153
|
f"Page {self._page.number}: Prepared {len(prepared_char_dicts)} character dictionaries."
|
126
154
|
)
|
127
155
|
|
156
|
+
# -------------------------------------------------------------
|
157
|
+
# Detect strikethrough (horizontal strike-out lines) on raw
|
158
|
+
# characters BEFORE we run any word-grouping. This way the
|
159
|
+
# NaturalWordExtractor can use the presence/absence of a
|
160
|
+
# "strike" attribute to decide whether two neighbouring chars
|
161
|
+
# belong to the same word.
|
162
|
+
# -------------------------------------------------------------
|
163
|
+
|
164
|
+
try:
|
165
|
+
self._mark_strikethrough_chars(prepared_char_dicts)
|
166
|
+
except Exception as strike_err: # pragma: no cover – strike detection must never crash loading
|
167
|
+
logger.warning(
|
168
|
+
f"Page {self._page.number}: Strikethrough detection failed – {strike_err}",
|
169
|
+
exc_info=True,
|
170
|
+
)
|
171
|
+
|
172
|
+
# -------------------------------------------------------------
|
173
|
+
# Detect underlines on raw characters (must come after strike so
|
174
|
+
# both attributes are present before word grouping).
|
175
|
+
# -------------------------------------------------------------
|
176
|
+
|
177
|
+
try:
|
178
|
+
self._mark_underline_chars(prepared_char_dicts)
|
179
|
+
except Exception as u_err: # pragma: no cover
|
180
|
+
logger.warning(
|
181
|
+
f"Page {self._page.number}: Underline detection failed – {u_err}",
|
182
|
+
exc_info=True,
|
183
|
+
)
|
184
|
+
|
185
|
+
# Detect highlights
|
186
|
+
try:
|
187
|
+
self._mark_highlight_chars(prepared_char_dicts)
|
188
|
+
except Exception as h_err:
|
189
|
+
logger.warning(
|
190
|
+
f"Page {self._page.number}: Highlight detection failed – {h_err}",
|
191
|
+
exc_info=True,
|
192
|
+
)
|
193
|
+
|
194
|
+
# Create a mapping from character dict to index for efficient lookup
|
195
|
+
char_to_index = {}
|
196
|
+
for idx, char_dict in enumerate(prepared_char_dicts):
|
197
|
+
key = (
|
198
|
+
char_dict.get("x0", 0),
|
199
|
+
char_dict.get("top", 0),
|
200
|
+
char_dict.get("text", ""),
|
201
|
+
)
|
202
|
+
char_to_index[key] = idx
|
203
|
+
|
128
204
|
# 2. Instantiate the custom word extractor
|
129
205
|
# Get config settings from the parent PDF or use defaults
|
130
206
|
pdf_config = getattr(self._page._parent, "_config", {})
|
@@ -134,45 +210,230 @@ class ElementManager:
|
|
134
210
|
|
135
211
|
# Define which attributes to preserve on the merged word object
|
136
212
|
# Should include split attributes + any others needed for filtering (like color)
|
137
|
-
attributes_to_preserve = list(
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
# line_dir=..., char_dir=...
|
213
|
+
attributes_to_preserve = list(
|
214
|
+
set(
|
215
|
+
self._word_split_attributes
|
216
|
+
+ [
|
217
|
+
"non_stroking_color",
|
218
|
+
"strike",
|
219
|
+
"underline",
|
220
|
+
"highlight",
|
221
|
+
"highlight_color",
|
222
|
+
]
|
223
|
+
)
|
149
224
|
)
|
150
225
|
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
226
|
+
# -------------------------------------------------------------
|
227
|
+
# NEW: Detect direction (LTR vs RTL) per visual line and feed
|
228
|
+
# pdfplumber's WordExtractor with the correct settings.
|
229
|
+
# -------------------------------------------------------------
|
230
|
+
import unicodedata
|
231
|
+
|
232
|
+
def _is_rtl_char(ch: str) -> bool:
|
233
|
+
"""Return True if the character has an RTL bidi class."""
|
234
|
+
if not ch:
|
235
|
+
return False
|
236
|
+
# If string has more than one character take first (works for most PDFs)
|
237
|
+
first = ch[0]
|
238
|
+
try:
|
239
|
+
return unicodedata.bidirectional(first) in ("R", "AL", "AN")
|
240
|
+
except Exception:
|
241
|
+
return False
|
242
|
+
|
243
|
+
# Helper: group characters into visual lines using y-tolerance
|
244
|
+
sorted_chars_for_line_grouping = sorted(
|
245
|
+
prepared_char_dicts,
|
246
|
+
key=lambda c: (round(c.get("top", 0) / max(yt, 1)) * yt, c.get("x0", 0)),
|
247
|
+
)
|
248
|
+
|
249
|
+
lines: List[List[Dict[str, Any]]] = []
|
250
|
+
current_line_key = None
|
251
|
+
for char_dict in sorted_chars_for_line_grouping:
|
252
|
+
top_val = char_dict.get("top", 0)
|
253
|
+
line_key = round(top_val / max(yt, 1)) # bucket index
|
254
|
+
if current_line_key is None or line_key != current_line_key:
|
255
|
+
# start new line bucket
|
256
|
+
lines.append([])
|
257
|
+
current_line_key = line_key
|
258
|
+
lines[-1].append(char_dict)
|
259
|
+
|
260
|
+
word_elements: List[TextElement] = []
|
261
|
+
# Process each line separately with direction detection
|
262
|
+
for line_chars in lines:
|
263
|
+
if not line_chars:
|
264
|
+
continue
|
265
|
+
# Determine RTL ratio
|
266
|
+
rtl_count = sum(1 for ch in line_chars if _is_rtl_char(ch.get("text", "")))
|
267
|
+
ltr_count = len(line_chars) - rtl_count
|
268
|
+
# Consider RTL if it has strictly more RTL than LTR strong characters
|
269
|
+
is_rtl_line = rtl_count > ltr_count
|
270
|
+
|
271
|
+
# Build a WordExtractor tailored for this line's direction
|
272
|
+
if is_rtl_line:
|
273
|
+
line_dir = "ttb" # horizontal lines stacked top→bottom
|
274
|
+
char_dir = "rtl" # characters right→left within the line
|
275
|
+
else:
|
276
|
+
line_dir = "ttb"
|
277
|
+
char_dir = "ltr"
|
278
|
+
|
279
|
+
extractor = NaturalWordExtractor(
|
280
|
+
word_split_attributes=self._word_split_attributes + ["strike", "underline", "highlight"],
|
281
|
+
extra_attrs=attributes_to_preserve,
|
282
|
+
x_tolerance=xt,
|
283
|
+
y_tolerance=yt,
|
284
|
+
keep_blank_chars=True,
|
285
|
+
use_text_flow=use_flow,
|
286
|
+
line_dir=line_dir,
|
287
|
+
char_dir=char_dir,
|
159
288
|
)
|
160
289
|
|
161
|
-
|
290
|
+
# Prepare character sequence for the extractor:
|
291
|
+
# • For LTR lines -> left→right order (x0 ascending)
|
292
|
+
# • For RTL lines -> feed **reversed** list so that neighbouring
|
293
|
+
# characters appear adjacent when the extractor walks right→left.
|
294
|
+
line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
|
295
|
+
|
296
|
+
try:
|
297
|
+
word_tuples = extractor.iter_extract_tuples(line_chars_for_extractor)
|
298
|
+
except Exception as e: # pragma: no cover
|
299
|
+
logger.error(
|
300
|
+
f"Word extraction failed on line (rtl={is_rtl_line}) of page {self._page.number}: {e}",
|
301
|
+
exc_info=True,
|
302
|
+
)
|
303
|
+
word_tuples = []
|
304
|
+
|
162
305
|
for word_dict, char_list in word_tuples:
|
163
|
-
#
|
164
|
-
|
306
|
+
# Memory optimisation for char indices
|
307
|
+
char_indices = []
|
308
|
+
for char_dict in char_list:
|
309
|
+
key = (
|
310
|
+
char_dict.get("x0", 0),
|
311
|
+
char_dict.get("top", 0),
|
312
|
+
char_dict.get("text", ""),
|
313
|
+
)
|
314
|
+
# char_to_index dict built earlier in load_elements
|
315
|
+
if key in char_to_index:
|
316
|
+
char_indices.append(char_to_index[key])
|
317
|
+
word_dict["_char_indices"] = char_indices
|
318
|
+
word_dict["_char_dicts"] = char_list # keep for back-compat
|
319
|
+
# Create and append TextElement
|
165
320
|
word_element = self._create_word_element(word_dict)
|
166
|
-
|
321
|
+
word_elements.append(word_element)
|
322
|
+
|
323
|
+
# Decide if this individual word contains RTL characters; safer than relying
|
324
|
+
# on the whole-line heuristic.
|
325
|
+
rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
|
326
|
+
if rtl_in_word:
|
327
|
+
try:
|
328
|
+
from bidi.algorithm import get_display # type: ignore
|
329
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
330
|
+
|
331
|
+
word_element.text = mirror_brackets(
|
332
|
+
get_display(word_element.text, base_dir="R")
|
333
|
+
)
|
334
|
+
except Exception:
|
335
|
+
# Fallback: keep original text if python-bidi fails
|
336
|
+
pass
|
337
|
+
|
338
|
+
# ------------------------------------------------------------------
|
339
|
+
# Propagate per-char strikethrough info up to word level.
|
340
|
+
# ------------------------------------------------------------------
|
341
|
+
|
342
|
+
if prepared_char_dicts:
|
343
|
+
for w in word_elements:
|
344
|
+
strike_chars = 0
|
345
|
+
total_chars = 0
|
346
|
+
if getattr(w, "_char_indices", None):
|
347
|
+
for idx in w._char_indices:
|
348
|
+
if 0 <= idx < len(prepared_char_dicts):
|
349
|
+
total_chars += 1
|
350
|
+
if prepared_char_dicts[idx].get("strike"):
|
351
|
+
strike_chars += 1
|
352
|
+
elif getattr(w, "_char_dicts", None):
|
353
|
+
for ch in w._char_dicts:
|
354
|
+
total_chars += 1
|
355
|
+
if ch.get("strike"):
|
356
|
+
strike_chars += 1
|
357
|
+
|
358
|
+
if total_chars:
|
359
|
+
w._obj["strike"] = (strike_chars / total_chars) >= 0.6
|
360
|
+
else:
|
361
|
+
w._obj["strike"] = False
|
362
|
+
|
363
|
+
# underline propagation
|
364
|
+
ul_chars = 0
|
365
|
+
if getattr(w, "_char_indices", None):
|
366
|
+
for idx in w._char_indices:
|
367
|
+
if 0 <= idx < len(prepared_char_dicts):
|
368
|
+
if prepared_char_dicts[idx].get("underline"):
|
369
|
+
ul_chars += 1
|
370
|
+
elif getattr(w, "_char_dicts", None):
|
371
|
+
ul_chars = sum(1 for ch in w._char_dicts if ch.get("underline"))
|
372
|
+
|
373
|
+
if total_chars:
|
374
|
+
w._obj["underline"] = (ul_chars / total_chars) >= 0.6
|
375
|
+
else:
|
376
|
+
w._obj["underline"] = False
|
377
|
+
|
378
|
+
# highlight propagation
|
379
|
+
hl_chars = 0
|
380
|
+
if getattr(w, "_char_indices", None):
|
381
|
+
for idx in w._char_indices:
|
382
|
+
if 0 <= idx < len(prepared_char_dicts):
|
383
|
+
if prepared_char_dicts[idx].get("highlight"):
|
384
|
+
hl_chars += 1
|
385
|
+
elif getattr(w, "_char_dicts", None):
|
386
|
+
hl_chars = sum(1 for ch in w._char_dicts if ch.get("highlight"))
|
387
|
+
|
388
|
+
if total_chars:
|
389
|
+
w._obj["highlight"] = (hl_chars / total_chars) >= 0.6
|
390
|
+
else:
|
391
|
+
w._obj["highlight"] = False
|
392
|
+
|
393
|
+
# Determine dominant highlight color among chars
|
394
|
+
if w._obj.get("highlight"):
|
395
|
+
color_counts = {}
|
396
|
+
source_iter = (
|
397
|
+
(prepared_char_dicts[idx] for idx in w._char_indices)
|
398
|
+
if getattr(w, "_char_indices", None)
|
399
|
+
else w._char_dicts if getattr(w, "_char_dicts", None) else []
|
400
|
+
)
|
401
|
+
for chd in source_iter:
|
402
|
+
if chd.get("highlight") and chd.get("highlight_color") is not None:
|
403
|
+
col = chd["highlight_color"]
|
404
|
+
color_counts[col] = color_counts.get(col, 0) + 1
|
405
|
+
|
406
|
+
if color_counts:
|
407
|
+
dominant_color = max(color_counts.items(), key=lambda t: t[1])[0]
|
408
|
+
try:
|
409
|
+
w._obj["highlight_color"] = tuple(dominant_color) if isinstance(dominant_color, (list, tuple)) else dominant_color
|
410
|
+
except Exception:
|
411
|
+
w._obj["highlight_color"] = dominant_color
|
412
|
+
|
413
|
+
generated_words = word_elements
|
167
414
|
logger.debug(
|
168
415
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
169
416
|
)
|
170
417
|
|
418
|
+
# --- Post-processing pass to ensure every word containing RTL characters is
|
419
|
+
# stored in logical order and with mirrored brackets. This is a
|
420
|
+
# safeguard in case the per-line loop above missed some tokens.
|
421
|
+
try:
|
422
|
+
from bidi.algorithm import get_display # type: ignore
|
423
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
424
|
+
|
425
|
+
for w in generated_words:
|
426
|
+
if any(_is_rtl_char(ch) for ch in w.text):
|
427
|
+
w.text = mirror_brackets(get_display(w.text, base_dir="R"))
|
428
|
+
except Exception:
|
429
|
+
pass # graceful degradation – keep original text
|
430
|
+
|
171
431
|
# 4. Load other elements (rects, lines)
|
172
432
|
rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
|
173
433
|
line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
|
434
|
+
image_elements = [ImageElement(i, self._page) for i in self._page._page.images]
|
174
435
|
logger.debug(
|
175
|
-
f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines."
|
436
|
+
f"Page {self._page.number}: Loaded {len(rect_elements)} rects, {len(line_elements)} lines, {len(image_elements)} images."
|
176
437
|
)
|
177
438
|
|
178
439
|
# 5. Create the final elements dictionary
|
@@ -183,6 +444,7 @@ class ElementManager:
|
|
183
444
|
"words": generated_words,
|
184
445
|
"rects": rect_elements,
|
185
446
|
"lines": line_elements,
|
447
|
+
"images": image_elements,
|
186
448
|
}
|
187
449
|
|
188
450
|
# Add regions if they exist
|
@@ -238,6 +500,11 @@ class ElementManager:
|
|
238
500
|
augmented_dict.setdefault("upright", True)
|
239
501
|
augmented_dict.setdefault("fontname", "Unknown")
|
240
502
|
augmented_dict.setdefault("size", 0)
|
503
|
+
augmented_dict.setdefault("highlight_color", None)
|
504
|
+
# Ensure decoration keys exist for safe grouping
|
505
|
+
augmented_dict.setdefault("strike", False)
|
506
|
+
augmented_dict.setdefault("underline", False)
|
507
|
+
augmented_dict.setdefault("highlight", False)
|
241
508
|
|
242
509
|
prepared_dicts.append(augmented_dict)
|
243
510
|
# Use a unique identifier if available (e.g., tuple of key properties)
|
@@ -385,12 +652,21 @@ class ElementManager:
|
|
385
652
|
"italic": False,
|
386
653
|
"upright": True,
|
387
654
|
"doctop": pdf_top + self._page._page.initial_doctop,
|
655
|
+
"strike": False,
|
656
|
+
"underline": False,
|
657
|
+
"highlight": False,
|
658
|
+
"highlight_color": None,
|
388
659
|
}
|
389
660
|
|
390
661
|
# Create the representative char dict for this OCR word
|
391
662
|
ocr_char_dict = word_element_data.copy()
|
392
663
|
ocr_char_dict["object_type"] = "char"
|
393
664
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
665
|
+
# Ensure decoration keys
|
666
|
+
ocr_char_dict.setdefault("strike", False)
|
667
|
+
ocr_char_dict.setdefault("underline", False)
|
668
|
+
ocr_char_dict.setdefault("highlight", False)
|
669
|
+
ocr_char_dict.setdefault("highlight_color", None)
|
394
670
|
|
395
671
|
# Add the char dict list to the word data before creating TextElement
|
396
672
|
word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
|
@@ -550,6 +826,12 @@ class ElementManager:
|
|
550
826
|
self.load_elements()
|
551
827
|
return self._elements.get("regions", [])
|
552
828
|
|
829
|
+
@property
|
830
|
+
def images(self):
|
831
|
+
"""Get all image elements."""
|
832
|
+
self.load_elements()
|
833
|
+
return self._elements.get("images", [])
|
834
|
+
|
553
835
|
def remove_ocr_elements(self):
|
554
836
|
"""
|
555
837
|
Remove all elements with source="ocr" from the elements dictionary.
|
@@ -632,3 +914,252 @@ class ElementManager:
|
|
632
914
|
return True
|
633
915
|
|
634
916
|
return False
|
917
|
+
|
918
|
+
# ------------------------------------------------------------------
|
919
|
+
# Strikethrough detection (horizontal strike-out lines)
|
920
|
+
# ------------------------------------------------------------------
|
921
|
+
|
922
|
+
def _mark_strikethrough_chars(self, char_dicts: List[Dict[str, Any]], *,
|
923
|
+
thickness_tol: float = 1.5,
|
924
|
+
horiz_tol: float = 1.0,
|
925
|
+
coverage_ratio: float = 0.7,
|
926
|
+
band_top: float = 0.35,
|
927
|
+
band_bottom: float = 0.65) -> None:
|
928
|
+
"""Annotate character dictionaries with a boolean ``strike`` flag.
|
929
|
+
|
930
|
+
Args
|
931
|
+
----
|
932
|
+
char_dicts : list
|
933
|
+
The list that _prepare_char_dicts() returned – *modified in place*.
|
934
|
+
thickness_tol : float
|
935
|
+
Maximum height (in PDF pts) for a path to be considered a strike.
|
936
|
+
horiz_tol : float
|
937
|
+
Vertical tolerance when deciding if a pdfplumber ``line`` object
|
938
|
+
is horizontal (|y0-y1| ≤ horiz_tol).
|
939
|
+
coverage_ratio : float
|
940
|
+
Minimum proportion of the glyph's width that must be overlapped
|
941
|
+
by a candidate line.
|
942
|
+
band_top, band_bottom : float
|
943
|
+
Fractions of the glyph's height that define the central band in
|
944
|
+
which a line must fall to count as a strikethrough. Defaults to
|
945
|
+
35–65 %.
|
946
|
+
"""
|
947
|
+
|
948
|
+
# -------------------------------------------------------------
|
949
|
+
# Collect candidate horizontal primitives (lines + skinny rects)
|
950
|
+
# -------------------------------------------------------------
|
951
|
+
raw_lines = list(getattr(self._page._page, "lines", []))
|
952
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
953
|
+
|
954
|
+
candidates: List[Tuple[float, float, float, float]] = [] # (x0, y0, x1, y1)
|
955
|
+
|
956
|
+
# pdfplumber line objects – treat those whose angle ≈ 0°
|
957
|
+
for ln in raw_lines:
|
958
|
+
y0 = min(ln.get("y0", 0), ln.get("y1", 0))
|
959
|
+
y1 = max(ln.get("y0", 0), ln.get("y1", 0))
|
960
|
+
if abs(y1 - y0) <= horiz_tol: # horizontal
|
961
|
+
candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
|
962
|
+
|
963
|
+
# Thin rectangles that act as drawn lines
|
964
|
+
pg_height = self._page.height
|
965
|
+
for rc in raw_rects:
|
966
|
+
rb0 = rc.get("y0", 0)
|
967
|
+
rb1 = rc.get("y1", 0)
|
968
|
+
y0_raw = min(rb0, rb1)
|
969
|
+
y1_raw = max(rb0, rb1)
|
970
|
+
if (y1_raw - y0_raw) <= thickness_tol:
|
971
|
+
# Convert from PDF (origin bottom-left) to top-based coords used by chars
|
972
|
+
y0 = pg_height - y1_raw # upper edge distance from top
|
973
|
+
y1 = pg_height - y0_raw # lower edge distance from top
|
974
|
+
candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
|
975
|
+
|
976
|
+
if not candidates:
|
977
|
+
return # nothing to mark
|
978
|
+
|
979
|
+
# -------------------------------------------------------------
|
980
|
+
# Walk through characters and flag those crossed by a candidate
|
981
|
+
# -------------------------------------------------------------
|
982
|
+
for ch in char_dicts:
|
983
|
+
ch.setdefault("strike", False) # default value
|
984
|
+
try:
|
985
|
+
x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
|
986
|
+
except KeyError:
|
987
|
+
continue # skip malformed char dict
|
988
|
+
|
989
|
+
width = x1 - x0
|
990
|
+
height = bottom - top
|
991
|
+
if width <= 0 or height <= 0:
|
992
|
+
continue
|
993
|
+
|
994
|
+
mid_y0 = top + band_top * height
|
995
|
+
mid_y1 = top + band_bottom * height
|
996
|
+
|
997
|
+
# Check each candidate line for overlap
|
998
|
+
for lx0, ly0, lx1, ly1 in candidates:
|
999
|
+
if (ly0 >= (mid_y0 - 1.0)) and (ly1 <= (mid_y1 + 1.0)): # lies inside central band
|
1000
|
+
overlap = min(x1, lx1) - max(x0, lx0)
|
1001
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1002
|
+
ch["strike"] = True
|
1003
|
+
break # no need to check further lines
|
1004
|
+
|
1005
|
+
# Done – char_dicts mutated in place
|
1006
|
+
|
1007
|
+
# ------------------------------------------------------------------
|
1008
|
+
# Underline detection
|
1009
|
+
# ------------------------------------------------------------------
|
1010
|
+
|
1011
|
+
def _mark_underline_chars(
|
1012
|
+
self,
|
1013
|
+
char_dicts: List[Dict[str, Any]],
|
1014
|
+
*,
|
1015
|
+
thickness_tol: float = None,
|
1016
|
+
horiz_tol: float = None,
|
1017
|
+
coverage_ratio: float = None,
|
1018
|
+
band_frac: float = None,
|
1019
|
+
below_pad: float = None,
|
1020
|
+
) -> None:
|
1021
|
+
"""Annotate character dicts with ``underline`` flag."""
|
1022
|
+
|
1023
|
+
# Allow user overrides via PDF._config["underline_detection"]
|
1024
|
+
pdf_cfg = getattr(self._page._parent, "_config", {}).get("underline_detection", {})
|
1025
|
+
|
1026
|
+
thickness_tol = thickness_tol if thickness_tol is not None else pdf_cfg.get("thickness_tol", UNDERLINE_DEFAULTS["thickness_tol"])
|
1027
|
+
horiz_tol = horiz_tol if horiz_tol is not None else pdf_cfg.get("horiz_tol", UNDERLINE_DEFAULTS["horiz_tol"])
|
1028
|
+
coverage_ratio= coverage_ratio if coverage_ratio is not None else pdf_cfg.get("coverage_ratio", UNDERLINE_DEFAULTS["coverage_ratio"])
|
1029
|
+
band_frac = band_frac if band_frac is not None else pdf_cfg.get("band_frac", UNDERLINE_DEFAULTS["band_frac"])
|
1030
|
+
below_pad = below_pad if below_pad is not None else pdf_cfg.get("below_pad", UNDERLINE_DEFAULTS["below_pad"])
|
1031
|
+
|
1032
|
+
raw_lines = list(getattr(self._page._page, "lines", []))
|
1033
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
1034
|
+
|
1035
|
+
candidates: List[Tuple[float, float, float, float]] = []
|
1036
|
+
|
1037
|
+
for ln in raw_lines:
|
1038
|
+
y0 = min(ln.get("y0", 0), ln.get("y1", 0))
|
1039
|
+
y1 = max(ln.get("y0", 0), ln.get("y1", 0))
|
1040
|
+
if abs(y1 - y0) <= horiz_tol and (
|
1041
|
+
(ln.get("x1", 0) - ln.get("x0", 0)) < self._page.width * 0.95
|
1042
|
+
): # ignore full-width rules
|
1043
|
+
candidates.append((ln.get("x0", 0), y0, ln.get("x1", 0), y1))
|
1044
|
+
|
1045
|
+
pg_height = self._page.height
|
1046
|
+
for rc in raw_rects:
|
1047
|
+
rb0 = rc.get("y0", 0)
|
1048
|
+
rb1 = rc.get("y1", 0)
|
1049
|
+
y0_raw = min(rb0, rb1)
|
1050
|
+
y1_raw = max(rb0, rb1)
|
1051
|
+
if (y1_raw - y0_raw) <= thickness_tol and (
|
1052
|
+
(rc.get("x1", 0) - rc.get("x0", 0)) < self._page.width * 0.95
|
1053
|
+
):
|
1054
|
+
y0 = pg_height - y1_raw
|
1055
|
+
y1 = pg_height - y0_raw
|
1056
|
+
candidates.append((rc.get("x0", 0), y0, rc.get("x1", 0), y1))
|
1057
|
+
|
1058
|
+
if not candidates:
|
1059
|
+
for ch in char_dicts:
|
1060
|
+
ch.setdefault("underline", False)
|
1061
|
+
return
|
1062
|
+
|
1063
|
+
# group candidates by y within tolerance 0.5 to detect repeating table borders
|
1064
|
+
y_groups: Dict[int, int] = {}
|
1065
|
+
for _, y0, _, y1 in candidates:
|
1066
|
+
key = int((y0 + y1) / 2)
|
1067
|
+
y_groups[key] = y_groups.get(key, 0) + 1
|
1068
|
+
|
1069
|
+
table_y = {k for k, v in y_groups.items() if v >= 3}
|
1070
|
+
|
1071
|
+
# filter out candidates on those y values
|
1072
|
+
filtered_candidates = [c for c in candidates if int((c[1]+c[3])/2) not in table_y]
|
1073
|
+
|
1074
|
+
# annotate chars
|
1075
|
+
for ch in char_dicts:
|
1076
|
+
ch.setdefault("underline", False)
|
1077
|
+
try:
|
1078
|
+
x0, top, x1, bottom = ch["x0"], ch["top"], ch["x1"], ch["bottom"]
|
1079
|
+
except KeyError:
|
1080
|
+
continue
|
1081
|
+
|
1082
|
+
width = x1 - x0
|
1083
|
+
height = bottom - top
|
1084
|
+
if width <= 0 or height <= 0:
|
1085
|
+
continue
|
1086
|
+
|
1087
|
+
band_top = bottom - band_frac * height
|
1088
|
+
band_bottom = bottom + below_pad # allow some distance below baseline
|
1089
|
+
|
1090
|
+
for lx0, ly0, lx1, ly1 in filtered_candidates:
|
1091
|
+
if (ly0 >= band_top - 1) and (ly1 <= band_bottom + 1):
|
1092
|
+
overlap = min(x1, lx1) - max(x0, lx0)
|
1093
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1094
|
+
ch["underline"] = True
|
1095
|
+
break
|
1096
|
+
|
1097
|
+
# ------------------------------------------------------------------
|
1098
|
+
# Highlight detection
|
1099
|
+
# ------------------------------------------------------------------
|
1100
|
+
|
1101
|
+
def _mark_highlight_chars(self, char_dicts: List[Dict[str, Any]]) -> None:
|
1102
|
+
"""Detect PDF marker-style highlights and set ``highlight`` on char dicts."""
|
1103
|
+
|
1104
|
+
cfg = getattr(self._page._parent, "_config", {}).get("highlight_detection", {})
|
1105
|
+
|
1106
|
+
height_min_ratio = cfg.get("height_min_ratio", HIGHLIGHT_DEFAULTS["height_min_ratio"])
|
1107
|
+
height_max_ratio = cfg.get("height_max_ratio", HIGHLIGHT_DEFAULTS["height_max_ratio"])
|
1108
|
+
coverage_ratio = cfg.get("coverage_ratio", HIGHLIGHT_DEFAULTS["coverage_ratio"])
|
1109
|
+
|
1110
|
+
raw_rects = list(getattr(self._page._page, "rects", []))
|
1111
|
+
pg_height = self._page.height
|
1112
|
+
|
1113
|
+
# Build list of candidate highlight rectangles (convert to top-based coords)
|
1114
|
+
highlight_rects = []
|
1115
|
+
for rc in raw_rects:
|
1116
|
+
if rc.get("stroke", False):
|
1117
|
+
continue # border stroke, not fill-only
|
1118
|
+
if not rc.get("fill", False):
|
1119
|
+
continue
|
1120
|
+
|
1121
|
+
fill_col = rc.get("non_stroking_color")
|
1122
|
+
# We keep colour as metadata but no longer filter on it
|
1123
|
+
if fill_col is None:
|
1124
|
+
continue
|
1125
|
+
|
1126
|
+
y0_rect = min(rc.get("y0", 0), rc.get("y1", 0))
|
1127
|
+
y1_rect = max(rc.get("y0", 0), rc.get("y1", 0))
|
1128
|
+
rheight = y1_rect - y0_rect
|
1129
|
+
highlight_rects.append((rc.get("x0", 0), y0_rect, rc.get("x1", 0), y1_rect, rheight, fill_col))
|
1130
|
+
|
1131
|
+
if not highlight_rects:
|
1132
|
+
for ch in char_dicts:
|
1133
|
+
ch.setdefault("highlight", False)
|
1134
|
+
return
|
1135
|
+
|
1136
|
+
for ch in char_dicts:
|
1137
|
+
ch.setdefault("highlight", False)
|
1138
|
+
try:
|
1139
|
+
x0_raw, y0_raw, x1_raw, y1_raw = ch["x0"], ch["y0"], ch["x1"], ch["y1"]
|
1140
|
+
except KeyError:
|
1141
|
+
continue
|
1142
|
+
|
1143
|
+
width = x1_raw - x0_raw
|
1144
|
+
height = y1_raw - y0_raw
|
1145
|
+
if width <= 0 or height <= 0:
|
1146
|
+
continue
|
1147
|
+
|
1148
|
+
for rx0, ry0, rx1, ry1, rheight, rcolor in highlight_rects:
|
1149
|
+
# height ratio check relative to char
|
1150
|
+
ratio = rheight / height if height else 0
|
1151
|
+
if ratio < height_min_ratio or ratio > height_max_ratio:
|
1152
|
+
continue
|
1153
|
+
|
1154
|
+
# vertical containment in raw coords
|
1155
|
+
if not (y0_raw + 1 >= ry0 and y1_raw - 1 <= ry1):
|
1156
|
+
continue
|
1157
|
+
|
1158
|
+
overlap = min(x1_raw, rx1) - max(x0_raw, rx0)
|
1159
|
+
if overlap > 0 and (overlap / width) >= coverage_ratio:
|
1160
|
+
ch["highlight"] = True
|
1161
|
+
try:
|
1162
|
+
ch["highlight_color"] = tuple(rcolor) if isinstance(rcolor, (list, tuple)) else rcolor
|
1163
|
+
except Exception:
|
1164
|
+
ch["highlight_color"] = rcolor
|
1165
|
+
break
|