natural-pdf 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/element_manager.py +44 -24
- natural_pdf/elements/text.py +29 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +8 -8
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.30.dist-info → natural_pdf-0.1.31.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
|
|
7
7
|
|
8
8
|
import logging
|
9
9
|
import re
|
10
|
+
from contextlib import contextmanager
|
10
11
|
from itertools import groupby
|
11
12
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
12
13
|
|
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
|
|
47
48
|
"color_value_min": 0.4, # HSV V >
|
48
49
|
}
|
49
50
|
|
51
|
+
|
52
|
+
@contextmanager
|
53
|
+
def disable_text_sync():
|
54
|
+
"""
|
55
|
+
Temporarily disable text synchronization for performance.
|
56
|
+
|
57
|
+
This is used when bulk-updating text content where character-level
|
58
|
+
synchronization is not needed, such as during bidi processing.
|
59
|
+
Fixes exponential recursion issue with Arabic/RTL text processing.
|
60
|
+
"""
|
61
|
+
# Save original setter
|
62
|
+
original_setter = TextElement.text.fset
|
63
|
+
|
64
|
+
# Create a fast setter that skips sync
|
65
|
+
def fast_setter(self, value):
|
66
|
+
self._obj["text"] = value
|
67
|
+
# Skip character synchronization for performance
|
68
|
+
|
69
|
+
# Apply fast setter
|
70
|
+
TextElement.text = property(TextElement.text.fget, fast_setter)
|
71
|
+
|
72
|
+
try:
|
73
|
+
yield
|
74
|
+
finally:
|
75
|
+
# Restore original setter
|
76
|
+
TextElement.text = property(TextElement.text.fget, original_setter)
|
77
|
+
|
50
78
|
class NaturalWordExtractor(WordExtractor):
|
51
79
|
"""
|
52
80
|
Custom WordExtractor that splits words based on specified character attributes
|
@@ -208,8 +236,7 @@ class ElementManager:
|
|
208
236
|
yt = pdf_config.get("y_tolerance", 3)
|
209
237
|
use_flow = pdf_config.get("use_text_flow", False)
|
210
238
|
|
211
|
-
#
|
212
|
-
# Should include split attributes + any others needed for filtering (like color)
|
239
|
+
# List of attributes to preserve on word objects
|
213
240
|
attributes_to_preserve = list(
|
214
241
|
set(
|
215
242
|
self._word_split_attributes
|
@@ -223,7 +250,7 @@ class ElementManager:
|
|
223
250
|
)
|
224
251
|
)
|
225
252
|
|
226
|
-
#
|
253
|
+
# ------------------------------------------------------------------
|
227
254
|
# NEW: Detect direction (LTR vs RTL) per visual line and feed
|
228
255
|
# pdfplumber's WordExtractor with the correct settings.
|
229
256
|
# -------------------------------------------------------------
|
@@ -271,7 +298,9 @@ class ElementManager:
|
|
271
298
|
# Build a WordExtractor tailored for this line's direction
|
272
299
|
if is_rtl_line:
|
273
300
|
line_dir = "ttb" # horizontal lines stacked top→bottom
|
274
|
-
|
301
|
+
# Feed characters in right→left x-order; extractor can then treat
|
302
|
+
# them as left-to-right so that resulting text stays logical.
|
303
|
+
char_dir = "ltr"
|
275
304
|
else:
|
276
305
|
line_dir = "ttb"
|
277
306
|
char_dir = "ltr"
|
@@ -288,9 +317,8 @@ class ElementManager:
|
|
288
317
|
)
|
289
318
|
|
290
319
|
# Prepare character sequence for the extractor:
|
291
|
-
#
|
292
|
-
#
|
293
|
-
# characters appear adjacent when the extractor walks right→left.
|
320
|
+
# Always feed characters in spatial order (x0 ascending)
|
321
|
+
# PDF stores glyphs in visual order, so this gives us the visual sequence
|
294
322
|
line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
|
295
323
|
|
296
324
|
try:
|
@@ -324,15 +352,18 @@ class ElementManager:
|
|
324
352
|
# on the whole-line heuristic.
|
325
353
|
rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
|
326
354
|
if rtl_in_word:
|
355
|
+
# Convert from visual order (from PDF) to logical order using bidi
|
327
356
|
try:
|
328
357
|
from bidi.algorithm import get_display # type: ignore
|
329
358
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
330
359
|
|
331
|
-
|
332
|
-
|
333
|
-
|
360
|
+
with disable_text_sync():
|
361
|
+
# word_element.text is currently in visual order (from PDF)
|
362
|
+
# Convert to logical order using bidi with auto direction detection
|
363
|
+
logical_text = get_display(word_element.text, base_dir='L')
|
364
|
+
# Apply bracket mirroring for logical order
|
365
|
+
word_element.text = mirror_brackets(logical_text)
|
334
366
|
except Exception:
|
335
|
-
# Fallback: keep original text if python-bidi fails
|
336
367
|
pass
|
337
368
|
|
338
369
|
# ------------------------------------------------------------------
|
@@ -415,19 +446,6 @@ class ElementManager:
|
|
415
446
|
f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
|
416
447
|
)
|
417
448
|
|
418
|
-
# --- Post-processing pass to ensure every word containing RTL characters is
|
419
|
-
# stored in logical order and with mirrored brackets. This is a
|
420
|
-
# safeguard in case the per-line loop above missed some tokens.
|
421
|
-
try:
|
422
|
-
from bidi.algorithm import get_display # type: ignore
|
423
|
-
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
424
|
-
|
425
|
-
for w in generated_words:
|
426
|
-
if any(_is_rtl_char(ch) for ch in w.text):
|
427
|
-
w.text = mirror_brackets(get_display(w.text, base_dir="R"))
|
428
|
-
except Exception:
|
429
|
-
pass # graceful degradation – keep original text
|
430
|
-
|
431
449
|
# 4. Load other elements (rects, lines)
|
432
450
|
rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
|
433
451
|
line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
|
@@ -463,6 +481,8 @@ class ElementManager:
|
|
463
481
|
|
464
482
|
logger.debug(f"Page {self._page.number}: Element loading complete.")
|
465
483
|
|
484
|
+
# If per-word BiDi was skipped, generated_words already stay in logical order.
|
485
|
+
|
466
486
|
def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
|
467
487
|
"""
|
468
488
|
Prepares a list of character dictionaries from native PDF characters,
|
natural_pdf/elements/text.py
CHANGED
@@ -468,3 +468,32 @@ class TextElement(Element):
|
|
468
468
|
info[f"raw_{prop}"] = self._obj[prop]
|
469
469
|
|
470
470
|
return info
|
471
|
+
|
472
|
+
@property
|
473
|
+
def visual_text(self) -> str:
|
474
|
+
"""Return the text converted to *visual* order using the Unicode BiDi algorithm.
|
475
|
+
|
476
|
+
This helper is intentionally side-effect–free: it does **not** mutate
|
477
|
+
``self.text`` or the underlying character dictionaries. It should be
|
478
|
+
used by UI / rendering code that needs human-readable RTL/LTR mixing.
|
479
|
+
"""
|
480
|
+
logical = self.text
|
481
|
+
if not logical:
|
482
|
+
return logical
|
483
|
+
|
484
|
+
# Quick check – bail out if no RTL chars to save import/CPU.
|
485
|
+
import unicodedata
|
486
|
+
|
487
|
+
if not any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in logical):
|
488
|
+
return logical
|
489
|
+
|
490
|
+
try:
|
491
|
+
from bidi.algorithm import get_display # type: ignore
|
492
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
493
|
+
|
494
|
+
# Convert from logical order to visual order
|
495
|
+
visual = get_display(logical, base_dir="R")
|
496
|
+
return mirror_brackets(visual)
|
497
|
+
except Exception:
|
498
|
+
# If python-bidi is missing or errors, fall back to logical order
|
499
|
+
return logical
|
@@ -28,7 +28,7 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
|
|
28
28
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
29
29
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
30
30
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
31
|
-
natural_pdf/core/element_manager.py,sha256=
|
31
|
+
natural_pdf/core/element_manager.py,sha256=Mn4cYqPL_2LD_GK9lf2duExaJF1qhASCKsOdAZdQb00,49821
|
32
32
|
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
33
33
|
natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
|
34
34
|
natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
|
@@ -44,7 +44,7 @@ natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI
|
|
44
44
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
45
45
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
46
46
|
natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
|
47
|
-
natural_pdf/elements/text.py,sha256=
|
47
|
+
natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
|
48
48
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
49
49
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
50
50
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -100,7 +100,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
|
|
100
100
|
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
101
101
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
102
102
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
103
|
-
natural_pdf-0.1.
|
103
|
+
natural_pdf-0.1.31.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
104
104
|
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
105
105
|
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
106
106
|
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
@@ -115,8 +115,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
|
|
115
115
|
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
116
116
|
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
117
117
|
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
118
|
-
natural_pdf-0.1.
|
119
|
-
natural_pdf-0.1.
|
120
|
-
natural_pdf-0.1.
|
121
|
-
natural_pdf-0.1.
|
122
|
-
natural_pdf-0.1.
|
118
|
+
natural_pdf-0.1.31.dist-info/METADATA,sha256=tqimu2ZReyYu5pS0PsbCo-Z9fIzkpMj1ljGPNbaOFss,6711
|
119
|
+
natural_pdf-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
120
|
+
natural_pdf-0.1.31.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
121
|
+
natural_pdf-0.1.31.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
122
|
+
natural_pdf-0.1.31.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|