natural-pdf 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ characters, words, rectangles, and lines extracted from a page.
7
7
 
8
8
  import logging
9
9
  import re
10
+ from contextlib import contextmanager
10
11
  from itertools import groupby
11
12
  from typing import Any, Dict, List, Optional, Tuple, Union
12
13
 
@@ -47,6 +48,33 @@ HIGHLIGHT_DEFAULTS = {
47
48
  "color_value_min": 0.4, # HSV V >
48
49
  }
49
50
 
51
+
52
+ @contextmanager
53
+ def disable_text_sync():
54
+ """
55
+ Temporarily disable text synchronization for performance.
56
+
57
+ This is used when bulk-updating text content where character-level
58
+ synchronization is not needed, such as during bidi processing.
59
+ Fixes exponential recursion issue with Arabic/RTL text processing.
60
+ """
61
+ # Save original setter
62
+ original_setter = TextElement.text.fset
63
+
64
+ # Create a fast setter that skips sync
65
+ def fast_setter(self, value):
66
+ self._obj["text"] = value
67
+ # Skip character synchronization for performance
68
+
69
+ # Apply fast setter
70
+ TextElement.text = property(TextElement.text.fget, fast_setter)
71
+
72
+ try:
73
+ yield
74
+ finally:
75
+ # Restore original setter
76
+ TextElement.text = property(TextElement.text.fget, original_setter)
77
+
50
78
  class NaturalWordExtractor(WordExtractor):
51
79
  """
52
80
  Custom WordExtractor that splits words based on specified character attributes
@@ -208,8 +236,7 @@ class ElementManager:
208
236
  yt = pdf_config.get("y_tolerance", 3)
209
237
  use_flow = pdf_config.get("use_text_flow", False)
210
238
 
211
- # Define which attributes to preserve on the merged word object
212
- # Should include split attributes + any others needed for filtering (like color)
239
+ # List of attributes to preserve on word objects
213
240
  attributes_to_preserve = list(
214
241
  set(
215
242
  self._word_split_attributes
@@ -223,7 +250,7 @@ class ElementManager:
223
250
  )
224
251
  )
225
252
 
226
- # -------------------------------------------------------------
253
+ # ------------------------------------------------------------------
227
254
  # NEW: Detect direction (LTR vs RTL) per visual line and feed
228
255
  # pdfplumber's WordExtractor with the correct settings.
229
256
  # -------------------------------------------------------------
@@ -271,7 +298,9 @@ class ElementManager:
271
298
  # Build a WordExtractor tailored for this line's direction
272
299
  if is_rtl_line:
273
300
  line_dir = "ttb" # horizontal lines stacked top→bottom
274
- char_dir = "rtl" # characters right→left within the line
301
+ # Feed characters in right→left x-order; extractor can then treat
302
+ # them as left-to-right so that resulting text stays logical.
303
+ char_dir = "ltr"
275
304
  else:
276
305
  line_dir = "ttb"
277
306
  char_dir = "ltr"
@@ -288,9 +317,8 @@ class ElementManager:
288
317
  )
289
318
 
290
319
  # Prepare character sequence for the extractor:
291
- # For LTR lines -> left→right order (x0 ascending)
292
- # For RTL lines -> feed **reversed** list so that neighbouring
293
- # characters appear adjacent when the extractor walks right→left.
320
+ # Always feed characters in spatial order (x0 ascending)
321
+ # PDF stores glyphs in visual order, so this gives us the visual sequence
294
322
  line_chars_for_extractor = sorted(line_chars, key=lambda c: c.get("x0", 0))
295
323
 
296
324
  try:
@@ -324,15 +352,18 @@ class ElementManager:
324
352
  # on the whole-line heuristic.
325
353
  rtl_in_word = any(_is_rtl_char(ch.get("text", "")) for ch in char_list)
326
354
  if rtl_in_word:
355
+ # Convert from visual order (from PDF) to logical order using bidi
327
356
  try:
328
357
  from bidi.algorithm import get_display # type: ignore
329
358
  from natural_pdf.utils.bidi_mirror import mirror_brackets
330
359
 
331
- word_element.text = mirror_brackets(
332
- get_display(word_element.text, base_dir="R")
333
- )
360
+ with disable_text_sync():
361
+ # word_element.text is currently in visual order (from PDF)
362
+ # Convert to logical order using bidi with auto direction detection
363
+ logical_text = get_display(word_element.text, base_dir='L')
364
+ # Apply bracket mirroring for logical order
365
+ word_element.text = mirror_brackets(logical_text)
334
366
  except Exception:
335
- # Fallback: keep original text if python-bidi fails
336
367
  pass
337
368
 
338
369
  # ------------------------------------------------------------------
@@ -415,19 +446,6 @@ class ElementManager:
415
446
  f"Page {self._page.number}: Generated {len(generated_words)} words using NaturalWordExtractor."
416
447
  )
417
448
 
418
- # --- Post-processing pass to ensure every word containing RTL characters is
419
- # stored in logical order and with mirrored brackets. This is a
420
- # safeguard in case the per-line loop above missed some tokens.
421
- try:
422
- from bidi.algorithm import get_display # type: ignore
423
- from natural_pdf.utils.bidi_mirror import mirror_brackets
424
-
425
- for w in generated_words:
426
- if any(_is_rtl_char(ch) for ch in w.text):
427
- w.text = mirror_brackets(get_display(w.text, base_dir="R"))
428
- except Exception:
429
- pass # graceful degradation – keep original text
430
-
431
449
  # 4. Load other elements (rects, lines)
432
450
  rect_elements = [RectangleElement(r, self._page) for r in self._page._page.rects]
433
451
  line_elements = [LineElement(l, self._page) for l in self._page._page.lines]
@@ -463,6 +481,8 @@ class ElementManager:
463
481
 
464
482
  logger.debug(f"Page {self._page.number}: Element loading complete.")
465
483
 
484
+ # If per-word BiDi was skipped, generated_words already stay in logical order.
485
+
466
486
  def _prepare_char_dicts(self) -> List[Dict[str, Any]]:
467
487
  """
468
488
  Prepares a list of character dictionaries from native PDF characters,
@@ -468,3 +468,32 @@ class TextElement(Element):
468
468
  info[f"raw_{prop}"] = self._obj[prop]
469
469
 
470
470
  return info
471
+
472
+ @property
473
+ def visual_text(self) -> str:
474
+ """Return the text converted to *visual* order using the Unicode BiDi algorithm.
475
+
476
+ This helper is intentionally side-effect–free: it does **not** mutate
477
+ ``self.text`` or the underlying character dictionaries. It should be
478
+ used by UI / rendering code that needs human-readable RTL/LTR mixing.
479
+ """
480
+ logical = self.text
481
+ if not logical:
482
+ return logical
483
+
484
+ # Quick check – bail out if no RTL chars to save import/CPU.
485
+ import unicodedata
486
+
487
+ if not any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in logical):
488
+ return logical
489
+
490
+ try:
491
+ from bidi.algorithm import get_display # type: ignore
492
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
493
+
494
+ # Convert from logical order to visual order
495
+ visual = get_display(logical, base_dir="R")
496
+ return mirror_brackets(visual)
497
+ except Exception:
498
+ # If python-bidi is missing or errors, fall back to logical order
499
+ return logical
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.30
3
+ Version: 0.1.31
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -28,7 +28,7 @@ natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZH
28
28
  natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
29
29
  natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
30
30
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
31
- natural_pdf/core/element_manager.py,sha256=96v_w3kXhSUqRsJlX5Bl6O6hJzpYRqDn4xoyRsdqZ7o,49260
31
+ natural_pdf/core/element_manager.py,sha256=Mn4cYqPL_2LD_GK9lf2duExaJF1qhASCKsOdAZdQb00,49821
32
32
  natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
33
33
  natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
34
34
  natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
@@ -44,7 +44,7 @@ natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI
44
44
  natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
45
45
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
46
46
  natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
47
- natural_pdf/elements/text.py,sha256=2neapKplef0FsAMYWr4OdICt-TmrZ3z9z0YBrX8FrSk,17738
47
+ natural_pdf/elements/text.py,sha256=kw7u2KfHtDB905YawP7Hs89kcR8XnbtpkYQGEk6LNyk,18860
48
48
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
49
49
  natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
50
50
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
@@ -100,7 +100,7 @@ natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6
100
100
  natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
101
101
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
102
102
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
103
- natural_pdf-0.1.30.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
103
+ natural_pdf-0.1.31.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
104
104
  optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
105
105
  optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
106
106
  optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
@@ -115,8 +115,8 @@ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrO
115
115
  tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
116
116
  tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
117
117
  tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
118
- natural_pdf-0.1.30.dist-info/METADATA,sha256=4Jg-iXXt6zGNE4gSYE_nMF395JDzv1Dierh93x1Lklo,6711
119
- natural_pdf-0.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
- natural_pdf-0.1.30.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
- natural_pdf-0.1.30.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
- natural_pdf-0.1.30.dist-info/RECORD,,
118
+ natural_pdf-0.1.31.dist-info/METADATA,sha256=tqimu2ZReyYu5pS0PsbCo-Z9fIzkpMj1ljGPNbaOFss,6711
119
+ natural_pdf-0.1.31.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
+ natural_pdf-0.1.31.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
+ natural_pdf-0.1.31.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
+ natural_pdf-0.1.31.dist-info/RECORD,,