natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/text.py
CHANGED
@@ -32,10 +32,34 @@ class TextElement(Element):
|
|
32
32
|
obj["object_type"] = "text"
|
33
33
|
|
34
34
|
super().__init__(obj, page)
|
35
|
-
|
36
|
-
#
|
35
|
+
|
36
|
+
# Memory optimization: Store character indices instead of full dictionaries
|
37
|
+
# This reduces memory usage by ~50% by avoiding character data duplication
|
38
|
+
self._char_indices = obj.pop("_char_indices", [])
|
39
|
+
|
40
|
+
# Backward compatibility: Keep _char_dicts for existing code
|
41
|
+
# But prefer _char_indices when available to save memory
|
37
42
|
self._char_dicts = obj.pop("_char_dicts", [])
|
38
43
|
|
44
|
+
@property
|
45
|
+
def chars(self):
|
46
|
+
"""Get constituent character elements efficiently.
|
47
|
+
|
48
|
+
Uses character indices when available to avoid memory duplication,
|
49
|
+
falls back to _char_dicts for backward compatibility.
|
50
|
+
"""
|
51
|
+
if self._char_indices:
|
52
|
+
# Memory-efficient approach: access characters by index
|
53
|
+
if hasattr(self.page, '_element_mgr'):
|
54
|
+
char_elements = self.page._element_mgr.get_elements('chars')
|
55
|
+
return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
|
56
|
+
|
57
|
+
# Backward compatibility: convert _char_dicts to TextElement objects
|
58
|
+
if self._char_dicts:
|
59
|
+
return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
|
60
|
+
|
61
|
+
return []
|
62
|
+
|
39
63
|
@property
|
40
64
|
def text(self) -> str:
|
41
65
|
"""Get the text content."""
|
@@ -43,17 +67,22 @@ class TextElement(Element):
|
|
43
67
|
|
44
68
|
@text.setter
|
45
69
|
def text(self, value: str):
|
46
|
-
"""Set the text content and synchronise underlying char dictionaries (if any)."""
|
70
|
+
"""Set the text content and synchronise underlying char dictionaries/indices (if any)."""
|
47
71
|
# Update the primary text value stored on the object itself
|
48
72
|
self._obj["text"] = value
|
49
73
|
|
50
|
-
# ---
|
51
|
-
# that rely on the raw character dictionaries see the corrected text.
|
52
|
-
# For OCR-generated words we usually have a single representative char
|
53
|
-
# dict; for native words there may be one per character.
|
54
|
-
# ---------------------------------------------------------------------
|
74
|
+
# --- Sync character data for both memory-efficient and legacy approaches
|
55
75
|
try:
|
56
|
-
|
76
|
+
# If using memory-efficient character indices, update the referenced chars
|
77
|
+
if hasattr(self, "_char_indices") and self._char_indices:
|
78
|
+
if hasattr(self.page, '_element_mgr'):
|
79
|
+
char_elements = self.page._element_mgr.get_elements('chars')
|
80
|
+
for idx, char_idx in enumerate(self._char_indices):
|
81
|
+
if char_idx < len(char_elements) and idx < len(value):
|
82
|
+
char_elements[char_idx].text = value[idx]
|
83
|
+
|
84
|
+
# Legacy _char_dicts synchronization for backward compatibility
|
85
|
+
elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
|
57
86
|
if not self._char_dicts:
|
58
87
|
return # Nothing to update
|
59
88
|
|
@@ -93,7 +122,7 @@ class TextElement(Element):
|
|
93
122
|
# Keep failures silent but logged; better to have outdated chars than crash.
|
94
123
|
import logging
|
95
124
|
logger = logging.getLogger(__name__)
|
96
|
-
logger.debug(f"TextElement: Failed to sync
|
125
|
+
logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
|
97
126
|
|
98
127
|
@property
|
99
128
|
def source(self) -> str:
|
@@ -331,6 +360,45 @@ class TextElement(Element):
|
|
331
360
|
|
332
361
|
return False
|
333
362
|
|
363
|
+
@property
|
364
|
+
def strike(self) -> bool: # alias: struck
|
365
|
+
"""True if this element (word/char) is marked as strikethrough."""
|
366
|
+
# Two possible storage places: raw object dict (comes from extractor
|
367
|
+
# via extra_attrs) or metadata (if later pipeline stages mutate).
|
368
|
+
return bool(self._obj.get("strike") or self.metadata.get("decoration", {}).get("strike"))
|
369
|
+
|
370
|
+
# Back-compat alias
|
371
|
+
@property
|
372
|
+
def struck(self) -> bool: # noqa: D401
|
373
|
+
return self.strike
|
374
|
+
|
375
|
+
# -----------------------------
|
376
|
+
# Underline decoration
|
377
|
+
# -----------------------------
|
378
|
+
|
379
|
+
@property
|
380
|
+
def underline(self) -> bool:
|
381
|
+
"""True if element is underlined."""
|
382
|
+
return bool(self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline"))
|
383
|
+
|
384
|
+
# -----------------------------
|
385
|
+
# Highlight decoration
|
386
|
+
# -----------------------------
|
387
|
+
|
388
|
+
@property
|
389
|
+
def is_highlighted(self) -> bool:
|
390
|
+
"""True if element (char/word) is marked as highlighted in the original PDF."""
|
391
|
+
return bool(
|
392
|
+
self._obj.get("highlight")
|
393
|
+
or self._obj.get("is_highlighted")
|
394
|
+
or self.metadata.get("decoration", {}).get("highlight")
|
395
|
+
)
|
396
|
+
|
397
|
+
@property
|
398
|
+
def highlight_color(self):
|
399
|
+
"""Return RGB(A) tuple of highlight colour if stored."""
|
400
|
+
return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get("highlight_color")
|
401
|
+
|
334
402
|
def __repr__(self) -> str:
|
335
403
|
"""String representation of the text element."""
|
336
404
|
if self.text:
|
@@ -342,6 +410,12 @@ class TextElement(Element):
|
|
342
410
|
font_style.append("bold")
|
343
411
|
if self.italic:
|
344
412
|
font_style.append("italic")
|
413
|
+
if self.strike:
|
414
|
+
font_style.append("strike")
|
415
|
+
if self.underline:
|
416
|
+
font_style.append("underline")
|
417
|
+
if self.is_highlighted:
|
418
|
+
font_style.append("highlight")
|
345
419
|
style_str = f", style={font_style}" if font_style else ""
|
346
420
|
|
347
421
|
# Use font_family for display but include raw fontname and variant
|
@@ -353,7 +427,11 @@ class TextElement(Element):
|
|
353
427
|
base_font = self.fontname.split("+", 1)[1]
|
354
428
|
font_display = f"{font_display} ({base_font})"
|
355
429
|
|
356
|
-
|
430
|
+
color_info = ""
|
431
|
+
if self.is_highlighted and self.highlight_color is not None:
|
432
|
+
color_info = f", highlight_color={self.highlight_color}"
|
433
|
+
|
434
|
+
return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str}{color_info} bbox={self.bbox}>"
|
357
435
|
|
358
436
|
def font_info(self) -> dict:
|
359
437
|
"""
|
natural_pdf/flows/collections.py
CHANGED
@@ -164,7 +164,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
|
|
164
164
|
|
165
165
|
def show(
|
166
166
|
self,
|
167
|
-
|
167
|
+
resolution: Optional[float] = None,
|
168
168
|
labels: bool = True,
|
169
169
|
legend_position: str = "right",
|
170
170
|
default_color: Optional[Union[Tuple, str]] = "orange", # A distinct color for FEC show
|
@@ -273,7 +273,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
|
|
273
273
|
else getattr(page_obj, "page_number", 1) - 1
|
274
274
|
),
|
275
275
|
temporary_highlights=temp_highlights_for_page,
|
276
|
-
|
276
|
+
resolution=resolution,
|
277
277
|
width=width,
|
278
278
|
labels=labels,
|
279
279
|
legend_position=legend_position,
|
@@ -480,7 +480,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
|
|
480
480
|
|
481
481
|
def show(
|
482
482
|
self,
|
483
|
-
|
483
|
+
resolution: Optional[float] = None,
|
484
484
|
labels: bool = True,
|
485
485
|
legend_position: str = "right",
|
486
486
|
default_color: Optional[Union[Tuple, str]] = "darkviolet", # A distinct color for FRC show
|
@@ -565,7 +565,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
|
|
565
565
|
else getattr(page_obj, "page_number", 1) - 1
|
566
566
|
),
|
567
567
|
temporary_highlights=temp_highlights_for_page,
|
568
|
-
|
568
|
+
resolution=resolution,
|
569
569
|
width=width,
|
570
570
|
labels=labels,
|
571
571
|
legend_position=legend_position,
|
natural_pdf/flows/region.py
CHANGED
@@ -244,7 +244,7 @@ class FlowRegion:
|
|
244
244
|
|
245
245
|
def show(
|
246
246
|
self,
|
247
|
-
|
247
|
+
resolution: Optional[float] = None,
|
248
248
|
labels: bool = True,
|
249
249
|
legend_position: str = "right",
|
250
250
|
color: Optional[Union[Tuple, str]] = "fuchsia",
|
@@ -258,6 +258,21 @@ class FlowRegion:
|
|
258
258
|
"""
|
259
259
|
Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
|
260
260
|
If multiple pages are involved, they are stacked into a single image.
|
261
|
+
|
262
|
+
Args:
|
263
|
+
resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
|
264
|
+
labels: Whether to include a legend for highlights.
|
265
|
+
legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
|
266
|
+
color: Color for highlighting the constituent regions.
|
267
|
+
label_prefix: Prefix for region labels (e.g., 'FlowPart').
|
268
|
+
width: Optional width for the output image (overrides resolution).
|
269
|
+
stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
|
270
|
+
stack_gap: Gap in pixels between stacked pages.
|
271
|
+
stack_background_color: RGB background color for the stacked image.
|
272
|
+
**kwargs: Additional arguments passed to the underlying rendering methods.
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
|
261
276
|
"""
|
262
277
|
if not self.constituent_regions:
|
263
278
|
logger.info("FlowRegion.show() called with no constituent regions.")
|
@@ -350,7 +365,7 @@ class FlowRegion:
|
|
350
365
|
else getattr(page_obj, "page_number", 1) - 1
|
351
366
|
),
|
352
367
|
temporary_highlights=temp_highlights_for_page,
|
353
|
-
|
368
|
+
resolution=resolution,
|
354
369
|
width=width,
|
355
370
|
labels=labels, # Pass through labels
|
356
371
|
legend_position=legend_position,
|
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -127,7 +127,7 @@ class PaddleOCREngine(OCREngine):
|
|
127
127
|
except ImportError as e:
|
128
128
|
self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
129
129
|
raise RuntimeError(
|
130
|
-
"paddleocr is not available. Install via:
|
130
|
+
"paddleocr is not available. Install via: npdf install paddle"
|
131
131
|
) from e
|
132
132
|
|
133
133
|
paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
|
natural_pdf/ocr/ocr_factory.py
CHANGED
@@ -32,7 +32,7 @@ class OCRFactory:
|
|
32
32
|
return SuryaOCREngine(**kwargs)
|
33
33
|
except ImportError:
|
34
34
|
raise ImportError(
|
35
|
-
"Surya engine requires additional dependencies. " "Install with:
|
35
|
+
"Surya engine requires additional dependencies. " "Install with: npdf install surya"
|
36
36
|
)
|
37
37
|
elif engine_type == "easyocr":
|
38
38
|
try:
|
@@ -42,7 +42,7 @@ class OCRFactory:
|
|
42
42
|
except ImportError:
|
43
43
|
raise ImportError(
|
44
44
|
"EasyOCR engine requires the 'easyocr' package. "
|
45
|
-
"Install with: pip install easyocr (or
|
45
|
+
"Install with: pip install easyocr (or npdf install easyocr when available)"
|
46
46
|
)
|
47
47
|
elif engine_type == "paddle":
|
48
48
|
try:
|
@@ -52,7 +52,7 @@ class OCRFactory:
|
|
52
52
|
except ImportError:
|
53
53
|
raise ImportError(
|
54
54
|
"PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
|
55
|
-
"Install with:
|
55
|
+
"Install with: npdf install paddle"
|
56
56
|
)
|
57
57
|
elif engine_type == "doctr":
|
58
58
|
try:
|
@@ -137,9 +137,9 @@ class OCRFactory:
|
|
137
137
|
|
138
138
|
# If we get here, no engines are available
|
139
139
|
raise ImportError(
|
140
|
-
"No OCR engines are installed. You can add one via the
|
141
|
-
"
|
142
|
-
"
|
143
|
-
"
|
144
|
-
"
|
140
|
+
"No OCR engines are installed. You can add one via the npdf installer, e.g.:\n"
|
141
|
+
" npdf install easyocr # fastest to set up\n"
|
142
|
+
" npdf install paddle # best Asian-language accuracy\n"
|
143
|
+
" npdf install surya # Surya OCR engine\n"
|
144
|
+
" npdf install yolo # Layout detection (YOLO)\n"
|
145
145
|
)
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -94,7 +94,7 @@ class OCRManager:
|
|
94
94
|
engine_instance = engine_class() # Instantiate first
|
95
95
|
if not engine_instance.is_available():
|
96
96
|
# Check availability before storing
|
97
|
-
install_hint = f"
|
97
|
+
install_hint = f"npdf install {engine_name}"
|
98
98
|
|
99
99
|
raise RuntimeError(
|
100
100
|
f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
@@ -295,3 +295,53 @@ class OCRManager:
|
|
295
295
|
) # Log check failures at debug level
|
296
296
|
pass # Ignore engines that fail to instantiate or check
|
297
297
|
return available
|
298
|
+
|
299
|
+
def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
|
300
|
+
"""
|
301
|
+
Cleanup OCR engine instances to free memory.
|
302
|
+
|
303
|
+
Args:
|
304
|
+
engine_name: Specific engine to cleanup, or None to cleanup all engines
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
Number of engines cleaned up
|
308
|
+
"""
|
309
|
+
cleaned_count = 0
|
310
|
+
|
311
|
+
if engine_name:
|
312
|
+
# Cleanup specific engine
|
313
|
+
engine_name = engine_name.lower()
|
314
|
+
if engine_name in self._engine_instances:
|
315
|
+
engine = self._engine_instances.pop(engine_name)
|
316
|
+
if hasattr(engine, 'cleanup'):
|
317
|
+
try:
|
318
|
+
engine.cleanup()
|
319
|
+
except Exception as e:
|
320
|
+
logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
|
321
|
+
|
322
|
+
# Clear associated locks
|
323
|
+
self._engine_locks.pop(engine_name, None)
|
324
|
+
self._engine_inference_locks.pop(engine_name, None)
|
325
|
+
|
326
|
+
logger.info(f"Cleaned up OCR engine: {engine_name}")
|
327
|
+
cleaned_count = 1
|
328
|
+
else:
|
329
|
+
# Cleanup all engines
|
330
|
+
for name, engine in list(self._engine_instances.items()):
|
331
|
+
if hasattr(engine, 'cleanup'):
|
332
|
+
try:
|
333
|
+
engine.cleanup()
|
334
|
+
except Exception as e:
|
335
|
+
logger.debug(f"Engine {name} cleanup method failed: {e}")
|
336
|
+
|
337
|
+
# Clear all caches
|
338
|
+
engine_count = len(self._engine_instances)
|
339
|
+
self._engine_instances.clear()
|
340
|
+
self._engine_locks.clear()
|
341
|
+
self._engine_inference_locks.clear()
|
342
|
+
|
343
|
+
if engine_count > 0:
|
344
|
+
logger.info(f"Cleaned up {engine_count} OCR engines")
|
345
|
+
cleaned_count = engine_count
|
346
|
+
|
347
|
+
return cleaned_count
|
natural_pdf/selectors/parser.py
CHANGED
@@ -224,6 +224,18 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
224
224
|
|
225
225
|
selector = selector.strip()
|
226
226
|
|
227
|
+
# ------------------------------------------------------------------
|
228
|
+
# Handle wildcard selector (leading "*")
|
229
|
+
# ------------------------------------------------------------------
|
230
|
+
# A selector can start with "*" to denote "any element type", optionally
|
231
|
+
# followed by attribute blocks or pseudo-classes – e.g. *[width>100].
|
232
|
+
# We strip the asterisk but keep the remainder so the normal attribute
|
233
|
+
# / pseudo-class parsing logic can proceed.
|
234
|
+
|
235
|
+
if selector.startswith("*"):
|
236
|
+
# Keep everything *after* the asterisk (attributes, pseudos, etc.).
|
237
|
+
selector = selector[1:].strip()
|
238
|
+
|
227
239
|
# --- Handle OR operators first (| or ,) ---
|
228
240
|
# Check if selector contains OR operators at the top level only
|
229
241
|
# (not inside quotes, parentheses, or brackets)
|
@@ -253,13 +265,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
253
265
|
|
254
266
|
# --- Continue with single selector parsing (existing logic) ---
|
255
267
|
|
256
|
-
# --- Handle wildcard selector explicitly ---
|
257
|
-
if selector == "*":
|
258
|
-
# Wildcard matches any type, already the default.
|
259
|
-
# Clear selector so the loop doesn't run and error out.
|
260
|
-
selector = ""
|
261
|
-
# --- END NEW ---
|
262
|
-
|
263
268
|
# 1. Extract type (optional, at the beginning)
|
264
269
|
# Only run if selector wasn't '*'
|
265
270
|
if selector:
|
@@ -741,6 +746,21 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
741
746
|
elif name == "vertical":
|
742
747
|
filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
|
743
748
|
|
749
|
+
# --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
|
750
|
+
elif name in ("strike", "strikethrough", "strikeout"):
|
751
|
+
filter_lambda = lambda el: hasattr(el, "strike") and bool(getattr(el, "strike"))
|
752
|
+
filter_name = f"pseudo-class :{name}"
|
753
|
+
elif name in ("underline", "underlined"):
|
754
|
+
filter_lambda = lambda el: hasattr(el, "underline") and bool(getattr(el, "underline"))
|
755
|
+
filter_name = f"pseudo-class :{name}"
|
756
|
+
elif name in ("highlight", "highlighted"):
|
757
|
+
# Match only if the element exposes an `is_highlighted` boolean flag.
|
758
|
+
# We deliberately avoid looking at the generic `.highlight()` method on
|
759
|
+
# Element, because it is a callable present on every element and would
|
760
|
+
# incorrectly mark everything as highlighted.
|
761
|
+
filter_lambda = lambda el: bool(getattr(el, "is_highlighted", False))
|
762
|
+
filter_name = f"pseudo-class :{name}"
|
763
|
+
|
744
764
|
# Check predefined lambda functions (e.g., :first-child, :empty)
|
745
765
|
elif name in PSEUDO_CLASS_FUNCTIONS:
|
746
766
|
filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""Sequence wrapper for table data with convenient DataFrame helpers."""
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from collections.abc import Sequence
|
5
|
+
from typing import Any, List, Iterator, Optional, Union
|
6
|
+
|
7
|
+
|
8
|
+
class TableResult(Sequence):
|
9
|
+
"""List-of-rows plus `.df` / `.to_df()` helpers.
|
10
|
+
|
11
|
+
The object behaves like an immutable sequence of rows (each row is a
|
12
|
+
list of cell values) but offers an easy hand-off to *pandas*.
|
13
|
+
"""
|
14
|
+
|
15
|
+
_IMMUTABLE_MESSAGE = (
|
16
|
+
"TableResult is read-only; convert to list(result) if you need to mutate"
|
17
|
+
)
|
18
|
+
|
19
|
+
def __init__(self, rows: Optional[List[List[Any]]] = None) -> None:
|
20
|
+
# Normalise to list of list so that Sequence operations work as expected
|
21
|
+
self._rows: List[List[Any]] = list(rows or [])
|
22
|
+
|
23
|
+
# ---------------------------------------------------------------------
|
24
|
+
# Sequence API
|
25
|
+
# ---------------------------------------------------------------------
|
26
|
+
def __getitem__(self, index): # type: ignore[override]
|
27
|
+
return self._rows[index]
|
28
|
+
|
29
|
+
def __len__(self) -> int: # type: ignore[override]
|
30
|
+
return len(self._rows)
|
31
|
+
|
32
|
+
def __iter__(self) -> Iterator[List[Any]]: # type: ignore[override]
|
33
|
+
return iter(self._rows)
|
34
|
+
|
35
|
+
# ------------------------------------------------------------------
|
36
|
+
# Convenience helpers
|
37
|
+
# ------------------------------------------------------------------
|
38
|
+
@property
|
39
|
+
def df(self):
|
40
|
+
"""Quick property alias → calls :py:meth:`to_df` with default args."""
|
41
|
+
return self.to_df()
|
42
|
+
|
43
|
+
def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
|
44
|
+
"""Convert to *pandas* DataFrame.
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
header : "first" | int | list[int] | None, default "first"
|
49
|
+
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
50
|
+
index_col : same semantics as pandas, forwarded.
|
51
|
+
**kwargs : forwarded to :pyclass:`pandas.DataFrame`.
|
52
|
+
"""
|
53
|
+
try:
|
54
|
+
import pandas as pd # type: ignore
|
55
|
+
except ModuleNotFoundError as exc:
|
56
|
+
raise ImportError(
|
57
|
+
"pandas is required for TableResult.to_df(); install via `pip install pandas`."
|
58
|
+
) from exc
|
59
|
+
|
60
|
+
rows = self._rows
|
61
|
+
if not rows:
|
62
|
+
return pd.DataFrame()
|
63
|
+
|
64
|
+
# Determine header rows and body rows
|
65
|
+
body = rows
|
66
|
+
hdr = None
|
67
|
+
if header == "first":
|
68
|
+
hdr = rows[0]
|
69
|
+
body = rows[1:]
|
70
|
+
elif header is None or header is False:
|
71
|
+
hdr = None
|
72
|
+
elif isinstance(header, int):
|
73
|
+
hdr = rows[header]
|
74
|
+
body = rows[:header] + rows[header + 1 :]
|
75
|
+
elif isinstance(header, (list, tuple)):
|
76
|
+
hdr_rows = [rows[i] for i in header]
|
77
|
+
body = [r for idx, r in enumerate(rows) if idx not in header]
|
78
|
+
hdr = hdr_rows
|
79
|
+
else:
|
80
|
+
raise ValueError("Invalid value for header parameter")
|
81
|
+
|
82
|
+
df = pd.DataFrame(body, columns=hdr)
|
83
|
+
if index_col is not None and not df.empty:
|
84
|
+
df.set_index(df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True)
|
85
|
+
|
86
|
+
if kwargs:
|
87
|
+
df = pd.DataFrame(df, **kwargs)
|
88
|
+
return df
|
89
|
+
|
90
|
+
# ------------------------------------------------------------------
|
91
|
+
# Block mutating operations to keep result read-only
|
92
|
+
# ------------------------------------------------------------------
|
93
|
+
def _readonly(self, *args, **kwargs):
|
94
|
+
raise TypeError(self._IMMUTABLE_MESSAGE)
|
95
|
+
|
96
|
+
append = extend = insert = __setitem__ = __delitem__ = clear = pop = remove = _readonly # type: ignore
|
97
|
+
|
98
|
+
# Nice repr in notebooks
|
99
|
+
def __repr__(self) -> str: # noqa: D401 (simple)
|
100
|
+
preview = "…" if len(self._rows) > 5 else ""
|
101
|
+
return f"TableResult(rows={len(self._rows)}{preview})"
|
@@ -0,0 +1,36 @@
|
|
1
|
+
"""Light-weight bracket mirroring for RTL text.
|
2
|
+
|
3
|
+
This module provides `mirror_brackets`, a fast pure-python helper that
|
4
|
+
replaces each bracket/parenthesis character with its Unicode-defined pair.
|
5
|
+
|
6
|
+
For everyday PDFs the six ASCII pairs are enough, but the mapping can be
|
7
|
+
extended easily from Unicode's BidiBrackets.txt.
|
8
|
+
"""
|
9
|
+
from typing import Dict
|
10
|
+
|
11
|
+
# Minimal mapping – ( ) [ ] { }
|
12
|
+
_ASCII_MIRROR: Dict[int, str] = {
|
13
|
+
0x0028: ")", # ( -> )
|
14
|
+
0x0029: "(", # ) -> (
|
15
|
+
0x005B: "]", # [ -> ]
|
16
|
+
0x005D: "[", # ] -> [
|
17
|
+
0x007B: "}", # { -> }
|
18
|
+
0x007D: "{", # } -> {
|
19
|
+
}
|
20
|
+
|
21
|
+
|
22
|
+
def mirror_brackets(text: str) -> str: # pragma: no cover
|
23
|
+
"""Return *text* with each bracket replaced by its mirror partner.
|
24
|
+
|
25
|
+
The function is context-free: it blindly flips every character found in
|
26
|
+
the mapping, which is sufficient once the string is already in visual
|
27
|
+
order (e.g., after `bidi.algorithm.get_display`).
|
28
|
+
"""
|
29
|
+
if not text:
|
30
|
+
return text
|
31
|
+
# Fast path: only allocate when needed
|
32
|
+
out_chars = []
|
33
|
+
append = out_chars.append
|
34
|
+
for ch in text:
|
35
|
+
append(_ASCII_MIRROR.get(ord(ch), ch))
|
36
|
+
return "".join(out_chars)
|
@@ -235,12 +235,26 @@ def merge_images_with_legend(
|
|
235
235
|
|
236
236
|
|
237
237
|
def render_plain_page(page, resolution):
|
238
|
+
"""
|
239
|
+
Render a page to PIL Image using the specified resolution.
|
240
|
+
|
241
|
+
Args:
|
242
|
+
page: Page object to render
|
243
|
+
resolution: DPI resolution for rendering
|
244
|
+
|
245
|
+
Returns:
|
246
|
+
PIL Image of the rendered page
|
247
|
+
"""
|
238
248
|
doc = pypdfium2.PdfDocument(page._page.pdf.stream)
|
239
249
|
|
240
250
|
pdf_page = doc[page.index]
|
241
251
|
|
252
|
+
# Convert resolution (DPI) to scale factor for pypdfium2
|
253
|
+
# PDF standard is 72 DPI, so scale = resolution / 72
|
254
|
+
scale_factor = resolution / 72.0
|
255
|
+
|
242
256
|
bitmap = pdf_page.render(
|
243
|
-
scale=
|
257
|
+
scale=scale_factor,
|
244
258
|
)
|
245
259
|
image = bitmap.to_pil().convert("RGB")
|
246
260
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.30
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -24,6 +24,7 @@ Requires-Dist: pydantic
|
|
24
24
|
Requires-Dist: jenkspy
|
25
25
|
Requires-Dist: scipy
|
26
26
|
Requires-Dist: ipywidgets>=7.0.0
|
27
|
+
Requires-Dist: python-bidi
|
27
28
|
Provides-Extra: test
|
28
29
|
Requires-Dist: pytest; extra == "test"
|
29
30
|
Requires-Dist: pytest-xdist; extra == "test"
|