natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
# ocr_options.py
|
2
|
-
import logging
|
3
2
|
from dataclasses import dataclass, field
|
4
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
4
|
|
6
|
-
# Configure logging
|
7
|
-
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8
|
-
# logger = logging.getLogger(__name__)
|
9
|
-
# Assume logger is configured elsewhere or remove if not needed globally
|
10
5
|
|
11
6
|
|
12
7
|
# --- Base Options ---
|
@@ -58,8 +53,6 @@ class EasyOCROptions(BaseOCROptions):
|
|
58
53
|
add_margin: float = 0.1
|
59
54
|
output_format: str = "standard"
|
60
55
|
|
61
|
-
# def __post_init__(self):
|
62
|
-
# logger.debug(f"Initialized EasyOCROptions: {self}")
|
63
56
|
|
64
57
|
|
65
58
|
# --- PaddleOCR Specific Options ---
|
@@ -95,8 +88,8 @@ class PaddleOCROptions(BaseOCROptions):
|
|
95
88
|
|
96
89
|
# Detection parameters (can be overridden at predict time)
|
97
90
|
# https://github.com/PaddlePaddle/PaddleOCR/issues/15424
|
98
|
-
text_det_limit_side_len: Optional[int] = 736
|
99
|
-
text_det_limit_type: Optional[str] =
|
91
|
+
text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
|
92
|
+
text_det_limit_type: Optional[str] = "max" # WAITING FOR FIX
|
100
93
|
text_det_thresh: Optional[float] = None
|
101
94
|
text_det_box_thresh: Optional[float] = None
|
102
95
|
text_det_unclip_ratio: Optional[float] = None
|
@@ -113,7 +106,7 @@ class PaddleOCROptions(BaseOCROptions):
|
|
113
106
|
enable_hpi: Optional[bool] = None
|
114
107
|
use_tensorrt: Optional[bool] = None
|
115
108
|
precision: Optional[str] = None
|
116
|
-
enable_mkldnn: Optional[bool] = False
|
109
|
+
enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
|
117
110
|
# mkldnn_cache_capacity: Optional[int] = None
|
118
111
|
cpu_threads: Optional[int] = None
|
119
112
|
paddlex_config: Optional[str] = None
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -9,6 +9,7 @@ import numpy as np
|
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
|
11
11
|
from natural_pdf.elements.collections import ElementCollection
|
12
|
+
|
12
13
|
from .qa_result import QAResult
|
13
14
|
|
14
15
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
@@ -252,13 +253,17 @@ class DocumentQA:
|
|
252
253
|
# Save per-question result in debug mode
|
253
254
|
if debug:
|
254
255
|
# File names: debug_qa_result_0.json, …
|
255
|
-
result_path = os.path.join(
|
256
|
+
result_path = os.path.join(
|
257
|
+
debug_output_dir, f"debug_qa_result_{q[:30].replace(' ', '_')}.json"
|
258
|
+
)
|
256
259
|
try:
|
257
260
|
with open(result_path, "w") as f:
|
258
261
|
serializable = {
|
259
262
|
k: (
|
260
263
|
str(v)
|
261
|
-
if not isinstance(
|
264
|
+
if not isinstance(
|
265
|
+
v, (str, int, float, bool, list, dict, type(None))
|
266
|
+
)
|
262
267
|
else v
|
263
268
|
)
|
264
269
|
for k, v in top_res.items()
|
@@ -317,9 +322,9 @@ class DocumentQA:
|
|
317
322
|
warnings.warn(
|
318
323
|
f"No text elements found on page {page.index}. "
|
319
324
|
"Consider applying OCR first using page.apply_ocr() to extract text from images.",
|
320
|
-
UserWarning
|
325
|
+
UserWarning,
|
321
326
|
)
|
322
|
-
|
327
|
+
|
323
328
|
# Return appropriate "not found" result(s)
|
324
329
|
if isinstance(question, (list, tuple)):
|
325
330
|
return [
|
@@ -376,7 +381,11 @@ class DocumentQA:
|
|
376
381
|
start_idx = res.start
|
377
382
|
end_idx = res.end
|
378
383
|
|
379
|
-
if
|
384
|
+
if (
|
385
|
+
elements
|
386
|
+
and 0 <= start_idx < len(word_boxes)
|
387
|
+
and 0 <= end_idx < len(word_boxes)
|
388
|
+
):
|
380
389
|
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
381
390
|
|
382
391
|
source_elements = []
|
@@ -426,9 +435,9 @@ class DocumentQA:
|
|
426
435
|
warnings.warn(
|
427
436
|
f"No text elements found in region on page {region.page.index}. "
|
428
437
|
"Consider applying OCR first using region.apply_ocr() to extract text from images.",
|
429
|
-
UserWarning
|
438
|
+
UserWarning,
|
430
439
|
)
|
431
|
-
|
440
|
+
|
432
441
|
# Return appropriate "not found" result(s)
|
433
442
|
if isinstance(question, (list, tuple)):
|
434
443
|
return [
|
@@ -488,7 +497,11 @@ class DocumentQA:
|
|
488
497
|
start_idx = res.start
|
489
498
|
end_idx = res.end
|
490
499
|
|
491
|
-
if
|
500
|
+
if (
|
501
|
+
elements
|
502
|
+
and 0 <= start_idx < len(word_boxes)
|
503
|
+
and 0 <= end_idx < len(word_boxes)
|
504
|
+
):
|
492
505
|
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
493
506
|
|
494
507
|
source_elements = []
|
natural_pdf/qa/qa_result.py
CHANGED
@@ -24,13 +24,9 @@ class QAResult(dict):
|
|
24
24
|
"""
|
25
25
|
source = self.get("source_elements")
|
26
26
|
if source is None:
|
27
|
-
raise AttributeError(
|
28
|
-
"QAResult does not contain 'source_elements'; nothing to show()."
|
29
|
-
)
|
27
|
+
raise AttributeError("QAResult does not contain 'source_elements'; nothing to show().")
|
30
28
|
if not hasattr(source, "show"):
|
31
|
-
raise AttributeError(
|
32
|
-
"'source_elements' object has no 'show' method; cannot visualise."
|
33
|
-
)
|
29
|
+
raise AttributeError("'source_elements' object has no 'show' method; cannot visualise.")
|
34
30
|
return source.show(*args, **kwargs)
|
35
31
|
|
36
32
|
# ------------------------------------------------------------------
|
@@ -52,4 +48,4 @@ class QAResult(dict):
|
|
52
48
|
|
53
49
|
# Ensure ``copy`` keeps the subclass type
|
54
50
|
def copy(self):
|
55
|
-
return QAResult(self)
|
51
|
+
return QAResult(self)
|
natural_pdf/search/__init__.py
CHANGED
@@ -18,9 +18,9 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
|
|
18
18
|
|
19
19
|
try:
|
20
20
|
import numpy as np
|
21
|
+
|
21
22
|
# Lazy import for sentence_transformers to avoid heavy loading at module level
|
22
23
|
# import sentence_transformers
|
23
|
-
|
24
24
|
# Basic search dependencies are available
|
25
25
|
SEARCH_DEPENDENCIES_AVAILABLE = True
|
26
26
|
|
@@ -51,6 +51,7 @@ def _check_sentence_transformers():
|
|
51
51
|
"""Lazy check for sentence_transformers availability."""
|
52
52
|
try:
|
53
53
|
import sentence_transformers
|
54
|
+
|
54
55
|
return True
|
55
56
|
except ImportError:
|
56
57
|
return False
|
@@ -63,7 +64,7 @@ def check_search_availability():
|
|
63
64
|
"Search functionality requires 'lancedb' and pyarrow. "
|
64
65
|
"Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
|
65
66
|
)
|
66
|
-
|
67
|
+
|
67
68
|
# Lazy check for sentence_transformers when actually needed
|
68
69
|
if not _check_sentence_transformers():
|
69
70
|
raise ImportError(
|
@@ -7,15 +7,13 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
7
7
|
|
8
8
|
import lancedb
|
9
9
|
import pyarrow as pa
|
10
|
+
|
11
|
+
from .search_options import BaseSearchOptions
|
12
|
+
from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
|
13
|
+
|
10
14
|
# Lazy import for SentenceTransformer to avoid heavy loading at module level
|
11
15
|
# from sentence_transformers import SentenceTransformer
|
12
16
|
|
13
|
-
from .search_options import BaseSearchOptions
|
14
|
-
from .search_service_protocol import (
|
15
|
-
Indexable,
|
16
|
-
IndexConfigurationError,
|
17
|
-
SearchServiceProtocol,
|
18
|
-
)
|
19
17
|
|
20
18
|
logger = logging.getLogger(__name__)
|
21
19
|
|
@@ -26,6 +24,7 @@ DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
|
|
26
24
|
def _get_sentence_transformer(model_name: str):
|
27
25
|
"""Lazy import and instantiation of SentenceTransformer."""
|
28
26
|
from sentence_transformers import SentenceTransformer
|
27
|
+
|
29
28
|
return SentenceTransformer(model_name)
|
30
29
|
|
31
30
|
|
@@ -6,8 +6,6 @@ from pathlib import Path
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
-
# Lazy import for SentenceTransformer to avoid heavy loading at module level
|
10
|
-
# from sentence_transformers import SentenceTransformer
|
11
9
|
|
12
10
|
from .search_options import BaseSearchOptions
|
13
11
|
from .search_service_protocol import (
|
@@ -17,6 +15,10 @@ from .search_service_protocol import (
|
|
17
15
|
SearchServiceProtocol,
|
18
16
|
)
|
19
17
|
|
18
|
+
# Lazy import for SentenceTransformer to avoid heavy loading at module level
|
19
|
+
# from sentence_transformers import SentenceTransformer
|
20
|
+
|
21
|
+
|
20
22
|
logger = logging.getLogger(__name__)
|
21
23
|
|
22
24
|
DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
@@ -25,6 +27,7 @@ DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
|
25
27
|
def _get_sentence_transformer(model_name: str):
|
26
28
|
"""Lazy import and instantiation of SentenceTransformer."""
|
27
29
|
from sentence_transformers import SentenceTransformer
|
30
|
+
|
28
31
|
return SentenceTransformer(model_name)
|
29
32
|
|
30
33
|
|
natural_pdf/selectors/parser.py
CHANGED
@@ -1,5 +1,29 @@
|
|
1
|
-
"""
|
2
|
-
|
1
|
+
"""CSS-like selector parser for natural-pdf.
|
2
|
+
|
3
|
+
This module implements a sophisticated selector parsing system that enables
|
4
|
+
jQuery-style element selection in PDF documents. It supports complex CSS-like
|
5
|
+
selectors with extensions for PDF-specific attributes and spatial relationships.
|
6
|
+
|
7
|
+
The parser handles:
|
8
|
+
- Basic element selectors (text, rect, line, image)
|
9
|
+
- Attribute selectors with comparisons ([size>12], [color="red"])
|
10
|
+
- Pseudo-selectors for text content (:contains(), :regex())
|
11
|
+
- Spatial relationship selectors (:above(), :below(), :near())
|
12
|
+
- Color matching with Delta E distance calculations
|
13
|
+
- Logical operators (AND, OR) and grouping
|
14
|
+
- Complex nested expressions with proper precedence
|
15
|
+
|
16
|
+
Key features:
|
17
|
+
- Safe value parsing without eval() for security
|
18
|
+
- Color parsing from multiple formats (hex, RGB, names, CSS functions)
|
19
|
+
- Font and style attribute matching
|
20
|
+
- Coordinate and dimension-based selections
|
21
|
+
- Performance-optimized filtering functions
|
22
|
+
|
23
|
+
This enables powerful document navigation like:
|
24
|
+
- page.find('text[size>12]:bold:contains("Summary")')
|
25
|
+
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
|
+
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
3
27
|
"""
|
4
28
|
|
5
29
|
import ast
|
@@ -16,14 +40,35 @@ logger = logging.getLogger(__name__)
|
|
16
40
|
|
17
41
|
|
18
42
|
def safe_parse_value(value_str: str) -> Any:
|
19
|
-
"""
|
20
|
-
|
43
|
+
"""Safely parse a value string without using eval().
|
44
|
+
|
45
|
+
Parses various value formats commonly found in PDF attributes while maintaining
|
46
|
+
security by avoiding eval(). Supports numbers, tuples, lists, booleans, and
|
47
|
+
quoted strings with proper type conversion.
|
21
48
|
|
22
49
|
Args:
|
23
|
-
value_str: String representation of a value
|
50
|
+
value_str: String representation of a value. Can be a number ("12"),
|
51
|
+
tuple ("(1.0, 0.5, 0.2)"), list ("[1, 2, 3]"), quoted string
|
52
|
+
('"Arial"'), boolean ("True"), or plain string ("Arial").
|
24
53
|
|
25
54
|
Returns:
|
26
|
-
Parsed value
|
55
|
+
Parsed value with appropriate Python type. Numbers become int/float,
|
56
|
+
tuples/lists maintain structure, quoted strings are unquoted, and
|
57
|
+
unrecognized values are returned as strings.
|
58
|
+
|
59
|
+
Example:
|
60
|
+
```python
|
61
|
+
safe_parse_value("12") # -> 12
|
62
|
+
safe_parse_value("12.5") # -> 12.5
|
63
|
+
safe_parse_value("(1,0,0)") # -> (1, 0, 0)
|
64
|
+
safe_parse_value('"Arial"') # -> "Arial"
|
65
|
+
safe_parse_value("True") # -> True
|
66
|
+
safe_parse_value("plain_text") # -> "plain_text"
|
67
|
+
```
|
68
|
+
|
69
|
+
Note:
|
70
|
+
This function deliberately avoids eval() for security reasons and uses
|
71
|
+
ast.literal_eval() for safe parsing of Python literals.
|
27
72
|
"""
|
28
73
|
# Strip quotes first if it's a quoted string
|
29
74
|
value_str = value_str.strip()
|
natural_pdf/tables/__init__.py
CHANGED
natural_pdf/tables/result.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
"""Sequence wrapper for table data with convenient DataFrame helpers."""
|
2
|
+
|
2
3
|
from __future__ import annotations
|
3
4
|
|
4
5
|
from collections.abc import Sequence
|
5
|
-
from typing import Any,
|
6
|
+
from typing import Any, Iterator, List, Optional, Union
|
6
7
|
|
7
8
|
|
8
9
|
class TableResult(Sequence):
|
@@ -12,9 +13,7 @@ class TableResult(Sequence):
|
|
12
13
|
list of cell values) but offers an easy hand-off to *pandas*.
|
13
14
|
"""
|
14
15
|
|
15
|
-
_IMMUTABLE_MESSAGE = (
|
16
|
-
"TableResult is read-only; convert to list(result) if you need to mutate"
|
17
|
-
)
|
16
|
+
_IMMUTABLE_MESSAGE = "TableResult is read-only; convert to list(result) if you need to mutate"
|
18
17
|
|
19
18
|
def __init__(self, rows: Optional[List[List[Any]]] = None) -> None:
|
20
19
|
# Normalise to list of list so that Sequence operations work as expected
|
@@ -81,7 +80,9 @@ class TableResult(Sequence):
|
|
81
80
|
|
82
81
|
df = pd.DataFrame(body, columns=hdr)
|
83
82
|
if index_col is not None and not df.empty:
|
84
|
-
df.set_index(
|
83
|
+
df.set_index(
|
84
|
+
df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True
|
85
|
+
)
|
85
86
|
|
86
87
|
if kwargs:
|
87
88
|
df = pd.DataFrame(df, **kwargs)
|
@@ -98,4 +99,4 @@ class TableResult(Sequence):
|
|
98
99
|
# Nice repr in notebooks
|
99
100
|
def __repr__(self) -> str: # noqa: D401 (simple)
|
100
101
|
preview = "…" if len(self._rows) > 5 else ""
|
101
|
-
return f"TableResult(rows={len(self._rows)}{preview})"
|
102
|
+
return f"TableResult(rows={len(self._rows)}{preview})"
|
natural_pdf/utils/bidi_mirror.py
CHANGED
@@ -6,6 +6,7 @@ replaces each bracket/parenthesis character with its Unicode-defined pair.
|
|
6
6
|
For everyday PDFs the six ASCII pairs are enough, but the mapping can be
|
7
7
|
extended easily from Unicode's BidiBrackets.txt.
|
8
8
|
"""
|
9
|
+
|
9
10
|
from typing import Dict
|
10
11
|
|
11
12
|
# Minimal mapping – ( ) [ ] { }
|
@@ -33,4 +34,4 @@ def mirror_brackets(text: str) -> str: # pragma: no cover
|
|
33
34
|
append = out_chars.append
|
34
35
|
for ch in text:
|
35
36
|
append(_ASCII_MIRROR.get(ord(ch), ch))
|
36
|
-
return "".join(out_chars)
|
37
|
+
return "".join(out_chars)
|
@@ -178,8 +178,9 @@ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any
|
|
178
178
|
Returns:
|
179
179
|
List of elements in reading order
|
180
180
|
"""
|
181
|
-
# TODO: Implement complex layout analysis
|
182
|
-
#
|
181
|
+
# TODO: Implement complex layout analysis for sophisticated document structures
|
182
|
+
# Would include: multi-column detection, figure/caption relationships, sidebars
|
183
|
+
# For now, fall back to column-aware reading order which handles most cases
|
183
184
|
return _column_reading_order(elements)
|
184
185
|
|
185
186
|
|
@@ -237,11 +237,11 @@ def merge_images_with_legend(
|
|
237
237
|
def render_plain_page(page, resolution):
|
238
238
|
"""
|
239
239
|
Render a page to PIL Image using the specified resolution.
|
240
|
-
|
240
|
+
|
241
241
|
Args:
|
242
242
|
page: Page object to render
|
243
243
|
resolution: DPI resolution for rendering
|
244
|
-
|
244
|
+
|
245
245
|
Returns:
|
246
246
|
PIL Image of the rendered page
|
247
247
|
"""
|
@@ -252,7 +252,7 @@ def render_plain_page(page, resolution):
|
|
252
252
|
# Convert resolution (DPI) to scale factor for pypdfium2
|
253
253
|
# PDF standard is 72 DPI, so scale = resolution / 72
|
254
254
|
scale_factor = resolution / 72.0
|
255
|
-
|
255
|
+
|
256
256
|
bitmap = pdf_page.render(
|
257
257
|
scale=scale_factor,
|
258
258
|
)
|
natural_pdf/widgets/viewer.py
CHANGED
@@ -0,0 +1,121 @@
|
|
1
|
+
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
|
+
natural_pdf/cli.py,sha256=SkPwhhMM-GhLsj3O1n1Agxz4KOxcZ08sj8hVQSFJB5c,4064
|
3
|
+
natural_pdf/analyzers/__init__.py,sha256=IPu_PMKFviDeEIeiC8_2KdeqH7z8OQ6q2v980hkByFY,672
|
4
|
+
natural_pdf/analyzers/guides.py,sha256=5Lqc51trtqmLvjxLjDS__mgeyviRrjV-CIIT69RmEt4,92327
|
5
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=Ef1o73QYVXQ2QcQMM_W9XRwY6vaIQHgxzD7etJ6LbiM,62820
|
6
|
+
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
7
|
+
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
8
|
+
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
9
|
+
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
10
|
+
natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
|
11
|
+
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
12
|
+
natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
|
13
|
+
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=Ff7OfjMyGIWkfPKl6dHgkFyb-iru83_VDk0gmvyHbbg,15549
|
14
|
+
natural_pdf/analyzers/layout/layout_manager.py,sha256=ivMfTPjS14dyISu2o2Q6K48jkftvAOD04aCOtInkZGo,10267
|
15
|
+
natural_pdf/analyzers/layout/layout_options.py,sha256=2JENtBMHhP3hP0zpFI5-UP3-t1y49E7oLZnjd9d1eB0,7704
|
16
|
+
natural_pdf/analyzers/layout/paddle.py,sha256=44GG1sbaYTgvmtnrckNaCbDaNyw_D7FLLiSvzKP2cbk,23048
|
17
|
+
natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
|
18
|
+
natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
|
19
|
+
natural_pdf/analyzers/layout/table_structure_utils.py,sha256=_sugFWvVpRK3EimOCrikTDAalGnSaWqiqFbtJw8t-lg,2770
|
20
|
+
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
21
|
+
natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kNnMUuwCU,8286
|
22
|
+
natural_pdf/classification/manager.py,sha256=wyENltPSeWpJNjqzU91-ydJTnACZ_LC1q-ox_tRhMIM,22172
|
23
|
+
natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
|
24
|
+
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
25
|
+
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
26
|
+
natural_pdf/collections/pdf_collection.py,sha256=sDVEbFMNME_2OaHIsCoR_W7V1cAATNw4ZRqKWa6nbqA,30131
|
27
|
+
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
+
natural_pdf/core/element_manager.py,sha256=DRZvntd99wjXy6KeDjCq5uRhjMftZop9QklOZqlUH8M,55349
|
29
|
+
natural_pdf/core/highlighting_service.py,sha256=2tBrrEq6d6hz5f6Yf7z5TysJdlTyuHTURBnQxokJnDM,40645
|
30
|
+
natural_pdf/core/page.py,sha256=Jw5SDshnHesqoC4yhtKEokeV08wMHuWZyWs5kDMOAjo,133204
|
31
|
+
natural_pdf/core/pdf.py,sha256=9t8Ks-AZp3yjH_lRkFZAyIkjUQoCTRbmXK7vSi1e4UE,92415
|
32
|
+
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
33
|
+
natural_pdf/describe/base.py,sha256=CLhZXYQO6SOPUVWLt6VwZ7MK48t_6wgPMyFMLtTCKRc,18166
|
34
|
+
natural_pdf/describe/elements.py,sha256=JicXC9SJmmasqxalpCXA47-kVwv-6JnR3Xiu778aNHM,12634
|
35
|
+
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
36
|
+
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
37
|
+
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
38
|
+
natural_pdf/elements/base.py,sha256=-ZAcc8lb2aSWTKcprwKTvnR6hsDGDm7T8a1Y9V38E_A,52042
|
39
|
+
natural_pdf/elements/collections.py,sha256=7i279l8kpgzRyvjRr13n1BeqbC5ufwYx7lu_WmfXWTE,131199
|
40
|
+
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
41
|
+
natural_pdf/elements/line.py,sha256=mHSeV-ZABY-Cc_K_NpFL53OGtTWlexYDlMvZc8_Vrx8,3845
|
42
|
+
natural_pdf/elements/rect.py,sha256=QuQg0Qo7XYQKBac-3Ss0n0ELV6icdPcrygWM2VWzeX8,3325
|
43
|
+
natural_pdf/elements/region.py,sha256=EqwtZJ2qgMyykuLVv2zO51oKJoSU4Hl7UA_mqTqRzmQ,143419
|
44
|
+
natural_pdf/elements/text.py,sha256=409RqADe0FYG_i99n6Dy0hl_fWTtBHRCzCq7BP0eAL8,18854
|
45
|
+
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
46
|
+
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
47
|
+
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
48
|
+
natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
|
49
|
+
natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
|
50
|
+
natural_pdf/exporters/original_pdf.py,sha256=KYW0f9_zdouZq_ZwNGvYnu6WHqv7JWrrEAdPCVmhRV4,6782
|
51
|
+
natural_pdf/exporters/paddleocr.py,sha256=RBP03GCk0mLeC7tWtuti8AIUHlpOrtvbWkE2n7Ja7k8,19484
|
52
|
+
natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
|
53
|
+
natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
|
+
natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
|
55
|
+
natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
|
56
|
+
natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6HcsM,4979
|
57
|
+
natural_pdf/extraction/mixin.py,sha256=z0HNRs4x4RoioNjzg3slDeqoHbiPug0HB37bUHehqMY,25066
|
58
|
+
natural_pdf/extraction/result.py,sha256=PDaCCN2LQBbHsZy0_lrQ0ROeMsnmH1WRoXWOjk9M2o4,1825
|
59
|
+
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
60
|
+
natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
|
61
|
+
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
62
|
+
natural_pdf/flows/flow.py,sha256=ukkUqXsZmEw-QJEiVqEBLC8ktfBG2Bw56_RR1OEsd24,12802
|
63
|
+
natural_pdf/flows/region.py,sha256=nB634NCuC2BzBHuXAn8Ynf5lwZnR5mWb3RD36iEaPYY,27659
|
64
|
+
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
65
|
+
natural_pdf/ocr/engine.py,sha256=SwNlWydtHbrIghV5JD_j5B4-rnjCMYIWUIEARag-zHw,11839
|
66
|
+
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
67
|
+
natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
|
68
|
+
natural_pdf/ocr/engine_paddle.py,sha256=OmZlXVh2SSgNePqb6sMo2Mg5boX7REA4MUY25O7hKgU,16144
|
69
|
+
natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
|
70
|
+
natural_pdf/ocr/ocr_factory.py,sha256=Ix-p1SrV6dchq6YcbbCTf2BPBHSGwu9KBnwnZ_ohOuw,5282
|
71
|
+
natural_pdf/ocr/ocr_manager.py,sha256=U8EVzNgeRQxxAbMpCEZhkF7nr_R8Fcvtp28oeV_D-Ms,16229
|
72
|
+
natural_pdf/ocr/ocr_options.py,sha256=_BgLjIih6mY3k-AgkdXu9UDD8bykmQX2fpf37tAOhYQ,5146
|
73
|
+
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
74
|
+
natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
|
75
|
+
natural_pdf/qa/document_qa.py,sha256=EduwpmUs8Oz35GrCfLw3312F_ngxIpWZLM8KNvasdrM,19887
|
76
|
+
natural_pdf/qa/qa_result.py,sha256=8_jL5MJAHR4LcjGVe5lVsFizxWieF6VI86DWaqetYxs,2167
|
77
|
+
natural_pdf/search/__init__.py,sha256=araouqM-l_m0VlluKf6i9BybAsHnfCuh39M0-xEI3jA,4273
|
78
|
+
natural_pdf/search/lancedb_search_service.py,sha256=dfz5IiMIcAc3KFzkBDF6Ab_JDLpLHqW6DO1JDkPPu1k,14458
|
79
|
+
natural_pdf/search/numpy_search_service.py,sha256=GwPwnX_wxBPFHe-bKS5upMRZLHj8PjLQ2d84lZygzHg,10331
|
80
|
+
natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
|
81
|
+
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
82
|
+
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
83
|
+
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
84
|
+
natural_pdf/selectors/parser.py,sha256=Flxjo_ZODBLQM8DQlQGqZTTQDyea3zUTzO9L2dtVabM,36402
|
85
|
+
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
86
|
+
natural_pdf/tables/result.py,sha256=hrGIWDkImpdxsGzugcQKU-qrTgHwwfOigJDFdYl8aUc,3994
|
87
|
+
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
88
|
+
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
89
|
+
natural_pdf/utils/bidi_mirror.py,sha256=jJEES0xDrMfo5Me8kHMxHv4COS51PitnYi2EvKv3HCE,1151
|
90
|
+
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
91
|
+
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
92
|
+
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
93
|
+
natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,215
|
94
|
+
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
95
|
+
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
96
|
+
natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
|
97
|
+
natural_pdf/utils/visualization.py,sha256=olDkWtuVzP0NxRg0CP0DL-eXNCY7Bs-SH-2Xn-cjbo0,9370
|
98
|
+
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
99
|
+
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
100
|
+
natural_pdf-0.1.35.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
101
|
+
optimization/memory_comparison.py,sha256=F90D_5WhliSGAct_lyx93xd4q4F-jeo8QpGyDr8tmNw,6543
|
102
|
+
optimization/pdf_analyzer.py,sha256=xf6h-FNlqCpsm8NriXcs_bQZOB8eQkxgGGKVRL_jgCM,19347
|
103
|
+
optimization/performance_analysis.py,sha256=RjAqeE3YS1r_7qTWkY6Ng5YMbb6MXJXfXX6LoVjg_xQ,13035
|
104
|
+
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
105
|
+
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
106
|
+
tools/bad_pdf_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
107
|
+
tools/bad_pdf_eval/analyser.py,sha256=bKUT3muP3ESE5i1D8sGyAS5tMzFMcq-i-xD_ZeUxYhY,13692
|
108
|
+
tools/bad_pdf_eval/collate_summaries.py,sha256=L_YsdiqmwGIHYWTVJqo6gyazyn3GIQgpfGGKk8uwckk,5159
|
109
|
+
tools/bad_pdf_eval/compile_attempts_markdown.py,sha256=ArFDZaSa9dz0ez0lsNlbUSK4hbvB3___DlfwqPEAZpY,4359
|
110
|
+
tools/bad_pdf_eval/eval_suite.py,sha256=zcapsGwO-VJ2OupJnPYKbrkzvzdGdoh2DZPK19bfkQg,4450
|
111
|
+
tools/bad_pdf_eval/evaluate_quality.py,sha256=-LR_shgxPVbaEZyWSVYKXTp2LNNVSdIwrlN5rllqntg,7149
|
112
|
+
tools/bad_pdf_eval/export_enrichment_csv.py,sha256=1hd1iaTinmT8K1rlaHFV_ZvvbyuLEAnIbmKZUtRWv8o,1958
|
113
|
+
tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1tP5Q,13313
|
114
|
+
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
115
|
+
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
116
|
+
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
117
|
+
natural_pdf-0.1.35.dist-info/METADATA,sha256=SVdCwYrjweXrrmU8m2korCIMJENbN9zDasRCi2pkb8E,6711
|
118
|
+
natural_pdf-0.1.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
119
|
+
natural_pdf-0.1.35.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
120
|
+
natural_pdf-0.1.35.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
121
|
+
natural_pdf-0.1.35.dist-info/RECORD,,
|