natural-pdf 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/classification/manager.py +38 -13
- natural_pdf/core/page.py +2 -1
- natural_pdf/core/pdf.py +141 -32
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +2 -1
- natural_pdf/elements/collections.py +11 -1
- natural_pdf/elements/region.py +4 -1
- natural_pdf/exporters/__init__.py +12 -1
- natural_pdf/exporters/hocr.py +9 -8
- natural_pdf/exporters/original_pdf.py +31 -2
- natural_pdf/ocr/engine_surya.py +1 -2
- natural_pdf/ocr/ocr_manager.py +21 -4
- natural_pdf/search/__init__.py +20 -3
- natural_pdf/search/lancedb_search_service.py +13 -5
- natural_pdf/search/numpy_search_service.py +13 -3
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/METADATA +16 -16
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/RECORD +24 -19
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.16.dist-info → natural_pdf-0.1.18.dist-info}/top_level.txt +0 -0
@@ -30,6 +30,7 @@ from tqdm.auto import tqdm
|
|
30
30
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
31
31
|
from natural_pdf.classification.manager import ClassificationManager
|
32
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
33
|
+
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
33
34
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
34
35
|
from natural_pdf.core.pdf import PDF
|
35
36
|
from natural_pdf.elements.base import Element
|
@@ -71,7 +72,14 @@ P = TypeVar("P", bound="Page")
|
|
71
72
|
|
72
73
|
|
73
74
|
class ElementCollection(
|
74
|
-
Generic[T],
|
75
|
+
Generic[T],
|
76
|
+
ApplyMixin,
|
77
|
+
ExportMixin,
|
78
|
+
ClassificationMixin,
|
79
|
+
DirectionalCollectionMixin,
|
80
|
+
DescribeMixin,
|
81
|
+
InspectMixin,
|
82
|
+
MutableSequence,
|
75
83
|
):
|
76
84
|
"""
|
77
85
|
Collection of PDF elements with batch operations.
|
@@ -1795,6 +1803,8 @@ class ElementCollection(
|
|
1795
1803
|
)
|
1796
1804
|
|
1797
1805
|
|
1806
|
+
|
1807
|
+
|
1798
1808
|
class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
1799
1809
|
"""
|
1800
1810
|
Represents a collection of Page objects, often from a single PDF document.
|
natural_pdf/elements/region.py
CHANGED
@@ -15,6 +15,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
|
|
15
15
|
|
16
16
|
# --- Classification Imports --- #
|
17
17
|
from natural_pdf.classification.mixin import ClassificationMixin
|
18
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
18
19
|
from natural_pdf.elements.base import DirectionalMixin
|
19
20
|
from natural_pdf.elements.text import TextElement # ADDED IMPORT
|
20
21
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
@@ -49,7 +50,7 @@ except ImportError:
|
|
49
50
|
logger = logging.getLogger(__name__)
|
50
51
|
|
51
52
|
|
52
|
-
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
53
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
53
54
|
"""
|
54
55
|
Represents a rectangular region on a page.
|
55
56
|
"""
|
@@ -2962,3 +2963,5 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
|
|
2962
2963
|
)
|
2963
2964
|
|
2964
2965
|
return text_element
|
2966
|
+
|
2967
|
+
|
@@ -1,4 +1,15 @@
|
|
1
1
|
from .base import FinetuneExporter
|
2
|
-
|
2
|
+
|
3
|
+
# Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
|
4
|
+
def _get_paddleocr_exporter():
|
5
|
+
"""Lazy import for PaddleOCRRecognitionExporter."""
|
6
|
+
from .paddleocr import PaddleOCRRecognitionExporter
|
7
|
+
return PaddleOCRRecognitionExporter
|
8
|
+
|
9
|
+
# Make PaddleOCRRecognitionExporter available through attribute access
|
10
|
+
def __getattr__(name):
|
11
|
+
if name == "PaddleOCRRecognitionExporter":
|
12
|
+
return _get_paddleocr_exporter()
|
13
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
3
14
|
|
4
15
|
__all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
|
natural_pdf/exporters/hocr.py
CHANGED
@@ -16,6 +16,7 @@ from dataclasses import dataclass
|
|
16
16
|
from itertools import pairwise
|
17
17
|
from math import atan, pi
|
18
18
|
from pathlib import Path
|
19
|
+
from typing import Optional, Union
|
19
20
|
from xml.etree import ElementTree
|
20
21
|
|
21
22
|
from pikepdf import Matrix, Name, Rectangle
|
@@ -94,12 +95,12 @@ class HocrTransform:
|
|
94
95
|
def __init__(
|
95
96
|
self,
|
96
97
|
*,
|
97
|
-
hocr_filename: str
|
98
|
+
hocr_filename: Union[str, Path],
|
98
99
|
dpi: float,
|
99
100
|
debug: bool = False,
|
100
101
|
fontname: Name = Name("/f-0-0"),
|
101
102
|
font: Font = GlyphlessFont(),
|
102
|
-
debug_render_options: DebugRenderOptions
|
103
|
+
debug_render_options: Optional[DebugRenderOptions] = None,
|
103
104
|
):
|
104
105
|
"""Initialize the HocrTransform object."""
|
105
106
|
if debug:
|
@@ -144,7 +145,7 @@ class HocrTransform:
|
|
144
145
|
return text
|
145
146
|
|
146
147
|
@classmethod
|
147
|
-
def element_coordinates(cls, element: Element) -> Rectangle
|
148
|
+
def element_coordinates(cls, element: Element) -> Optional[Rectangle]:
|
148
149
|
"""Get coordinates of the bounding box around an element."""
|
149
150
|
matches = cls.box_pattern.search(element.attrib.get("title", ""))
|
150
151
|
if not matches:
|
@@ -172,7 +173,7 @@ class HocrTransform:
|
|
172
173
|
return 0.0
|
173
174
|
return float(matches.group(1))
|
174
175
|
|
175
|
-
def _child_xpath(self, html_tag: str, html_class: str
|
176
|
+
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
|
176
177
|
xpath = f".//{self.xmlns}{html_tag}"
|
177
178
|
if html_class:
|
178
179
|
xpath += f"[@class='{html_class}']"
|
@@ -187,7 +188,7 @@ class HocrTransform:
|
|
187
188
|
self,
|
188
189
|
*,
|
189
190
|
out_filename: Path,
|
190
|
-
image_filename: Path
|
191
|
+
image_filename: Optional[Path] = None,
|
191
192
|
invisible_text: bool = True,
|
192
193
|
) -> None:
|
193
194
|
"""Creates a PDF file with an image superimposed on top of the text.
|
@@ -291,7 +292,7 @@ class HocrTransform:
|
|
291
292
|
def _do_line(
|
292
293
|
self,
|
293
294
|
canvas: Canvas,
|
294
|
-
line: Element
|
295
|
+
line: Optional[Element],
|
295
296
|
elemclass: str,
|
296
297
|
invisible_text: bool,
|
297
298
|
text_direction: TextDirection,
|
@@ -387,8 +388,8 @@ class HocrTransform:
|
|
387
388
|
line_matrix: Matrix,
|
388
389
|
text: Text,
|
389
390
|
fontsize: float,
|
390
|
-
elem: Element
|
391
|
-
next_elem: Element
|
391
|
+
elem: Optional[Element],
|
392
|
+
next_elem: Optional[Element],
|
392
393
|
text_direction: TextDirection,
|
393
394
|
inject_word_breaks: bool,
|
394
395
|
):
|
@@ -4,6 +4,8 @@ Module for exporting original PDF pages without modification.
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
|
+
import io
|
8
|
+
import urllib.request
|
7
9
|
from pathlib import Path
|
8
10
|
from typing import TYPE_CHECKING, List, Set, Union
|
9
11
|
|
@@ -69,8 +71,11 @@ def create_original_pdf(
|
|
69
71
|
|
70
72
|
# Verify all pages come from the same PDF and get path
|
71
73
|
first_page_pdf_path = None
|
74
|
+
first_page_pdf_obj = None
|
72
75
|
if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
|
73
|
-
|
76
|
+
src_pdf = pages_to_extract[0].pdf
|
77
|
+
first_page_pdf_path = getattr(src_pdf, "path", None)
|
78
|
+
first_page_pdf_obj = src_pdf
|
74
79
|
|
75
80
|
if not first_page_pdf_path:
|
76
81
|
raise ValueError(
|
@@ -93,7 +98,28 @@ def create_original_pdf(
|
|
93
98
|
)
|
94
99
|
|
95
100
|
try:
|
96
|
-
|
101
|
+
# Prefer opening via filesystem path when it exists locally
|
102
|
+
if first_page_pdf_path and os.path.exists(first_page_pdf_path):
|
103
|
+
source_handle = pikepdf.Pdf.open(first_page_pdf_path)
|
104
|
+
else:
|
105
|
+
# Fallback: attempt to open from in-memory bytes stored on PDF object
|
106
|
+
if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
|
107
|
+
source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
|
108
|
+
else:
|
109
|
+
# Attempt to download bytes directly if path looks like URL
|
110
|
+
if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
|
111
|
+
try:
|
112
|
+
with urllib.request.urlopen(first_page_pdf_path) as resp:
|
113
|
+
data = resp.read()
|
114
|
+
source_handle = pikepdf.Pdf.open(io.BytesIO(data))
|
115
|
+
except Exception as dl_err:
|
116
|
+
raise FileNotFoundError(
|
117
|
+
f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
|
118
|
+
)
|
119
|
+
else:
|
120
|
+
raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
|
121
|
+
|
122
|
+
with source_handle as source_pikepdf_doc:
|
97
123
|
target_pikepdf_doc = pikepdf.Pdf.new()
|
98
124
|
|
99
125
|
for page_index in sorted_indices:
|
@@ -113,6 +139,9 @@ def create_original_pdf(
|
|
113
139
|
f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
|
114
140
|
)
|
115
141
|
|
142
|
+
except FileNotFoundError as e:
|
143
|
+
logger.error(str(e))
|
144
|
+
raise RuntimeError(f"Failed to save original pages PDF: {e}")
|
116
145
|
except pikepdf.PasswordError:
|
117
146
|
logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
|
118
147
|
raise RuntimeError(
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -27,7 +27,6 @@ class SuryaOCREngine(OCREngine):
|
|
27
27
|
if not self.is_available():
|
28
28
|
raise ImportError("Surya OCR library is not installed or available.")
|
29
29
|
|
30
|
-
# Store languages for use in _process_single_image
|
31
30
|
self._langs = languages
|
32
31
|
|
33
32
|
from surya.detection import DetectionPredictor
|
@@ -63,7 +62,6 @@ class SuryaOCREngine(OCREngine):
|
|
63
62
|
if not self._recognition_predictor or not self._detection_predictor:
|
64
63
|
raise RuntimeError("Surya predictors are not initialized.")
|
65
64
|
|
66
|
-
# Store languages instance variable during initialization to use here
|
67
65
|
langs = (
|
68
66
|
[self._langs] # Send all languages together in one list per image
|
69
67
|
if hasattr(self, "_langs")
|
@@ -75,6 +73,7 @@ class SuryaOCREngine(OCREngine):
|
|
75
73
|
results = self._detection_predictor(images=[image])
|
76
74
|
else:
|
77
75
|
results = self._recognition_predictor(
|
76
|
+
langs=langs,
|
78
77
|
images=[image],
|
79
78
|
det_predictor=self._detection_predictor,
|
80
79
|
)
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -11,7 +11,8 @@ from PIL import Image
|
|
11
11
|
from .engine import OCREngine
|
12
12
|
from .engine_doctr import DoctrOCREngine
|
13
13
|
from .engine_easyocr import EasyOCREngine
|
14
|
-
|
14
|
+
# Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
|
15
|
+
# from .engine_paddle import PaddleOCREngine
|
15
16
|
from .engine_surya import SuryaOCREngine
|
16
17
|
from .ocr_options import (
|
17
18
|
BaseOCROptions,
|
@@ -28,10 +29,16 @@ logger = logging.getLogger(__name__)
|
|
28
29
|
class OCRManager:
|
29
30
|
"""Manages OCR engine selection, configuration, and execution."""
|
30
31
|
|
32
|
+
@staticmethod
|
33
|
+
def _get_paddle_engine_class():
|
34
|
+
"""Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
|
35
|
+
from .engine_paddle import PaddleOCREngine
|
36
|
+
return PaddleOCREngine
|
37
|
+
|
31
38
|
# Registry mapping engine names to classes and default options
|
32
39
|
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
33
40
|
"easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
|
34
|
-
"paddle": {"class":
|
41
|
+
"paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
|
35
42
|
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
|
36
43
|
"doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
|
37
44
|
# Add other engines here
|
@@ -76,7 +83,12 @@ class OCRManager:
|
|
76
83
|
logger.info(
|
77
84
|
f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
|
78
85
|
)
|
79
|
-
|
86
|
+
engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
|
87
|
+
# Handle lazy loading - if it's a lambda function, call it to get the actual class
|
88
|
+
if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
|
89
|
+
engine_class = engine_class_or_factory()
|
90
|
+
else:
|
91
|
+
engine_class = engine_class_or_factory
|
80
92
|
start_time = time.monotonic() # Optional: time initialization
|
81
93
|
try:
|
82
94
|
engine_instance = engine_class() # Instantiate first
|
@@ -277,7 +289,12 @@ class OCRManager:
|
|
277
289
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
278
290
|
try:
|
279
291
|
# Temporarily instantiate to check availability without caching
|
280
|
-
|
292
|
+
engine_class_or_factory = registry_entry["class"]
|
293
|
+
# Handle lazy loading - if it's a lambda function, call it to get the actual class
|
294
|
+
if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
|
295
|
+
engine_class = engine_class_or_factory()
|
296
|
+
else:
|
297
|
+
engine_class = engine_class_or_factory
|
281
298
|
if engine_class().is_available():
|
282
299
|
available.append(name)
|
283
300
|
except Exception as e:
|
natural_pdf/search/__init__.py
CHANGED
@@ -18,7 +18,8 @@ SEARCH_DEPENDENCIES_AVAILABLE = False
|
|
18
18
|
|
19
19
|
try:
|
20
20
|
import numpy as np
|
21
|
-
import sentence_transformers
|
21
|
+
# Lazy import for sentence_transformers to avoid heavy loading at module level
|
22
|
+
# import sentence_transformers
|
22
23
|
|
23
24
|
# Basic search dependencies are available
|
24
25
|
SEARCH_DEPENDENCIES_AVAILABLE = True
|
@@ -46,12 +47,28 @@ except ImportError:
|
|
46
47
|
logger = logging.getLogger(__name__)
|
47
48
|
|
48
49
|
|
50
|
+
def _check_sentence_transformers():
|
51
|
+
"""Lazy check for sentence_transformers availability."""
|
52
|
+
try:
|
53
|
+
import sentence_transformers
|
54
|
+
return True
|
55
|
+
except ImportError:
|
56
|
+
return False
|
57
|
+
|
58
|
+
|
49
59
|
def check_search_availability():
|
50
60
|
"""Check if required search dependencies are available."""
|
51
61
|
if not SEARCH_DEPENDENCIES_AVAILABLE:
|
52
62
|
raise ImportError(
|
53
|
-
"Search functionality requires '
|
54
|
-
"Install with: pip install natural-pdf[search] (or pip install
|
63
|
+
"Search functionality requires 'lancedb' and pyarrow. "
|
64
|
+
"Install with: pip install natural-pdf[search] (or pip install lancedb pyarrow)"
|
65
|
+
)
|
66
|
+
|
67
|
+
# Lazy check for sentence_transformers when actually needed
|
68
|
+
if not _check_sentence_transformers():
|
69
|
+
raise ImportError(
|
70
|
+
"Search functionality requires 'sentence-transformers'. "
|
71
|
+
"Install with: pip install sentence-transformers"
|
55
72
|
)
|
56
73
|
|
57
74
|
|
@@ -1,12 +1,14 @@
|
|
1
1
|
import logging
|
2
|
+
import os
|
2
3
|
import shutil
|
3
4
|
import tempfile
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
6
7
|
|
7
8
|
import lancedb
|
8
9
|
import pyarrow as pa
|
9
|
-
|
10
|
+
# Lazy import for SentenceTransformer to avoid heavy loading at module level
|
11
|
+
# from sentence_transformers import SentenceTransformer
|
10
12
|
|
11
13
|
from .search_options import BaseSearchOptions
|
12
14
|
from .search_service_protocol import (
|
@@ -17,8 +19,14 @@ from .search_service_protocol import (
|
|
17
19
|
|
18
20
|
logger = logging.getLogger(__name__)
|
19
21
|
|
20
|
-
DEFAULT_EMBEDDING_MODEL = "
|
21
|
-
DEFAULT_LANCEDB_PERSIST_PATH = "./
|
22
|
+
DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
23
|
+
DEFAULT_LANCEDB_PERSIST_PATH = "./lancedb_data"
|
24
|
+
|
25
|
+
|
26
|
+
def _get_sentence_transformer(model_name: str):
|
27
|
+
"""Lazy import and instantiation of SentenceTransformer."""
|
28
|
+
from sentence_transformers import SentenceTransformer
|
29
|
+
return SentenceTransformer(model_name)
|
22
30
|
|
23
31
|
|
24
32
|
class LanceDBSearchService(SearchServiceProtocol):
|
@@ -41,7 +49,7 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
41
49
|
self._db = None
|
42
50
|
self._table = None
|
43
51
|
|
44
|
-
self.embedding_model =
|
52
|
+
self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
|
45
53
|
test_embedding = self.embedding_model.encode("test")
|
46
54
|
self._embedding_dims = len(test_embedding)
|
47
55
|
|
@@ -1,21 +1,31 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
+
import os
|
4
|
+
import tempfile
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
5
7
|
|
6
8
|
import numpy as np
|
7
|
-
|
9
|
+
# Lazy import for SentenceTransformer to avoid heavy loading at module level
|
10
|
+
# from sentence_transformers import SentenceTransformer
|
8
11
|
|
9
12
|
from .search_options import BaseSearchOptions
|
10
13
|
from .search_service_protocol import (
|
11
14
|
Indexable,
|
12
15
|
IndexConfigurationError,
|
16
|
+
SearchResult,
|
13
17
|
SearchServiceProtocol,
|
14
18
|
)
|
15
19
|
|
16
20
|
logger = logging.getLogger(__name__)
|
17
21
|
|
18
|
-
DEFAULT_EMBEDDING_MODEL = "
|
22
|
+
DEFAULT_EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
23
|
+
|
24
|
+
|
25
|
+
def _get_sentence_transformer(model_name: str):
|
26
|
+
"""Lazy import and instantiation of SentenceTransformer."""
|
27
|
+
from sentence_transformers import SentenceTransformer
|
28
|
+
return SentenceTransformer(model_name)
|
19
29
|
|
20
30
|
|
21
31
|
class NumpySearchService(SearchServiceProtocol):
|
@@ -38,7 +48,7 @@ class NumpySearchService(SearchServiceProtocol):
|
|
38
48
|
|
39
49
|
self.collection_name = collection_name
|
40
50
|
self._embedding_model_name = embedding_model_name
|
41
|
-
self.embedding_model =
|
51
|
+
self.embedding_model = _get_sentence_transformer(self._embedding_model_name)
|
42
52
|
self._embedding_dims = len(self.embedding_model.encode("test"))
|
43
53
|
|
44
54
|
# Simple in-memory storage
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
11
11
|
Requires-Python: >=3.9
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
|
+
Requires-Dist: pandas
|
14
15
|
Requires-Dist: pdfplumber
|
15
16
|
Requires-Dist: colormath2
|
16
17
|
Requires-Dist: pillow
|
@@ -20,14 +21,15 @@ Requires-Dist: urllib3
|
|
20
21
|
Requires-Dist: tqdm
|
21
22
|
Requires-Dist: pydantic
|
22
23
|
Requires-Dist: jenkspy
|
23
|
-
Requires-Dist: pikepdf
|
24
|
+
Requires-Dist: pikepdf
|
24
25
|
Requires-Dist: scipy
|
25
26
|
Requires-Dist: torch
|
26
27
|
Requires-Dist: torchvision
|
27
|
-
Requires-Dist: transformers[sentencepiece]
|
28
|
+
Requires-Dist: transformers[sentencepiece]
|
28
29
|
Requires-Dist: huggingface_hub>=0.29.3
|
29
30
|
Requires-Dist: sentence-transformers
|
30
31
|
Requires-Dist: timm
|
32
|
+
Requires-Dist: ipywidgets>=7.0.0
|
31
33
|
Provides-Extra: test
|
32
34
|
Requires-Dist: pytest; extra == "test"
|
33
35
|
Requires-Dist: pytest-xdist; extra == "test"
|
@@ -39,7 +41,6 @@ Provides-Extra: favorites
|
|
39
41
|
Requires-Dist: natural-pdf[deskew]; extra == "favorites"
|
40
42
|
Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
|
41
43
|
Requires-Dist: natural-pdf[search]; extra == "favorites"
|
42
|
-
Requires-Dist: ipywidgets; extra == "favorites"
|
43
44
|
Requires-Dist: surya-ocr; extra == "favorites"
|
44
45
|
Provides-Extra: dev
|
45
46
|
Requires-Dist: black; extra == "dev"
|
@@ -61,23 +62,22 @@ Requires-Dist: setuptools; extra == "dev"
|
|
61
62
|
Provides-Extra: deskew
|
62
63
|
Requires-Dist: deskew>=1.5; extra == "deskew"
|
63
64
|
Requires-Dist: img2pdf; extra == "deskew"
|
64
|
-
Provides-Extra: addons
|
65
|
-
Requires-Dist: surya-ocr; extra == "addons"
|
66
|
-
Requires-Dist: doclayout_yolo; extra == "addons"
|
67
|
-
Requires-Dist: paddlepaddle>=3.0.0; extra == "addons"
|
68
|
-
Requires-Dist: paddleocr>=3.0.0; extra == "addons"
|
69
|
-
Requires-Dist: ipywidgets>=7.0.0; extra == "addons"
|
70
|
-
Requires-Dist: easyocr; extra == "addons"
|
71
|
-
Requires-Dist: surya-ocr; extra == "addons"
|
72
|
-
Requires-Dist: doclayout_yolo; extra == "addons"
|
73
|
-
Requires-Dist: python-doctr[torch]; extra == "addons"
|
74
|
-
Requires-Dist: docling; extra == "addons"
|
75
65
|
Provides-Extra: all
|
76
66
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
77
67
|
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
78
68
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
79
69
|
Requires-Dist: natural-pdf[search]; extra == "all"
|
80
|
-
Requires-Dist: natural-pdf[
|
70
|
+
Requires-Dist: natural-pdf[extras]; extra == "all"
|
71
|
+
Requires-Dist: natural-pdf[favorites]; extra == "all"
|
72
|
+
Provides-Extra: paddle
|
73
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddle"
|
74
|
+
Requires-Dist: paddleocr>=3.0.1; extra == "paddle"
|
75
|
+
Requires-Dist: paddlex>=3.0.1; extra == "paddle"
|
76
|
+
Provides-Extra: extras
|
77
|
+
Requires-Dist: surya-ocr; extra == "extras"
|
78
|
+
Requires-Dist: doclayout_yolo; extra == "extras"
|
79
|
+
Requires-Dist: easyocr; extra == "extras"
|
80
|
+
Requires-Dist: natural-pdf[paddle]; extra == "extras"
|
81
81
|
Provides-Extra: ocr-export
|
82
82
|
Requires-Dist: pikepdf; extra == "ocr-export"
|
83
83
|
Provides-Extra: export-extras
|
@@ -17,7 +17,7 @@ natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuK
|
|
17
17
|
natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
|
18
18
|
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
19
19
|
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
20
|
-
natural_pdf/classification/manager.py,sha256
|
20
|
+
natural_pdf/classification/manager.py,sha256=pzuTP-34W9N3im1ZFhCfQpOu37VSHEx4JHoHNxyy6o0,18894
|
21
21
|
natural_pdf/classification/mixin.py,sha256=_XtoqCMqj1nxZYskIV2RbVYiVVcEWzFwae4s5vpzC74,6566
|
22
22
|
natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
|
23
23
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
@@ -25,21 +25,26 @@ natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm
|
|
25
25
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
26
26
|
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
27
27
|
natural_pdf/core/highlighting_service.py,sha256=_kQUS6_BBvsLBuSZloFrVag6jN90KzHa0ULyGBjufSs,36955
|
28
|
-
natural_pdf/core/page.py,sha256=
|
29
|
-
natural_pdf/core/pdf.py,sha256=
|
28
|
+
natural_pdf/core/page.py,sha256=i3DriIQwoO4RuSrkrCXv44Dz8OL9KXPa2y4GhsD1y18,118324
|
29
|
+
natural_pdf/core/pdf.py,sha256=yBvb1iGw9gwVPJ3Rm1EBaZ8_g60TuW_Elhg2EOcJMzc,73871
|
30
|
+
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
31
|
+
natural_pdf/describe/base.py,sha256=7USCFIl4mI5b15LTVkwvhAn_mngMwhwxCnVYaZz5Vdc,16842
|
32
|
+
natural_pdf/describe/elements.py,sha256=BOkz2wDhGh6P8NOm6pSNxitgmVokLTISztaFhrxMcdw,12717
|
33
|
+
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
34
|
+
natural_pdf/describe/summary.py,sha256=dPtjrn6fQ8nL0F74RITX2vXlDX7ZgaX9JQPnJB-S_XQ,6735
|
30
35
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
31
|
-
natural_pdf/elements/base.py,sha256=
|
32
|
-
natural_pdf/elements/collections.py,sha256=
|
36
|
+
natural_pdf/elements/base.py,sha256=IlAeyzV66xMrxVx9U3ocGPekzGUBJgKkAiJ5kpvCSAg,39675
|
37
|
+
natural_pdf/elements/collections.py,sha256=vgVZsVC3xxRF2S5KW7L0JKa-NSUFnqURk50NtvlwbcM,122113
|
33
38
|
natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
|
34
39
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
35
|
-
natural_pdf/elements/region.py,sha256=
|
40
|
+
natural_pdf/elements/region.py,sha256=hBklYKcXJWyxayu9todYQOZ-d9KVDtqeV-CIt9IcSn8,123400
|
36
41
|
natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
|
37
42
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
38
|
-
natural_pdf/exporters/__init__.py,sha256=
|
43
|
+
natural_pdf/exporters/__init__.py,sha256=XG0ckcKHgG7IVma75syORUme6wEItUvDA46aCZzGqrU,639
|
39
44
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
40
|
-
natural_pdf/exporters/hocr.py,sha256=
|
45
|
+
natural_pdf/exporters/hocr.py,sha256=wksvJvWLSxuAfhYzg_0T2_W8eqDoMgAVC-gwZ9FoO_k,19969
|
41
46
|
natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
|
42
|
-
natural_pdf/exporters/original_pdf.py,sha256=
|
47
|
+
natural_pdf/exporters/original_pdf.py,sha256=dtvC4er6TWOfqq-n24Pejw3mlAuPd8IVyihggJtcf0s,6634
|
43
48
|
natural_pdf/exporters/paddleocr.py,sha256=IAG2p9YeImYcsIvb6a_L5mMrKarvaMaDvRrvdlY6bX4,19489
|
44
49
|
natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
|
45
50
|
natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -58,16 +63,16 @@ natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,874
|
|
58
63
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
59
64
|
natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
|
60
65
|
natural_pdf/ocr/engine_paddle.py,sha256=ZUtyjso_UjjAPnJt5ac-AtOpR6PfOhO76iOyjngGzr0,16198
|
61
|
-
natural_pdf/ocr/engine_surya.py,sha256=
|
66
|
+
natural_pdf/ocr/engine_surya.py,sha256=PNjvpsHnBghAoa-df52HEyvXzfNI-gTFgKvs2LxHgKo,5051
|
62
67
|
natural_pdf/ocr/ocr_factory.py,sha256=gBFXdFs7E4aCynHz06sQsAhaO3s8yhgoFgN5nyxtg9c,5221
|
63
|
-
natural_pdf/ocr/ocr_manager.py,sha256=
|
68
|
+
natural_pdf/ocr/ocr_manager.py,sha256=M1GRAThzWl5iMkQJ41j84G6cJ7XruQD_HoPPzWf7nUk,14742
|
64
69
|
natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
|
65
70
|
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
66
71
|
natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
|
67
72
|
natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
|
68
|
-
natural_pdf/search/__init__.py,sha256=
|
69
|
-
natural_pdf/search/lancedb_search_service.py,sha256=
|
70
|
-
natural_pdf/search/numpy_search_service.py,sha256=
|
73
|
+
natural_pdf/search/__init__.py,sha256=0Xa7tT_2q57wHObFMQLQLd4gd9AV0oyS-svV6BmmdMI,4276
|
74
|
+
natural_pdf/search/lancedb_search_service.py,sha256=6dz2IEZUWk3hFW28C-LF_85pWohd7Sr5k44bM0pBdm4,14472
|
75
|
+
natural_pdf/search/numpy_search_service.py,sha256=MoPBlyHTDqah1IrwBzyglEyiXlF4wqaU_5mml_ngvGc,10328
|
71
76
|
natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
|
72
77
|
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
73
78
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
@@ -85,8 +90,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
|
|
85
90
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
86
91
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
87
92
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
88
|
-
natural_pdf-0.1.
|
89
|
-
natural_pdf-0.1.
|
90
|
-
natural_pdf-0.1.
|
91
|
-
natural_pdf-0.1.
|
92
|
-
natural_pdf-0.1.
|
93
|
+
natural_pdf-0.1.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
94
|
+
natural_pdf-0.1.18.dist-info/METADATA,sha256=aU8IC02yZuy1aUrHhtDCHEp5igjwaUGP1NDnFDsOTL8,6684
|
95
|
+
natural_pdf-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
96
|
+
natural_pdf-0.1.18.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
97
|
+
natural_pdf-0.1.18.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|