deepdoctection 0.34__py3-none-any.whl → 0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +6 -10
- deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection/analyzer/_config.py +150 -0
- deepdoctection/analyzer/dd.py +11 -335
- deepdoctection/analyzer/factory.py +522 -0
- deepdoctection/configs/conf_dd_one.yaml +1 -0
- deepdoctection/datapoint/annotation.py +1 -1
- deepdoctection/datapoint/convert.py +6 -4
- deepdoctection/datapoint/image.py +16 -6
- deepdoctection/datapoint/view.py +1 -0
- deepdoctection/extern/pdftext.py +96 -5
- deepdoctection/extern/tessocr.py +1 -0
- deepdoctection/utils/env_info.py +30 -1
- deepdoctection/utils/file_utils.py +19 -0
- deepdoctection/utils/metacfg.py +12 -0
- deepdoctection/utils/pdf_utils.py +86 -3
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/METADATA +17 -11
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/RECORD +21 -19
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
deepdoctection/extern/pdftext.py
CHANGED
|
@@ -24,21 +24,25 @@ from typing import Optional
|
|
|
24
24
|
from lazy_imports import try_import
|
|
25
25
|
|
|
26
26
|
from ..utils.context import save_tmp_file
|
|
27
|
-
from ..utils.file_utils import get_pdfplumber_requirement
|
|
27
|
+
from ..utils.file_utils import get_pdfplumber_requirement, get_pypdfium2_requirement
|
|
28
28
|
from ..utils.settings import LayoutType, ObjectTypes
|
|
29
29
|
from ..utils.types import Requirement
|
|
30
30
|
from .base import DetectionResult, ModelCategories, PdfMiner
|
|
31
31
|
|
|
32
|
-
with try_import() as
|
|
32
|
+
with try_import() as pdfplumber_import_guard:
|
|
33
33
|
from pdfplumber.pdf import PDF, Page
|
|
34
34
|
|
|
35
|
+
with try_import() as pypdfmium_import_guard:
|
|
36
|
+
import pypdfium2.raw as pypdfium_c
|
|
37
|
+
from pypdfium2 import PdfDocument
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
|
|
40
|
+
def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> DetectionResult:
|
|
37
41
|
return DetectionResult(
|
|
38
42
|
box=[float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
|
|
39
43
|
class_id=1,
|
|
40
44
|
text=word["text"],
|
|
41
|
-
class_name=
|
|
45
|
+
class_name=class_name,
|
|
42
46
|
)
|
|
43
47
|
|
|
44
48
|
|
|
@@ -49,6 +53,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
49
53
|
|
|
50
54
|
pdf_plumber = PdfPlumberTextDetector()
|
|
51
55
|
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
56
|
+
df.reset_state()
|
|
52
57
|
|
|
53
58
|
for dp in df:
|
|
54
59
|
detection_results = pdf_plumber.predict(dp["pdf_bytes"])
|
|
@@ -61,6 +66,8 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
61
66
|
pipe = DoctectionPipe([text_extract])
|
|
62
67
|
|
|
63
68
|
df = pipe.analyze(path="path/to/document.pdf")
|
|
69
|
+
df.reset_state()
|
|
70
|
+
|
|
64
71
|
for dp in df:
|
|
65
72
|
...
|
|
66
73
|
|
|
@@ -87,7 +94,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
87
94
|
self._page = PDF(fin).pages[0]
|
|
88
95
|
self._pdf_bytes = pdf_bytes
|
|
89
96
|
words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
|
|
90
|
-
detect_results =
|
|
97
|
+
detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
|
|
91
98
|
return detect_results
|
|
92
99
|
|
|
93
100
|
@classmethod
|
|
@@ -113,3 +120,87 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
113
120
|
|
|
114
121
|
def get_category_names(self) -> tuple[ObjectTypes, ...]:
|
|
115
122
|
return self.categories.get_categories(as_dict=False)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class Pdfmium2TextDetector(PdfMiner):
|
|
126
|
+
"""
|
|
127
|
+
Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
|
|
128
|
+
|
|
129
|
+
pdfmium2 = Pdfmium2TextDetector()
|
|
130
|
+
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
131
|
+
df.reset_state()
|
|
132
|
+
|
|
133
|
+
for dp in df:
|
|
134
|
+
detection_results = pdfmium2.predict(dp["pdf_bytes"])
|
|
135
|
+
|
|
136
|
+
To use it in a more integrated way:
|
|
137
|
+
|
|
138
|
+
pdfmium2 = Pdfmium2TextDetector()
|
|
139
|
+
text_extract = TextExtractionService(pdfmium2)
|
|
140
|
+
|
|
141
|
+
pipe = DoctectionPipe([text_extract])
|
|
142
|
+
|
|
143
|
+
df = pipe.analyze(path="path/to/document.pdf")
|
|
144
|
+
df.reset_state()
|
|
145
|
+
for dp in df:
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(self) -> None:
|
|
151
|
+
self.name = "Pdfmium"
|
|
152
|
+
self.model_id = self.get_model_id()
|
|
153
|
+
self.categories = ModelCategories(init_categories={1: LayoutType.LINE})
|
|
154
|
+
self._page: Optional[Page] = None
|
|
155
|
+
|
|
156
|
+
def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
|
|
157
|
+
"""
|
|
158
|
+
Call pypdfium2 and returns detected text as detection results
|
|
159
|
+
|
|
160
|
+
:param pdf_bytes: bytes of a single pdf page
|
|
161
|
+
:return: A list of DetectionResult
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
pdf = PdfDocument(pdf_bytes)
|
|
165
|
+
page = pdf.get_page(0)
|
|
166
|
+
text = page.get_textpage()
|
|
167
|
+
words = []
|
|
168
|
+
height = page.get_height()
|
|
169
|
+
for obj in page.get_objects((pypdfium_c.FPDF_PAGEOBJ_TEXT,)):
|
|
170
|
+
box = obj.get_pos()
|
|
171
|
+
if all(x > 0 for x in box):
|
|
172
|
+
words.append(
|
|
173
|
+
{
|
|
174
|
+
"text": text.get_text_bounded(*box),
|
|
175
|
+
"x0": box[0],
|
|
176
|
+
"x1": box[2],
|
|
177
|
+
"top": height - box[3],
|
|
178
|
+
"bottom": height - box[1],
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
|
|
182
|
+
return detect_results
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
186
|
+
return [get_pypdfium2_requirement()]
|
|
187
|
+
|
|
188
|
+
def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
|
|
189
|
+
"""
|
|
190
|
+
Get the width and height of the full page
|
|
191
|
+
:param pdf_bytes: pdf_bytes generating the pdf
|
|
192
|
+
:return: width and height
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
if self._pdf_bytes == pdf_bytes and self._page is not None:
|
|
196
|
+
return self._page.bbox[2], self._page.bbox[3] # pylint: disable=E1101
|
|
197
|
+
# if the pdf bytes is not equal to the cached pdf, will recalculate values
|
|
198
|
+
pdf = PdfDocument(pdf_bytes)
|
|
199
|
+
self._page = pdf.get_page(0)
|
|
200
|
+
self._pdf_bytes = pdf_bytes
|
|
201
|
+
if self._page is not None:
|
|
202
|
+
return self._page.get_width(), self._page.get_height() # type: ignore
|
|
203
|
+
raise ValueError("Page not found")
|
|
204
|
+
|
|
205
|
+
def get_category_names(self) -> tuple[ObjectTypes, ...]:
|
|
206
|
+
return self.categories.get_categories(as_dict=False)
|
deepdoctection/extern/tessocr.py
CHANGED
|
@@ -421,6 +421,7 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
421
421
|
def __init__(self) -> None:
|
|
422
422
|
self.name = fspath(_TESS_PATH) + "-rotation"
|
|
423
423
|
self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
|
|
424
|
+
self.model_id = self.get_model_id()
|
|
424
425
|
|
|
425
426
|
def transform(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
|
|
426
427
|
"""
|
deepdoctection/utils/env_info.py
CHANGED
|
@@ -20,6 +20,10 @@ Some useful function for collecting environment information.
|
|
|
20
20
|
|
|
21
21
|
This is also the place where we give an overview of the important environment variables.
|
|
22
22
|
|
|
23
|
+
For env variables with boolean character, use one of the following values:
|
|
24
|
+
|
|
25
|
+
{"1", "True", "TRUE", "true", "yes"}
|
|
26
|
+
|
|
23
27
|
`USE_TENSORFLOW
|
|
24
28
|
USE_PYTORCH
|
|
25
29
|
USE_CUDA
|
|
@@ -35,6 +39,12 @@ decide what image processing library the `viz_handler` should use. The default l
|
|
|
35
39
|
to be installed separately. However, if both libraries have been detected `viz_handler` will opt for OpenCV.
|
|
36
40
|
Use the variables to let choose `viz_handler` according to your preferences.
|
|
37
41
|
|
|
42
|
+
`USE_DD_POPPLER
|
|
43
|
+
USE_DD_PDFIUM`
|
|
44
|
+
|
|
45
|
+
For PDF rendering we use PyPDFium2 as default but for legacy reasons, we also support Poppler. If you want to enforce
|
|
46
|
+
Poppler set one to `USE_DD_POPPLER=True` and `USE_DD_PDFIUM=False` the other to False.
|
|
47
|
+
|
|
38
48
|
`HF_CREDENTIALS`
|
|
39
49
|
|
|
40
50
|
will be used by the `ModelDownloadManager` to pass your credentials if you have a model registered that resides in a
|
|
@@ -56,6 +66,7 @@ from typing import Optional
|
|
|
56
66
|
|
|
57
67
|
import numpy as np
|
|
58
68
|
from packaging import version
|
|
69
|
+
from pypdf.errors import DependencyError
|
|
59
70
|
from tabulate import tabulate
|
|
60
71
|
|
|
61
72
|
from .file_utils import (
|
|
@@ -75,6 +86,7 @@ from .file_utils import (
|
|
|
75
86
|
pdf_to_cairo_available,
|
|
76
87
|
pdf_to_ppm_available,
|
|
77
88
|
pdfplumber_available,
|
|
89
|
+
pypdfium2_available,
|
|
78
90
|
pytorch_available,
|
|
79
91
|
qpdf_available,
|
|
80
92
|
scipy_available,
|
|
@@ -88,7 +100,7 @@ from .file_utils import (
|
|
|
88
100
|
from .logger import LoggingRecord, logger
|
|
89
101
|
from .types import KeyValEnvInfos, PathLikeOrStr
|
|
90
102
|
|
|
91
|
-
__all__ = ["collect_env_info", "auto_select_viz_library", "ENV_VARS_TRUE"]
|
|
103
|
+
__all__ = ["collect_env_info", "auto_select_viz_library", "auto_select_pdf_render_framework", "ENV_VARS_TRUE"]
|
|
92
104
|
|
|
93
105
|
# pylint: disable=import-outside-toplevel
|
|
94
106
|
|
|
@@ -532,4 +544,21 @@ def auto_select_viz_library() -> None:
|
|
|
532
544
|
os.environ["USE_DD_OPENCV"] = "False"
|
|
533
545
|
|
|
534
546
|
|
|
547
|
+
def auto_select_pdf_render_framework() -> None:
|
|
548
|
+
"""Setting pdf2image as default pdf rendering library if pdfium is not installed"""
|
|
549
|
+
|
|
550
|
+
# if env variables are already set, don't change them
|
|
551
|
+
if os.environ.get("USE_DD_POPPLER") or os.environ.get("USE_DD_PDFIUM"):
|
|
552
|
+
return
|
|
553
|
+
if pypdfium2_available():
|
|
554
|
+
os.environ["USE_DD_POPPLER"] = "False"
|
|
555
|
+
os.environ["USE_DD_PDFIUM"] = "True"
|
|
556
|
+
return
|
|
557
|
+
if pdf_to_cairo_available() or pdf_to_ppm_available():
|
|
558
|
+
os.environ["USE_DD_POPPLER"] = "True"
|
|
559
|
+
os.environ["USE_DD_PDFIUM"] = "False"
|
|
560
|
+
return
|
|
561
|
+
raise DependencyError("No pdf rendering library found. Please install Poppler or pdfium.")
|
|
562
|
+
|
|
563
|
+
|
|
535
564
|
# pylint: enable=import-outside-toplevel
|
|
@@ -616,6 +616,25 @@ def get_pillow_requirement() -> Requirement:
|
|
|
616
616
|
return "pillow", pillow_available(), _PILLOW_ERR_MSG
|
|
617
617
|
|
|
618
618
|
|
|
619
|
+
# Pypdfium2
|
|
620
|
+
_PYPDFIUM2_AVAILABLE = importlib.util.find_spec("pypdfium2") is not None
|
|
621
|
+
_PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def pypdfium2_available() -> bool:
|
|
625
|
+
"""
|
|
626
|
+
Returns True if pypdfium2 is installed
|
|
627
|
+
"""
|
|
628
|
+
return bool(_PYPDFIUM2_AVAILABLE)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def get_pypdfium2_requirement() -> Requirement:
|
|
632
|
+
"""
|
|
633
|
+
Return pypdfium2 requirement
|
|
634
|
+
"""
|
|
635
|
+
return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
|
|
636
|
+
|
|
637
|
+
|
|
619
638
|
# SpaCy
|
|
620
639
|
_SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
|
|
621
640
|
_SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"
|
deepdoctection/utils/metacfg.py
CHANGED
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
import pprint
|
|
23
24
|
from typing import Any
|
|
@@ -105,6 +106,17 @@ class AttrDict:
|
|
|
105
106
|
v = eval(v) # pylint: disable=C0103, W0123
|
|
106
107
|
setattr(dic, key, v)
|
|
107
108
|
|
|
109
|
+
def overwrite_config(self, other_config: AttrDict) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Overwrite the current config with values from another config.
|
|
112
|
+
|
|
113
|
+
:param other_config: The other AttrDict instance to copy values from.
|
|
114
|
+
:raises AttributeError: If a key from other_config is not an attribute of self.
|
|
115
|
+
"""
|
|
116
|
+
if self._freezed:
|
|
117
|
+
raise AttributeError("Config was freezed! Cannot overwrite config.")
|
|
118
|
+
self.from_dict(other_config.to_dict())
|
|
119
|
+
|
|
108
120
|
def freeze(self, freezed: bool = True) -> None:
|
|
109
121
|
"""
|
|
110
122
|
:param freezed: freeze the instance, so that no attributes can be added or changed
|
|
@@ -24,13 +24,16 @@ import subprocess
|
|
|
24
24
|
import sys
|
|
25
25
|
from errno import ENOENT
|
|
26
26
|
from io import BytesIO
|
|
27
|
+
from pathlib import Path
|
|
27
28
|
from shutil import copyfile
|
|
28
|
-
from typing import Generator, Optional
|
|
29
|
+
from typing import Generator, Literal, Optional
|
|
29
30
|
|
|
31
|
+
from lazy_imports import try_import
|
|
30
32
|
from numpy import uint8
|
|
31
33
|
from pypdf import PdfReader, PdfWriter, errors
|
|
32
34
|
|
|
33
35
|
from .context import save_tmp_file, timeout_manager
|
|
36
|
+
from .env_info import ENV_VARS_TRUE
|
|
34
37
|
from .error import DependencyError, FileExtensionError
|
|
35
38
|
from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
|
|
36
39
|
from .logger import LoggingRecord, logger
|
|
@@ -38,7 +41,17 @@ from .types import PathLikeOrStr, PixelValues
|
|
|
38
41
|
from .utils import is_file_extension
|
|
39
42
|
from .viz import viz_handler
|
|
40
43
|
|
|
41
|
-
|
|
44
|
+
with try_import() as pt_import_guard:
|
|
45
|
+
import pypdfium2
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"decrypt_pdf_document",
|
|
49
|
+
"get_pdf_file_reader",
|
|
50
|
+
"get_pdf_file_writer",
|
|
51
|
+
"PDFStreamer",
|
|
52
|
+
"pdf_to_np_array",
|
|
53
|
+
"split_pdf",
|
|
54
|
+
]
|
|
42
55
|
|
|
43
56
|
|
|
44
57
|
def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
@@ -234,7 +247,7 @@ def _run_poppler(poppler_args: list[str]) -> None:
|
|
|
234
247
|
raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
|
|
235
248
|
|
|
236
249
|
|
|
237
|
-
def
|
|
250
|
+
def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
|
|
238
251
|
"""
|
|
239
252
|
Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
|
|
240
253
|
file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
|
|
@@ -250,3 +263,73 @@ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dp
|
|
|
250
263
|
image = viz_handler.read_image(tmp_name + "-1.png")
|
|
251
264
|
|
|
252
265
|
return image.astype(uint8)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
|
|
269
|
+
"""
|
|
270
|
+
Convert a single pdf page from its byte representation to a numpy array using pdfium.
|
|
271
|
+
|
|
272
|
+
:param pdf_bytes: Bytes representing the PDF file
|
|
273
|
+
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
274
|
+
:return: numpy array
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
page = pypdfium2.PdfDocument(pdf_bytes)[0]
|
|
278
|
+
return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
|
|
282
|
+
"""
|
|
283
|
+
Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
|
|
284
|
+
pdfium to render the pdf.
|
|
285
|
+
|
|
286
|
+
:param pdf_bytes: Bytes representing the PDF file
|
|
287
|
+
:param size: Size of the resulting image(s), uses (width, height) standard
|
|
288
|
+
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
289
|
+
:return: numpy array
|
|
290
|
+
"""
|
|
291
|
+
if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
|
|
292
|
+
if size is not None:
|
|
293
|
+
logger.warning(
|
|
294
|
+
LoggingRecord(
|
|
295
|
+
f"pdf_to_np_array_pdfmium does not support the size parameter. Will use dpi = {dpi} instead."
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
return pdf_to_np_array_pdfmium(pdf_bytes, dpi)
|
|
299
|
+
return pdf_to_np_array_poppler(pdf_bytes, size, dpi)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def split_pdf(
|
|
303
|
+
pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
|
|
304
|
+
) -> None:
|
|
305
|
+
"""
|
|
306
|
+
Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
|
|
307
|
+
|
|
308
|
+
:param pdf_path: Path to the pdf file
|
|
309
|
+
:param output_dir: Path to the output directory
|
|
310
|
+
:param file_type: Type of the output file. Either "image" or "pdf"
|
|
311
|
+
:param dpi: Image quality in DPI/dots-per-inch (default
|
|
312
|
+
"""
|
|
313
|
+
pdf_path = Path(pdf_path)
|
|
314
|
+
filename = pdf_path.stem
|
|
315
|
+
output_dir = Path(output_dir)
|
|
316
|
+
file_dir = output_dir / filename
|
|
317
|
+
if not file_dir.exists():
|
|
318
|
+
os.makedirs(file_dir)
|
|
319
|
+
|
|
320
|
+
with open(pdf_path, "rb") as file:
|
|
321
|
+
pdf = PdfReader(file)
|
|
322
|
+
for i, page in enumerate(pdf.pages):
|
|
323
|
+
writer = PdfWriter()
|
|
324
|
+
writer.add_page(page)
|
|
325
|
+
if file_type == ".pdf":
|
|
326
|
+
with open(file_dir / f"{filename}_{i}.pdf", "wb") as out:
|
|
327
|
+
writer.write(out)
|
|
328
|
+
writer.close()
|
|
329
|
+
else:
|
|
330
|
+
with BytesIO() as buffer:
|
|
331
|
+
writer.write(buffer)
|
|
332
|
+
buffer.seek(0)
|
|
333
|
+
np_image = pdf_to_np_array(buffer.getvalue(), dpi=dpi)
|
|
334
|
+
viz_handler.write_image(file_dir / f"{filename}_{i}.png", np_image)
|
|
335
|
+
writer.close()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: catalogue ==2.0.10
|
|
20
|
-
Requires-Dist: huggingface-hub
|
|
20
|
+
Requires-Dist: huggingface-hub <0.26,>=0.12.0
|
|
21
21
|
Requires-Dist: importlib-metadata >=5.0.0
|
|
22
22
|
Requires-Dist: jsonlines ==3.1.0
|
|
23
23
|
Requires-Dist: lazy-imports ==0.3.1
|
|
@@ -27,6 +27,7 @@ Requires-Dist: numpy <2.0,>=1.21
|
|
|
27
27
|
Requires-Dist: packaging >=20.0
|
|
28
28
|
Requires-Dist: Pillow >=10.0.0
|
|
29
29
|
Requires-Dist: pypdf >=3.16.0
|
|
30
|
+
Requires-Dist: pypdfium2 >=4.30.0
|
|
30
31
|
Requires-Dist: pyyaml >=6.0.1
|
|
31
32
|
Requires-Dist: pyzmq >=16
|
|
32
33
|
Requires-Dist: scipy >=1.13.1
|
|
@@ -63,7 +64,7 @@ Requires-Dist: mkdocstrings-python ; extra == 'docs'
|
|
|
63
64
|
Requires-Dist: griffe ==0.25.0 ; extra == 'docs'
|
|
64
65
|
Provides-Extra: pt
|
|
65
66
|
Requires-Dist: catalogue ==2.0.10 ; extra == 'pt'
|
|
66
|
-
Requires-Dist: huggingface-hub
|
|
67
|
+
Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'pt'
|
|
67
68
|
Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'pt'
|
|
68
69
|
Requires-Dist: jsonlines ==3.1.0 ; extra == 'pt'
|
|
69
70
|
Requires-Dist: lazy-imports ==0.3.1 ; extra == 'pt'
|
|
@@ -73,6 +74,7 @@ Requires-Dist: numpy <2.0,>=1.21 ; extra == 'pt'
|
|
|
73
74
|
Requires-Dist: packaging >=20.0 ; extra == 'pt'
|
|
74
75
|
Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
|
|
75
76
|
Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
|
|
77
|
+
Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'pt'
|
|
76
78
|
Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
|
|
77
79
|
Requires-Dist: pyzmq >=16 ; extra == 'pt'
|
|
78
80
|
Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
|
|
@@ -95,7 +97,7 @@ Requires-Dist: pytest ==8.0.2 ; extra == 'test'
|
|
|
95
97
|
Requires-Dist: pytest-cov ; extra == 'test'
|
|
96
98
|
Provides-Extra: tf
|
|
97
99
|
Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
|
|
98
|
-
Requires-Dist: huggingface-hub
|
|
100
|
+
Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'tf'
|
|
99
101
|
Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
|
|
100
102
|
Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
|
|
101
103
|
Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
|
|
@@ -105,6 +107,7 @@ Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
|
|
|
105
107
|
Requires-Dist: packaging >=20.0 ; extra == 'tf'
|
|
106
108
|
Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
|
|
107
109
|
Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
|
|
110
|
+
Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'tf'
|
|
108
111
|
Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
|
|
109
112
|
Requires-Dist: pyzmq >=16 ; extra == 'tf'
|
|
110
113
|
Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
|
|
@@ -172,9 +175,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
172
175
|
- Document layout analysis and table recognition now runs with
|
|
173
176
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
174
177
|
anymore for basic inference.
|
|
175
|
-
-
|
|
178
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
176
179
|
(not contained in the built-in Analyzer).
|
|
177
|
-
-
|
|
180
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
181
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
179
182
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
180
183
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
@@ -263,7 +266,7 @@ documentation.
|
|
|
263
266
|
|
|
264
267
|
## Requirements
|
|
265
268
|
|
|
266
|
-

|
|
267
270
|
|
|
268
271
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
269
272
|
separately.
|
|
@@ -272,13 +275,16 @@ separately.
|
|
|
272
275
|
- Python >= 3.9
|
|
273
276
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
274
277
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
275
|
-
|
|
276
|
-
images.
|
|
278
|
+
|
|
277
279
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
278
280
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
279
281
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
280
282
|
engine has to be installed separately.
|
|
281
283
|
|
|
284
|
+
|
|
285
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
286
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
287
|
+
|
|
282
288
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
283
289
|
|
|
284
290
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -396,8 +402,8 @@ to develop this framework.
|
|
|
396
402
|
## Problems
|
|
397
403
|
|
|
398
404
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
399
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
400
|
-
to
|
|
405
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
406
|
+
to 12 weeks.
|
|
401
407
|
|
|
402
408
|
## If you like **deep**doctection ...
|
|
403
409
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
deepdoctection/__init__.py,sha256=
|
|
1
|
+
deepdoctection/__init__.py,sha256=RZpawNRTJPKNPFuONawVOsYWdr-rI8PPNXZhlPtOKtc,12580
|
|
2
2
|
deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
deepdoctection/analyzer/__init__.py,sha256=
|
|
4
|
-
deepdoctection/analyzer/
|
|
3
|
+
deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
|
|
4
|
+
deepdoctection/analyzer/_config.py,sha256=0cWtaI2e3jHNhufHZAqMje0YTTDAogKAHVl4VpYojAo,4874
|
|
5
|
+
deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
|
|
6
|
+
deepdoctection/analyzer/factory.py,sha256=T9jxtVLNFhocbsfWIGLPfFrEv21zQJzM6VdFt0yxMyg,23849
|
|
5
7
|
deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
|
|
6
|
-
deepdoctection/configs/conf_dd_one.yaml,sha256=
|
|
8
|
+
deepdoctection/configs/conf_dd_one.yaml,sha256=orP-oeqtWbz5S9FJZJKxy1UqMwOYjL9g0DOX-wbamqU,2239
|
|
7
9
|
deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
|
|
8
10
|
deepdoctection/dataflow/__init__.py,sha256=CWRHMpmJaPk4xY_oIIFubCt-z11SguWrMWxHZ7rdrvY,845
|
|
9
11
|
deepdoctection/dataflow/base.py,sha256=z4DCComSj5wStEPjtk0093cNNGfUMiDqx8dqz36nS_o,6221
|
|
@@ -14,11 +16,11 @@ deepdoctection/dataflow/parallel_map.py,sha256=8FhxJBWV-kjJrJ27jQtP3yYF6Ev6rz98w
|
|
|
14
16
|
deepdoctection/dataflow/serialize.py,sha256=4pYC7m9h53JCu99waVeKpHDpsCDDdYCrSZpP2QYSsgs,4555
|
|
15
17
|
deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i4wI,9619
|
|
16
18
|
deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
|
|
17
|
-
deepdoctection/datapoint/annotation.py,sha256=
|
|
19
|
+
deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
|
|
18
20
|
deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
|
|
19
|
-
deepdoctection/datapoint/convert.py,sha256=
|
|
20
|
-
deepdoctection/datapoint/image.py,sha256=
|
|
21
|
-
deepdoctection/datapoint/view.py,sha256=
|
|
21
|
+
deepdoctection/datapoint/convert.py,sha256=Gw2IjNiEotPu1yuMZqrIYB0mCAwafKt-VgMnrHj6S7U,6808
|
|
22
|
+
deepdoctection/datapoint/image.py,sha256=EvZlVwJjMAcL1z8RNPBvZ8fwdJvkGuGpcFxCP1y26Go,33045
|
|
23
|
+
deepdoctection/datapoint/view.py,sha256=7qSX4DQw9OPQQSKfSjV8e5i6jLyu6hOMceSKJAob2N8,42154
|
|
22
24
|
deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
|
|
23
25
|
deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
|
|
24
26
|
deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
|
|
@@ -57,8 +59,8 @@ deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0
|
|
|
57
59
|
deepdoctection/extern/hflayoutlm.py,sha256=KfoWx9_Rpa1Y2L51HLrYvenfWaTB4SVTmVJH00Cqb-s,56510
|
|
58
60
|
deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
|
|
59
61
|
deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
|
|
60
|
-
deepdoctection/extern/pdftext.py,sha256=
|
|
61
|
-
deepdoctection/extern/tessocr.py,sha256=
|
|
62
|
+
deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
|
|
63
|
+
deepdoctection/extern/tessocr.py,sha256=tG7etMvZ-jHFdq-jJAHYMJii3ujDjMfAFYUsjBp3nKI,17444
|
|
62
64
|
deepdoctection/extern/texocr.py,sha256=yMt5ZzKtsjd7ogrcNXba7zccGGGF9LXK194EtER6YNQ,5804
|
|
63
65
|
deepdoctection/extern/tpdetect.py,sha256=yAk1duQdoX-_pHLHgvhU7OOSiDy863q6XUMpjpYR734,8477
|
|
64
66
|
deepdoctection/extern/pt/__init__.py,sha256=3Cu0ZHjbYsJomru7-RQXEHihEQLegZrmLetlHiqS58I,742
|
|
@@ -124,23 +126,23 @@ deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ6
|
|
|
124
126
|
deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
|
|
125
127
|
deepdoctection/utils/context.py,sha256=VSnJnTtRGuq3w-0-syTf9DXOhR7WsPvWLLWTxKIBYec,4186
|
|
126
128
|
deepdoctection/utils/develop.py,sha256=4HyTarkFbJwctL-Hgu1TU_LSJppHvaroDbcyHsxhIA8,3444
|
|
127
|
-
deepdoctection/utils/env_info.py,sha256=
|
|
129
|
+
deepdoctection/utils/env_info.py,sha256=TnCA-LOTj4WIHd9yvn1AaoPWsLmPgc42l-BJmGV6zmM,19147
|
|
128
130
|
deepdoctection/utils/error.py,sha256=_3q9VepKfEhsM3H033_Fu0hwBzMSjsWALsjyJbGAZr8,2367
|
|
129
|
-
deepdoctection/utils/file_utils.py,sha256=
|
|
131
|
+
deepdoctection/utils/file_utils.py,sha256=IRElrcND0YEiU1QELw5hfXeNA39uE2_nyzh9-X7YcxI,19477
|
|
130
132
|
deepdoctection/utils/fs.py,sha256=C4ktrzjoVtX9kgycv5YrEigDI9byi65b6_D0aKsGM4Y,10161
|
|
131
133
|
deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE7Qphk,2159
|
|
132
134
|
deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
|
|
133
|
-
deepdoctection/utils/metacfg.py,sha256=
|
|
135
|
+
deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
|
|
134
136
|
deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
|
|
135
|
-
deepdoctection/utils/pdf_utils.py,sha256=
|
|
137
|
+
deepdoctection/utils/pdf_utils.py,sha256=OAQjE9xHVNcDsFqAvX47Lu-mgmoMpVXqIf5pOK8AwxY,11595
|
|
136
138
|
deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
|
|
137
139
|
deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
|
|
138
140
|
deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
|
|
139
141
|
deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
|
|
140
142
|
deepdoctection/utils/utils.py,sha256=ANzyIX6AY1yc-4gcn6yxksV84sPrJDaUurUNVatAFu8,5168
|
|
141
143
|
deepdoctection/utils/viz.py,sha256=Xm6pKlhM29UWBBGZHlWFl9XYFDAqaYDdwHXwe26Hvqo,25728
|
|
142
|
-
deepdoctection-0.
|
|
143
|
-
deepdoctection-0.
|
|
144
|
-
deepdoctection-0.
|
|
145
|
-
deepdoctection-0.
|
|
146
|
-
deepdoctection-0.
|
|
144
|
+
deepdoctection-0.35.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
+
deepdoctection-0.35.dist-info/METADATA,sha256=B6pPQjRYWcqd1p-3ul3PhflYOcKq2ZpP5D-i8kr7qgk,19403
|
|
146
|
+
deepdoctection-0.35.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
147
|
+
deepdoctection-0.35.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
+
deepdoctection-0.35.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|