deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +7 -14
- deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection/analyzer/_config.py +142 -0
- deepdoctection/analyzer/dd.py +11 -335
- deepdoctection/analyzer/factory.py +718 -0
- deepdoctection/configs/conf_dd_one.yaml +5 -0
- deepdoctection/datapoint/annotation.py +1 -1
- deepdoctection/datapoint/convert.py +6 -4
- deepdoctection/datapoint/image.py +16 -6
- deepdoctection/datapoint/view.py +91 -15
- deepdoctection/eval/cocometric.py +59 -13
- deepdoctection/extern/pdftext.py +96 -5
- deepdoctection/extern/tessocr.py +1 -0
- deepdoctection/mapper/match.py +4 -2
- deepdoctection/utils/env_info.py +30 -1
- deepdoctection/utils/file_utils.py +19 -0
- deepdoctection/utils/metacfg.py +12 -0
- deepdoctection/utils/pdf_utils.py +86 -3
- deepdoctection/utils/utils.py +39 -0
- deepdoctection/utils/viz.py +16 -13
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/METADATA +126 -116
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/RECORD +25 -23
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/WHEEL +1 -1
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/LICENSE +0 -0
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/top_level.txt +0 -0
deepdoctection/utils/env_info.py
CHANGED
|
@@ -20,6 +20,10 @@ Some useful function for collecting environment information.
|
|
|
20
20
|
|
|
21
21
|
This is also the place where we give an overview of the important environment variables.
|
|
22
22
|
|
|
23
|
+
For env variables with boolean character, use one of the following values:
|
|
24
|
+
|
|
25
|
+
{"1", "True", "TRUE", "true", "yes"}
|
|
26
|
+
|
|
23
27
|
`USE_TENSORFLOW
|
|
24
28
|
USE_PYTORCH
|
|
25
29
|
USE_CUDA
|
|
@@ -35,6 +39,12 @@ decide what image processing library the `viz_handler` should use. The default l
|
|
|
35
39
|
to be installed separately. However, if both libraries have been detected `viz_handler` will opt for OpenCV.
|
|
36
40
|
Use the variables to let choose `viz_handler` according to your preferences.
|
|
37
41
|
|
|
42
|
+
`USE_DD_POPPLER
|
|
43
|
+
USE_DD_PDFIUM`
|
|
44
|
+
|
|
45
|
+
For PDF rendering we use PyPDFium2 as default but for legacy reasons, we also support Poppler. If you want to enforce
|
|
46
|
+
Poppler set one to `USE_DD_POPPLER=True` and `USE_DD_PDFIUM=False` the other to False.
|
|
47
|
+
|
|
38
48
|
`HF_CREDENTIALS`
|
|
39
49
|
|
|
40
50
|
will be used by the `ModelDownloadManager` to pass your credentials if you have a model registered that resides in a
|
|
@@ -56,6 +66,7 @@ from typing import Optional
|
|
|
56
66
|
|
|
57
67
|
import numpy as np
|
|
58
68
|
from packaging import version
|
|
69
|
+
from pypdf.errors import DependencyError
|
|
59
70
|
from tabulate import tabulate
|
|
60
71
|
|
|
61
72
|
from .file_utils import (
|
|
@@ -75,6 +86,7 @@ from .file_utils import (
|
|
|
75
86
|
pdf_to_cairo_available,
|
|
76
87
|
pdf_to_ppm_available,
|
|
77
88
|
pdfplumber_available,
|
|
89
|
+
pypdfium2_available,
|
|
78
90
|
pytorch_available,
|
|
79
91
|
qpdf_available,
|
|
80
92
|
scipy_available,
|
|
@@ -88,7 +100,7 @@ from .file_utils import (
|
|
|
88
100
|
from .logger import LoggingRecord, logger
|
|
89
101
|
from .types import KeyValEnvInfos, PathLikeOrStr
|
|
90
102
|
|
|
91
|
-
__all__ = ["collect_env_info", "auto_select_viz_library", "ENV_VARS_TRUE"]
|
|
103
|
+
__all__ = ["collect_env_info", "auto_select_viz_library", "auto_select_pdf_render_framework", "ENV_VARS_TRUE"]
|
|
92
104
|
|
|
93
105
|
# pylint: disable=import-outside-toplevel
|
|
94
106
|
|
|
@@ -532,4 +544,21 @@ def auto_select_viz_library() -> None:
|
|
|
532
544
|
os.environ["USE_DD_OPENCV"] = "False"
|
|
533
545
|
|
|
534
546
|
|
|
547
|
+
def auto_select_pdf_render_framework() -> None:
|
|
548
|
+
"""Setting pdf2image as default pdf rendering library if pdfium is not installed"""
|
|
549
|
+
|
|
550
|
+
# if env variables are already set, don't change them
|
|
551
|
+
if os.environ.get("USE_DD_POPPLER") or os.environ.get("USE_DD_PDFIUM"):
|
|
552
|
+
return
|
|
553
|
+
if pypdfium2_available():
|
|
554
|
+
os.environ["USE_DD_POPPLER"] = "False"
|
|
555
|
+
os.environ["USE_DD_PDFIUM"] = "True"
|
|
556
|
+
return
|
|
557
|
+
if pdf_to_cairo_available() or pdf_to_ppm_available():
|
|
558
|
+
os.environ["USE_DD_POPPLER"] = "True"
|
|
559
|
+
os.environ["USE_DD_PDFIUM"] = "False"
|
|
560
|
+
return
|
|
561
|
+
raise DependencyError("No pdf rendering library found. Please install Poppler or pdfium.")
|
|
562
|
+
|
|
563
|
+
|
|
535
564
|
# pylint: enable=import-outside-toplevel
|
|
@@ -616,6 +616,25 @@ def get_pillow_requirement() -> Requirement:
|
|
|
616
616
|
return "pillow", pillow_available(), _PILLOW_ERR_MSG
|
|
617
617
|
|
|
618
618
|
|
|
619
|
+
# Pypdfium2
|
|
620
|
+
_PYPDFIUM2_AVAILABLE = importlib.util.find_spec("pypdfium2") is not None
|
|
621
|
+
_PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def pypdfium2_available() -> bool:
|
|
625
|
+
"""
|
|
626
|
+
Returns True if pypdfium2 is installed
|
|
627
|
+
"""
|
|
628
|
+
return bool(_PYPDFIUM2_AVAILABLE)
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def get_pypdfium2_requirement() -> Requirement:
|
|
632
|
+
"""
|
|
633
|
+
Return pypdfium2 requirement
|
|
634
|
+
"""
|
|
635
|
+
return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
|
|
636
|
+
|
|
637
|
+
|
|
619
638
|
# SpaCy
|
|
620
639
|
_SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
|
|
621
640
|
_SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"
|
deepdoctection/utils/metacfg.py
CHANGED
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
import pprint
|
|
23
24
|
from typing import Any
|
|
@@ -105,6 +106,17 @@ class AttrDict:
|
|
|
105
106
|
v = eval(v) # pylint: disable=C0103, W0123
|
|
106
107
|
setattr(dic, key, v)
|
|
107
108
|
|
|
109
|
+
def overwrite_config(self, other_config: AttrDict) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Overwrite the current config with values from another config.
|
|
112
|
+
|
|
113
|
+
:param other_config: The other AttrDict instance to copy values from.
|
|
114
|
+
:raises AttributeError: If a key from other_config is not an attribute of self.
|
|
115
|
+
"""
|
|
116
|
+
if self._freezed:
|
|
117
|
+
raise AttributeError("Config was freezed! Cannot overwrite config.")
|
|
118
|
+
self.from_dict(other_config.to_dict())
|
|
119
|
+
|
|
108
120
|
def freeze(self, freezed: bool = True) -> None:
|
|
109
121
|
"""
|
|
110
122
|
:param freezed: freeze the instance, so that no attributes can be added or changed
|
|
@@ -24,13 +24,16 @@ import subprocess
|
|
|
24
24
|
import sys
|
|
25
25
|
from errno import ENOENT
|
|
26
26
|
from io import BytesIO
|
|
27
|
+
from pathlib import Path
|
|
27
28
|
from shutil import copyfile
|
|
28
|
-
from typing import Generator, Optional
|
|
29
|
+
from typing import Generator, Literal, Optional
|
|
29
30
|
|
|
31
|
+
from lazy_imports import try_import
|
|
30
32
|
from numpy import uint8
|
|
31
33
|
from pypdf import PdfReader, PdfWriter, errors
|
|
32
34
|
|
|
33
35
|
from .context import save_tmp_file, timeout_manager
|
|
36
|
+
from .env_info import ENV_VARS_TRUE
|
|
34
37
|
from .error import DependencyError, FileExtensionError
|
|
35
38
|
from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
|
|
36
39
|
from .logger import LoggingRecord, logger
|
|
@@ -38,7 +41,17 @@ from .types import PathLikeOrStr, PixelValues
|
|
|
38
41
|
from .utils import is_file_extension
|
|
39
42
|
from .viz import viz_handler
|
|
40
43
|
|
|
41
|
-
|
|
44
|
+
with try_import() as pt_import_guard:
|
|
45
|
+
import pypdfium2
|
|
46
|
+
|
|
47
|
+
__all__ = [
|
|
48
|
+
"decrypt_pdf_document",
|
|
49
|
+
"get_pdf_file_reader",
|
|
50
|
+
"get_pdf_file_writer",
|
|
51
|
+
"PDFStreamer",
|
|
52
|
+
"pdf_to_np_array",
|
|
53
|
+
"split_pdf",
|
|
54
|
+
]
|
|
42
55
|
|
|
43
56
|
|
|
44
57
|
def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
@@ -234,7 +247,7 @@ def _run_poppler(poppler_args: list[str]) -> None:
|
|
|
234
247
|
raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
|
|
235
248
|
|
|
236
249
|
|
|
237
|
-
def
|
|
250
|
+
def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
|
|
238
251
|
"""
|
|
239
252
|
Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
|
|
240
253
|
file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
|
|
@@ -250,3 +263,73 @@ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dp
|
|
|
250
263
|
image = viz_handler.read_image(tmp_name + "-1.png")
|
|
251
264
|
|
|
252
265
|
return image.astype(uint8)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
|
|
269
|
+
"""
|
|
270
|
+
Convert a single pdf page from its byte representation to a numpy array using pdfium.
|
|
271
|
+
|
|
272
|
+
:param pdf_bytes: Bytes representing the PDF file
|
|
273
|
+
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
274
|
+
:return: numpy array
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
page = pypdfium2.PdfDocument(pdf_bytes)[0]
|
|
278
|
+
return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
|
|
282
|
+
"""
|
|
283
|
+
Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
|
|
284
|
+
pdfium to render the pdf.
|
|
285
|
+
|
|
286
|
+
:param pdf_bytes: Bytes representing the PDF file
|
|
287
|
+
:param size: Size of the resulting image(s), uses (width, height) standard
|
|
288
|
+
:param dpi: Image quality in DPI/dots-per-inch (default 200)
|
|
289
|
+
:return: numpy array
|
|
290
|
+
"""
|
|
291
|
+
if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
|
|
292
|
+
if size is not None:
|
|
293
|
+
logger.warning(
|
|
294
|
+
LoggingRecord(
|
|
295
|
+
f"pdf_to_np_array_pdfmium does not support the size parameter. Will use dpi = {dpi} instead."
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
return pdf_to_np_array_pdfmium(pdf_bytes, dpi)
|
|
299
|
+
return pdf_to_np_array_poppler(pdf_bytes, size, dpi)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def split_pdf(
|
|
303
|
+
pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
|
|
304
|
+
) -> None:
|
|
305
|
+
"""
|
|
306
|
+
Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
|
|
307
|
+
|
|
308
|
+
:param pdf_path: Path to the pdf file
|
|
309
|
+
:param output_dir: Path to the output directory
|
|
310
|
+
:param file_type: Type of the output file. Either "image" or "pdf"
|
|
311
|
+
:param dpi: Image quality in DPI/dots-per-inch (default
|
|
312
|
+
"""
|
|
313
|
+
pdf_path = Path(pdf_path)
|
|
314
|
+
filename = pdf_path.stem
|
|
315
|
+
output_dir = Path(output_dir)
|
|
316
|
+
file_dir = output_dir / filename
|
|
317
|
+
if not file_dir.exists():
|
|
318
|
+
os.makedirs(file_dir)
|
|
319
|
+
|
|
320
|
+
with open(pdf_path, "rb") as file:
|
|
321
|
+
pdf = PdfReader(file)
|
|
322
|
+
for i, page in enumerate(pdf.pages):
|
|
323
|
+
writer = PdfWriter()
|
|
324
|
+
writer.add_page(page)
|
|
325
|
+
if file_type == ".pdf":
|
|
326
|
+
with open(file_dir / f"{filename}_{i}.pdf", "wb") as out:
|
|
327
|
+
writer.write(out)
|
|
328
|
+
writer.close()
|
|
329
|
+
else:
|
|
330
|
+
with BytesIO() as buffer:
|
|
331
|
+
writer.write(buffer)
|
|
332
|
+
buffer.seek(0)
|
|
333
|
+
np_image = pdf_to_np_array(buffer.getvalue(), dpi=dpi)
|
|
334
|
+
viz_handler.write_image(file_dir / f"{filename}_{i}.png", np_image)
|
|
335
|
+
writer.close()
|
deepdoctection/utils/utils.py
CHANGED
|
@@ -155,3 +155,42 @@ def is_file_extension(file_name: PathLikeOrStr, extension: Union[str, Sequence[s
|
|
|
155
155
|
if isinstance(extension, str):
|
|
156
156
|
return os.path.splitext(file_name)[-1].lower() == extension
|
|
157
157
|
return os.path.splitext(file_name)[-1].lower() in extension
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def partition_list(base_list: list[str], stop_value: str) -> list[list[str]]:
|
|
161
|
+
"""
|
|
162
|
+
Partitions a list of strings into sublists, where each sublist starts with the first occurrence of the stop value.
|
|
163
|
+
Consecutive stop values are grouped together in the same sublist.
|
|
164
|
+
|
|
165
|
+
:param base_list: The list of strings to be partitioned.
|
|
166
|
+
:param stop_value: The string value that indicates the start of a new partition.
|
|
167
|
+
:return: A list of lists, where each sublist is a partition of the original list.
|
|
168
|
+
|
|
169
|
+
** Example:**
|
|
170
|
+
|
|
171
|
+
strings = ['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c', 'a', 'b', 'a', 'b', 'a', 'a']
|
|
172
|
+
stop_string = 'a'
|
|
173
|
+
partition_list(strings, stop_string)
|
|
174
|
+
|
|
175
|
+
# Output [['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c'], ['a', 'b'], ['a', 'b'], ['a', 'a']]
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
partitions = []
|
|
179
|
+
current_partition: list[str] = []
|
|
180
|
+
stop_found = False
|
|
181
|
+
|
|
182
|
+
for s in base_list:
|
|
183
|
+
if s == stop_value:
|
|
184
|
+
if not stop_found and current_partition:
|
|
185
|
+
partitions.append(current_partition)
|
|
186
|
+
current_partition = []
|
|
187
|
+
current_partition.append(s)
|
|
188
|
+
stop_found = True
|
|
189
|
+
else:
|
|
190
|
+
current_partition.append(s)
|
|
191
|
+
stop_found = False
|
|
192
|
+
|
|
193
|
+
if current_partition:
|
|
194
|
+
partitions.append(current_partition)
|
|
195
|
+
|
|
196
|
+
return partitions
|
deepdoctection/utils/viz.py
CHANGED
|
@@ -205,6 +205,7 @@ def draw_boxes(
|
|
|
205
205
|
font_scale: float = 1.0,
|
|
206
206
|
rectangle_thickness: int = 4,
|
|
207
207
|
box_color_by_category: bool = True,
|
|
208
|
+
show_palette: bool = True,
|
|
208
209
|
) -> PixelValues:
|
|
209
210
|
"""
|
|
210
211
|
Dray bounding boxes with category names into image.
|
|
@@ -216,6 +217,7 @@ def draw_boxes(
|
|
|
216
217
|
:param font_scale: Font scale of text box
|
|
217
218
|
:param rectangle_thickness: Thickness of bounding box
|
|
218
219
|
:param box_color_by_category:
|
|
220
|
+
:param show_palette: Whether to show a color palette of the categories
|
|
219
221
|
:return: A new image np.ndarray
|
|
220
222
|
"""
|
|
221
223
|
if color is not None:
|
|
@@ -261,19 +263,20 @@ def draw_boxes(
|
|
|
261
263
|
)
|
|
262
264
|
|
|
263
265
|
# draw a (very ugly) color palette
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
np_image
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
266
|
+
if show_palette:
|
|
267
|
+
y_0 = np_image.shape[0]
|
|
268
|
+
for category, col in category_to_color.items():
|
|
269
|
+
if category is not None:
|
|
270
|
+
np_image = viz_handler.draw_text(
|
|
271
|
+
np_image,
|
|
272
|
+
(np_image.shape[1], y_0),
|
|
273
|
+
category,
|
|
274
|
+
color=col,
|
|
275
|
+
font_scale=font_scale,
|
|
276
|
+
rectangle_thickness=rectangle_thickness,
|
|
277
|
+
)
|
|
278
|
+
_, text_h = viz_handler.get_text_size(category, font_scale * 2)
|
|
279
|
+
y_0 = y_0 - int(1 * text_h)
|
|
277
280
|
|
|
278
281
|
return np_image
|
|
279
282
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.36
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -16,114 +16,117 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
16
16
|
Requires-Python: >=3.9
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist: catalogue
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist: importlib-metadata
|
|
22
|
-
Requires-Dist: jsonlines
|
|
23
|
-
Requires-Dist: lazy-imports
|
|
24
|
-
Requires-Dist: mock
|
|
25
|
-
Requires-Dist: networkx
|
|
26
|
-
Requires-Dist: numpy
|
|
27
|
-
Requires-Dist: packaging
|
|
28
|
-
Requires-Dist: Pillow
|
|
29
|
-
Requires-Dist: pypdf
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
Requires-Dist:
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist:
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
Requires-Dist:
|
|
44
|
-
Requires-Dist:
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist:
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist:
|
|
50
|
-
Requires-Dist:
|
|
51
|
-
|
|
52
|
-
Requires-Dist:
|
|
53
|
-
Requires-Dist:
|
|
54
|
-
Requires-Dist:
|
|
55
|
-
Requires-Dist:
|
|
56
|
-
Requires-Dist:
|
|
57
|
-
Requires-Dist:
|
|
58
|
-
Requires-Dist:
|
|
59
|
-
Requires-Dist:
|
|
60
|
-
Requires-Dist:
|
|
61
|
-
Requires-Dist:
|
|
62
|
-
Requires-Dist:
|
|
63
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: catalogue==2.0.10
|
|
20
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0
|
|
21
|
+
Requires-Dist: importlib-metadata>=5.0.0
|
|
22
|
+
Requires-Dist: jsonlines==3.1.0
|
|
23
|
+
Requires-Dist: lazy-imports==0.3.1
|
|
24
|
+
Requires-Dist: mock==4.0.3
|
|
25
|
+
Requires-Dist: networkx>=2.7.1
|
|
26
|
+
Requires-Dist: numpy<2.0,>=1.21
|
|
27
|
+
Requires-Dist: packaging>=20.0
|
|
28
|
+
Requires-Dist: Pillow>=10.0.0
|
|
29
|
+
Requires-Dist: pypdf>=3.16.0
|
|
30
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
32
|
+
Requires-Dist: pyzmq>=16
|
|
33
|
+
Requires-Dist: scipy>=1.13.1
|
|
34
|
+
Requires-Dist: termcolor>=1.1
|
|
35
|
+
Requires-Dist: tabulate>=0.7.7
|
|
36
|
+
Requires-Dist: tqdm==4.64.0
|
|
37
|
+
Provides-Extra: tf
|
|
38
|
+
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
39
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
|
|
40
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
41
|
+
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
42
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
43
|
+
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
44
|
+
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
45
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
46
|
+
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
47
|
+
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
48
|
+
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
49
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
50
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
51
|
+
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
52
|
+
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
53
|
+
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
54
|
+
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
55
|
+
Requires-Dist: tqdm==4.64.0; extra == "tf"
|
|
56
|
+
Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
57
|
+
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
58
|
+
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
59
|
+
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
60
|
+
Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
61
|
+
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
62
|
+
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
63
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
64
|
+
Requires-Dist: fasttext==0.9.2; extra == "tf"
|
|
65
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
66
|
+
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
67
|
+
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
68
|
+
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
64
69
|
Provides-Extra: pt
|
|
65
|
-
Requires-Dist: catalogue
|
|
66
|
-
Requires-Dist:
|
|
67
|
-
Requires-Dist: importlib-metadata
|
|
68
|
-
Requires-Dist: jsonlines
|
|
69
|
-
Requires-Dist: lazy-imports
|
|
70
|
-
Requires-Dist: mock
|
|
71
|
-
Requires-Dist: networkx
|
|
72
|
-
Requires-Dist: numpy
|
|
73
|
-
Requires-Dist: packaging
|
|
74
|
-
Requires-Dist: Pillow
|
|
75
|
-
Requires-Dist: pypdf
|
|
76
|
-
Requires-Dist:
|
|
77
|
-
Requires-Dist:
|
|
78
|
-
Requires-Dist:
|
|
79
|
-
Requires-Dist:
|
|
80
|
-
Requires-Dist:
|
|
81
|
-
Requires-Dist:
|
|
82
|
-
Requires-Dist:
|
|
83
|
-
Requires-Dist:
|
|
84
|
-
Requires-Dist:
|
|
85
|
-
Requires-Dist:
|
|
86
|
-
Requires-Dist:
|
|
87
|
-
Requires-Dist:
|
|
88
|
-
Requires-Dist:
|
|
89
|
-
Requires-Dist:
|
|
90
|
-
Requires-Dist:
|
|
91
|
-
Requires-Dist:
|
|
92
|
-
Requires-Dist:
|
|
70
|
+
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
71
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
|
|
72
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
73
|
+
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
74
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
75
|
+
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
76
|
+
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
77
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
78
|
+
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
79
|
+
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
80
|
+
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
81
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
82
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
83
|
+
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
84
|
+
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
85
|
+
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
86
|
+
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
87
|
+
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
88
|
+
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
89
|
+
Requires-Dist: transformers>=4.36.0; extra == "pt"
|
|
90
|
+
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
91
|
+
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
92
|
+
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
93
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
94
|
+
Requires-Dist: fasttext==0.9.2; extra == "pt"
|
|
95
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
96
|
+
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
97
|
+
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
98
|
+
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
99
|
+
Provides-Extra: docs
|
|
100
|
+
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
101
|
+
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
102
|
+
Requires-Dist: transformers>=4.36.0; extra == "docs"
|
|
103
|
+
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
104
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
105
|
+
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
106
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
|
|
107
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "docs"
|
|
108
|
+
Requires-Dist: jinja2==3.0.3; extra == "docs"
|
|
109
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
110
|
+
Requires-Dist: mkdocstrings-python; extra == "docs"
|
|
111
|
+
Requires-Dist: griffe==0.25.0; extra == "docs"
|
|
112
|
+
Provides-Extra: dev
|
|
113
|
+
Requires-Dist: python-dotenv==1.0.0; extra == "dev"
|
|
114
|
+
Requires-Dist: click; extra == "dev"
|
|
115
|
+
Requires-Dist: black==23.7.0; extra == "dev"
|
|
116
|
+
Requires-Dist: isort==5.13.2; extra == "dev"
|
|
117
|
+
Requires-Dist: pylint==2.17.4; extra == "dev"
|
|
118
|
+
Requires-Dist: mypy==1.4.1; extra == "dev"
|
|
119
|
+
Requires-Dist: wandb; extra == "dev"
|
|
120
|
+
Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
|
|
121
|
+
Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
|
|
122
|
+
Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
|
|
123
|
+
Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
|
|
124
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
|
|
125
|
+
Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
|
|
126
|
+
Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
|
|
93
127
|
Provides-Extra: test
|
|
94
|
-
Requires-Dist: pytest
|
|
95
|
-
Requires-Dist: pytest-cov
|
|
96
|
-
Provides-Extra: tf
|
|
97
|
-
Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
|
|
98
|
-
Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'tf'
|
|
99
|
-
Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
|
|
100
|
-
Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
|
|
101
|
-
Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
|
|
102
|
-
Requires-Dist: mock ==4.0.3 ; extra == 'tf'
|
|
103
|
-
Requires-Dist: networkx >=2.7.1 ; extra == 'tf'
|
|
104
|
-
Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
|
|
105
|
-
Requires-Dist: packaging >=20.0 ; extra == 'tf'
|
|
106
|
-
Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
|
|
107
|
-
Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
|
|
108
|
-
Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
|
|
109
|
-
Requires-Dist: pyzmq >=16 ; extra == 'tf'
|
|
110
|
-
Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
|
|
111
|
-
Requires-Dist: termcolor >=1.1 ; extra == 'tf'
|
|
112
|
-
Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
|
|
113
|
-
Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'
|
|
114
|
-
Requires-Dist: tensorpack ==0.11 ; extra == 'tf'
|
|
115
|
-
Requires-Dist: protobuf ==3.20.1 ; extra == 'tf'
|
|
116
|
-
Requires-Dist: tensorflow-addons >=0.17.1 ; extra == 'tf'
|
|
117
|
-
Requires-Dist: tf2onnx >=1.9.2 ; extra == 'tf'
|
|
118
|
-
Requires-Dist: python-doctr ==0.8.1 ; extra == 'tf'
|
|
119
|
-
Requires-Dist: pycocotools >=2.0.2 ; extra == 'tf'
|
|
120
|
-
Requires-Dist: boto3 ==1.34.102 ; extra == 'tf'
|
|
121
|
-
Requires-Dist: pdfplumber >=0.11.0 ; extra == 'tf'
|
|
122
|
-
Requires-Dist: fasttext ==0.9.2 ; extra == 'tf'
|
|
123
|
-
Requires-Dist: jdeskew >=0.2.2 ; extra == 'tf'
|
|
124
|
-
Requires-Dist: apted ==1.0.3 ; extra == 'tf'
|
|
125
|
-
Requires-Dist: distance ==0.1.3 ; extra == 'tf'
|
|
126
|
-
Requires-Dist: lxml >=4.9.1 ; extra == 'tf'
|
|
128
|
+
Requires-Dist: pytest==8.0.2; extra == "test"
|
|
129
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
127
130
|
|
|
128
131
|
|
|
129
132
|
<p align="center">
|
|
@@ -172,13 +175,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
172
175
|
- Document layout analysis and table recognition now runs with
|
|
173
176
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
174
177
|
anymore for basic inference.
|
|
175
|
-
-
|
|
176
|
-
|
|
177
|
-
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
179
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
180
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
179
181
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
180
182
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
181
|
-
LayoutLM can be used for LiLT as well
|
|
183
|
+
LayoutLM can be used for LiLT as well.
|
|
184
|
+
- [**new**] There are two notebooks available that show, how to write a
|
|
185
|
+
[custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
|
|
186
|
+
a third party library that has not been supported yet and how to use
|
|
187
|
+
[advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
|
|
188
|
+
get links between layout segments e.g. captions and tables or figures.
|
|
182
189
|
|
|
183
190
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
184
191
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -263,7 +270,7 @@ documentation.
|
|
|
263
270
|
|
|
264
271
|
## Requirements
|
|
265
272
|
|
|
266
|
-

|
|
267
274
|
|
|
268
275
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
269
276
|
separately.
|
|
@@ -272,13 +279,16 @@ separately.
|
|
|
272
279
|
- Python >= 3.9
|
|
273
280
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
274
281
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
275
|
-
|
|
276
|
-
images.
|
|
282
|
+
|
|
277
283
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
278
284
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
279
285
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
280
286
|
engine has to be installed separately.
|
|
281
287
|
|
|
288
|
+
|
|
289
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
290
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
291
|
+
|
|
282
292
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
283
293
|
|
|
284
294
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -396,8 +406,8 @@ to develop this framework.
|
|
|
396
406
|
## Problems
|
|
397
407
|
|
|
398
408
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
399
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
400
|
-
to
|
|
409
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
410
|
+
to 12 weeks.
|
|
401
411
|
|
|
402
412
|
## If you like **deep**doctection ...
|
|
403
413
|
|