deepdoctection 0.34__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -24,21 +24,25 @@ from typing import Optional
24
24
  from lazy_imports import try_import
25
25
 
26
26
  from ..utils.context import save_tmp_file
27
- from ..utils.file_utils import get_pdfplumber_requirement
27
+ from ..utils.file_utils import get_pdfplumber_requirement, get_pypdfium2_requirement
28
28
  from ..utils.settings import LayoutType, ObjectTypes
29
29
  from ..utils.types import Requirement
30
30
  from .base import DetectionResult, ModelCategories, PdfMiner
31
31
 
32
- with try_import() as import_guard:
32
+ with try_import() as pdfplumber_import_guard:
33
33
  from pdfplumber.pdf import PDF, Page
34
34
 
35
+ with try_import() as pypdfmium_import_guard:
36
+ import pypdfium2.raw as pypdfium_c
37
+ from pypdfium2 import PdfDocument
35
38
 
36
- def _to_detect_result(word: dict[str, str]) -> DetectionResult:
39
+
40
+ def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> DetectionResult:
37
41
  return DetectionResult(
38
42
  box=[float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
39
43
  class_id=1,
40
44
  text=word["text"],
41
- class_name=LayoutType.WORD,
45
+ class_name=class_name,
42
46
  )
43
47
 
44
48
 
@@ -49,6 +53,7 @@ class PdfPlumberTextDetector(PdfMiner):
49
53
 
50
54
  pdf_plumber = PdfPlumberTextDetector()
51
55
  df = SerializerPdfDoc.load("path/to/document.pdf")
56
+ df.reset_state()
52
57
 
53
58
  for dp in df:
54
59
  detection_results = pdf_plumber.predict(dp["pdf_bytes"])
@@ -61,6 +66,8 @@ class PdfPlumberTextDetector(PdfMiner):
61
66
  pipe = DoctectionPipe([text_extract])
62
67
 
63
68
  df = pipe.analyze(path="path/to/document.pdf")
69
+ df.reset_state()
70
+
64
71
  for dp in df:
65
72
  ...
66
73
 
@@ -87,7 +94,7 @@ class PdfPlumberTextDetector(PdfMiner):
87
94
  self._page = PDF(fin).pages[0]
88
95
  self._pdf_bytes = pdf_bytes
89
96
  words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
90
- detect_results = list(map(_to_detect_result, words))
97
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
91
98
  return detect_results
92
99
 
93
100
  @classmethod
@@ -113,3 +120,87 @@ class PdfPlumberTextDetector(PdfMiner):
113
120
 
114
121
  def get_category_names(self) -> tuple[ObjectTypes, ...]:
115
122
  return self.categories.get_categories(as_dict=False)
123
+
124
+
125
+ class Pdfmium2TextDetector(PdfMiner):
126
+ """
127
+ Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
128
+
129
+ pdfmium2 = Pdfmium2TextDetector()
130
+ df = SerializerPdfDoc.load("path/to/document.pdf")
131
+ df.reset_state()
132
+
133
+ for dp in df:
134
+ detection_results = pdfmium2.predict(dp["pdf_bytes"])
135
+
136
+ To use it in a more integrated way:
137
+
138
+ pdfmium2 = Pdfmium2TextDetector()
139
+ text_extract = TextExtractionService(pdfmium2)
140
+
141
+ pipe = DoctectionPipe([text_extract])
142
+
143
+ df = pipe.analyze(path="path/to/document.pdf")
144
+ df.reset_state()
145
+ for dp in df:
146
+ ...
147
+
148
+ """
149
+
150
+ def __init__(self) -> None:
151
+ self.name = "Pdfmium"
152
+ self.model_id = self.get_model_id()
153
+ self.categories = ModelCategories(init_categories={1: LayoutType.LINE})
154
+ self._page: Optional[Page] = None
155
+
156
+ def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
157
+ """
158
+ Call pypdfium2 and returns detected text as detection results
159
+
160
+ :param pdf_bytes: bytes of a single pdf page
161
+ :return: A list of DetectionResult
162
+ """
163
+
164
+ pdf = PdfDocument(pdf_bytes)
165
+ page = pdf.get_page(0)
166
+ text = page.get_textpage()
167
+ words = []
168
+ height = page.get_height()
169
+ for obj in page.get_objects((pypdfium_c.FPDF_PAGEOBJ_TEXT,)):
170
+ box = obj.get_pos()
171
+ if all(x > 0 for x in box):
172
+ words.append(
173
+ {
174
+ "text": text.get_text_bounded(*box),
175
+ "x0": box[0],
176
+ "x1": box[2],
177
+ "top": height - box[3],
178
+ "bottom": height - box[1],
179
+ }
180
+ )
181
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
182
+ return detect_results
183
+
184
+ @classmethod
185
+ def get_requirements(cls) -> list[Requirement]:
186
+ return [get_pypdfium2_requirement()]
187
+
188
+ def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
189
+ """
190
+ Get the width and height of the full page
191
+ :param pdf_bytes: pdf_bytes generating the pdf
192
+ :return: width and height
193
+ """
194
+
195
+ if self._pdf_bytes == pdf_bytes and self._page is not None:
196
+ return self._page.bbox[2], self._page.bbox[3] # pylint: disable=E1101
197
+ # if the pdf bytes is not equal to the cached pdf, will recalculate values
198
+ pdf = PdfDocument(pdf_bytes)
199
+ self._page = pdf.get_page(0)
200
+ self._pdf_bytes = pdf_bytes
201
+ if self._page is not None:
202
+ return self._page.get_width(), self._page.get_height() # type: ignore
203
+ raise ValueError("Page not found")
204
+
205
+ def get_category_names(self) -> tuple[ObjectTypes, ...]:
206
+ return self.categories.get_categories(as_dict=False)
@@ -421,6 +421,7 @@ class TesseractRotationTransformer(ImageTransformer):
421
421
  def __init__(self) -> None:
422
422
  self.name = fspath(_TESS_PATH) + "-rotation"
423
423
  self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
424
+ self.model_id = self.get_model_id()
424
425
 
425
426
  def transform(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
426
427
  """
@@ -20,6 +20,10 @@ Some useful function for collecting environment information.
20
20
 
21
21
  This is also the place where we give an overview of the important environment variables.
22
22
 
23
+ For env variables with boolean character, use one of the following values:
24
+
25
+ {"1", "True", "TRUE", "true", "yes"}
26
+
23
27
  `USE_TENSORFLOW
24
28
  USE_PYTORCH
25
29
  USE_CUDA
@@ -35,6 +39,12 @@ decide what image processing library the `viz_handler` should use. The default l
35
39
  to be installed separately. However, if both libraries have been detected `viz_handler` will opt for OpenCV.
36
40
  Use the variables to let choose `viz_handler` according to your preferences.
37
41
 
42
+ `USE_DD_POPPLER
43
+ USE_DD_PDFIUM`
44
+
45
+ For PDF rendering we use PyPDFium2 as default but for legacy reasons, we also support Poppler. If you want to enforce
46
+ Poppler set one to `USE_DD_POPPLER=True` and `USE_DD_PDFIUM=False` the other to False.
47
+
38
48
  `HF_CREDENTIALS`
39
49
 
40
50
  will be used by the `ModelDownloadManager` to pass your credentials if you have a model registered that resides in a
@@ -56,6 +66,7 @@ from typing import Optional
56
66
 
57
67
  import numpy as np
58
68
  from packaging import version
69
+ from pypdf.errors import DependencyError
59
70
  from tabulate import tabulate
60
71
 
61
72
  from .file_utils import (
@@ -75,6 +86,7 @@ from .file_utils import (
75
86
  pdf_to_cairo_available,
76
87
  pdf_to_ppm_available,
77
88
  pdfplumber_available,
89
+ pypdfium2_available,
78
90
  pytorch_available,
79
91
  qpdf_available,
80
92
  scipy_available,
@@ -88,7 +100,7 @@ from .file_utils import (
88
100
  from .logger import LoggingRecord, logger
89
101
  from .types import KeyValEnvInfos, PathLikeOrStr
90
102
 
91
- __all__ = ["collect_env_info", "auto_select_viz_library", "ENV_VARS_TRUE"]
103
+ __all__ = ["collect_env_info", "auto_select_viz_library", "auto_select_pdf_render_framework", "ENV_VARS_TRUE"]
92
104
 
93
105
  # pylint: disable=import-outside-toplevel
94
106
 
@@ -532,4 +544,21 @@ def auto_select_viz_library() -> None:
532
544
  os.environ["USE_DD_OPENCV"] = "False"
533
545
 
534
546
 
547
+ def auto_select_pdf_render_framework() -> None:
548
+ """Setting pdf2image as default pdf rendering library if pdfium is not installed"""
549
+
550
+ # if env variables are already set, don't change them
551
+ if os.environ.get("USE_DD_POPPLER") or os.environ.get("USE_DD_PDFIUM"):
552
+ return
553
+ if pypdfium2_available():
554
+ os.environ["USE_DD_POPPLER"] = "False"
555
+ os.environ["USE_DD_PDFIUM"] = "True"
556
+ return
557
+ if pdf_to_cairo_available() or pdf_to_ppm_available():
558
+ os.environ["USE_DD_POPPLER"] = "True"
559
+ os.environ["USE_DD_PDFIUM"] = "False"
560
+ return
561
+ raise DependencyError("No pdf rendering library found. Please install Poppler or pdfium.")
562
+
563
+
535
564
  # pylint: enable=import-outside-toplevel
@@ -616,6 +616,25 @@ def get_pillow_requirement() -> Requirement:
616
616
  return "pillow", pillow_available(), _PILLOW_ERR_MSG
617
617
 
618
618
 
619
+ # Pypdfium2
620
+ _PYPDFIUM2_AVAILABLE = importlib.util.find_spec("pypdfium2") is not None
621
+ _PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
622
+
623
+
624
+ def pypdfium2_available() -> bool:
625
+ """
626
+ Returns True if pypdfium2 is installed
627
+ """
628
+ return bool(_PYPDFIUM2_AVAILABLE)
629
+
630
+
631
+ def get_pypdfium2_requirement() -> Requirement:
632
+ """
633
+ Return pypdfium2 requirement
634
+ """
635
+ return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
636
+
637
+
619
638
  # SpaCy
620
639
  _SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
621
640
  _SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"
@@ -18,6 +18,7 @@
18
18
  """
19
19
  Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import pprint
23
24
  from typing import Any
@@ -105,6 +106,17 @@ class AttrDict:
105
106
  v = eval(v) # pylint: disable=C0103, W0123
106
107
  setattr(dic, key, v)
107
108
 
109
+ def overwrite_config(self, other_config: AttrDict) -> None:
110
+ """
111
+ Overwrite the current config with values from another config.
112
+
113
+ :param other_config: The other AttrDict instance to copy values from.
114
+ :raises AttributeError: If a key from other_config is not an attribute of self.
115
+ """
116
+ if self._freezed:
117
+ raise AttributeError("Config was freezed! Cannot overwrite config.")
118
+ self.from_dict(other_config.to_dict())
119
+
108
120
  def freeze(self, freezed: bool = True) -> None:
109
121
  """
110
122
  :param freezed: freeze the instance, so that no attributes can be added or changed
@@ -24,13 +24,16 @@ import subprocess
24
24
  import sys
25
25
  from errno import ENOENT
26
26
  from io import BytesIO
27
+ from pathlib import Path
27
28
  from shutil import copyfile
28
- from typing import Generator, Optional
29
+ from typing import Generator, Literal, Optional
29
30
 
31
+ from lazy_imports import try_import
30
32
  from numpy import uint8
31
33
  from pypdf import PdfReader, PdfWriter, errors
32
34
 
33
35
  from .context import save_tmp_file, timeout_manager
36
+ from .env_info import ENV_VARS_TRUE
34
37
  from .error import DependencyError, FileExtensionError
35
38
  from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
36
39
  from .logger import LoggingRecord, logger
@@ -38,7 +41,17 @@ from .types import PathLikeOrStr, PixelValues
38
41
  from .utils import is_file_extension
39
42
  from .viz import viz_handler
40
43
 
41
- __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
44
+ with try_import() as pt_import_guard:
45
+ import pypdfium2
46
+
47
+ __all__ = [
48
+ "decrypt_pdf_document",
49
+ "get_pdf_file_reader",
50
+ "get_pdf_file_writer",
51
+ "PDFStreamer",
52
+ "pdf_to_np_array",
53
+ "split_pdf",
54
+ ]
42
55
 
43
56
 
44
57
  def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
@@ -234,7 +247,7 @@ def _run_poppler(poppler_args: list[str]) -> None:
234
247
  raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
235
248
 
236
249
 
237
- def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
250
+ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
238
251
  """
239
252
  Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
240
253
  file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -250,3 +263,73 @@ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dp
250
263
  image = viz_handler.read_image(tmp_name + "-1.png")
251
264
 
252
265
  return image.astype(uint8)
266
+
267
+
268
+ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
269
+ """
270
+ Convert a single pdf page from its byte representation to a numpy array using pdfium.
271
+
272
+ :param pdf_bytes: Bytes representing the PDF file
273
+ :param dpi: Image quality in DPI/dots-per-inch (default 200)
274
+ :return: numpy array
275
+ """
276
+
277
+ page = pypdfium2.PdfDocument(pdf_bytes)[0]
278
+ return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
279
+
280
+
281
+ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
282
+ """
283
+ Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
284
+ pdfium to render the pdf.
285
+
286
+ :param pdf_bytes: Bytes representing the PDF file
287
+ :param size: Size of the resulting image(s), uses (width, height) standard
288
+ :param dpi: Image quality in DPI/dots-per-inch (default 200)
289
+ :return: numpy array
290
+ """
291
+ if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
292
+ if size is not None:
293
+ logger.warning(
294
+ LoggingRecord(
295
+ f"pdf_to_np_array_pdfmium does not support the size parameter. Will use dpi = {dpi} instead."
296
+ )
297
+ )
298
+ return pdf_to_np_array_pdfmium(pdf_bytes, dpi)
299
+ return pdf_to_np_array_poppler(pdf_bytes, size, dpi)
300
+
301
+
302
+ def split_pdf(
303
+ pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
304
+ ) -> None:
305
+ """
306
+ Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
307
+
308
+ :param pdf_path: Path to the pdf file
309
+ :param output_dir: Path to the output directory
310
+ :param file_type: Type of the output file. Either "image" or "pdf"
311
+ :param dpi: Image quality in DPI/dots-per-inch (default
312
+ """
313
+ pdf_path = Path(pdf_path)
314
+ filename = pdf_path.stem
315
+ output_dir = Path(output_dir)
316
+ file_dir = output_dir / filename
317
+ if not file_dir.exists():
318
+ os.makedirs(file_dir)
319
+
320
+ with open(pdf_path, "rb") as file:
321
+ pdf = PdfReader(file)
322
+ for i, page in enumerate(pdf.pages):
323
+ writer = PdfWriter()
324
+ writer.add_page(page)
325
+ if file_type == ".pdf":
326
+ with open(file_dir / f"{filename}_{i}.pdf", "wb") as out:
327
+ writer.write(out)
328
+ writer.close()
329
+ else:
330
+ with BytesIO() as buffer:
331
+ writer.write(buffer)
332
+ buffer.seek(0)
333
+ np_image = pdf_to_np_array(buffer.getvalue(), dpi=dpi)
334
+ viz_handler.write_image(file_dir / f"{filename}_{i}.png", np_image)
335
+ writer.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.34
3
+ Version: 0.35
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: catalogue ==2.0.10
20
- Requires-Dist: huggingface-hub >=0.12.0
20
+ Requires-Dist: huggingface-hub <0.26,>=0.12.0
21
21
  Requires-Dist: importlib-metadata >=5.0.0
22
22
  Requires-Dist: jsonlines ==3.1.0
23
23
  Requires-Dist: lazy-imports ==0.3.1
@@ -27,6 +27,7 @@ Requires-Dist: numpy <2.0,>=1.21
27
27
  Requires-Dist: packaging >=20.0
28
28
  Requires-Dist: Pillow >=10.0.0
29
29
  Requires-Dist: pypdf >=3.16.0
30
+ Requires-Dist: pypdfium2 >=4.30.0
30
31
  Requires-Dist: pyyaml >=6.0.1
31
32
  Requires-Dist: pyzmq >=16
32
33
  Requires-Dist: scipy >=1.13.1
@@ -63,7 +64,7 @@ Requires-Dist: mkdocstrings-python ; extra == 'docs'
63
64
  Requires-Dist: griffe ==0.25.0 ; extra == 'docs'
64
65
  Provides-Extra: pt
65
66
  Requires-Dist: catalogue ==2.0.10 ; extra == 'pt'
66
- Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'pt'
67
+ Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'pt'
67
68
  Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'pt'
68
69
  Requires-Dist: jsonlines ==3.1.0 ; extra == 'pt'
69
70
  Requires-Dist: lazy-imports ==0.3.1 ; extra == 'pt'
@@ -73,6 +74,7 @@ Requires-Dist: numpy <2.0,>=1.21 ; extra == 'pt'
73
74
  Requires-Dist: packaging >=20.0 ; extra == 'pt'
74
75
  Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
75
76
  Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
77
+ Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'pt'
76
78
  Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
77
79
  Requires-Dist: pyzmq >=16 ; extra == 'pt'
78
80
  Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
@@ -95,7 +97,7 @@ Requires-Dist: pytest ==8.0.2 ; extra == 'test'
95
97
  Requires-Dist: pytest-cov ; extra == 'test'
96
98
  Provides-Extra: tf
97
99
  Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
98
- Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'tf'
100
+ Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'tf'
99
101
  Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
100
102
  Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
101
103
  Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
@@ -105,6 +107,7 @@ Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
105
107
  Requires-Dist: packaging >=20.0 ; extra == 'tf'
106
108
  Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
107
109
  Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
110
+ Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'tf'
108
111
  Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
109
112
  Requires-Dist: pyzmq >=16 ; extra == 'tf'
110
113
  Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
@@ -172,9 +175,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
172
175
  - Document layout analysis and table recognition now runs with
173
176
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
174
177
  anymore for basic inference.
175
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
178
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
176
179
  (not contained in the built-in Analyzer).
177
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
180
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
178
181
  [**transformers**](https://github.com/huggingface/transformers).
179
182
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
180
183
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
@@ -263,7 +266,7 @@ documentation.
263
266
 
264
267
  ## Requirements
265
268
 
266
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
269
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
267
270
 
268
271
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
269
272
  separately.
@@ -272,13 +275,16 @@ separately.
272
275
  - Python >= 3.9
273
276
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
274
277
  In general, if you want to train or fine-tune models, a GPU is required.
275
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
276
- images.
278
+
277
279
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
278
280
  and [PyTorch](https://pytorch.org/get-started/locally/).
279
281
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
280
282
  engine has to be installed separately.
281
283
 
284
+
285
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
286
+ documents into images. For release `v.0.35.0` this dependency will be optional.
287
+
282
288
  The following overview shows the availability of the models in conjunction with the DL framework.
283
289
 
284
290
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -396,8 +402,8 @@ to develop this framework.
396
402
  ## Problems
397
403
 
398
404
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
399
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
400
- to 6 weeks.
405
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
406
+ to 12 weeks.
401
407
 
402
408
  ## If you like **deep**doctection ...
403
409
 
@@ -1,9 +1,11 @@
1
- deepdoctection/__init__.py,sha256=lgfD5PlxwSqTwMnKBtcpzKH9emJ4UtyWaWrpM9Pn0Ng,12596
1
+ deepdoctection/__init__.py,sha256=RZpawNRTJPKNPFuONawVOsYWdr-rI8PPNXZhlPtOKtc,12580
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- deepdoctection/analyzer/__init__.py,sha256=g86MeZz_BIQ2-b8kDIss7osPUNrFhT-Z3Eu7Wm02pFI,706
4
- deepdoctection/analyzer/dd.py,sha256=j3G6PFmXe9XBTwtu8-g9D3yAx7obaNzfZ2yl7rEOUqg,20234
3
+ deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
+ deepdoctection/analyzer/_config.py,sha256=0cWtaI2e3jHNhufHZAqMje0YTTDAogKAHVl4VpYojAo,4874
5
+ deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
6
+ deepdoctection/analyzer/factory.py,sha256=T9jxtVLNFhocbsfWIGLPfFrEv21zQJzM6VdFt0yxMyg,23849
5
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
6
- deepdoctection/configs/conf_dd_one.yaml,sha256=d4ZTMQ1oTIYMFctQAaQBKK6iQP4LsViUDrPvsnaLumo,2220
8
+ deepdoctection/configs/conf_dd_one.yaml,sha256=orP-oeqtWbz5S9FJZJKxy1UqMwOYjL9g0DOX-wbamqU,2239
7
9
  deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
8
10
  deepdoctection/dataflow/__init__.py,sha256=CWRHMpmJaPk4xY_oIIFubCt-z11SguWrMWxHZ7rdrvY,845
9
11
  deepdoctection/dataflow/base.py,sha256=z4DCComSj5wStEPjtk0093cNNGfUMiDqx8dqz36nS_o,6221
@@ -14,11 +16,11 @@ deepdoctection/dataflow/parallel_map.py,sha256=8FhxJBWV-kjJrJ27jQtP3yYF6Ev6rz98w
14
16
  deepdoctection/dataflow/serialize.py,sha256=4pYC7m9h53JCu99waVeKpHDpsCDDdYCrSZpP2QYSsgs,4555
15
17
  deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i4wI,9619
16
18
  deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
17
- deepdoctection/datapoint/annotation.py,sha256=3hDwNf3bm7qi0xnvfKn459hxZe4BdiLPiFt03hJBbUQ,22517
19
+ deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
18
20
  deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
19
- deepdoctection/datapoint/convert.py,sha256=9L3YS89nGPAV8dqPZ-KOLVxvatj_zax2yP5RD-fuZCU,6718
20
- deepdoctection/datapoint/image.py,sha256=WyGcVYNrC-sv7bxODunEttxcQCFhplpWkWLLVQ266C0,32766
21
- deepdoctection/datapoint/view.py,sha256=Hdz67F8UtIkQjFW6U2mKeQ1WIdaXq4dOx95ymyQFLiU,42137
21
+ deepdoctection/datapoint/convert.py,sha256=Gw2IjNiEotPu1yuMZqrIYB0mCAwafKt-VgMnrHj6S7U,6808
22
+ deepdoctection/datapoint/image.py,sha256=EvZlVwJjMAcL1z8RNPBvZ8fwdJvkGuGpcFxCP1y26Go,33045
23
+ deepdoctection/datapoint/view.py,sha256=7qSX4DQw9OPQQSKfSjV8e5i6jLyu6hOMceSKJAob2N8,42154
22
24
  deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
23
25
  deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
24
26
  deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
@@ -57,8 +59,8 @@ deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0
57
59
  deepdoctection/extern/hflayoutlm.py,sha256=KfoWx9_Rpa1Y2L51HLrYvenfWaTB4SVTmVJH00Cqb-s,56510
58
60
  deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
59
61
  deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
60
- deepdoctection/extern/pdftext.py,sha256=9EvDstMBeOeCFXM21wKaj5iTOUJSt8_50RfGdMcMjIA,4048
61
- deepdoctection/extern/tessocr.py,sha256=GCTcVHm6oOXS2Xq76j-xY9etRDDJA5qfqWJ5AJ-Kn8k,17400
62
+ deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
63
+ deepdoctection/extern/tessocr.py,sha256=tG7etMvZ-jHFdq-jJAHYMJii3ujDjMfAFYUsjBp3nKI,17444
62
64
  deepdoctection/extern/texocr.py,sha256=yMt5ZzKtsjd7ogrcNXba7zccGGGF9LXK194EtER6YNQ,5804
63
65
  deepdoctection/extern/tpdetect.py,sha256=yAk1duQdoX-_pHLHgvhU7OOSiDy863q6XUMpjpYR734,8477
64
66
  deepdoctection/extern/pt/__init__.py,sha256=3Cu0ZHjbYsJomru7-RQXEHihEQLegZrmLetlHiqS58I,742
@@ -124,23 +126,23 @@ deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ6
124
126
  deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
125
127
  deepdoctection/utils/context.py,sha256=VSnJnTtRGuq3w-0-syTf9DXOhR7WsPvWLLWTxKIBYec,4186
126
128
  deepdoctection/utils/develop.py,sha256=4HyTarkFbJwctL-Hgu1TU_LSJppHvaroDbcyHsxhIA8,3444
127
- deepdoctection/utils/env_info.py,sha256=Fm6A4XfJsYQmW5TzPmwn7_jh9qx5jqYlt00k9NK0yR8,18007
129
+ deepdoctection/utils/env_info.py,sha256=TnCA-LOTj4WIHd9yvn1AaoPWsLmPgc42l-BJmGV6zmM,19147
128
130
  deepdoctection/utils/error.py,sha256=_3q9VepKfEhsM3H033_Fu0hwBzMSjsWALsjyJbGAZr8,2367
129
- deepdoctection/utils/file_utils.py,sha256=koYsfHtl0-nh8T9nUb215Rc1X-WDvk2gEjyw-YJVZ34,19019
131
+ deepdoctection/utils/file_utils.py,sha256=IRElrcND0YEiU1QELw5hfXeNA39uE2_nyzh9-X7YcxI,19477
130
132
  deepdoctection/utils/fs.py,sha256=C4ktrzjoVtX9kgycv5YrEigDI9byi65b6_D0aKsGM4Y,10161
131
133
  deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE7Qphk,2159
132
134
  deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
133
- deepdoctection/utils/metacfg.py,sha256=AGAE-KOymOLsarpUBBYawpVSXImvJyUeOD4LD2W_7Yo,5196
135
+ deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
134
136
  deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
135
- deepdoctection/utils/pdf_utils.py,sha256=H5BdLXvDlvTEfb-3zcRjy207PeqEnaymkG122R7UA4o,8635
137
+ deepdoctection/utils/pdf_utils.py,sha256=OAQjE9xHVNcDsFqAvX47Lu-mgmoMpVXqIf5pOK8AwxY,11595
136
138
  deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
137
139
  deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
138
140
  deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
139
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
140
142
  deepdoctection/utils/utils.py,sha256=ANzyIX6AY1yc-4gcn6yxksV84sPrJDaUurUNVatAFu8,5168
141
143
  deepdoctection/utils/viz.py,sha256=Xm6pKlhM29UWBBGZHlWFl9XYFDAqaYDdwHXwe26Hvqo,25728
142
- deepdoctection-0.34.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
143
- deepdoctection-0.34.dist-info/METADATA,sha256=YJ5XJnf7zMlDmr6f7vqvFNL11hy-ZEz8VbdYgii0AQo,19169
144
- deepdoctection-0.34.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
145
- deepdoctection-0.34.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
146
- deepdoctection-0.34.dist-info/RECORD,,
144
+ deepdoctection-0.35.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.35.dist-info/METADATA,sha256=B6pPQjRYWcqd1p-3ul3PhflYOcKq2ZpP5D-i8kr7qgk,19403
146
+ deepdoctection-0.35.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
147
+ deepdoctection-0.35.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.35.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5