deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -20,6 +20,10 @@ Some useful function for collecting environment information.
20
20
 
21
21
  This is also the place where we give an overview of the important environment variables.
22
22
 
23
+ For env variables with boolean character, use one of the following values:
24
+
25
+ {"1", "True", "TRUE", "true", "yes"}
26
+
23
27
  `USE_TENSORFLOW
24
28
  USE_PYTORCH
25
29
  USE_CUDA
@@ -35,6 +39,12 @@ decide what image processing library the `viz_handler` should use. The default l
35
39
  to be installed separately. However, if both libraries have been detected `viz_handler` will opt for OpenCV.
36
40
  Use the variables to let choose `viz_handler` according to your preferences.
37
41
 
42
+ `USE_DD_POPPLER
43
+ USE_DD_PDFIUM`
44
+
45
+ For PDF rendering we use PyPDFium2 as default but for legacy reasons, we also support Poppler. If you want to enforce
46
+ Poppler set one to `USE_DD_POPPLER=True` and `USE_DD_PDFIUM=False` the other to False.
47
+
38
48
  `HF_CREDENTIALS`
39
49
 
40
50
  will be used by the `ModelDownloadManager` to pass your credentials if you have a model registered that resides in a
@@ -56,6 +66,7 @@ from typing import Optional
56
66
 
57
67
  import numpy as np
58
68
  from packaging import version
69
+ from pypdf.errors import DependencyError
59
70
  from tabulate import tabulate
60
71
 
61
72
  from .file_utils import (
@@ -75,6 +86,7 @@ from .file_utils import (
75
86
  pdf_to_cairo_available,
76
87
  pdf_to_ppm_available,
77
88
  pdfplumber_available,
89
+ pypdfium2_available,
78
90
  pytorch_available,
79
91
  qpdf_available,
80
92
  scipy_available,
@@ -88,7 +100,7 @@ from .file_utils import (
88
100
  from .logger import LoggingRecord, logger
89
101
  from .types import KeyValEnvInfos, PathLikeOrStr
90
102
 
91
- __all__ = ["collect_env_info", "auto_select_viz_library", "ENV_VARS_TRUE"]
103
+ __all__ = ["collect_env_info", "auto_select_viz_library", "auto_select_pdf_render_framework", "ENV_VARS_TRUE"]
92
104
 
93
105
  # pylint: disable=import-outside-toplevel
94
106
 
@@ -532,4 +544,21 @@ def auto_select_viz_library() -> None:
532
544
  os.environ["USE_DD_OPENCV"] = "False"
533
545
 
534
546
 
547
+ def auto_select_pdf_render_framework() -> None:
548
+ """Setting pdf2image as default pdf rendering library if pdfium is not installed"""
549
+
550
+ # if env variables are already set, don't change them
551
+ if os.environ.get("USE_DD_POPPLER") or os.environ.get("USE_DD_PDFIUM"):
552
+ return
553
+ if pypdfium2_available():
554
+ os.environ["USE_DD_POPPLER"] = "False"
555
+ os.environ["USE_DD_PDFIUM"] = "True"
556
+ return
557
+ if pdf_to_cairo_available() or pdf_to_ppm_available():
558
+ os.environ["USE_DD_POPPLER"] = "True"
559
+ os.environ["USE_DD_PDFIUM"] = "False"
560
+ return
561
+ raise DependencyError("No pdf rendering library found. Please install Poppler or pdfium.")
562
+
563
+
535
564
  # pylint: enable=import-outside-toplevel
@@ -616,6 +616,25 @@ def get_pillow_requirement() -> Requirement:
616
616
  return "pillow", pillow_available(), _PILLOW_ERR_MSG
617
617
 
618
618
 
619
+ # Pypdfium2
620
+ _PYPDFIUM2_AVAILABLE = importlib.util.find_spec("pypdfium2") is not None
621
+ _PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
622
+
623
+
624
+ def pypdfium2_available() -> bool:
625
+ """
626
+ Returns True if pypdfium2 is installed
627
+ """
628
+ return bool(_PYPDFIUM2_AVAILABLE)
629
+
630
+
631
+ def get_pypdfium2_requirement() -> Requirement:
632
+ """
633
+ Return pypdfium2 requirement
634
+ """
635
+ return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
636
+
637
+
619
638
  # SpaCy
620
639
  _SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
621
640
  _SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"
@@ -18,6 +18,7 @@
18
18
  """
19
19
  Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to .yaml files
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import pprint
23
24
  from typing import Any
@@ -105,6 +106,17 @@ class AttrDict:
105
106
  v = eval(v) # pylint: disable=C0103, W0123
106
107
  setattr(dic, key, v)
107
108
 
109
+ def overwrite_config(self, other_config: AttrDict) -> None:
110
+ """
111
+ Overwrite the current config with values from another config.
112
+
113
+ :param other_config: The other AttrDict instance to copy values from.
114
+ :raises AttributeError: If a key from other_config is not an attribute of self.
115
+ """
116
+ if self._freezed:
117
+ raise AttributeError("Config was freezed! Cannot overwrite config.")
118
+ self.from_dict(other_config.to_dict())
119
+
108
120
  def freeze(self, freezed: bool = True) -> None:
109
121
  """
110
122
  :param freezed: freeze the instance, so that no attributes can be added or changed
@@ -24,13 +24,16 @@ import subprocess
24
24
  import sys
25
25
  from errno import ENOENT
26
26
  from io import BytesIO
27
+ from pathlib import Path
27
28
  from shutil import copyfile
28
- from typing import Generator, Optional
29
+ from typing import Generator, Literal, Optional
29
30
 
31
+ from lazy_imports import try_import
30
32
  from numpy import uint8
31
33
  from pypdf import PdfReader, PdfWriter, errors
32
34
 
33
35
  from .context import save_tmp_file, timeout_manager
36
+ from .env_info import ENV_VARS_TRUE
34
37
  from .error import DependencyError, FileExtensionError
35
38
  from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
36
39
  from .logger import LoggingRecord, logger
@@ -38,7 +41,17 @@ from .types import PathLikeOrStr, PixelValues
38
41
  from .utils import is_file_extension
39
42
  from .viz import viz_handler
40
43
 
41
- __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
44
+ with try_import() as pt_import_guard:
45
+ import pypdfium2
46
+
47
+ __all__ = [
48
+ "decrypt_pdf_document",
49
+ "get_pdf_file_reader",
50
+ "get_pdf_file_writer",
51
+ "PDFStreamer",
52
+ "pdf_to_np_array",
53
+ "split_pdf",
54
+ ]
42
55
 
43
56
 
44
57
  def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
@@ -234,7 +247,7 @@ def _run_poppler(poppler_args: list[str]) -> None:
234
247
  raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
235
248
 
236
249
 
237
- def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
250
+ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
238
251
  """
239
252
  Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
240
253
  file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -250,3 +263,73 @@ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dp
250
263
  image = viz_handler.read_image(tmp_name + "-1.png")
251
264
 
252
265
  return image.astype(uint8)
266
+
267
+
268
+ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
269
+ """
270
+ Convert a single pdf page from its byte representation to a numpy array using pdfium.
271
+
272
+ :param pdf_bytes: Bytes representing the PDF file
273
+ :param dpi: Image quality in DPI/dots-per-inch (default 200)
274
+ :return: numpy array
275
+ """
276
+
277
+ page = pypdfium2.PdfDocument(pdf_bytes)[0]
278
+ return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
279
+
280
+
281
+ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
282
+ """
283
+ Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
284
+ pdfium to render the pdf.
285
+
286
+ :param pdf_bytes: Bytes representing the PDF file
287
+ :param size: Size of the resulting image(s), uses (width, height) standard
288
+ :param dpi: Image quality in DPI/dots-per-inch (default 200)
289
+ :return: numpy array
290
+ """
291
+ if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
292
+ if size is not None:
293
+ logger.warning(
294
+ LoggingRecord(
295
+ f"pdf_to_np_array_pdfmium does not support the size parameter. Will use dpi = {dpi} instead."
296
+ )
297
+ )
298
+ return pdf_to_np_array_pdfmium(pdf_bytes, dpi)
299
+ return pdf_to_np_array_poppler(pdf_bytes, size, dpi)
300
+
301
+
302
+ def split_pdf(
303
+ pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
304
+ ) -> None:
305
+ """
306
+ Split a pdf into single pages. The pages are saved as single pdf/png files in a subfolder of the output directory.
307
+
308
+ :param pdf_path: Path to the pdf file
309
+ :param output_dir: Path to the output directory
310
+ :param file_type: Type of the output file. Either "image" or "pdf"
311
+ :param dpi: Image quality in DPI/dots-per-inch (default
312
+ """
313
+ pdf_path = Path(pdf_path)
314
+ filename = pdf_path.stem
315
+ output_dir = Path(output_dir)
316
+ file_dir = output_dir / filename
317
+ if not file_dir.exists():
318
+ os.makedirs(file_dir)
319
+
320
+ with open(pdf_path, "rb") as file:
321
+ pdf = PdfReader(file)
322
+ for i, page in enumerate(pdf.pages):
323
+ writer = PdfWriter()
324
+ writer.add_page(page)
325
+ if file_type == ".pdf":
326
+ with open(file_dir / f"{filename}_{i}.pdf", "wb") as out:
327
+ writer.write(out)
328
+ writer.close()
329
+ else:
330
+ with BytesIO() as buffer:
331
+ writer.write(buffer)
332
+ buffer.seek(0)
333
+ np_image = pdf_to_np_array(buffer.getvalue(), dpi=dpi)
334
+ viz_handler.write_image(file_dir / f"{filename}_{i}.png", np_image)
335
+ writer.close()
@@ -155,3 +155,42 @@ def is_file_extension(file_name: PathLikeOrStr, extension: Union[str, Sequence[s
155
155
  if isinstance(extension, str):
156
156
  return os.path.splitext(file_name)[-1].lower() == extension
157
157
  return os.path.splitext(file_name)[-1].lower() in extension
158
+
159
+
160
+ def partition_list(base_list: list[str], stop_value: str) -> list[list[str]]:
161
+ """
162
+ Partitions a list of strings into sublists, where each sublist starts with the first occurrence of the stop value.
163
+ Consecutive stop values are grouped together in the same sublist.
164
+
165
+ :param base_list: The list of strings to be partitioned.
166
+ :param stop_value: The string value that indicates the start of a new partition.
167
+ :return: A list of lists, where each sublist is a partition of the original list.
168
+
169
+ ** Example:**
170
+
171
+ strings = ['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c', 'a', 'b', 'a', 'b', 'a', 'a']
172
+ stop_string = 'a'
173
+ partition_list(strings, stop_string)
174
+
175
+ # Output [['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c'], ['a', 'b'], ['a', 'b'], ['a', 'a']]
176
+ """
177
+
178
+ partitions = []
179
+ current_partition: list[str] = []
180
+ stop_found = False
181
+
182
+ for s in base_list:
183
+ if s == stop_value:
184
+ if not stop_found and current_partition:
185
+ partitions.append(current_partition)
186
+ current_partition = []
187
+ current_partition.append(s)
188
+ stop_found = True
189
+ else:
190
+ current_partition.append(s)
191
+ stop_found = False
192
+
193
+ if current_partition:
194
+ partitions.append(current_partition)
195
+
196
+ return partitions
@@ -205,6 +205,7 @@ def draw_boxes(
205
205
  font_scale: float = 1.0,
206
206
  rectangle_thickness: int = 4,
207
207
  box_color_by_category: bool = True,
208
+ show_palette: bool = True,
208
209
  ) -> PixelValues:
209
210
  """
210
211
  Dray bounding boxes with category names into image.
@@ -216,6 +217,7 @@ def draw_boxes(
216
217
  :param font_scale: Font scale of text box
217
218
  :param rectangle_thickness: Thickness of bounding box
218
219
  :param box_color_by_category:
220
+ :param show_palette: Whether to show a color palette of the categories
219
221
  :return: A new image np.ndarray
220
222
  """
221
223
  if color is not None:
@@ -261,19 +263,20 @@ def draw_boxes(
261
263
  )
262
264
 
263
265
  # draw a (very ugly) color palette
264
- y_0 = np_image.shape[0]
265
- for category, col in category_to_color.items():
266
- if category is not None:
267
- np_image = viz_handler.draw_text(
268
- np_image,
269
- (np_image.shape[1], y_0),
270
- category,
271
- color=col,
272
- font_scale=font_scale * 3,
273
- rectangle_thickness=rectangle_thickness,
274
- )
275
- _, text_h = viz_handler.get_text_size(category, font_scale * 2)
276
- y_0 = y_0 - int(10 * text_h)
266
+ if show_palette:
267
+ y_0 = np_image.shape[0]
268
+ for category, col in category_to_color.items():
269
+ if category is not None:
270
+ np_image = viz_handler.draw_text(
271
+ np_image,
272
+ (np_image.shape[1], y_0),
273
+ category,
274
+ color=col,
275
+ font_scale=font_scale,
276
+ rectangle_thickness=rectangle_thickness,
277
+ )
278
+ _, text_h = viz_handler.get_text_size(category, font_scale * 2)
279
+ y_0 = y_0 - int(1 * text_h)
277
280
 
278
281
  return np_image
279
282
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.34
3
+ Version: 0.36
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -16,114 +16,117 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: catalogue ==2.0.10
20
- Requires-Dist: huggingface-hub >=0.12.0
21
- Requires-Dist: importlib-metadata >=5.0.0
22
- Requires-Dist: jsonlines ==3.1.0
23
- Requires-Dist: lazy-imports ==0.3.1
24
- Requires-Dist: mock ==4.0.3
25
- Requires-Dist: networkx >=2.7.1
26
- Requires-Dist: numpy <2.0,>=1.21
27
- Requires-Dist: packaging >=20.0
28
- Requires-Dist: Pillow >=10.0.0
29
- Requires-Dist: pypdf >=3.16.0
30
- Requires-Dist: pyyaml >=6.0.1
31
- Requires-Dist: pyzmq >=16
32
- Requires-Dist: scipy >=1.13.1
33
- Requires-Dist: termcolor >=1.1
34
- Requires-Dist: tabulate >=0.7.7
35
- Requires-Dist: tqdm ==4.64.0
36
- Provides-Extra: dev
37
- Requires-Dist: python-dotenv ==1.0.0 ; extra == 'dev'
38
- Requires-Dist: click ; extra == 'dev'
39
- Requires-Dist: black ==23.7.0 ; extra == 'dev'
40
- Requires-Dist: isort ==5.13.2 ; extra == 'dev'
41
- Requires-Dist: pylint ==2.17.4 ; extra == 'dev'
42
- Requires-Dist: mypy ==1.4.1 ; extra == 'dev'
43
- Requires-Dist: wandb ; extra == 'dev'
44
- Requires-Dist: types-PyYAML >=6.0.12.12 ; extra == 'dev'
45
- Requires-Dist: types-termcolor >=1.1.3 ; extra == 'dev'
46
- Requires-Dist: types-tabulate >=0.9.0.3 ; extra == 'dev'
47
- Requires-Dist: types-tqdm >=4.66.0.5 ; extra == 'dev'
48
- Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'dev'
49
- Requires-Dist: types-Pillow >=10.2.0.20240406 ; extra == 'dev'
50
- Requires-Dist: types-urllib3 >=1.26.25.14 ; extra == 'dev'
51
- Provides-Extra: docs
52
- Requires-Dist: tensorpack ==0.11 ; extra == 'docs'
53
- Requires-Dist: boto3 ==1.34.102 ; extra == 'docs'
54
- Requires-Dist: transformers >=4.36.0 ; extra == 'docs'
55
- Requires-Dist: accelerate >=0.29.1 ; extra == 'docs'
56
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'docs'
57
- Requires-Dist: lxml >=4.9.1 ; extra == 'docs'
58
- Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'docs'
59
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'docs'
60
- Requires-Dist: jinja2 ==3.0.3 ; extra == 'docs'
61
- Requires-Dist: mkdocs-material ; extra == 'docs'
62
- Requires-Dist: mkdocstrings-python ; extra == 'docs'
63
- Requires-Dist: griffe ==0.25.0 ; extra == 'docs'
19
+ Requires-Dist: catalogue==2.0.10
20
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0
21
+ Requires-Dist: importlib-metadata>=5.0.0
22
+ Requires-Dist: jsonlines==3.1.0
23
+ Requires-Dist: lazy-imports==0.3.1
24
+ Requires-Dist: mock==4.0.3
25
+ Requires-Dist: networkx>=2.7.1
26
+ Requires-Dist: numpy<2.0,>=1.21
27
+ Requires-Dist: packaging>=20.0
28
+ Requires-Dist: Pillow>=10.0.0
29
+ Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdfium2>=4.30.0
31
+ Requires-Dist: pyyaml>=6.0.1
32
+ Requires-Dist: pyzmq>=16
33
+ Requires-Dist: scipy>=1.13.1
34
+ Requires-Dist: termcolor>=1.1
35
+ Requires-Dist: tabulate>=0.7.7
36
+ Requires-Dist: tqdm==4.64.0
37
+ Provides-Extra: tf
38
+ Requires-Dist: catalogue==2.0.10; extra == "tf"
39
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
40
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
41
+ Requires-Dist: jsonlines==3.1.0; extra == "tf"
42
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
43
+ Requires-Dist: mock==4.0.3; extra == "tf"
44
+ Requires-Dist: networkx>=2.7.1; extra == "tf"
45
+ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
46
+ Requires-Dist: packaging>=20.0; extra == "tf"
47
+ Requires-Dist: Pillow>=10.0.0; extra == "tf"
48
+ Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
50
+ Requires-Dist: pyyaml>=6.0.1; extra == "tf"
51
+ Requires-Dist: pyzmq>=16; extra == "tf"
52
+ Requires-Dist: scipy>=1.13.1; extra == "tf"
53
+ Requires-Dist: termcolor>=1.1; extra == "tf"
54
+ Requires-Dist: tabulate>=0.7.7; extra == "tf"
55
+ Requires-Dist: tqdm==4.64.0; extra == "tf"
56
+ Requires-Dist: tensorpack==0.11; extra == "tf"
57
+ Requires-Dist: protobuf==3.20.1; extra == "tf"
58
+ Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
59
+ Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
60
+ Requires-Dist: python-doctr==0.8.1; extra == "tf"
61
+ Requires-Dist: pycocotools>=2.0.2; extra == "tf"
62
+ Requires-Dist: boto3==1.34.102; extra == "tf"
63
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
64
+ Requires-Dist: fasttext==0.9.2; extra == "tf"
65
+ Requires-Dist: jdeskew>=0.2.2; extra == "tf"
66
+ Requires-Dist: apted==1.0.3; extra == "tf"
67
+ Requires-Dist: distance==0.1.3; extra == "tf"
68
+ Requires-Dist: lxml>=4.9.1; extra == "tf"
64
69
  Provides-Extra: pt
65
- Requires-Dist: catalogue ==2.0.10 ; extra == 'pt'
66
- Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'pt'
67
- Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'pt'
68
- Requires-Dist: jsonlines ==3.1.0 ; extra == 'pt'
69
- Requires-Dist: lazy-imports ==0.3.1 ; extra == 'pt'
70
- Requires-Dist: mock ==4.0.3 ; extra == 'pt'
71
- Requires-Dist: networkx >=2.7.1 ; extra == 'pt'
72
- Requires-Dist: numpy <2.0,>=1.21 ; extra == 'pt'
73
- Requires-Dist: packaging >=20.0 ; extra == 'pt'
74
- Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
75
- Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
76
- Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
77
- Requires-Dist: pyzmq >=16 ; extra == 'pt'
78
- Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
79
- Requires-Dist: termcolor >=1.1 ; extra == 'pt'
80
- Requires-Dist: tabulate >=0.7.7 ; extra == 'pt'
81
- Requires-Dist: tqdm ==4.64.0 ; extra == 'pt'
82
- Requires-Dist: timm >=0.9.16 ; extra == 'pt'
83
- Requires-Dist: transformers >=4.36.0 ; extra == 'pt'
84
- Requires-Dist: accelerate >=0.29.1 ; extra == 'pt'
85
- Requires-Dist: python-doctr ==0.8.1 ; extra == 'pt'
86
- Requires-Dist: boto3 ==1.34.102 ; extra == 'pt'
87
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'pt'
88
- Requires-Dist: fasttext ==0.9.2 ; extra == 'pt'
89
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'pt'
90
- Requires-Dist: apted ==1.0.3 ; extra == 'pt'
91
- Requires-Dist: distance ==0.1.3 ; extra == 'pt'
92
- Requires-Dist: lxml >=4.9.1 ; extra == 'pt'
70
+ Requires-Dist: catalogue==2.0.10; extra == "pt"
71
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
72
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
73
+ Requires-Dist: jsonlines==3.1.0; extra == "pt"
74
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
75
+ Requires-Dist: mock==4.0.3; extra == "pt"
76
+ Requires-Dist: networkx>=2.7.1; extra == "pt"
77
+ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
78
+ Requires-Dist: packaging>=20.0; extra == "pt"
79
+ Requires-Dist: Pillow>=10.0.0; extra == "pt"
80
+ Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
82
+ Requires-Dist: pyyaml>=6.0.1; extra == "pt"
83
+ Requires-Dist: pyzmq>=16; extra == "pt"
84
+ Requires-Dist: scipy>=1.13.1; extra == "pt"
85
+ Requires-Dist: termcolor>=1.1; extra == "pt"
86
+ Requires-Dist: tabulate>=0.7.7; extra == "pt"
87
+ Requires-Dist: tqdm==4.64.0; extra == "pt"
88
+ Requires-Dist: timm>=0.9.16; extra == "pt"
89
+ Requires-Dist: transformers>=4.36.0; extra == "pt"
90
+ Requires-Dist: accelerate>=0.29.1; extra == "pt"
91
+ Requires-Dist: python-doctr==0.8.1; extra == "pt"
92
+ Requires-Dist: boto3==1.34.102; extra == "pt"
93
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
94
+ Requires-Dist: fasttext==0.9.2; extra == "pt"
95
+ Requires-Dist: jdeskew>=0.2.2; extra == "pt"
96
+ Requires-Dist: apted==1.0.3; extra == "pt"
97
+ Requires-Dist: distance==0.1.3; extra == "pt"
98
+ Requires-Dist: lxml>=4.9.1; extra == "pt"
99
+ Provides-Extra: docs
100
+ Requires-Dist: tensorpack==0.11; extra == "docs"
101
+ Requires-Dist: boto3==1.34.102; extra == "docs"
102
+ Requires-Dist: transformers>=4.36.0; extra == "docs"
103
+ Requires-Dist: accelerate>=0.29.1; extra == "docs"
104
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
105
+ Requires-Dist: lxml>=4.9.1; extra == "docs"
106
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
107
+ Requires-Dist: jdeskew>=0.2.2; extra == "docs"
108
+ Requires-Dist: jinja2==3.0.3; extra == "docs"
109
+ Requires-Dist: mkdocs-material; extra == "docs"
110
+ Requires-Dist: mkdocstrings-python; extra == "docs"
111
+ Requires-Dist: griffe==0.25.0; extra == "docs"
112
+ Provides-Extra: dev
113
+ Requires-Dist: python-dotenv==1.0.0; extra == "dev"
114
+ Requires-Dist: click; extra == "dev"
115
+ Requires-Dist: black==23.7.0; extra == "dev"
116
+ Requires-Dist: isort==5.13.2; extra == "dev"
117
+ Requires-Dist: pylint==2.17.4; extra == "dev"
118
+ Requires-Dist: mypy==1.4.1; extra == "dev"
119
+ Requires-Dist: wandb; extra == "dev"
120
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
121
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
122
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
123
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
124
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
125
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
126
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
93
127
  Provides-Extra: test
94
- Requires-Dist: pytest ==8.0.2 ; extra == 'test'
95
- Requires-Dist: pytest-cov ; extra == 'test'
96
- Provides-Extra: tf
97
- Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
98
- Requires-Dist: huggingface-hub >=0.12.0 ; extra == 'tf'
99
- Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
100
- Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
101
- Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
102
- Requires-Dist: mock ==4.0.3 ; extra == 'tf'
103
- Requires-Dist: networkx >=2.7.1 ; extra == 'tf'
104
- Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
105
- Requires-Dist: packaging >=20.0 ; extra == 'tf'
106
- Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
107
- Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
108
- Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
109
- Requires-Dist: pyzmq >=16 ; extra == 'tf'
110
- Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
111
- Requires-Dist: termcolor >=1.1 ; extra == 'tf'
112
- Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
113
- Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'
114
- Requires-Dist: tensorpack ==0.11 ; extra == 'tf'
115
- Requires-Dist: protobuf ==3.20.1 ; extra == 'tf'
116
- Requires-Dist: tensorflow-addons >=0.17.1 ; extra == 'tf'
117
- Requires-Dist: tf2onnx >=1.9.2 ; extra == 'tf'
118
- Requires-Dist: python-doctr ==0.8.1 ; extra == 'tf'
119
- Requires-Dist: pycocotools >=2.0.2 ; extra == 'tf'
120
- Requires-Dist: boto3 ==1.34.102 ; extra == 'tf'
121
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'tf'
122
- Requires-Dist: fasttext ==0.9.2 ; extra == 'tf'
123
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'tf'
124
- Requires-Dist: apted ==1.0.3 ; extra == 'tf'
125
- Requires-Dist: distance ==0.1.3 ; extra == 'tf'
126
- Requires-Dist: lxml >=4.9.1 ; extra == 'tf'
128
+ Requires-Dist: pytest==8.0.2; extra == "test"
129
+ Requires-Dist: pytest-cov; extra == "test"
127
130
 
128
131
 
129
132
  <p align="center">
@@ -172,13 +175,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
172
175
  - Document layout analysis and table recognition now runs with
173
176
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
174
177
  anymore for basic inference.
175
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
176
- (not contained in the built-in Analyzer).
177
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
178
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
179
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
178
180
  [**transformers**](https://github.com/huggingface/transformers).
179
181
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
180
182
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
181
- LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
183
+ LayoutLM can be used for LiLT as well.
184
+ - [**new**] There are two notebooks available that show, how to write a
185
+ [custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
186
+ a third party library that has not been supported yet and how to use
187
+ [advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
188
+ get links between layout segments e.g. captions and tables or figures.
182
189
 
183
190
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
184
191
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -263,7 +270,7 @@ documentation.
263
270
 
264
271
  ## Requirements
265
272
 
266
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
273
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
267
274
 
268
275
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
269
276
  separately.
@@ -272,13 +279,16 @@ separately.
272
279
  - Python >= 3.9
273
280
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
274
281
  In general, if you want to train or fine-tune models, a GPU is required.
275
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
276
- images.
282
+
277
283
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
278
284
  and [PyTorch](https://pytorch.org/get-started/locally/).
279
285
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
280
286
  engine has to be installed separately.
281
287
 
288
+
289
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
290
+ documents into images. For release `v.0.35.0` this dependency will be optional.
291
+
282
292
  The following overview shows the availability of the models in conjunction with the DL framework.
283
293
 
284
294
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -396,8 +406,8 @@ to develop this framework.
396
406
  ## Problems
397
407
 
398
408
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
399
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
400
- to 6 weeks.
409
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
410
+ to 12 weeks.
401
411
 
402
412
  ## If you like **deep**doctection ...
403
413