deepdoctection 0.35__py3-none-any.whl → 0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -26,7 +26,7 @@ from errno import ENOENT
26
26
  from io import BytesIO
27
27
  from pathlib import Path
28
28
  from shutil import copyfile
29
- from typing import Generator, Literal, Optional
29
+ from typing import Generator, Literal, Optional, Union
30
30
 
31
31
  from lazy_imports import try_import
32
32
  from numpy import uint8
@@ -46,6 +46,7 @@ with try_import() as pt_import_guard:
46
46
 
47
47
  __all__ = [
48
48
  "decrypt_pdf_document",
49
+ "decrypt_pdf_document_from_bytes",
49
50
  "get_pdf_file_reader",
50
51
  "get_pdf_file_writer",
51
52
  "PDFStreamer",
@@ -68,7 +69,6 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
68
69
  :param path: A path to the pdf file
69
70
  :return: True if document has been successfully decrypted
70
71
  """
71
-
72
72
  if qpdf_available():
73
73
  path_base, file_name = os.path.split(path)
74
74
  file_name_tmp = os.path.splitext(file_name)[0] + "tmp.pdf"
@@ -86,41 +86,69 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
86
86
  return False
87
87
 
88
88
 
89
- def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
89
+ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
90
+ """
91
+ Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
92
+
93
+ qpdf: <http://qpdf.sourceforge.net/>
94
+
95
+ :param input_bytes: A bytes object representing the pdf file
96
+ :return: The decrypted bytes object
97
+ """
98
+ with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
99
+ is_decrypted = decrypt_pdf_document(input_file_name)
100
+ if is_decrypted:
101
+ with open(input_file_name, "rb") as file:
102
+ return file.read()
103
+ else:
104
+ logger.error(LoggingRecord("pdf bytes cannot be decrypted and therefore cannot be processed further."))
105
+ sys.exit()
106
+
107
+
108
+ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
90
109
  """
91
110
  Creates a file reader object from a pdf document. Will try to decrypt the document if it is
92
111
  encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
93
112
 
94
- :param path: A path to a pdf document
113
+ :param path_or_bytes: A path to a pdf document
95
114
  :return: A file reader object from which you can iterate through the document.
96
115
  """
97
116
 
98
- if not os.path.isfile(path):
99
- raise FileNotFoundError(str(path))
100
- file_name = os.path.split(path)[1]
117
+ if isinstance(path_or_bytes, bytes):
118
+ try:
119
+ reader = PdfReader(BytesIO(path_or_bytes))
120
+ except (errors.PdfReadError, AttributeError):
121
+ decrypted_bytes = decrypt_pdf_document_from_bytes(path_or_bytes)
122
+ reader = PdfReader(BytesIO(decrypted_bytes))
123
+ return reader
124
+
125
+ if not os.path.isfile(path_or_bytes):
126
+ raise FileNotFoundError(str(path_or_bytes))
127
+ file_name = os.path.split(path_or_bytes)[1]
101
128
  if not is_file_extension(file_name, ".pdf"):
102
129
  raise FileExtensionError(f"must be a pdf file: {file_name}")
103
130
 
104
- with open(path, "rb") as file:
131
+ with open(path_or_bytes, "rb") as file:
105
132
  qpdf_called = False
106
133
  try:
107
- input_pdf_as_bytes = PdfReader(file)
134
+ reader = PdfReader(file)
108
135
  except (errors.PdfReadError, AttributeError):
109
- _ = decrypt_pdf_document(path)
136
+ _ = decrypt_pdf_document(path_or_bytes)
110
137
  qpdf_called = True
111
138
 
112
139
  if not qpdf_called:
113
- if input_pdf_as_bytes.is_encrypted:
114
- is_decrypted = decrypt_pdf_document(path)
140
+ if reader.is_encrypted:
141
+ is_decrypted = decrypt_pdf_document(path_or_bytes)
115
142
  if not is_decrypted:
116
143
  logger.error(
117
144
  LoggingRecord(
118
- f"pdf document {path} cannot be decrypted and therefore cannot be " f"processed further."
145
+ f"pdf document {path_or_bytes} cannot be decrypted and therefore cannot "
146
+ f"be processed further."
119
147
  )
120
148
  )
121
149
  sys.exit()
122
150
 
123
- return PdfReader(os.fspath(path))
151
+ return PdfReader(os.fspath(path_or_bytes))
124
152
 
125
153
 
126
154
  def get_pdf_file_writer() -> PdfWriter:
@@ -157,11 +185,11 @@ class PDFStreamer:
157
185
 
158
186
  """
159
187
 
160
- def __init__(self, path: PathLikeOrStr) -> None:
188
+ def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
161
189
  """
162
- :param path: to a pdf.
190
+ :param path_or_bytes: to a pdf.
163
191
  """
164
- self.file_reader = get_pdf_file_reader(path)
192
+ self.file_reader = get_pdf_file_reader(path_or_bytes)
165
193
  self.file_writer = PdfWriter()
166
194
 
167
195
  def __len__(self) -> int:
@@ -155,3 +155,42 @@ def is_file_extension(file_name: PathLikeOrStr, extension: Union[str, Sequence[s
155
155
  if isinstance(extension, str):
156
156
  return os.path.splitext(file_name)[-1].lower() == extension
157
157
  return os.path.splitext(file_name)[-1].lower() in extension
158
+
159
+
160
+ def partition_list(base_list: list[str], stop_value: str) -> list[list[str]]:
161
+ """
162
+ Partitions a list of strings into sublists, where each sublist starts with the first occurrence of the stop value.
163
+ Consecutive stop values are grouped together in the same sublist.
164
+
165
+ :param base_list: The list of strings to be partitioned.
166
+ :param stop_value: The string value that indicates the start of a new partition.
167
+ :return: A list of lists, where each sublist is a partition of the original list.
168
+
169
+ ** Example:**
170
+
171
+ strings = ['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c', 'a', 'b', 'a', 'b', 'a', 'a']
172
+ stop_string = 'a'
173
+ partition_list(strings, stop_string)
174
+
175
+ # Output [['a', 'a', 'c', 'c', 'b', 'd', 'c', 'c'], ['a', 'b'], ['a', 'b'], ['a', 'a']]
176
+ """
177
+
178
+ partitions = []
179
+ current_partition: list[str] = []
180
+ stop_found = False
181
+
182
+ for s in base_list:
183
+ if s == stop_value:
184
+ if not stop_found and current_partition:
185
+ partitions.append(current_partition)
186
+ current_partition = []
187
+ current_partition.append(s)
188
+ stop_found = True
189
+ else:
190
+ current_partition.append(s)
191
+ stop_found = False
192
+
193
+ if current_partition:
194
+ partitions.append(current_partition)
195
+
196
+ return partitions
@@ -205,6 +205,7 @@ def draw_boxes(
205
205
  font_scale: float = 1.0,
206
206
  rectangle_thickness: int = 4,
207
207
  box_color_by_category: bool = True,
208
+ show_palette: bool = True,
208
209
  ) -> PixelValues:
209
210
  """
210
211
  Dray bounding boxes with category names into image.
@@ -216,6 +217,7 @@ def draw_boxes(
216
217
  :param font_scale: Font scale of text box
217
218
  :param rectangle_thickness: Thickness of bounding box
218
219
  :param box_color_by_category:
220
+ :param show_palette: Whether to show a color palette of the categories
219
221
  :return: A new image np.ndarray
220
222
  """
221
223
  if color is not None:
@@ -261,19 +263,20 @@ def draw_boxes(
261
263
  )
262
264
 
263
265
  # draw a (very ugly) color palette
264
- y_0 = np_image.shape[0]
265
- for category, col in category_to_color.items():
266
- if category is not None:
267
- np_image = viz_handler.draw_text(
268
- np_image,
269
- (np_image.shape[1], y_0),
270
- category,
271
- color=col,
272
- font_scale=font_scale * 3,
273
- rectangle_thickness=rectangle_thickness,
274
- )
275
- _, text_h = viz_handler.get_text_size(category, font_scale * 2)
276
- y_0 = y_0 - int(10 * text_h)
266
+ if show_palette:
267
+ y_0 = np_image.shape[0]
268
+ for category, col in category_to_color.items():
269
+ if category is not None:
270
+ np_image = viz_handler.draw_text(
271
+ np_image,
272
+ (np_image.shape[1], y_0),
273
+ category,
274
+ color=col,
275
+ font_scale=font_scale,
276
+ rectangle_thickness=rectangle_thickness,
277
+ )
278
+ _, text_h = viz_handler.get_text_size(category, font_scale * 2)
279
+ y_0 = y_0 - int(1 * text_h)
277
280
 
278
281
  return np_image
279
282
 
@@ -309,6 +312,7 @@ class VizPackageHandler:
309
312
  "interactive_imshow": "_cv2_interactive_imshow",
310
313
  "encode": "_cv2_encode",
311
314
  "rotate_image": "_cv2_rotate_image",
315
+ "convert_bytes_to_np": "_cv2_convert_bytes_to_np",
312
316
  },
313
317
  "pillow": {
314
318
  "read_image": "_pillow_read_image",
@@ -322,6 +326,7 @@ class VizPackageHandler:
322
326
  "interactive_imshow": "_pillow_interactive_imshow",
323
327
  "encode": "_pillow_encode",
324
328
  "rotate_image": "_pillow_rotate_image",
329
+ "convert_bytes_to_np": "_pillow_convert_bytes_to_np",
325
330
  },
326
331
  }
327
332
 
@@ -481,6 +486,37 @@ class VizPackageHandler:
481
486
  pil_image = Image.open(im_file)
482
487
  return np.array(pil_image)[:, :, ::-1]
483
488
 
489
+ def convert_bytes_to_np(self, image_bytes: bytes) -> PixelValues:
490
+ """Converting an image as bytes into np.array
491
+
492
+ :param image_bytes: Image as np.array
493
+ """
494
+ return getattr(self, self.pkg_func_dict["convert_bytes_to_np"])(image_bytes)
495
+
496
+ @staticmethod
497
+ def _cv2_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
498
+ """
499
+ Convert image bytes to a numpy array using OpenCV.
500
+
501
+ :param image_bytes: Image bytes
502
+ :return: Image as numpy array
503
+ """
504
+ np_array = np.frombuffer(image_bytes, np.uint8)
505
+ np_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
506
+ return np_image
507
+
508
+ @staticmethod
509
+ def _pillow_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
510
+ """
511
+ Convert image bytes to a numpy array using Pillow.
512
+
513
+ :param image_bytes: Image bytes
514
+ :return: Image as numpy array
515
+ """
516
+ image = Image.open(BytesIO(image_bytes))
517
+ np_image = np.array(image)
518
+ return np_image
519
+
484
520
  def resize(self, image: PixelValues, width: int, height: int, interpolation: str) -> PixelValues:
485
521
  """
486
522
  Resize a given image to new width, height. Specifying an interpolation method is required. Depending on the
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.35
3
+ Version: 0.37
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -16,117 +16,117 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: catalogue ==2.0.10
20
- Requires-Dist: huggingface-hub <0.26,>=0.12.0
21
- Requires-Dist: importlib-metadata >=5.0.0
22
- Requires-Dist: jsonlines ==3.1.0
23
- Requires-Dist: lazy-imports ==0.3.1
24
- Requires-Dist: mock ==4.0.3
25
- Requires-Dist: networkx >=2.7.1
26
- Requires-Dist: numpy <2.0,>=1.21
27
- Requires-Dist: packaging >=20.0
28
- Requires-Dist: Pillow >=10.0.0
29
- Requires-Dist: pypdf >=3.16.0
30
- Requires-Dist: pypdfium2 >=4.30.0
31
- Requires-Dist: pyyaml >=6.0.1
32
- Requires-Dist: pyzmq >=16
33
- Requires-Dist: scipy >=1.13.1
34
- Requires-Dist: termcolor >=1.1
35
- Requires-Dist: tabulate >=0.7.7
36
- Requires-Dist: tqdm ==4.64.0
37
- Provides-Extra: dev
38
- Requires-Dist: python-dotenv ==1.0.0 ; extra == 'dev'
39
- Requires-Dist: click ; extra == 'dev'
40
- Requires-Dist: black ==23.7.0 ; extra == 'dev'
41
- Requires-Dist: isort ==5.13.2 ; extra == 'dev'
42
- Requires-Dist: pylint ==2.17.4 ; extra == 'dev'
43
- Requires-Dist: mypy ==1.4.1 ; extra == 'dev'
44
- Requires-Dist: wandb ; extra == 'dev'
45
- Requires-Dist: types-PyYAML >=6.0.12.12 ; extra == 'dev'
46
- Requires-Dist: types-termcolor >=1.1.3 ; extra == 'dev'
47
- Requires-Dist: types-tabulate >=0.9.0.3 ; extra == 'dev'
48
- Requires-Dist: types-tqdm >=4.66.0.5 ; extra == 'dev'
49
- Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'dev'
50
- Requires-Dist: types-Pillow >=10.2.0.20240406 ; extra == 'dev'
51
- Requires-Dist: types-urllib3 >=1.26.25.14 ; extra == 'dev'
52
- Provides-Extra: docs
53
- Requires-Dist: tensorpack ==0.11 ; extra == 'docs'
54
- Requires-Dist: boto3 ==1.34.102 ; extra == 'docs'
55
- Requires-Dist: transformers >=4.36.0 ; extra == 'docs'
56
- Requires-Dist: accelerate >=0.29.1 ; extra == 'docs'
57
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'docs'
58
- Requires-Dist: lxml >=4.9.1 ; extra == 'docs'
59
- Requires-Dist: lxml-stubs >=0.5.1 ; extra == 'docs'
60
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'docs'
61
- Requires-Dist: jinja2 ==3.0.3 ; extra == 'docs'
62
- Requires-Dist: mkdocs-material ; extra == 'docs'
63
- Requires-Dist: mkdocstrings-python ; extra == 'docs'
64
- Requires-Dist: griffe ==0.25.0 ; extra == 'docs'
19
+ Requires-Dist: catalogue==2.0.10
20
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0
21
+ Requires-Dist: importlib-metadata>=5.0.0
22
+ Requires-Dist: jsonlines==3.1.0
23
+ Requires-Dist: lazy-imports==0.3.1
24
+ Requires-Dist: mock==4.0.3
25
+ Requires-Dist: networkx>=2.7.1
26
+ Requires-Dist: numpy<2.0,>=1.21
27
+ Requires-Dist: packaging>=20.0
28
+ Requires-Dist: Pillow>=10.0.0
29
+ Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdfium2>=4.30.0
31
+ Requires-Dist: pyyaml>=6.0.1
32
+ Requires-Dist: pyzmq>=16
33
+ Requires-Dist: scipy>=1.13.1
34
+ Requires-Dist: termcolor>=1.1
35
+ Requires-Dist: tabulate>=0.7.7
36
+ Requires-Dist: tqdm==4.64.0
37
+ Provides-Extra: tf
38
+ Requires-Dist: catalogue==2.0.10; extra == "tf"
39
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
40
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
41
+ Requires-Dist: jsonlines==3.1.0; extra == "tf"
42
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
43
+ Requires-Dist: mock==4.0.3; extra == "tf"
44
+ Requires-Dist: networkx>=2.7.1; extra == "tf"
45
+ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
46
+ Requires-Dist: packaging>=20.0; extra == "tf"
47
+ Requires-Dist: Pillow>=10.0.0; extra == "tf"
48
+ Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
50
+ Requires-Dist: pyyaml>=6.0.1; extra == "tf"
51
+ Requires-Dist: pyzmq>=16; extra == "tf"
52
+ Requires-Dist: scipy>=1.13.1; extra == "tf"
53
+ Requires-Dist: termcolor>=1.1; extra == "tf"
54
+ Requires-Dist: tabulate>=0.7.7; extra == "tf"
55
+ Requires-Dist: tqdm==4.64.0; extra == "tf"
56
+ Requires-Dist: tensorpack==0.11; extra == "tf"
57
+ Requires-Dist: protobuf==3.20.1; extra == "tf"
58
+ Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
59
+ Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
60
+ Requires-Dist: python-doctr==0.8.1; extra == "tf"
61
+ Requires-Dist: pycocotools>=2.0.2; extra == "tf"
62
+ Requires-Dist: boto3==1.34.102; extra == "tf"
63
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
64
+ Requires-Dist: fasttext==0.9.2; extra == "tf"
65
+ Requires-Dist: jdeskew>=0.2.2; extra == "tf"
66
+ Requires-Dist: apted==1.0.3; extra == "tf"
67
+ Requires-Dist: distance==0.1.3; extra == "tf"
68
+ Requires-Dist: lxml>=4.9.1; extra == "tf"
65
69
  Provides-Extra: pt
66
- Requires-Dist: catalogue ==2.0.10 ; extra == 'pt'
67
- Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'pt'
68
- Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'pt'
69
- Requires-Dist: jsonlines ==3.1.0 ; extra == 'pt'
70
- Requires-Dist: lazy-imports ==0.3.1 ; extra == 'pt'
71
- Requires-Dist: mock ==4.0.3 ; extra == 'pt'
72
- Requires-Dist: networkx >=2.7.1 ; extra == 'pt'
73
- Requires-Dist: numpy <2.0,>=1.21 ; extra == 'pt'
74
- Requires-Dist: packaging >=20.0 ; extra == 'pt'
75
- Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
76
- Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
77
- Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'pt'
78
- Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
79
- Requires-Dist: pyzmq >=16 ; extra == 'pt'
80
- Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
81
- Requires-Dist: termcolor >=1.1 ; extra == 'pt'
82
- Requires-Dist: tabulate >=0.7.7 ; extra == 'pt'
83
- Requires-Dist: tqdm ==4.64.0 ; extra == 'pt'
84
- Requires-Dist: timm >=0.9.16 ; extra == 'pt'
85
- Requires-Dist: transformers >=4.36.0 ; extra == 'pt'
86
- Requires-Dist: accelerate >=0.29.1 ; extra == 'pt'
87
- Requires-Dist: python-doctr ==0.8.1 ; extra == 'pt'
88
- Requires-Dist: boto3 ==1.34.102 ; extra == 'pt'
89
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'pt'
90
- Requires-Dist: fasttext ==0.9.2 ; extra == 'pt'
91
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'pt'
92
- Requires-Dist: apted ==1.0.3 ; extra == 'pt'
93
- Requires-Dist: distance ==0.1.3 ; extra == 'pt'
94
- Requires-Dist: lxml >=4.9.1 ; extra == 'pt'
70
+ Requires-Dist: catalogue==2.0.10; extra == "pt"
71
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
72
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
73
+ Requires-Dist: jsonlines==3.1.0; extra == "pt"
74
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
75
+ Requires-Dist: mock==4.0.3; extra == "pt"
76
+ Requires-Dist: networkx>=2.7.1; extra == "pt"
77
+ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
78
+ Requires-Dist: packaging>=20.0; extra == "pt"
79
+ Requires-Dist: Pillow>=10.0.0; extra == "pt"
80
+ Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
82
+ Requires-Dist: pyyaml>=6.0.1; extra == "pt"
83
+ Requires-Dist: pyzmq>=16; extra == "pt"
84
+ Requires-Dist: scipy>=1.13.1; extra == "pt"
85
+ Requires-Dist: termcolor>=1.1; extra == "pt"
86
+ Requires-Dist: tabulate>=0.7.7; extra == "pt"
87
+ Requires-Dist: tqdm==4.64.0; extra == "pt"
88
+ Requires-Dist: timm>=0.9.16; extra == "pt"
89
+ Requires-Dist: transformers>=4.36.0; extra == "pt"
90
+ Requires-Dist: accelerate>=0.29.1; extra == "pt"
91
+ Requires-Dist: python-doctr==0.8.1; extra == "pt"
92
+ Requires-Dist: boto3==1.34.102; extra == "pt"
93
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
94
+ Requires-Dist: fasttext==0.9.2; extra == "pt"
95
+ Requires-Dist: jdeskew>=0.2.2; extra == "pt"
96
+ Requires-Dist: apted==1.0.3; extra == "pt"
97
+ Requires-Dist: distance==0.1.3; extra == "pt"
98
+ Requires-Dist: lxml>=4.9.1; extra == "pt"
99
+ Provides-Extra: docs
100
+ Requires-Dist: tensorpack==0.11; extra == "docs"
101
+ Requires-Dist: boto3==1.34.102; extra == "docs"
102
+ Requires-Dist: transformers>=4.36.0; extra == "docs"
103
+ Requires-Dist: accelerate>=0.29.1; extra == "docs"
104
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
105
+ Requires-Dist: lxml>=4.9.1; extra == "docs"
106
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
107
+ Requires-Dist: jdeskew>=0.2.2; extra == "docs"
108
+ Requires-Dist: jinja2==3.0.3; extra == "docs"
109
+ Requires-Dist: mkdocs-material; extra == "docs"
110
+ Requires-Dist: mkdocstrings-python; extra == "docs"
111
+ Requires-Dist: griffe==0.25.0; extra == "docs"
112
+ Provides-Extra: dev
113
+ Requires-Dist: python-dotenv==1.0.0; extra == "dev"
114
+ Requires-Dist: click; extra == "dev"
115
+ Requires-Dist: black==23.7.0; extra == "dev"
116
+ Requires-Dist: isort==5.13.2; extra == "dev"
117
+ Requires-Dist: pylint==2.17.4; extra == "dev"
118
+ Requires-Dist: mypy==1.4.1; extra == "dev"
119
+ Requires-Dist: wandb; extra == "dev"
120
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
121
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
122
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
123
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
124
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
125
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
126
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
95
127
  Provides-Extra: test
96
- Requires-Dist: pytest ==8.0.2 ; extra == 'test'
97
- Requires-Dist: pytest-cov ; extra == 'test'
98
- Provides-Extra: tf
99
- Requires-Dist: catalogue ==2.0.10 ; extra == 'tf'
100
- Requires-Dist: huggingface-hub <0.26,>=0.12.0 ; extra == 'tf'
101
- Requires-Dist: importlib-metadata >=5.0.0 ; extra == 'tf'
102
- Requires-Dist: jsonlines ==3.1.0 ; extra == 'tf'
103
- Requires-Dist: lazy-imports ==0.3.1 ; extra == 'tf'
104
- Requires-Dist: mock ==4.0.3 ; extra == 'tf'
105
- Requires-Dist: networkx >=2.7.1 ; extra == 'tf'
106
- Requires-Dist: numpy <2.0,>=1.21 ; extra == 'tf'
107
- Requires-Dist: packaging >=20.0 ; extra == 'tf'
108
- Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
109
- Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
110
- Requires-Dist: pypdfium2 >=4.30.0 ; extra == 'tf'
111
- Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
112
- Requires-Dist: pyzmq >=16 ; extra == 'tf'
113
- Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
114
- Requires-Dist: termcolor >=1.1 ; extra == 'tf'
115
- Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
116
- Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'
117
- Requires-Dist: tensorpack ==0.11 ; extra == 'tf'
118
- Requires-Dist: protobuf ==3.20.1 ; extra == 'tf'
119
- Requires-Dist: tensorflow-addons >=0.17.1 ; extra == 'tf'
120
- Requires-Dist: tf2onnx >=1.9.2 ; extra == 'tf'
121
- Requires-Dist: python-doctr ==0.8.1 ; extra == 'tf'
122
- Requires-Dist: pycocotools >=2.0.2 ; extra == 'tf'
123
- Requires-Dist: boto3 ==1.34.102 ; extra == 'tf'
124
- Requires-Dist: pdfplumber >=0.11.0 ; extra == 'tf'
125
- Requires-Dist: fasttext ==0.9.2 ; extra == 'tf'
126
- Requires-Dist: jdeskew >=0.2.2 ; extra == 'tf'
127
- Requires-Dist: apted ==1.0.3 ; extra == 'tf'
128
- Requires-Dist: distance ==0.1.3 ; extra == 'tf'
129
- Requires-Dist: lxml >=4.9.1 ; extra == 'tf'
128
+ Requires-Dist: pytest==8.0.2; extra == "test"
129
+ Requires-Dist: pytest-cov; extra == "test"
130
130
 
131
131
 
132
132
  <p align="center">
@@ -176,12 +176,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
176
176
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
177
177
  anymore for basic inference.
178
178
  - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
179
- (not contained in the built-in Analyzer).
180
179
  - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
181
180
  [**transformers**](https://github.com/huggingface/transformers).
182
181
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
183
182
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
184
- LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
183
+ LayoutLM can be used for LiLT as well.
184
+ - [**new**] There are two notebooks available that show, how to write a
185
+ [custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
186
+ a third party library that has not been supported yet and how to use
187
+ [advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
188
+ get links between layout segments e.g. captions and tables or figures.
185
189
 
186
190
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
187
191
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -1,33 +1,33 @@
1
- deepdoctection/__init__.py,sha256=RZpawNRTJPKNPFuONawVOsYWdr-rI8PPNXZhlPtOKtc,12580
1
+ deepdoctection/__init__.py,sha256=7VELexCFRaBTCXHQpBoKhVi4hqUUgpcsLTqvHXHjufQ,12651
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
- deepdoctection/analyzer/_config.py,sha256=0cWtaI2e3jHNhufHZAqMje0YTTDAogKAHVl4VpYojAo,4874
4
+ deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
5
5
  deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
6
- deepdoctection/analyzer/factory.py,sha256=T9jxtVLNFhocbsfWIGLPfFrEv21zQJzM6VdFt0yxMyg,23849
6
+ deepdoctection/analyzer/factory.py,sha256=xmo5F9X7I6lp0ZWJv8QavpMyG8UWYLvMi4qogsZV1_s,31507
7
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
8
- deepdoctection/configs/conf_dd_one.yaml,sha256=orP-oeqtWbz5S9FJZJKxy1UqMwOYjL9g0DOX-wbamqU,2239
8
+ deepdoctection/configs/conf_dd_one.yaml,sha256=td7XsyVhdXkhh5Pie7sT_WNjGTaxBOWgpxhkobHd1H0,2325
9
9
  deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
10
10
  deepdoctection/dataflow/__init__.py,sha256=CWRHMpmJaPk4xY_oIIFubCt-z11SguWrMWxHZ7rdrvY,845
11
11
  deepdoctection/dataflow/base.py,sha256=z4DCComSj5wStEPjtk0093cNNGfUMiDqx8dqz36nS_o,6221
12
12
  deepdoctection/dataflow/common.py,sha256=MyGA2VWlNMjQdIN_Jd-o0Ec3bDJmjQit4Nv0v43OCSQ,10119
13
13
  deepdoctection/dataflow/custom.py,sha256=3CK_1oL9p6nbOq8WtH5_vQUo70_8Z8pXY7kG0OFqzug,6803
14
- deepdoctection/dataflow/custom_serialize.py,sha256=CKeyw2Ayq_qAl0O5BoKkIOFJgteCt78h9QFTI23XhmQ,22818
14
+ deepdoctection/dataflow/custom_serialize.py,sha256=WocuiYo2gkih5Z9lWAoIIfUewwYSDOhHzG7ZZjKlUic,22827
15
15
  deepdoctection/dataflow/parallel_map.py,sha256=8FhxJBWV-kjJrJ27jQtP3yYF6Ev6rz98worO60oi96c,15837
16
16
  deepdoctection/dataflow/serialize.py,sha256=4pYC7m9h53JCu99waVeKpHDpsCDDdYCrSZpP2QYSsgs,4555
17
17
  deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i4wI,9619
18
18
  deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
19
19
  deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
20
20
  deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
21
- deepdoctection/datapoint/convert.py,sha256=Gw2IjNiEotPu1yuMZqrIYB0mCAwafKt-VgMnrHj6S7U,6808
22
- deepdoctection/datapoint/image.py,sha256=EvZlVwJjMAcL1z8RNPBvZ8fwdJvkGuGpcFxCP1y26Go,33045
23
- deepdoctection/datapoint/view.py,sha256=7qSX4DQw9OPQQSKfSjV8e5i6jLyu6hOMceSKJAob2N8,42154
21
+ deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
22
+ deepdoctection/datapoint/image.py,sha256=DIXXXD2yKsacg47Wt_GEYEIe1MQkrd06Yr5xAWv_n64,33047
23
+ deepdoctection/datapoint/view.py,sha256=1rVMuqucCrI5zlwyXMADJQBV38V_zSNFqFyBi3cMA1E,44914
24
24
  deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
25
25
  deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
26
26
  deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
27
27
  deepdoctection/datasets/dataflow_builder.py,sha256=cYU2zV3gZW2bFvMHimlO9VIl3BAUaCwML08cCIQ8Em4,4107
28
28
  deepdoctection/datasets/info.py,sha256=6y5TfiUhQppynbMFP5JmUPk95ggsVCtGIw4dYh2lVus,20501
29
29
  deepdoctection/datasets/registry.py,sha256=ZjzVzjsCgNXJuZZZtR98_yKocADmh4EBGV5JqJbGjWk,2543
30
- deepdoctection/datasets/save.py,sha256=khYQ4t94FOu9RWMimP9E4kASq25f61SIow78NHaX1pg,3349
30
+ deepdoctection/datasets/save.py,sha256=Y9508Qqp8gIGN7pbGgVBBnkiC6NdCb9L2YR4wVvEUxM,3350
31
31
  deepdoctection/datasets/instances/__init__.py,sha256=XEc_4vT5lDn6bbZID9ujDEumWu8Ec2W-QS4pI_bfWWE,1388
32
32
  deepdoctection/datasets/instances/doclaynet.py,sha256=wRZT7wMTilZBLZ1gKY2cWReD1EGT735vOOTy0pD0N6M,12038
33
33
  deepdoctection/datasets/instances/fintabnet.py,sha256=qYzFK1dWF6MEPkHamP255DvAzlQT_GnkvDe1aM7CgjA,12006
@@ -44,13 +44,13 @@ deepdoctection/datasets/instances/xsl/pascal_voc.xsl,sha256=DlzFV2P8NtQKXVe96i-m
44
44
  deepdoctection/eval/__init__.py,sha256=rbns4tSEQ30QLj8h0mm3A0dCaKuN9LDxxpVypKKSXSE,932
45
45
  deepdoctection/eval/accmetric.py,sha256=4bND-xz9AZu9ACYRkEzn9V6Jn8MEiqnF7kxSp4k_baE,19655
46
46
  deepdoctection/eval/base.py,sha256=gCvhTdwEaCKplYTWPMjGvtB_0Vbq2KBJWFHq8mMlLPA,4814
47
- deepdoctection/eval/cocometric.py,sha256=Co7XaLQzp7qxw8UQaG2D68PzY6eA9aRNueeo_zaMJLM,8777
47
+ deepdoctection/eval/cocometric.py,sha256=4cpNmF3xZjInCOWOoVU_7itQxLI-zr0O6suNjPU2xWc,11020
48
48
  deepdoctection/eval/eval.py,sha256=B9PUZBjj6KzXHLOxUVn3QHiOcBQogfJmp9mjopbMo9k,19721
49
49
  deepdoctection/eval/registry.py,sha256=v4mp-s67vBVRu1nQzuGlYPViQnMSeIXEcF_WmvfUCoU,1051
50
50
  deepdoctection/eval/tedsmetric.py,sha256=rKw-734Y9CpBtIfkBSPQF2vAZxnIdWrI9Zc723P7RxI,9529
51
51
  deepdoctection/eval/tp_eval_callback.py,sha256=SXsXumoyxq-MIH9Cep5eUOwnNshMbKmC6mYOGwCg0pM,5283
52
52
  deepdoctection/extern/__init__.py,sha256=9Iks9b4Q_LynjcV167TVCoK8YsQRUcA2jjmAmDNA_X8,1056
53
- deepdoctection/extern/base.py,sha256=ajzFzD9BrFwnly4SziN8PadI-PBOzzVRlIGPm_sNllE,24142
53
+ deepdoctection/extern/base.py,sha256=ONPgappl_P5HSwQr42FatuRnwMTvUPecPsCztDTN0Hw,24108
54
54
  deepdoctection/extern/d2detect.py,sha256=zrKv1yurApnjD7QZIZk_8LYCahjmN82MQUjHjv8zvkQ,22127
55
55
  deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1SA,3077
56
56
  deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
@@ -94,8 +94,8 @@ deepdoctection/mapper/d2struct.py,sha256=Dx-YnycsIQH4a5-9Gn_yMhiQ-gOFgMueNeH3rhX
94
94
  deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC162kQ,5551
95
95
  deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
96
96
  deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
97
- deepdoctection/mapper/match.py,sha256=E7Qna6zLDIxlI7puOL9BjjZKuRry-zONs8TLWmyEMIQ,9580
98
- deepdoctection/mapper/misc.py,sha256=rCqHOcsCfVPXs36AWK0rZ2kk0CUM3yXV370_zyIGBJ4,6518
97
+ deepdoctection/mapper/match.py,sha256=pCWZpz2R8JahiKXCw7dxKRTLiPgJXeVDgkddDPLy_c0,9643
98
+ deepdoctection/mapper/misc.py,sha256=NLSSgk066Tkrrdi075HkqV7cP-iqT9fv_MtyAJ-8gOg,6743
99
99
  deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
100
100
  deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
101
101
  deepdoctection/mapper/pubstruct.py,sha256=YxsrZ-E0pD45Mm_VCPQB9yEgHsTPkw4htt-3DwCRX1k,23361
@@ -106,7 +106,7 @@ deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac
106
106
  deepdoctection/pipe/base.py,sha256=Davjkf3D837y9AIITcx7yXdebmVaz6Moyw_5Wi3nfmg,13561
107
107
  deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
108
108
  deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
109
- deepdoctection/pipe/doctectionpipe.py,sha256=I6B6HT_BG2ByQ3Rjsui3-Ct31yLmodx-iuZnujXaiSc,8953
109
+ deepdoctection/pipe/doctectionpipe.py,sha256=uhsrSuwaHcOMj8b8i6wCpPaZlSxCTaeHVhMokJ8vRSI,11835
110
110
  deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
111
111
  deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
112
112
  deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
@@ -129,20 +129,20 @@ deepdoctection/utils/develop.py,sha256=4HyTarkFbJwctL-Hgu1TU_LSJppHvaroDbcyHsxhI
129
129
  deepdoctection/utils/env_info.py,sha256=TnCA-LOTj4WIHd9yvn1AaoPWsLmPgc42l-BJmGV6zmM,19147
130
130
  deepdoctection/utils/error.py,sha256=_3q9VepKfEhsM3H033_Fu0hwBzMSjsWALsjyJbGAZr8,2367
131
131
  deepdoctection/utils/file_utils.py,sha256=IRElrcND0YEiU1QELw5hfXeNA39uE2_nyzh9-X7YcxI,19477
132
- deepdoctection/utils/fs.py,sha256=C4ktrzjoVtX9kgycv5YrEigDI9byi65b6_D0aKsGM4Y,10161
132
+ deepdoctection/utils/fs.py,sha256=x842BxUP5bbjJ2cofw-g4dKJv4QAaGzda4qnAazabO4,10281
133
133
  deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE7Qphk,2159
134
134
  deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
135
135
  deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
136
136
  deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
137
- deepdoctection/utils/pdf_utils.py,sha256=OAQjE9xHVNcDsFqAvX47Lu-mgmoMpVXqIf5pOK8AwxY,11595
137
+ deepdoctection/utils/pdf_utils.py,sha256=G0m8kUn2HwwyZWH_BcrDkm-m3MP9GN9SWHj5VhB7swY,12845
138
138
  deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
139
139
  deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
140
140
  deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
141
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
142
- deepdoctection/utils/utils.py,sha256=ANzyIX6AY1yc-4gcn6yxksV84sPrJDaUurUNVatAFu8,5168
143
- deepdoctection/utils/viz.py,sha256=Xm6pKlhM29UWBBGZHlWFl9XYFDAqaYDdwHXwe26Hvqo,25728
144
- deepdoctection-0.35.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
- deepdoctection-0.35.dist-info/METADATA,sha256=B6pPQjRYWcqd1p-3ul3PhflYOcKq2ZpP5D-i8kr7qgk,19403
146
- deepdoctection-0.35.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
147
- deepdoctection-0.35.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
- deepdoctection-0.35.dist-info/RECORD,,
142
+ deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
143
+ deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
144
+ deepdoctection-0.37.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.37.dist-info/METADATA,sha256=0qGgmf07xmNRJx55yfMagHcfAoQG6GO9KTw6b0tv0uA,19543
146
+ deepdoctection-0.37.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
147
+ deepdoctection-0.37.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.37.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5