deepdoctection 0.36__py3-none-any.whl → 0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
24
24
 
25
25
  # pylint: enable=wrong-import-position
26
26
 
27
- __version__ = 0.36
27
+ __version__ = 0.37
28
28
 
29
29
  _IMPORT_STRUCTURE = {
30
30
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -87,6 +87,7 @@ _IMPORT_STRUCTURE = {
87
87
  "convert_b64_to_np_array",
88
88
  "convert_np_array_to_b64",
89
89
  "convert_np_array_to_b64_b",
90
+ "convert_bytes_to_np_array",
90
91
  "convert_pdf_bytes_to_np_array_v2",
91
92
  "box_to_point4",
92
93
  "point4_to_box",
@@ -371,6 +372,7 @@ _IMPORT_STRUCTURE = {
371
372
  "save_config_to_yaml",
372
373
  "config_to_cli_str",
373
374
  "decrypt_pdf_document",
375
+ "decrypt_pdf_document_from_bytes",
374
376
  "get_pdf_file_reader",
375
377
  "get_pdf_file_writer",
376
378
  "PDFStreamer",
@@ -593,7 +593,7 @@ class SerializerPdfDoc:
593
593
  file_name = os.path.split(path)[1]
594
594
  prefix, suffix = os.path.splitext(file_name)
595
595
  df: DataFlow
596
- df = CustomDataFromIterable(PDFStreamer(path=path), max_datapoints=max_datapoints)
596
+ df = CustomDataFromIterable(PDFStreamer(path_or_bytes=path), max_datapoints=max_datapoints)
597
597
  df = MapData(
598
598
  df,
599
599
  lambda dp: {
@@ -40,6 +40,7 @@ __all__ = [
40
40
  "convert_b64_to_np_array",
41
41
  "convert_np_array_to_b64",
42
42
  "convert_np_array_to_b64_b",
43
+ "convert_bytes_to_np_array",
43
44
  "convert_pdf_bytes_to_np_array_v2",
44
45
  "box_to_point4",
45
46
  "point4_to_box",
@@ -107,6 +108,16 @@ def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
107
108
  return viz_handler.encode(np_image)
108
109
 
109
110
 
111
+ def convert_bytes_to_np_array(image_bytes: bytes) -> PixelValues:
112
+ """
113
+ Converts an image in bytes to a numpy array
114
+
115
+ :param image_bytes: An image as bytes.
116
+ :return: numpy array.
117
+ """
118
+ return viz_handler.convert_bytes_to_np(image_bytes)
119
+
120
+
110
121
  @deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
111
122
  def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
112
123
  """
@@ -587,7 +587,7 @@ class Image:
587
587
  )
588
588
  ann.image.dump(sub_image)
589
589
 
590
- def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
590
+ def remove_image_from_lower_hierarchy(self, pixel_values_only: bool = False) -> None:
591
591
  """Will remove all images from image annotations."""
592
592
  for ann in self.annotations:
593
593
  if pixel_values_only:
@@ -717,7 +717,7 @@ class Image:
717
717
  else:
718
718
  path_json = fspath(path) + ".json"
719
719
  if highest_hierarchy_only:
720
- self.remove_image_from_lower_hierachy()
720
+ self.remove_image_from_lower_hierarchy()
721
721
  export_dict = self.as_dict()
722
722
  export_dict["location"] = fspath(export_dict["location"])
723
723
  if not image_to_json:
@@ -62,7 +62,7 @@ def dataflow_to_json(
62
62
  if highest_hierarchy_only:
63
63
 
64
64
  def _remove_hh(dp: Image) -> Image:
65
- dp.remove_image_from_lower_hierachy()
65
+ dp.remove_image_from_lower_hierarchy()
66
66
  return dp
67
67
 
68
68
  df = MapData(df, _remove_hh)
@@ -69,8 +69,7 @@ class ModelCategories:
69
69
  if self.init_categories:
70
70
  self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
71
71
  else:
72
- if self._init_categories is None:
73
- self._init_categories = MappingProxyType({})
72
+ self._init_categories = MappingProxyType({})
74
73
  self.categories = self._init_categories
75
74
 
76
75
  @overload
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
181
180
  self._init_categories = self.merge_bio_semantics_categories(
182
181
  self._categories_semantics, self._categories_bio
183
182
  )
184
- super().__post_init__()
183
+ self.categories = self._init_categories
185
184
 
186
185
  @staticmethod
187
186
  def merge_bio_semantics_categories(
@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
27
27
 
28
28
  from lazy_imports import try_import
29
29
 
30
- from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
30
+ from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
31
31
  from ..datapoint.image import Image
32
32
  from ..utils.fs import get_load_image_func, load_image_from_file
33
33
  from ..utils.types import JsonDict
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
49
49
 
50
50
  file_name: Optional[str]
51
51
  location: Optional[str]
52
+ image_bytes: Optional[bytes] = None
52
53
 
53
54
  if isinstance(dp, str):
54
55
  _, file_name = os.path.split(dp)
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
62
63
  document_id = dp.get("document_id")
63
64
  if location == "":
64
65
  location = str(dp.get("path", ""))
66
+ image_bytes = dp.get("image_bytes")
65
67
  else:
66
68
  raise TypeError("datapoint not of expected type for converting to image")
67
69
 
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
76
78
  if dp_image.pdf_bytes is not None:
77
79
  if isinstance(dp_image.pdf_bytes, bytes):
78
80
  dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
81
+ elif image_bytes is not None:
82
+ dp_image.image = convert_bytes_to_np_array(image_bytes)
79
83
  else:
80
84
  dp_image.image = load_image_from_file(location)
81
85
 
@@ -23,31 +23,38 @@ import os
23
23
  from pathlib import Path
24
24
  from typing import List, Mapping, Optional, Sequence, Tuple, Union
25
25
 
26
- from ..dataflow import DataFlow, MapData
26
+ from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
27
27
  from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
28
28
  from ..datapoint.image import Image
29
29
  from ..datapoint.view import IMAGE_DEFAULTS
30
30
  from ..mapper.maputils import curry
31
31
  from ..mapper.misc import to_image
32
32
  from ..utils.fs import maybe_path_or_pdf
33
+ from ..utils.identifier import get_uuid_from_str
33
34
  from ..utils.logger import LoggingRecord, logger
35
+ from ..utils.pdf_utils import PDFStreamer
34
36
  from ..utils.types import PathLikeOrStr
37
+ from ..utils.utils import is_file_extension
35
38
  from .base import Pipeline, PipelineComponent
36
39
  from .common import PageParsingService
37
40
 
38
41
 
39
42
  def _collect_from_kwargs(
40
- **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
41
- ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
43
+ **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
44
+ ) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
45
+ b_bytes = kwargs.get("bytes")
42
46
  dataset_dataflow = kwargs.get("dataset_dataflow")
43
47
  path = kwargs.get("path")
44
48
  if path is None and dataset_dataflow is None:
45
49
  raise ValueError("Pass either path or dataset_dataflow as argument")
50
+ if path is None and b_bytes:
51
+ raise ValueError("When passing bytes, a path to the source document must be provided")
46
52
 
47
53
  shuffle = kwargs.get("shuffle", False)
48
54
  if not isinstance(shuffle, bool):
49
55
  raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
50
56
 
57
+ file_type = None
51
58
  doc_path = None
52
59
  if path:
53
60
  if not isinstance(path, (str, Path)):
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
56
63
  if path_type == 2:
57
64
  doc_path = path
58
65
  path = None
66
+ file_type = ".pdf"
67
+ elif path_type == 3:
68
+ if is_file_extension(path, ".jpg"):
69
+ file_type = ".jpg"
70
+ if is_file_extension(path, ".png"):
71
+ file_type = ".png"
72
+ if is_file_extension(path, ".jpeg"):
73
+ file_type = ".jpeg"
74
+ if not b_bytes:
75
+ raise ValueError("When passing a path to a single image, bytes of the image must be passed")
59
76
  elif not path_type:
60
77
  raise ValueError("Pass only a path to a directory or to a pdf file")
61
78
 
62
- file_type = kwargs.get("file_type", [".jpg", ".png", ".tif"])
79
+ file_type = kwargs.get(
80
+ "file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type # type: ignore
81
+ )
63
82
 
64
83
  max_datapoints = kwargs.get("max_datapoints")
65
84
  if not isinstance(max_datapoints, (int, type(None))):
66
85
  raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
67
- return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow # type: ignore
86
+ return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes # type: ignore
68
87
 
69
88
 
70
89
  @curry
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
142
161
 
143
162
  super().__init__(pipeline_component_list)
144
163
 
145
- def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
146
- path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
164
+ def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
165
+ -> DataFlow:
166
+ path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
147
167
 
148
168
  df: DataFlow
149
169
 
150
- if isinstance(path, (str, Path)):
170
+ if isinstance(b_bytes, bytes):
171
+ df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
172
+ b_bytes=b_bytes,
173
+ file_type=file_type)
174
+
175
+ elif isinstance(path, (str, Path)):
151
176
  if not isinstance(file_type, (str, list)):
152
177
  raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
153
178
  df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
162
187
 
163
188
  df = MapData(df, _proto_process(path, doc_path))
164
189
  if dataset_dataflow is None:
165
- df = MapData(df, _to_image(dpi=300)) # pylint: disable=E1120
190
+ df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
166
191
  return df
167
192
 
168
193
  @staticmethod
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
197
222
  """
198
223
  return _doc_to_dataflow(path, max_datapoints)
199
224
 
225
+ @staticmethod
226
+ def bytes_to_dataflow(
227
+ path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
228
+ ) -> DataFlow:
229
+ """
230
+ Converts a bytes object to a dataflow
231
+
232
+ :param path: path to directory or an image file
233
+ :param b_bytes: bytes object
234
+ :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
235
+ :param max_datapoints: max number of datapoints to consider
236
+ :return: DataFlow
237
+ """
238
+
239
+ file_name = os.path.split(path)[1]
240
+ if isinstance(file_type, str):
241
+ if file_type == ".pdf":
242
+ prefix, suffix = os.path.splitext(file_name)
243
+ df: DataFlow
244
+ df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
245
+ df = MapData(
246
+ df,
247
+ lambda dp: {
248
+ "path": path,
249
+ "file_name": prefix + f"_{dp[1]}" + suffix,
250
+ "pdf_bytes": dp[0],
251
+ "page_number": dp[1],
252
+ "document_id": get_uuid_from_str(prefix),
253
+ },
254
+ )
255
+ else:
256
+ df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
257
+ return df
258
+ raise ValueError(
259
+ f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
260
+ f"not supported"
261
+ )
262
+
200
263
  def dataflow_to_page(self, df: DataFlow) -> DataFlow:
201
264
  """
202
265
  Converts a dataflow of images to a dataflow of pages
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
206
269
  """
207
270
  return self.page_parser.predict_dataflow(df)
208
271
 
209
- def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
272
+ def analyze(
273
+ self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
274
+ ) -> DataFlow:
210
275
  """
211
276
  `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
212
277
 
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
215
280
  only the first page is processed through the pipeline.
216
281
  Alternatively, a path to a pdf document with multiple pages.
217
282
 
283
+ `kwargs key bytes:` A bytes object of an image
284
+
218
285
  `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
219
286
 
220
287
  `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
@@ -227,20 +227,21 @@ def get_load_image_func(
227
227
 
228
228
  def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
229
229
  """
230
- Checks if the path points to a directory or a pdf document. Returns 1 if the path points to a directory, 2
231
- if the path points to a pdf doc or 0, if none of the previous is true.
230
+ Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
231
+ directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
232
+ previous is true.
232
233
 
233
234
  :param path: A path
234
- :return: A value of 0,1,2
235
+ :return: A value of 0,1,2,3
235
236
  """
236
237
 
237
- is_dir = os.path.isdir(path)
238
- if is_dir:
238
+ if os.path.isdir(path):
239
239
  return 1
240
240
  file_name = os.path.split(path)[1]
241
- is_pdf = is_file_extension(file_name, ".pdf")
242
- if is_pdf:
241
+ if is_file_extension(file_name, ".pdf"):
243
242
  return 2
243
+ if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
244
+ return 3
244
245
  return 0
245
246
 
246
247
 
@@ -26,7 +26,7 @@ from errno import ENOENT
26
26
  from io import BytesIO
27
27
  from pathlib import Path
28
28
  from shutil import copyfile
29
- from typing import Generator, Literal, Optional
29
+ from typing import Generator, Literal, Optional, Union
30
30
 
31
31
  from lazy_imports import try_import
32
32
  from numpy import uint8
@@ -46,6 +46,7 @@ with try_import() as pt_import_guard:
46
46
 
47
47
  __all__ = [
48
48
  "decrypt_pdf_document",
49
+ "decrypt_pdf_document_from_bytes",
49
50
  "get_pdf_file_reader",
50
51
  "get_pdf_file_writer",
51
52
  "PDFStreamer",
@@ -68,7 +69,6 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
68
69
  :param path: A path to the pdf file
69
70
  :return: True if document has been successfully decrypted
70
71
  """
71
-
72
72
  if qpdf_available():
73
73
  path_base, file_name = os.path.split(path)
74
74
  file_name_tmp = os.path.splitext(file_name)[0] + "tmp.pdf"
@@ -86,41 +86,69 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
86
86
  return False
87
87
 
88
88
 
89
- def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
89
+ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
90
+ """
91
+ Decrypting a pdf given as bytes. Under the hood, it saves the bytes to a temporary file and then calls
92
+
93
+ qpdf: <http://qpdf.sourceforge.net/>
94
+
95
+ :param input_bytes: A bytes object representing the pdf file
96
+ :return: The decrypted bytes object
97
+ """
98
+ with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
99
+ is_decrypted = decrypt_pdf_document(input_file_name)
100
+ if is_decrypted:
101
+ with open(input_file_name, "rb") as file:
102
+ return file.read()
103
+ else:
104
+ logger.error(LoggingRecord("pdf bytes cannot be decrypted and therefore cannot be processed further."))
105
+ sys.exit()
106
+
107
+
108
+ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
90
109
  """
91
110
  Creates a file reader object from a pdf document. Will try to decrypt the document if it is
92
111
  encrypted. (See `decrypt_pdf_document` to understand what is meant with "decrypt").
93
112
 
94
- :param path: A path to a pdf document
113
+ :param path_or_bytes: A path to a pdf document
95
114
  :return: A file reader object from which you can iterate through the document.
96
115
  """
97
116
 
98
- if not os.path.isfile(path):
99
- raise FileNotFoundError(str(path))
100
- file_name = os.path.split(path)[1]
117
+ if isinstance(path_or_bytes, bytes):
118
+ try:
119
+ reader = PdfReader(BytesIO(path_or_bytes))
120
+ except (errors.PdfReadError, AttributeError):
121
+ decrypted_bytes = decrypt_pdf_document_from_bytes(path_or_bytes)
122
+ reader = PdfReader(BytesIO(decrypted_bytes))
123
+ return reader
124
+
125
+ if not os.path.isfile(path_or_bytes):
126
+ raise FileNotFoundError(str(path_or_bytes))
127
+ file_name = os.path.split(path_or_bytes)[1]
101
128
  if not is_file_extension(file_name, ".pdf"):
102
129
  raise FileExtensionError(f"must be a pdf file: {file_name}")
103
130
 
104
- with open(path, "rb") as file:
131
+ with open(path_or_bytes, "rb") as file:
105
132
  qpdf_called = False
106
133
  try:
107
- input_pdf_as_bytes = PdfReader(file)
134
+ reader = PdfReader(file)
108
135
  except (errors.PdfReadError, AttributeError):
109
- _ = decrypt_pdf_document(path)
136
+ _ = decrypt_pdf_document(path_or_bytes)
110
137
  qpdf_called = True
111
138
 
112
139
  if not qpdf_called:
113
- if input_pdf_as_bytes.is_encrypted:
114
- is_decrypted = decrypt_pdf_document(path)
140
+ if reader.is_encrypted:
141
+ is_decrypted = decrypt_pdf_document(path_or_bytes)
115
142
  if not is_decrypted:
116
143
  logger.error(
117
144
  LoggingRecord(
118
- f"pdf document {path} cannot be decrypted and therefore cannot be " f"processed further."
145
+ f"pdf document {path_or_bytes} cannot be decrypted and therefore cannot "
146
+ f"be processed further."
119
147
  )
120
148
  )
121
149
  sys.exit()
122
150
 
123
- return PdfReader(os.fspath(path))
151
+ return PdfReader(os.fspath(path_or_bytes))
124
152
 
125
153
 
126
154
  def get_pdf_file_writer() -> PdfWriter:
@@ -157,11 +185,11 @@ class PDFStreamer:
157
185
 
158
186
  """
159
187
 
160
- def __init__(self, path: PathLikeOrStr) -> None:
188
+ def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
161
189
  """
162
- :param path: to a pdf.
190
+ :param path_or_bytes: to a pdf.
163
191
  """
164
- self.file_reader = get_pdf_file_reader(path)
192
+ self.file_reader = get_pdf_file_reader(path_or_bytes)
165
193
  self.file_writer = PdfWriter()
166
194
 
167
195
  def __len__(self) -> int:
@@ -312,6 +312,7 @@ class VizPackageHandler:
312
312
  "interactive_imshow": "_cv2_interactive_imshow",
313
313
  "encode": "_cv2_encode",
314
314
  "rotate_image": "_cv2_rotate_image",
315
+ "convert_bytes_to_np": "_cv2_convert_bytes_to_np",
315
316
  },
316
317
  "pillow": {
317
318
  "read_image": "_pillow_read_image",
@@ -325,6 +326,7 @@ class VizPackageHandler:
325
326
  "interactive_imshow": "_pillow_interactive_imshow",
326
327
  "encode": "_pillow_encode",
327
328
  "rotate_image": "_pillow_rotate_image",
329
+ "convert_bytes_to_np": "_pillow_convert_bytes_to_np",
328
330
  },
329
331
  }
330
332
 
@@ -484,6 +486,37 @@ class VizPackageHandler:
484
486
  pil_image = Image.open(im_file)
485
487
  return np.array(pil_image)[:, :, ::-1]
486
488
 
489
+ def convert_bytes_to_np(self, image_bytes: bytes) -> PixelValues:
490
+ """Converting an image as bytes into np.array
491
+
492
+ :param image_bytes: Image as np.array
493
+ """
494
+ return getattr(self, self.pkg_func_dict["convert_bytes_to_np"])(image_bytes)
495
+
496
+ @staticmethod
497
+ def _cv2_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
498
+ """
499
+ Convert image bytes to a numpy array using OpenCV.
500
+
501
+ :param image_bytes: Image bytes
502
+ :return: Image as numpy array
503
+ """
504
+ np_array = np.frombuffer(image_bytes, np.uint8)
505
+ np_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
506
+ return np_image
507
+
508
+ @staticmethod
509
+ def _pillow_convert_bytes_to_np(image_bytes: bytes) -> PixelValues:
510
+ """
511
+ Convert image bytes to a numpy array using Pillow.
512
+
513
+ :param image_bytes: Image bytes
514
+ :return: Image as numpy array
515
+ """
516
+ image = Image.open(BytesIO(image_bytes))
517
+ np_image = np.array(image)
518
+ return np_image
519
+
487
520
  def resize(self, image: PixelValues, width: int, height: int, interpolation: str) -> PixelValues:
488
521
  """
489
522
  Resize a given image to new width, height. Specifying an interpolation method is required. Depending on the
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.36
3
+ Version: 0.37
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -1,4 +1,4 @@
1
- deepdoctection/__init__.py,sha256=fNUbaFAlK1JUXgPCmTu2UOLUMqW4HIgkaW4uOUYjYYg,12571
1
+ deepdoctection/__init__.py,sha256=7VELexCFRaBTCXHQpBoKhVi4hqUUgpcsLTqvHXHjufQ,12651
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
4
  deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
@@ -11,15 +11,15 @@ deepdoctection/dataflow/__init__.py,sha256=CWRHMpmJaPk4xY_oIIFubCt-z11SguWrMWxHZ
11
11
  deepdoctection/dataflow/base.py,sha256=z4DCComSj5wStEPjtk0093cNNGfUMiDqx8dqz36nS_o,6221
12
12
  deepdoctection/dataflow/common.py,sha256=MyGA2VWlNMjQdIN_Jd-o0Ec3bDJmjQit4Nv0v43OCSQ,10119
13
13
  deepdoctection/dataflow/custom.py,sha256=3CK_1oL9p6nbOq8WtH5_vQUo70_8Z8pXY7kG0OFqzug,6803
14
- deepdoctection/dataflow/custom_serialize.py,sha256=CKeyw2Ayq_qAl0O5BoKkIOFJgteCt78h9QFTI23XhmQ,22818
14
+ deepdoctection/dataflow/custom_serialize.py,sha256=WocuiYo2gkih5Z9lWAoIIfUewwYSDOhHzG7ZZjKlUic,22827
15
15
  deepdoctection/dataflow/parallel_map.py,sha256=8FhxJBWV-kjJrJ27jQtP3yYF6Ev6rz98worO60oi96c,15837
16
16
  deepdoctection/dataflow/serialize.py,sha256=4pYC7m9h53JCu99waVeKpHDpsCDDdYCrSZpP2QYSsgs,4555
17
17
  deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i4wI,9619
18
18
  deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
19
19
  deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
20
20
  deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
21
- deepdoctection/datapoint/convert.py,sha256=Gw2IjNiEotPu1yuMZqrIYB0mCAwafKt-VgMnrHj6S7U,6808
22
- deepdoctection/datapoint/image.py,sha256=EvZlVwJjMAcL1z8RNPBvZ8fwdJvkGuGpcFxCP1y26Go,33045
21
+ deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
22
+ deepdoctection/datapoint/image.py,sha256=DIXXXD2yKsacg47Wt_GEYEIe1MQkrd06Yr5xAWv_n64,33047
23
23
  deepdoctection/datapoint/view.py,sha256=1rVMuqucCrI5zlwyXMADJQBV38V_zSNFqFyBi3cMA1E,44914
24
24
  deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
25
25
  deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
@@ -27,7 +27,7 @@ deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4r
27
27
  deepdoctection/datasets/dataflow_builder.py,sha256=cYU2zV3gZW2bFvMHimlO9VIl3BAUaCwML08cCIQ8Em4,4107
28
28
  deepdoctection/datasets/info.py,sha256=6y5TfiUhQppynbMFP5JmUPk95ggsVCtGIw4dYh2lVus,20501
29
29
  deepdoctection/datasets/registry.py,sha256=ZjzVzjsCgNXJuZZZtR98_yKocADmh4EBGV5JqJbGjWk,2543
30
- deepdoctection/datasets/save.py,sha256=khYQ4t94FOu9RWMimP9E4kASq25f61SIow78NHaX1pg,3349
30
+ deepdoctection/datasets/save.py,sha256=Y9508Qqp8gIGN7pbGgVBBnkiC6NdCb9L2YR4wVvEUxM,3350
31
31
  deepdoctection/datasets/instances/__init__.py,sha256=XEc_4vT5lDn6bbZID9ujDEumWu8Ec2W-QS4pI_bfWWE,1388
32
32
  deepdoctection/datasets/instances/doclaynet.py,sha256=wRZT7wMTilZBLZ1gKY2cWReD1EGT735vOOTy0pD0N6M,12038
33
33
  deepdoctection/datasets/instances/fintabnet.py,sha256=qYzFK1dWF6MEPkHamP255DvAzlQT_GnkvDe1aM7CgjA,12006
@@ -50,7 +50,7 @@ deepdoctection/eval/registry.py,sha256=v4mp-s67vBVRu1nQzuGlYPViQnMSeIXEcF_WmvfUC
50
50
  deepdoctection/eval/tedsmetric.py,sha256=rKw-734Y9CpBtIfkBSPQF2vAZxnIdWrI9Zc723P7RxI,9529
51
51
  deepdoctection/eval/tp_eval_callback.py,sha256=SXsXumoyxq-MIH9Cep5eUOwnNshMbKmC6mYOGwCg0pM,5283
52
52
  deepdoctection/extern/__init__.py,sha256=9Iks9b4Q_LynjcV167TVCoK8YsQRUcA2jjmAmDNA_X8,1056
53
- deepdoctection/extern/base.py,sha256=ajzFzD9BrFwnly4SziN8PadI-PBOzzVRlIGPm_sNllE,24142
53
+ deepdoctection/extern/base.py,sha256=ONPgappl_P5HSwQr42FatuRnwMTvUPecPsCztDTN0Hw,24108
54
54
  deepdoctection/extern/d2detect.py,sha256=zrKv1yurApnjD7QZIZk_8LYCahjmN82MQUjHjv8zvkQ,22127
55
55
  deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1SA,3077
56
56
  deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
@@ -95,7 +95,7 @@ deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC1
95
95
  deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
96
96
  deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
97
97
  deepdoctection/mapper/match.py,sha256=pCWZpz2R8JahiKXCw7dxKRTLiPgJXeVDgkddDPLy_c0,9643
98
- deepdoctection/mapper/misc.py,sha256=rCqHOcsCfVPXs36AWK0rZ2kk0CUM3yXV370_zyIGBJ4,6518
98
+ deepdoctection/mapper/misc.py,sha256=NLSSgk066Tkrrdi075HkqV7cP-iqT9fv_MtyAJ-8gOg,6743
99
99
  deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
100
100
  deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
101
101
  deepdoctection/mapper/pubstruct.py,sha256=YxsrZ-E0pD45Mm_VCPQB9yEgHsTPkw4htt-3DwCRX1k,23361
@@ -106,7 +106,7 @@ deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac
106
106
  deepdoctection/pipe/base.py,sha256=Davjkf3D837y9AIITcx7yXdebmVaz6Moyw_5Wi3nfmg,13561
107
107
  deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
108
108
  deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
109
- deepdoctection/pipe/doctectionpipe.py,sha256=I6B6HT_BG2ByQ3Rjsui3-Ct31yLmodx-iuZnujXaiSc,8953
109
+ deepdoctection/pipe/doctectionpipe.py,sha256=uhsrSuwaHcOMj8b8i6wCpPaZlSxCTaeHVhMokJ8vRSI,11835
110
110
  deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
111
111
  deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
112
112
  deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
@@ -129,20 +129,20 @@ deepdoctection/utils/develop.py,sha256=4HyTarkFbJwctL-Hgu1TU_LSJppHvaroDbcyHsxhI
129
129
  deepdoctection/utils/env_info.py,sha256=TnCA-LOTj4WIHd9yvn1AaoPWsLmPgc42l-BJmGV6zmM,19147
130
130
  deepdoctection/utils/error.py,sha256=_3q9VepKfEhsM3H033_Fu0hwBzMSjsWALsjyJbGAZr8,2367
131
131
  deepdoctection/utils/file_utils.py,sha256=IRElrcND0YEiU1QELw5hfXeNA39uE2_nyzh9-X7YcxI,19477
132
- deepdoctection/utils/fs.py,sha256=C4ktrzjoVtX9kgycv5YrEigDI9byi65b6_D0aKsGM4Y,10161
132
+ deepdoctection/utils/fs.py,sha256=x842BxUP5bbjJ2cofw-g4dKJv4QAaGzda4qnAazabO4,10281
133
133
  deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE7Qphk,2159
134
134
  deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
135
135
  deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
136
136
  deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
137
- deepdoctection/utils/pdf_utils.py,sha256=OAQjE9xHVNcDsFqAvX47Lu-mgmoMpVXqIf5pOK8AwxY,11595
137
+ deepdoctection/utils/pdf_utils.py,sha256=G0m8kUn2HwwyZWH_BcrDkm-m3MP9GN9SWHj5VhB7swY,12845
138
138
  deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
139
139
  deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
140
140
  deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
141
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
142
142
  deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
143
- deepdoctection/utils/viz.py,sha256=Mok1d0V7NwlhAvO1S1Iq5YitKpVmOfH_XHTSlRelCB0,25902
144
- deepdoctection-0.36.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
- deepdoctection-0.36.dist-info/METADATA,sha256=E-zXgx0bTdSqbd88D_abscR_poEJaKJGIwlv2RFbQs8,19543
146
- deepdoctection-0.36.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
147
- deepdoctection-0.36.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
- deepdoctection-0.36.dist-info/RECORD,,
143
+ deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
144
+ deepdoctection-0.37.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.37.dist-info/METADATA,sha256=0qGgmf07xmNRJx55yfMagHcfAoQG6GO9KTw6b0tv0uA,19543
146
+ deepdoctection-0.37.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
147
+ deepdoctection-0.37.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.37.dist-info/RECORD,,