deepdoctection 0.35__py3-none-any.whl → 0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -25,6 +25,7 @@ from copy import copy
25
25
  from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union, no_type_check
26
26
 
27
27
  import numpy as np
28
+ from typing_extensions import LiteralString
28
29
 
29
30
  from ..utils.error import AnnotationError, ImageError
30
31
  from ..utils.logger import LoggingRecord, logger
@@ -40,10 +41,12 @@ from ..utils.settings import (
40
41
  WordType,
41
42
  get_type,
42
43
  )
44
+ from ..utils.transform import ResizeTransform
43
45
  from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
44
46
  from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
45
47
  from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
46
48
  from .box import BoundingBox, crop_box_from_image
49
+ from .convert import box_to_point4, point4_to_box
47
50
  from .image import Image
48
51
 
49
52
 
@@ -101,7 +104,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
101
104
  return np_image
102
105
  raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
103
106
 
104
- def __getattr__(self, item: str) -> Optional[Union[str, int, list[str]]]:
107
+ def __getattr__(self, item: str) -> Optional[Union[str, int, list[str], list[ImageAnnotationBaseView]]]:
105
108
  """
106
109
  Get attributes defined by registered `self.get_attribute_names()` in a multi step process:
107
110
 
@@ -126,6 +129,9 @@ class ImageAnnotationBaseView(ImageAnnotation):
126
129
  if isinstance(sub_cat, ContainerAnnotation):
127
130
  return sub_cat.value
128
131
  return sub_cat.category_id
132
+ if item in self.relationships:
133
+ relationship_ids = self.get_relationship(get_type(item))
134
+ return self.base_page.get_annotation(annotation_ids=relationship_ids)
129
135
  if self.image is not None:
130
136
  if item in self.image.summary.sub_categories:
131
137
  sub_cat = self.get_summary(get_type(item))
@@ -165,7 +171,11 @@ class Word(ImageAnnotationBaseView):
165
171
  """
166
172
 
167
173
  def get_attribute_names(self) -> set[str]:
168
- return set(WordType).union(super().get_attribute_names()).union({Relationships.READING_ORDER})
174
+ return (
175
+ set(WordType)
176
+ .union(super().get_attribute_names())
177
+ .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
178
+ )
169
179
 
170
180
 
171
181
  class Layout(ImageAnnotationBaseView):
@@ -246,7 +256,11 @@ class Layout(ImageAnnotationBaseView):
246
256
  }
247
257
 
248
258
  def get_attribute_names(self) -> set[str]:
249
- return {"words", "text"}.union(super().get_attribute_names()).union({Relationships.READING_ORDER})
259
+ return (
260
+ {"words", "text"}
261
+ .union(super().get_attribute_names())
262
+ .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
263
+ )
250
264
 
251
265
  def __len__(self) -> int:
252
266
  """len of text counted by number of characters"""
@@ -433,8 +447,8 @@ class ImageDefaults(TypedDict):
433
447
  """ImageDefaults"""
434
448
 
435
449
  text_container: LayoutType
436
- floating_text_block_categories: tuple[LayoutType, ...]
437
- text_block_categories: tuple[LayoutType, ...]
450
+ floating_text_block_categories: tuple[Union[LayoutType, CellType], ...]
451
+ text_block_categories: tuple[Union[LayoutType, CellType], ...]
438
452
 
439
453
 
440
454
  IMAGE_DEFAULTS: ImageDefaults = {
@@ -448,9 +462,13 @@ IMAGE_DEFAULTS: ImageDefaults = {
448
462
  "text_block_categories": (
449
463
  LayoutType.TEXT,
450
464
  LayoutType.TITLE,
451
- LayoutType.FIGURE,
452
465
  LayoutType.LIST,
453
466
  LayoutType.CELL,
467
+ LayoutType.FIGURE,
468
+ CellType.COLUMN_HEADER,
469
+ CellType.PROJECTED_ROW_HEADER,
470
+ CellType.SPANNING,
471
+ CellType.ROW_HEADER,
454
472
  ),
455
473
  }
456
474
 
@@ -510,6 +528,8 @@ class Page(Image):
510
528
  "document_id",
511
529
  "page_number",
512
530
  "angle",
531
+ "figures",
532
+ "residual_layouts",
513
533
  }
514
534
  include_residual_text_container: bool = True
515
535
 
@@ -608,6 +628,41 @@ class Page(Image):
608
628
  """
609
629
  return self.get_annotation(category_names=LayoutType.TABLE)
610
630
 
631
+ @property
632
+ def figures(self) -> list[ImageAnnotationBaseView]:
633
+ """
634
+ A list of a figures.
635
+ """
636
+ return self.get_annotation(category_names=LayoutType.FIGURE)
637
+
638
+ @property
639
+ def residual_layouts(self) -> list[ImageAnnotationBaseView]:
640
+ """
641
+ A list of all residual layouts. Residual layouts are all layouts that are
642
+ - not floating text blocks,
643
+ - not text containers,
644
+ - not tables,
645
+ - not figures
646
+ - not cells
647
+ - not rows
648
+ - not columns
649
+ """
650
+ return self.get_annotation(category_names=self._get_residual_layout())
651
+
652
+ def _get_residual_layout(self) -> list[LiteralString]:
653
+ layouts = copy(list(self.floating_text_block_categories))
654
+ layouts.extend(
655
+ [
656
+ LayoutType.TABLE,
657
+ LayoutType.FIGURE,
658
+ self.text_container,
659
+ LayoutType.CELL,
660
+ LayoutType.ROW,
661
+ LayoutType.COLUMN,
662
+ ]
663
+ )
664
+ return [layout for layout in LayoutType if layout not in layouts]
665
+
611
666
  @classmethod
612
667
  def from_image(
613
668
  cls,
@@ -801,12 +856,15 @@ class Page(Image):
801
856
  self,
802
857
  show_tables: bool = True,
803
858
  show_layouts: bool = True,
859
+ show_figures: bool = False,
860
+ show_residual_layouts: bool = False,
804
861
  show_cells: bool = True,
805
862
  show_table_structure: bool = True,
806
863
  show_words: bool = False,
807
864
  show_token_class: bool = True,
808
865
  ignore_default_token_class: bool = False,
809
866
  interactive: bool = False,
867
+ scaled_width: int = 600,
810
868
  **debug_kwargs: str,
811
869
  ) -> Optional[PixelValues]:
812
870
  """
@@ -827,12 +885,14 @@ class Page(Image):
827
885
 
828
886
  :param show_tables: Will display all tables boxes as well as cells, rows and columns
829
887
  :param show_layouts: Will display all other layout components.
888
+ :param show_figures: Will display all figures
830
889
  :param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
831
890
  :param show_table_structure: Will display rows and columns
832
891
  :param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
833
892
  :param show_token_class: Will display token class instead of token tags (i.e. token classes with tags)
834
893
  :param interactive: If set to True will open an interactive image, otherwise it will return a numpy array that
835
894
  can be displayed differently.
895
+ :param scaled_width: Width of the image to display
836
896
  :param ignore_default_token_class: Will ignore displaying word bounding boxes with default or None token class
837
897
  label
838
898
  :return: If `interactive=False` will return a numpy array.
@@ -858,6 +918,11 @@ class Page(Image):
858
918
  box_stack.append(item.bbox)
859
919
  category_names_list.append(item.category_name.value)
860
920
 
921
+ if show_figures and not debug_kwargs:
922
+ for item in self.figures:
923
+ box_stack.append(item.bbox)
924
+ category_names_list.append(item.category_name.value)
925
+
861
926
  if show_tables and not debug_kwargs:
862
927
  for table in self.tables:
863
928
  box_stack.append(table.bbox)
@@ -914,24 +979,34 @@ class Page(Image):
914
979
  else:
915
980
  category_names_list.append(word.token_tag.value if word.token_tag is not None else None)
916
981
 
982
+ if show_residual_layouts and not debug_kwargs:
983
+ for item in self.residual_layouts:
984
+ box_stack.append(item.bbox)
985
+ category_names_list.append(item.category_name.value)
986
+
917
987
  if self.image is not None:
988
+ scale_fx = scaled_width / self.width
989
+ scaled_height = int(self.height * scale_fx)
990
+ img = viz_handler.resize(self.image, scaled_width, scaled_height, "VIZ")
991
+
918
992
  if box_stack:
919
993
  boxes = np.vstack(box_stack)
994
+ boxes = box_to_point4(boxes)
995
+ resizer = ResizeTransform(self.height, self.width, scaled_height, scaled_width, "VIZ")
996
+ boxes = resizer.apply_coords(boxes)
997
+ boxes = point4_to_box(boxes)
920
998
  if show_words:
921
999
  img = draw_boxes(
922
- self.image,
923
- boxes,
924
- category_names_list,
1000
+ np_image=img,
1001
+ boxes=boxes,
1002
+ category_names_list=category_names_list,
925
1003
  font_scale=1.0,
926
1004
  rectangle_thickness=4,
927
1005
  )
928
1006
  else:
929
- img = draw_boxes(self.image, boxes, category_names_list)
930
- scale_fx, scale_fy = 1.3, 1.3
931
- scaled_width, scaled_height = int(self.width * scale_fx), int(self.height * scale_fy)
932
- img = viz_handler.resize(img, scaled_width, scaled_height, "VIZ")
933
- else:
934
- img = self.image
1007
+ img = draw_boxes(
1008
+ np_image=img, boxes=boxes, category_names_list=category_names_list, show_palette=False
1009
+ )
935
1010
 
936
1011
  if interactive:
937
1012
  interactive_imshow(img)
@@ -62,7 +62,7 @@ def dataflow_to_json(
62
62
  if highest_hierarchy_only:
63
63
 
64
64
  def _remove_hh(dp: Image) -> Image:
65
- dp.remove_image_from_lower_hierachy()
65
+ dp.remove_image_from_lower_hierarchy()
66
66
  return dp
67
67
 
68
68
  df = MapData(df, _remove_hh)
@@ -71,8 +71,8 @@ https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeva
71
71
 
72
72
 
73
73
  def _summarize( # type: ignore
74
- self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100
75
- ) -> float:
74
+ self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100, per_category: bool = False
75
+ ) -> Union[float, list[float]]:
76
76
  # pylint: disable=C0103
77
77
  p = self.params
78
78
  iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
@@ -86,6 +86,36 @@ def _summarize( # type: ignore
86
86
 
87
87
  aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
88
88
  mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
89
+ if per_category:
90
+ if ap == 1:
91
+ s = self.eval["precision"]
92
+ num_classes = s.shape[2]
93
+ results_per_class = []
94
+ for idx in range(num_classes):
95
+ if iouThr is not None:
96
+ s = self.eval["precision"]
97
+ t = np.where(iouThr == p.iouThrs)[0]
98
+ s = s[t]
99
+ precision = s[:, :, idx, aind, mind]
100
+ precision = precision[precision > -1]
101
+ res = np.mean(precision) if precision.size else float("nan")
102
+ results_per_class.append(float(res))
103
+ print(f"Precision for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
104
+ else:
105
+ s = self.eval["recall"]
106
+ num_classes = s.shape[1]
107
+ results_per_class = []
108
+ for idx in range(num_classes):
109
+ if iouThr is not None:
110
+ s = self.eval["recall"]
111
+ t = np.where(iouThr == p.iouThrs)[0]
112
+ s = s[t]
113
+ recall = s[:, idx, aind, mind]
114
+ recall = recall[recall > -1]
115
+ res = np.mean(recall) if recall.size else float("nan")
116
+ results_per_class.append(float(res))
117
+ print(f"Recall for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
118
+ return results_per_class
89
119
  if ap == 1:
90
120
  # dimension of precision: [TxRxKxAxM]
91
121
  s = self.eval["precision"]
@@ -124,6 +154,7 @@ class CocoMetric(MetricBase):
124
154
  mapper = image_to_coco
125
155
  _f1_score = None
126
156
  _f1_iou = None
157
+ _per_category = False
127
158
  _params: dict[str, Union[list[int], list[list[int]]]] = {}
128
159
 
129
160
  @classmethod
@@ -176,18 +207,28 @@ class CocoMetric(MetricBase):
176
207
 
177
208
  if cls._f1_score:
178
209
  summary_bbox = [
179
- metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2]),
180
- metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2]),
210
+ metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
211
+ metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
181
212
  ]
182
213
  else:
183
214
  metric.summarize()
184
215
  summary_bbox = metric.stats
185
216
 
186
217
  results = []
187
- for params, value in zip(cls.get_summary_default_parameters(), summary_bbox):
218
+
219
+ default_parameters = cls.get_summary_default_parameters()
220
+ if cls._per_category:
221
+ default_parameters = default_parameters * len(summary_bbox[0])
222
+ summary_bbox = [item for pair in zip(*summary_bbox) for item in pair]
223
+ val = 0
224
+ for idx, (params, value) in enumerate(zip(default_parameters, summary_bbox)):
188
225
  params = copy(params)
189
226
  params["mode"] = "bbox"
190
227
  params["val"] = value
228
+ if cls._per_category:
229
+ if idx % 2 == 0:
230
+ val += 1
231
+ params["category_id"] = val
191
232
  results.append(params)
192
233
 
193
234
  return results
@@ -201,15 +242,16 @@ class CocoMetric(MetricBase):
201
242
  area range and maximum detections.
202
243
  """
203
244
  if cls._f1_score:
245
+ for el, idx in zip(_F1_DEFAULTS, [2, 2]):
246
+ if cls._params:
247
+ if cls._params.get("maxDets") is not None:
248
+ el["maxDets"] = cls._params["maxDets"][idx]
249
+ el["iouThr"] = cls._f1_iou
250
+ return _F1_DEFAULTS
251
+
252
+ for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
204
253
  if cls._params:
205
254
  if cls._params.get("maxDets") is not None:
206
- for el, idx in zip(_F1_DEFAULTS, [2, 2]):
207
- el["maxDets"] = cls._params["maxDets"][idx]
208
- el["iouThr"] = cls._f1_iou
209
- return _F1_DEFAULTS
210
- if cls._params:
211
- if cls._params.get("maxDets") is not None:
212
- for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
213
255
  el["maxDets"] = cls._params["maxDets"][idx]
214
256
  return _COCOEVAL_DEFAULTS
215
257
 
@@ -220,13 +262,16 @@ class CocoMetric(MetricBase):
220
262
  area_range: Optional[list[list[int]]] = None,
221
263
  f1_score: bool = False,
222
264
  f1_iou: float = 0.9,
265
+ per_category: bool = False,
223
266
  ) -> None:
224
267
  """
225
268
  Setting params for different coco metric modes.
226
269
 
227
270
  :param max_detections: The maximum number of detections to consider
228
271
  :param area_range: The area range to classify objects as "all", "small", "medium" and "large"
229
- :param f1_score: Will use f1 score setting with default iouThr 0.9
272
+ :param f1_score: Will use f1 score setting with default iouThr 0.9. To be more precise it does not calculate
273
+ the f1 score but the precision and recall for a given iou threshold. Use the harmonic mean to
274
+ get the ultimate f1 score.
230
275
  :param f1_iou: Use with f1_score True and reset the f1 iou threshold
231
276
  """
232
277
  if max_detections is not None:
@@ -238,6 +283,7 @@ class CocoMetric(MetricBase):
238
283
 
239
284
  cls._f1_score = f1_score
240
285
  cls._f1_iou = f1_iou
286
+ cls._per_category = per_category
241
287
 
242
288
  @classmethod
243
289
  def get_requirements(cls) -> list[Requirement]:
@@ -69,8 +69,7 @@ class ModelCategories:
69
69
  if self.init_categories:
70
70
  self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
71
71
  else:
72
- if self._init_categories is None:
73
- self._init_categories = MappingProxyType({})
72
+ self._init_categories = MappingProxyType({})
74
73
  self.categories = self._init_categories
75
74
 
76
75
  @overload
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
181
180
  self._init_categories = self.merge_bio_semantics_categories(
182
181
  self._categories_semantics, self._categories_bio
183
182
  )
184
- super().__post_init__()
183
+ self.categories = self._init_categories
185
184
 
186
185
  @staticmethod
187
186
  def merge_bio_semantics_categories(
@@ -193,5 +193,7 @@ def match_anns_by_distance(
193
193
  child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
194
194
  child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
195
195
  parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
196
- child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
197
- return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
196
+ if child_centers and parent_centers:
197
+ child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
198
+ return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
199
+ return []
@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
27
27
 
28
28
  from lazy_imports import try_import
29
29
 
30
- from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
30
+ from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
31
31
  from ..datapoint.image import Image
32
32
  from ..utils.fs import get_load_image_func, load_image_from_file
33
33
  from ..utils.types import JsonDict
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
49
49
 
50
50
  file_name: Optional[str]
51
51
  location: Optional[str]
52
+ image_bytes: Optional[bytes] = None
52
53
 
53
54
  if isinstance(dp, str):
54
55
  _, file_name = os.path.split(dp)
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
62
63
  document_id = dp.get("document_id")
63
64
  if location == "":
64
65
  location = str(dp.get("path", ""))
66
+ image_bytes = dp.get("image_bytes")
65
67
  else:
66
68
  raise TypeError("datapoint not of expected type for converting to image")
67
69
 
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
76
78
  if dp_image.pdf_bytes is not None:
77
79
  if isinstance(dp_image.pdf_bytes, bytes):
78
80
  dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
81
+ elif image_bytes is not None:
82
+ dp_image.image = convert_bytes_to_np_array(image_bytes)
79
83
  else:
80
84
  dp_image.image = load_image_from_file(location)
81
85
 
@@ -23,31 +23,38 @@ import os
23
23
  from pathlib import Path
24
24
  from typing import List, Mapping, Optional, Sequence, Tuple, Union
25
25
 
26
- from ..dataflow import DataFlow, MapData
26
+ from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
27
27
  from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
28
28
  from ..datapoint.image import Image
29
29
  from ..datapoint.view import IMAGE_DEFAULTS
30
30
  from ..mapper.maputils import curry
31
31
  from ..mapper.misc import to_image
32
32
  from ..utils.fs import maybe_path_or_pdf
33
+ from ..utils.identifier import get_uuid_from_str
33
34
  from ..utils.logger import LoggingRecord, logger
35
+ from ..utils.pdf_utils import PDFStreamer
34
36
  from ..utils.types import PathLikeOrStr
37
+ from ..utils.utils import is_file_extension
35
38
  from .base import Pipeline, PipelineComponent
36
39
  from .common import PageParsingService
37
40
 
38
41
 
39
42
  def _collect_from_kwargs(
40
- **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
41
- ) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
43
+ **kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
44
+ ) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
45
+ b_bytes = kwargs.get("bytes")
42
46
  dataset_dataflow = kwargs.get("dataset_dataflow")
43
47
  path = kwargs.get("path")
44
48
  if path is None and dataset_dataflow is None:
45
49
  raise ValueError("Pass either path or dataset_dataflow as argument")
50
+ if path is None and b_bytes:
51
+ raise ValueError("When passing bytes, a path to the source document must be provided")
46
52
 
47
53
  shuffle = kwargs.get("shuffle", False)
48
54
  if not isinstance(shuffle, bool):
49
55
  raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
50
56
 
57
+ file_type = None
51
58
  doc_path = None
52
59
  if path:
53
60
  if not isinstance(path, (str, Path)):
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
56
63
  if path_type == 2:
57
64
  doc_path = path
58
65
  path = None
66
+ file_type = ".pdf"
67
+ elif path_type == 3:
68
+ if is_file_extension(path, ".jpg"):
69
+ file_type = ".jpg"
70
+ if is_file_extension(path, ".png"):
71
+ file_type = ".png"
72
+ if is_file_extension(path, ".jpeg"):
73
+ file_type = ".jpeg"
74
+ if not b_bytes:
75
+ raise ValueError("When passing a path to a single image, bytes of the image must be passed")
59
76
  elif not path_type:
60
77
  raise ValueError("Pass only a path to a directory or to a pdf file")
61
78
 
62
- file_type = kwargs.get("file_type", [".jpg", ".png", ".tif"])
79
+ file_type = kwargs.get(
80
+ "file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type # type: ignore
81
+ )
63
82
 
64
83
  max_datapoints = kwargs.get("max_datapoints")
65
84
  if not isinstance(max_datapoints, (int, type(None))):
66
85
  raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
67
- return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow # type: ignore
86
+ return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes # type: ignore
68
87
 
69
88
 
70
89
  @curry
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
142
161
 
143
162
  super().__init__(pipeline_component_list)
144
163
 
145
- def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
146
- path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
164
+ def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
165
+ -> DataFlow:
166
+ path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
147
167
 
148
168
  df: DataFlow
149
169
 
150
- if isinstance(path, (str, Path)):
170
+ if isinstance(b_bytes, bytes):
171
+ df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
172
+ b_bytes=b_bytes,
173
+ file_type=file_type)
174
+
175
+ elif isinstance(path, (str, Path)):
151
176
  if not isinstance(file_type, (str, list)):
152
177
  raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
153
178
  df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
162
187
 
163
188
  df = MapData(df, _proto_process(path, doc_path))
164
189
  if dataset_dataflow is None:
165
- df = MapData(df, _to_image(dpi=300)) # pylint: disable=E1120
190
+ df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
166
191
  return df
167
192
 
168
193
  @staticmethod
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
197
222
  """
198
223
  return _doc_to_dataflow(path, max_datapoints)
199
224
 
225
+ @staticmethod
226
+ def bytes_to_dataflow(
227
+ path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
228
+ ) -> DataFlow:
229
+ """
230
+ Converts a bytes object to a dataflow
231
+
232
+ :param path: path to directory or an image file
233
+ :param b_bytes: bytes object
234
+ :param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
235
+ :param max_datapoints: max number of datapoints to consider
236
+ :return: DataFlow
237
+ """
238
+
239
+ file_name = os.path.split(path)[1]
240
+ if isinstance(file_type, str):
241
+ if file_type == ".pdf":
242
+ prefix, suffix = os.path.splitext(file_name)
243
+ df: DataFlow
244
+ df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
245
+ df = MapData(
246
+ df,
247
+ lambda dp: {
248
+ "path": path,
249
+ "file_name": prefix + f"_{dp[1]}" + suffix,
250
+ "pdf_bytes": dp[0],
251
+ "page_number": dp[1],
252
+ "document_id": get_uuid_from_str(prefix),
253
+ },
254
+ )
255
+ else:
256
+ df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
257
+ return df
258
+ raise ValueError(
259
+ f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
260
+ f"not supported"
261
+ )
262
+
200
263
  def dataflow_to_page(self, df: DataFlow) -> DataFlow:
201
264
  """
202
265
  Converts a dataflow of images to a dataflow of pages
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
206
269
  """
207
270
  return self.page_parser.predict_dataflow(df)
208
271
 
209
- def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
272
+ def analyze(
273
+ self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
274
+ ) -> DataFlow:
210
275
  """
211
276
  `kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
212
277
 
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
215
280
  only the first page is processed through the pipeline.
216
281
  Alternatively, a path to a pdf document with multiple pages.
217
282
 
283
+ `kwargs key bytes:` A bytes object of an image
284
+
218
285
  `kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
219
286
 
220
287
  `kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
@@ -227,20 +227,21 @@ def get_load_image_func(
227
227
 
228
228
  def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
229
229
  """
230
- Checks if the path points to a directory or a pdf document. Returns 1 if the path points to a directory, 2
231
- if the path points to a pdf doc or 0, if none of the previous is true.
230
+ Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
231
+ directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
232
+ previous is true.
232
233
 
233
234
  :param path: A path
234
- :return: A value of 0,1,2
235
+ :return: A value of 0,1,2,3
235
236
  """
236
237
 
237
- is_dir = os.path.isdir(path)
238
- if is_dir:
238
+ if os.path.isdir(path):
239
239
  return 1
240
240
  file_name = os.path.split(path)[1]
241
- is_pdf = is_file_extension(file_name, ".pdf")
242
- if is_pdf:
241
+ if is_file_extension(file_name, ".pdf"):
243
242
  return 2
243
+ if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
244
+ return 3
244
245
  return 0
245
246
 
246
247