deepdoctection 0.33__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -129,7 +129,7 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
129
129
  img["height"] = dp.height
130
130
  img["file_name"] = dp.file_name
131
131
 
132
- for img_ann in dp.get_annotation_iter():
132
+ for img_ann in dp.get_annotation():
133
133
  ann: JsonDict = {
134
134
  "id": int("".join([s for s in img_ann.annotation_id if s.isdigit()])),
135
135
  "image_id": img["id"],
@@ -139,7 +139,11 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
139
139
  ann["score"] = img_ann.score
140
140
  ann["iscrowd"] = 0
141
141
  bounding_box = img_ann.get_bounding_box(dp.image_id)
142
- ann["area"] = bounding_box.area
142
+ ann["area"] = (
143
+ bounding_box.area
144
+ if bounding_box.absolute_coords
145
+ else bounding_box.transform(dp.width, dp.height, absolute_coords=True).area
146
+ )
143
147
  ann["bbox"] = bounding_box.to_list(mode="xywh")
144
148
  anns.append(ann)
145
149
 
@@ -41,7 +41,7 @@ with try_import() as d2_import_guard:
41
41
  from detectron2.structures import BoxMode
42
42
 
43
43
  with try_import() as wb_import_guard:
44
- from wandb import Classes
44
+ from wandb import Classes # type: ignore
45
45
  from wandb import Image as Wbimage
46
46
 
47
47
 
@@ -189,6 +189,7 @@ def to_wandb_image(
189
189
  class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
190
190
  else:
191
191
  class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
192
+ class_labels = dict(categories.items())
192
193
 
193
194
  for ann in anns:
194
195
  bounding_box = ann.get_bounding_box(dp.image_id)
@@ -127,7 +127,7 @@ def image_to_raw_layoutlm_features(
127
127
  all_boxes = []
128
128
  all_labels: list[int] = []
129
129
 
130
- anns = dp.get_annotation_iter(category_names=LayoutType.WORD)
130
+ anns = dp.get_annotation(category_names=LayoutType.WORD)
131
131
 
132
132
  word_id_to_segment_box = {}
133
133
  if segment_positions:
@@ -23,6 +23,7 @@ from typing import Any, Literal, Optional, Sequence, Union
23
23
 
24
24
  import numpy as np
25
25
  from numpy.typing import NDArray
26
+ from scipy.spatial import distance
26
27
 
27
28
  from ..datapoint.annotation import ImageAnnotation
28
29
  from ..datapoint.box import iou
@@ -164,3 +165,33 @@ def match_anns_by_intersection(
164
165
  return [], [], [], []
165
166
 
166
167
  return child_index, parent_index, child_anns, parent_anns
168
+
169
+
170
+ def match_anns_by_distance(
171
+ dp: Image,
172
+ parent_ann_category_names: Union[TypeOrStr, Sequence[TypeOrStr]],
173
+ child_ann_category_names: Union[TypeOrStr, Sequence[TypeOrStr]],
174
+ parent_ann_ids: Optional[Union[Sequence[str], str]] = None,
175
+ child_ann_ids: Optional[Union[str, Sequence[str]]] = None,
176
+ ) -> list[tuple[ImageAnnotation, ImageAnnotation]]:
177
+ """
178
+ Generates pairs of parent and child annotations by calculating the euclidean distance between the centers of the
179
+ parent and child bounding boxes. It will return the closest child for each parent. Note, that a child can be
180
+ assigned multiple times to different parents.
181
+
182
+ :param dp: image datapoint
183
+ :param parent_ann_category_names: single str or list of category names
184
+ :param child_ann_category_names: single str or list of category names
185
+ :param parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent candi-
186
+ dates which are not in the list.
187
+ :param child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
188
+ candidates which are not in the list.
189
+ :return:
190
+ """
191
+
192
+ parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
193
+ child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
194
+ child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
195
+ parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
196
+ child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
197
+ return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
@@ -145,7 +145,7 @@ def image_ann_to_image(dp: Image, category_names: Union[str, list[str]], crop_im
145
145
  :return: Image
146
146
  """
147
147
 
148
- img_anns = dp.get_annotation_iter(category_names=category_names)
148
+ img_anns = dp.get_annotation(category_names=category_names)
149
149
  for ann in img_anns:
150
150
  dp.image_ann_to_image(annotation_id=ann.annotation_id, crop_image=crop_image)
151
151
 
@@ -163,7 +163,7 @@ def image_to_prodigy(dp: Image, category_names: Optional[Sequence[ObjectTypes]]
163
163
  output["image_id"] = dp.image_id
164
164
 
165
165
  spans = []
166
- for ann in dp.get_annotation_iter(category_names=category_names):
166
+ for ann in dp.get_annotation(category_names=category_names):
167
167
  bounding_box = ann.get_bounding_box(dp.image_id)
168
168
  if not bounding_box.absolute_coords:
169
169
  bounding_box = bounding_box.transform(dp.width, dp.height, absolute_coords=True)
@@ -272,6 +272,33 @@ class DatapointManager:
272
272
  return None
273
273
  return cont_ann.annotation_id
274
274
 
275
+ def set_relationship_annotation(
276
+ self, relationship_name: ObjectTypes, target_annotation_id: str, annotation_id: str
277
+ ) -> Optional[str]:
278
+ """
279
+ Create a relationship annotation and dump it to the target annotation.
280
+
281
+ :param relationship_name: The relationship key
282
+ :param target_annotation_id: Annotation_id of the parent `ImageAnnotation`
283
+ :param annotation_id: The annotation_id to dump the relationship to
284
+
285
+ :return: Annotation_id of the parent `ImageAnnotation` for references if the dumpy has been successful
286
+ """
287
+ self.assert_datapoint_passed()
288
+ with MappingContextManager(
289
+ dp_name=self.datapoint.file_name,
290
+ filter_level="annotation",
291
+ relationship_annotation={
292
+ "relationship_name": relationship_name.value,
293
+ "target_annotation_id": target_annotation_id,
294
+ "annotation_id": annotation_id,
295
+ },
296
+ ) as annotation_context:
297
+ self._cache_anns[target_annotation_id].dump_relationship(relationship_name, annotation_id)
298
+ if annotation_context.context_error:
299
+ return None
300
+ return target_annotation_id
301
+
275
302
  def set_summary_annotation(
276
303
  self,
277
304
  summary_key: ObjectTypes,
@@ -163,6 +163,29 @@ class PipelineComponent(ABC):
163
163
  return True
164
164
  return False
165
165
 
166
+ def _undo(self, dp: Image) -> Image:
167
+ """
168
+ Undo the processing of the pipeline component. It will remove `ImageAnnotation`, `CategoryAnnotation` and
169
+ `ContainerAnnotation` with the service_id of the pipeline component.
170
+ """
171
+ if self.timer_on:
172
+ with timed_operation(self.__class__.__name__):
173
+ self.dp_manager.datapoint = dp
174
+ dp.remove(service_ids=self.service_id)
175
+ else:
176
+ self.dp_manager.datapoint = dp
177
+ dp.remove(service_ids=self.service_id)
178
+ return self.dp_manager.datapoint
179
+
180
+ def undo(self, df: DataFlow) -> DataFlow:
181
+ """
182
+ Mapping a datapoint via `_undo` within a dataflow pipeline
183
+
184
+ :param df: An input dataflow of Images
185
+ :return: A output dataflow of Images
186
+ """
187
+ return MapData(df, self._undo)
188
+
166
189
 
167
190
  class Pipeline(ABC):
168
191
  """
@@ -29,8 +29,7 @@ import numpy as np
29
29
  from ..dataflow import DataFlow, MapData
30
30
  from ..datapoint.image import Image
31
31
  from ..datapoint.view import IMAGE_DEFAULTS, Page
32
- from ..mapper.maputils import MappingContextManager
33
- from ..mapper.match import match_anns_by_intersection
32
+ from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
34
33
  from ..mapper.misc import to_image
35
34
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
36
35
  from .base import MetaAnnotation, PipelineComponent
@@ -76,21 +75,23 @@ class ImageCroppingService(PipelineComponent):
76
75
  pass
77
76
 
78
77
 
79
- @pipeline_component_registry.register("MatchingService")
80
- class MatchingService(PipelineComponent):
78
+ class IntersectionMatcher:
81
79
  """
82
- Objects of two object classes can be assigned to one another by determining their pairwise average. If this is above
83
- a limit, a relation is created between them.
84
- The parent object class (based on its category) and the child object class are defined for the service. A child
85
- relation is created in the parent class if the conditions are met.
80
+ Objects of two object classes can be assigned to one another by determining their pairwise intersection. If this is
81
+ above a limit, a relation is created between them.
82
+ The parent object class (based on its category) and the child object class are defined for the service.
86
83
 
87
84
  Either `iou` (intersection-over-union) or `ioa` (intersection-over-area) can be selected as the matching rule.
88
85
 
89
86
  # the following will assign word annotations to text and title annotation, provided that their ioa-threshold
90
87
  # is above 0.7. words below that threshold will not be assigned.
91
88
 
92
- match = MatchingService(parent_categories=["TEXT","TITLE"],child_categories="WORD",matching_rule="ioa",
93
- threshold=0.7)
89
+ matcher = IntersectionMatcher(matching_rule="ioa", threshold=0.7)
90
+
91
+ match_service = MatchingService(parent_categories=["text","title"],
92
+ child_categories="word",
93
+ matcher=matcher,
94
+ relationship_key=Relationships.CHILD)
94
95
 
95
96
  # Assigning means that text and title annotation will receive a relationship called "CHILD" which is a list
96
97
  of annotation ids of mapped words.
@@ -98,16 +99,12 @@ class MatchingService(PipelineComponent):
98
99
 
99
100
  def __init__(
100
101
  self,
101
- parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
102
- child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
103
102
  matching_rule: Literal["iou", "ioa"],
104
103
  threshold: float,
105
104
  use_weighted_intersections: bool = False,
106
105
  max_parent_only: bool = False,
107
106
  ) -> None:
108
107
  """
109
- :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
110
- :param child_categories: list of categories to be used for a child class.
111
108
  :param matching_rule: "iou" or "ioa"
112
109
  :param threshold: iou/ioa threshold. Value between [0,1]
113
110
  :param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
@@ -115,7 +112,105 @@ class MatchingService(PipelineComponent):
115
112
  that intersections with more cells will likely decrease the ioa value. By
116
113
  multiplying the ioa with the number of all intersection for each child this
117
114
  value calibrate the ioa.
118
- :param max_parent_only: Will assign to each child at most one parent with maximum ioa
115
+ :param max_parent_only: Will assign to each child at most one parent with maximum ioa"""
116
+
117
+ if matching_rule not in ("iou", "ioa"):
118
+ raise ValueError("segment rule must be either iou or ioa")
119
+ self.matching_rule = matching_rule
120
+ self.threshold = threshold
121
+ self.use_weighted_intersections = use_weighted_intersections
122
+ self.max_parent_only = max_parent_only
123
+
124
+ def match(
125
+ self,
126
+ dp: Image,
127
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
128
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
129
+ ) -> list[tuple[str, str]]:
130
+ """
131
+ The matching algorithm
132
+
133
+ :param dp: datapoint image
134
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
135
+ :param child_categories: list of categories to be used for a child class.
136
+
137
+ :return: A list of tuples with parent and child annotation ids
138
+ """
139
+ child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
140
+ dp,
141
+ parent_ann_category_names=parent_categories,
142
+ child_ann_category_names=child_categories,
143
+ matching_rule=self.matching_rule,
144
+ threshold=self.threshold,
145
+ use_weighted_intersections=self.use_weighted_intersections,
146
+ max_parent_only=self.max_parent_only,
147
+ )
148
+
149
+ matched_child_anns = np.take(child_anns, child_index) # type: ignore
150
+ matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
151
+
152
+ all_parent_child_relations = []
153
+ for idx, parent in enumerate(matched_parent_anns):
154
+ all_parent_child_relations.append((parent.annotation_id, matched_child_anns[idx].annotation_id))
155
+
156
+ return all_parent_child_relations
157
+
158
+
159
+ class NeighbourMatcher:
160
+ """
161
+ Objects of two object classes can be assigned to one another by determining their pairwise distance.
162
+
163
+ # the following will assign caption annotations to figure annotation
164
+
165
+ matcher = NeighbourMatcher()
166
+
167
+ match_service = MatchingService(parent_categories=["figure"],
168
+ child_categories="caption",
169
+ matcher=matcher,
170
+ relationship_key=Relationships.LAYOUT_LINK)
171
+
172
+ """
173
+
174
+ def match(
175
+ self,
176
+ dp: Image,
177
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
178
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
179
+ ) -> list[tuple[str, str]]:
180
+ """
181
+ The matching algorithm
182
+
183
+ :param dp: datapoint image
184
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
185
+ :param child_categories: list of categories to be used for a child class.
186
+
187
+ :return: A list of tuples with parent and child annotation ids
188
+ """
189
+
190
+ return [
191
+ (pair[0].annotation_id, pair[1].annotation_id)
192
+ for pair in match_anns_by_distance(dp, parent_categories, child_categories)
193
+ ]
194
+
195
+
196
+ @pipeline_component_registry.register("MatchingService")
197
+ class MatchingService(PipelineComponent):
198
+ """
199
+ A service to match annotations of two categories by intersection or distance. The matched annotations will be
200
+ assigned a relationship. The parent category will receive a relationship to the child category.
201
+ """
202
+
203
+ def __init__(
204
+ self,
205
+ parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
206
+ child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
207
+ matcher: Union[IntersectionMatcher, NeighbourMatcher],
208
+ relationship_key: Relationships,
209
+ ) -> None:
210
+ """
211
+ :param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
212
+ :param child_categories: list of categories to be used for a child class.
213
+
119
214
  """
120
215
  self.parent_categories = (
121
216
  (get_type(parent_categories),)
@@ -127,13 +222,8 @@ class MatchingService(PipelineComponent):
127
222
  if isinstance(child_categories, str)
128
223
  else (tuple(get_type(category_name) for category_name in child_categories))
129
224
  )
130
- if matching_rule not in ("iou", "ioa"):
131
- raise ValueError("segment rule must be either iou or ioa")
132
-
133
- self.matching_rule = matching_rule
134
- self.threshold = threshold
135
- self.use_weighted_intersections = use_weighted_intersections
136
- self.max_parent_only = max_parent_only
225
+ self.matcher = matcher
226
+ self.relationship_key = relationship_key
137
227
  super().__init__("matching")
138
228
 
139
229
  def serve(self, dp: Image) -> None:
@@ -143,24 +233,14 @@ class MatchingService(PipelineComponent):
143
233
 
144
234
  :param dp: datapoint image
145
235
  """
146
- child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
147
- dp,
148
- parent_ann_category_names=self.parent_categories,
149
- child_ann_category_names=self.child_categories,
150
- matching_rule=self.matching_rule,
151
- threshold=self.threshold,
152
- use_weighted_intersections=self.use_weighted_intersections,
153
- max_parent_only=self.max_parent_only,
154
- )
155
236
 
156
- with MappingContextManager(dp_name=dp.file_name):
157
- matched_child_anns = np.take(child_anns, child_index) # type: ignore
158
- matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
159
- for idx, parent in enumerate(matched_parent_anns):
160
- parent.dump_relationship(Relationships.CHILD, matched_child_anns[idx].annotation_id)
237
+ matched_pairs = self.matcher.match(dp, self.parent_categories, self.child_categories)
238
+
239
+ for pair in matched_pairs:
240
+ self.dp_manager.set_relationship_annotation(self.relationship_key, pair[0], pair[1])
161
241
 
162
242
  def clone(self) -> PipelineComponent:
163
- return self.__class__(self.parent_categories, self.child_categories, self.matching_rule, self.threshold)
243
+ return self.__class__(self.parent_categories, self.child_categories, self.matcher, self.relationship_key)
164
244
 
165
245
  def get_meta_annotation(self) -> MetaAnnotation:
166
246
  return MetaAnnotation(
@@ -215,7 +295,12 @@ class PageParsingService(PipelineComponent):
215
295
  :param dp: Image
216
296
  :return: Page
217
297
  """
218
- return Page.from_image(dp, self.text_container, self.floating_text_block_categories)
298
+ return Page.from_image(
299
+ dp,
300
+ text_container=self.text_container,
301
+ floating_text_block_categories=self.floating_text_block_categories,
302
+ include_residual_text_container=self.include_residual_text_container,
303
+ )
219
304
 
220
305
  def _init_sanity_checks(self) -> None:
221
306
  assert self.text_container in (
@@ -372,7 +372,7 @@ def stretch_items(
372
372
  :param remove_iou_threshold_cols: iou threshold for removing overlapping columns
373
373
  :return: An Image
374
374
  """
375
- table_anns = dp.get_annotation_iter(category_names=table_name)
375
+ table_anns = dp.get_annotation(category_names=table_name)
376
376
 
377
377
  for table in table_anns:
378
378
  dp = stretch_item_per_table(dp, table, row_name, col_name, remove_iou_threshold_rows, remove_iou_threshold_cols)
@@ -190,7 +190,7 @@ class SubImageLayoutService(PipelineComponent):
190
190
  - Optionally invoke the DetectResultGenerator
191
191
  - Generate ImageAnnotations and dump to parent image and sub image.
192
192
  """
193
- sub_image_anns = dp.get_annotation_iter(category_names=self.sub_image_name)
193
+ sub_image_anns = dp.get_annotation(category_names=self.sub_image_name)
194
194
  for sub_image_ann in sub_image_anns:
195
195
  np_image = self.prepare_np_image(sub_image_ann)
196
196
  detect_result_list = self.predictor.predict(np_image)
@@ -176,7 +176,7 @@ def collect_installed_dependencies(data: KeyValEnvInfos) -> KeyValEnvInfos:
176
176
  data.append(("Pycocotools", "None"))
177
177
 
178
178
  if scipy_available():
179
- import scipy # type: ignore
179
+ import scipy
180
180
 
181
181
  data.append(("Scipy", scipy.__version__))
182
182
  else:
@@ -25,6 +25,7 @@ import os
25
25
  from base64 import b64encode
26
26
  from io import BytesIO
27
27
  from pathlib import Path
28
+ from shutil import copyfile
28
29
  from typing import Callable, Literal, Optional, Protocol, Union, overload
29
30
  from urllib.request import urlretrieve
30
31
 
@@ -50,6 +51,7 @@ __all__ = [
50
51
  "get_configs_dir_path",
51
52
  "get_weights_dir_path",
52
53
  "get_dataset_dir_path",
54
+ "maybe_copy_config_to_cache",
53
55
  ]
54
56
 
55
57
 
@@ -254,34 +256,55 @@ def load_json(path_ann: PathLikeOrStr) -> JsonDict:
254
256
  return json_dict
255
257
 
256
258
 
257
- def get_package_path() -> PathLikeOrStr:
259
+ def get_package_path() -> Path:
258
260
  """
259
261
  :return: full base path of this package
260
262
  """
261
263
  return PATH
262
264
 
263
265
 
264
- def get_weights_dir_path() -> PathLikeOrStr:
266
+ def get_weights_dir_path() -> Path:
265
267
  """
266
268
  :return: full base path to the model dir
267
269
  """
268
270
  return MODEL_DIR
269
271
 
270
272
 
271
- def get_configs_dir_path() -> PathLikeOrStr:
273
+ def get_configs_dir_path() -> Path:
272
274
  """
273
275
  :return: full base path to the configs dir
274
276
  """
275
277
  return CONFIGS
276
278
 
277
279
 
278
- def get_dataset_dir_path() -> PathLikeOrStr:
280
+ def get_dataset_dir_path() -> Path:
279
281
  """
280
282
  :return: full base path to the dataset dir
281
283
  """
282
284
  return DATASET_DIR
283
285
 
284
286
 
287
+ def maybe_copy_config_to_cache(
288
+ package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
289
+ ) -> str:
290
+ """
291
+ Initial copying of various files
292
+ :param package_path: base path to directory of source file `file_name`
293
+ :param configs_dir_path: base path to target directory
294
+ :param file_name: file to copy
295
+ :param force_copy: If file is already in target directory, will re-copy the file
296
+
297
+ :return: path to the copied file_name
298
+ """
299
+
300
+ absolute_path_source = os.path.join(package_path, file_name)
301
+ absolute_path = os.path.join(configs_dir_path, os.path.join(os.path.split(file_name)[1]))
302
+ mkdir_p(os.path.split(absolute_path)[0])
303
+ if not os.path.isfile(absolute_path) or force_copy:
304
+ copyfile(absolute_path_source, absolute_path)
305
+ return absolute_path
306
+
307
+
285
308
  @deprecated("Use pathlib operations instead", "2022-06-08")
286
309
  def sub_path(anchor_dir: PathLikeOrStr, *paths: PathLikeOrStr) -> PathLikeOrStr:
287
310
  """
@@ -107,8 +107,7 @@ def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
107
107
  )
108
108
  sys.exit()
109
109
 
110
- file_reader = PdfReader(open(path, "rb")) # pylint: disable=R1732
111
- return file_reader
110
+ return PdfReader(os.fspath(path))
112
111
 
113
112
 
114
113
  def get_pdf_file_writer() -> PdfWriter:
@@ -125,12 +124,24 @@ class PDFStreamer:
125
124
 
126
125
  **Example:**
127
126
 
128
- df = dataflow.DataFromIterable.PDFStreamer(path=path)
127
+ # Building a Dataflow with a PDFStreamer
128
+ df = dataflow.DataFromIterable(PDFStreamer(path=path))
129
129
  df.reset_state()
130
130
 
131
131
  for page in df:
132
132
  ... # do whatever you like
133
133
 
134
+ # Something else you can do:
135
+ streamer = PDFStreamer(path=path)
136
+ pages = len(streamer) # get the number of pages
137
+ random_int = random.sample(range(0, pages), 2) # select some pages
138
+ for ran in random_int:
139
+ pdf_bytes = streamer[ran] # get the page bytes directly
140
+
141
+ streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
142
+ # cause memory leaks if you open many files.
143
+
144
+
134
145
  """
135
146
 
136
147
  def __init__(self, path: PathLikeOrStr) -> None:
@@ -150,6 +161,20 @@ class PDFStreamer:
150
161
  writer.add_page(self.file_reader.pages[k])
151
162
  writer.write(buffer)
152
163
  yield buffer.getvalue(), k
164
+ self.file_reader.close()
165
+
166
+ def __getitem__(self, index: int) -> bytes:
167
+ buffer = BytesIO()
168
+ writer = get_pdf_file_writer()
169
+ writer.add_page(self.file_reader.pages[index])
170
+ writer.write(buffer)
171
+ return buffer.getvalue()
172
+
173
+ def close(self) -> None:
174
+ """
175
+ Close the file reader
176
+ """
177
+ self.file_reader.close()
153
178
 
154
179
 
155
180
  # The following functions are modified versions from the Python poppler wrapper
@@ -101,6 +101,7 @@ class DocumentType(ObjectTypes):
101
101
  GOVERNMENT_TENDERS = "government_tenders"
102
102
  MANUALS = "manuals"
103
103
  PATENTS = "patents"
104
+ MARK = "mark"
104
105
 
105
106
 
106
107
  @object_types_registry.register("LayoutType")
@@ -130,6 +131,7 @@ class LayoutType(ObjectTypes):
130
131
  BACKGROUND = "background"
131
132
  PAGE_NUMBER = "page_number"
132
133
  KEY_VALUE_AREA = "key_value_area"
134
+ LIST_ITEM = "list_item"
133
135
 
134
136
 
135
137
  @object_types_registry.register("TableType")
@@ -221,6 +223,7 @@ class Relationships(ObjectTypes):
221
223
  CHILD = "child"
222
224
  READING_ORDER = "reading_order"
223
225
  SEMANTIC_ENTITY_LINK = "semantic_entity_link"
226
+ LAYOUT_LINK = "layout_link"
224
227
 
225
228
 
226
229
  @object_types_registry.register("Languages")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.33
3
+ Version: 0.34
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -29,6 +29,7 @@ Requires-Dist: Pillow >=10.0.0
29
29
  Requires-Dist: pypdf >=3.16.0
30
30
  Requires-Dist: pyyaml >=6.0.1
31
31
  Requires-Dist: pyzmq >=16
32
+ Requires-Dist: scipy >=1.13.1
32
33
  Requires-Dist: termcolor >=1.1
33
34
  Requires-Dist: tabulate >=0.7.7
34
35
  Requires-Dist: tqdm ==4.64.0
@@ -74,6 +75,7 @@ Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
74
75
  Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
75
76
  Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
76
77
  Requires-Dist: pyzmq >=16 ; extra == 'pt'
78
+ Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
77
79
  Requires-Dist: termcolor >=1.1 ; extra == 'pt'
78
80
  Requires-Dist: tabulate >=0.7.7 ; extra == 'pt'
79
81
  Requires-Dist: tqdm ==4.64.0 ; extra == 'pt'
@@ -105,6 +107,7 @@ Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
105
107
  Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
106
108
  Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
107
109
  Requires-Dist: pyzmq >=16 ; extra == 'tf'
110
+ Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
108
111
  Requires-Dist: termcolor >=1.1 ; extra == 'tf'
109
112
  Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
110
113
  Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'