deepdoctection 0.38__py3-none-any.whl → 0.39.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -6,6 +6,7 @@ Init file for deepdoctection package. This file is used to import all submodules
6
6
  """
7
7
 
8
8
  import importlib.util
9
+ import os
9
10
 
10
11
  # Before doing anything else, check if the .env file exists and load it
11
12
  if importlib.util.find_spec("dotenv") is not None:
@@ -24,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
24
25
 
25
26
  # pylint: enable=wrong-import-position
26
27
 
27
- __version__ = "0.38"
28
+ __version__ = "0.39.1"
28
29
 
29
30
  _IMPORT_STRUCTURE = {
30
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -423,6 +424,9 @@ _IMPORT_STRUCTURE = {
423
424
  env_info = collect_env_info()
424
425
  logger.debug(LoggingRecord(msg=env_info))
425
426
  auto_select_pdf_render_framework()
427
+ os.environ["DPI"] = "300"
428
+ os.environ["IMAGE_WIDTH"] = ""
429
+ os.environ["IMAGE_HEIGHT"] = ""
426
430
 
427
431
  # Direct imports for type-checking
428
432
  if TYPE_CHECKING:
@@ -32,7 +32,6 @@ from ..extern.pt.ptutils import get_torch_device
32
32
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
33
33
  from ..pipe.doctectionpipe import DoctectionPipe
34
34
  from ..utils.env_info import ENV_VARS_TRUE
35
- from ..utils.error import DependencyError
36
35
  from ..utils.file_utils import tensorpack_available
37
36
  from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
38
37
  from ..utils.logger import LoggingRecord, logger
@@ -118,13 +117,15 @@ def get_dd_analyzer(
118
117
  :return: A DoctectionPipe instance with given configs
119
118
  """
120
119
  config_overwrite = [] if config_overwrite is None else config_overwrite
121
- lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
122
- if lib == "TF":
120
+ if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE:
121
+ lib = "TF"
123
122
  device = get_tf_device()
124
- elif lib == "PT":
123
+ elif os.environ.get("DD_USE_TORCH", "0") in ENV_VARS_TRUE:
124
+ lib = "PT"
125
125
  device = get_torch_device()
126
126
  else:
127
- raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
127
+ lib = None
128
+ device = None
128
129
  dd_one_config_path = maybe_copy_config_to_cache(
129
130
  get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
130
131
  )
@@ -48,6 +48,7 @@ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationServic
48
48
  from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
49
49
  from ..pipe.text import TextExtractionService
50
50
  from ..pipe.transform import SimpleTransformService
51
+ from ..utils.error import DependencyError
51
52
  from ..utils.file_utils import detectron2_available
52
53
  from ..utils.fs import get_configs_dir_path
53
54
  from ..utils.metacfg import AttrDict
@@ -62,8 +63,6 @@ __all__ = [
62
63
  "ServiceFactory",
63
64
  ]
64
65
 
65
- # from ._config import cfg
66
-
67
66
 
68
67
  class ServiceFactory:
69
68
  """
@@ -94,6 +93,8 @@ class ServiceFactory:
94
93
  :param config: configuration object
95
94
  :param mode: either `LAYOUT`,`CELL` or `ITEM`
96
95
  """
96
+ if config.LIB is None:
97
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
97
98
  weights = (
98
99
  getattr(config.TF, mode).WEIGHTS
99
100
  if config.LIB == "TF"
@@ -310,6 +311,8 @@ class ServiceFactory:
310
311
  config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
311
312
  )
312
313
  if config.OCR.USE_DOCTR:
314
+ if config.LIB is None:
315
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
313
316
  weights = (
314
317
  config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
315
318
  if config.LIB == "TF"
@@ -353,6 +356,8 @@ class ServiceFactory:
353
356
  :param config: configuration object
354
357
  :return: DoctrTextlineDetector
355
358
  """
359
+ if config.LIB is None:
360
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
356
361
  weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
357
362
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
358
363
  profile = ModelCatalog.get_profile(weights)
@@ -154,7 +154,9 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
154
154
  return np_array.astype(uint8)
155
155
 
156
156
 
157
- def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200) -> PixelValues:
157
+ def convert_pdf_bytes_to_np_array_v2(
158
+ pdf_bytes: bytes, dpi: Optional[int] = None, width: Optional[int] = None, height: Optional[int] = None
159
+ ) -> PixelValues:
158
160
  """
159
161
  Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
160
162
  If both is available you can steer the selection of the render engine with environment variables:
@@ -165,17 +167,21 @@ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200)
165
167
  :param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
166
168
  `utils.fs.load_bytes_from_pdf_file`
167
169
  :param dpi: The dpi value of the resulting output image. For high resolution set dpi=300.
170
+ :param width: The width of the resulting output image. This option does only work when using Poppler as
171
+ PDF renderer
172
+ :param height: The height of the resulting output image. This option does only work when using Poppler as
173
+ PDF renderer
168
174
  :return: Image as numpy array.
169
175
  """
170
176
 
171
- with BytesIO(pdf_bytes) as pdf_file:
172
- pdf = PdfReader(pdf_file).pages[0]
173
- shape = pdf.mediabox # pylint: disable=E1101
174
- height = shape[3] - shape[1]
175
- width = shape[2] - shape[0]
176
-
177
177
  if dpi is None:
178
- return pdf_to_np_array(pdf_bytes, size=(int(width), int(height)))
178
+ if width is None or height is None:
179
+ with BytesIO(pdf_bytes) as pdf_file:
180
+ pdf = PdfReader(pdf_file).pages[0]
181
+ shape = pdf.mediabox # pylint: disable=E1101
182
+ height = shape[3] - shape[1]
183
+ width = shape[2] - shape[0]
184
+ return pdf_to_np_array(pdf_bytes, size=(int(width), int(height))) # type: ignore
179
185
  return pdf_to_np_array(pdf_bytes, dpi=dpi)
180
186
 
181
187
 
@@ -153,7 +153,7 @@ class Image:
153
153
  self.set_width_height(self._image.shape[1], self._image.shape[0])
154
154
  self._self_embedding()
155
155
  elif isinstance(image, bytes):
156
- self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=environ.get("DPI", 300)) # type: ignore
156
+ self._image = convert_pdf_bytes_to_np_array_v2(image, dpi=int(environ["DPI"]))
157
157
  self.set_width_height(self._image.shape[1], self._image.shape[0])
158
158
  self._self_embedding()
159
159
  else:
@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
228
228
 
229
229
  """
230
230
  words = self.get_ordered_words()
231
- characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
232
- *[
233
- (
234
- word.characters,
235
- word.annotation_id,
236
- word.token_class,
237
- word.token_tag,
238
- word.get_sub_category(WordType.TOKEN_CLASS).category_id
239
- if WordType.TOKEN_CLASS in word.sub_categories
240
- else None,
241
- word.get_sub_category(WordType.TOKEN_TAG).category_id
242
- if WordType.TOKEN_TAG in word.sub_categories
243
- else None,
244
- )
245
- for word in words
246
- ]
247
- )
231
+ if words:
232
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
233
+ *[
234
+ (
235
+ word.characters,
236
+ word.annotation_id,
237
+ word.token_class,
238
+ word.token_tag,
239
+ word.get_sub_category(WordType.TOKEN_CLASS).category_id
240
+ if WordType.TOKEN_CLASS in word.sub_categories
241
+ else None,
242
+ word.get_sub_category(WordType.TOKEN_TAG).category_id
243
+ if WordType.TOKEN_TAG in word.sub_categories
244
+ else None,
245
+ )
246
+ for word in words
247
+ ]
248
+ )
249
+ else:
250
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
251
+ [], # type: ignore
252
+ [], # type: ignore
253
+ [], # type: ignore
254
+ [], # type: ignore
255
+ [], # type: ignore
256
+ [], # type: ignore
257
+ )
248
258
  return {
249
259
  "text": " ".join(characters),
250
260
  "words": characters,
@@ -327,7 +337,7 @@ class Table(Layout):
327
337
  :return: A list of `Cell` objects that are row headers.
328
338
  """
329
339
  all_relation_ids = self.get_relationship(Relationships.CHILD)
330
- all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
340
+ all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
331
341
  category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
332
342
  )
333
343
  row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
@@ -363,18 +373,18 @@ class Table(Layout):
363
373
  category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
364
374
  )
365
375
  row_cells = list(
366
- filter(
367
- lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells # type: ignore
368
- )
376
+ filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells) # type: ignore
369
377
  )
370
- row_cells.sort(key=lambda c: c.column_number) # type: ignore
378
+ row_cells.sort(key=lambda c: c.column_number) # type: ignore
371
379
  column_header_cells = self.column_header_cells
372
380
 
373
381
  kv_dict: Mapping[str, str] = {}
374
382
  for cell in row_cells:
375
383
  for header in column_header_cells:
376
- if (cell.column_number == header.column_number and # type: ignore
377
- cell.annotation_id != header.annotation_id): # type: ignore
384
+ if (
385
+ cell.column_number == header.column_number # type: ignore
386
+ and cell.annotation_id != header.annotation_id # type: ignore
387
+ ):
378
388
  kv_dict[(header.column_number, header.text)] = cell.text # type: ignore
379
389
  break
380
390
  return kv_dict
@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
24
24
  from typing import Any, Mapping, Optional, Union
25
25
 
26
26
  import jsonlines
27
- from huggingface_hub import cached_download, hf_hub_url # type: ignore
27
+ from huggingface_hub import hf_hub_download
28
28
  from tabulate import tabulate
29
29
  from termcolor import colored
30
30
 
@@ -136,51 +136,6 @@ class ModelCatalog:
136
136
  dl_library="TF",
137
137
  model_wrapper="TPFrcnnDetector",
138
138
  ),
139
- "item/model-1620000.data-00000-of-00001": ModelProfile(
140
- name="item/model-1620000.data-00000-of-00001",
141
- description="Tensorpack row/column detection model trained on Pubtabnet",
142
- config="dd/tp/conf_frcnn_rows.yaml",
143
- size=[823546048, 25787],
144
- tp_model=True,
145
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
146
- hf_model_name="model-1620000",
147
- hf_config_file=["conf_frcnn_rows.yaml"],
148
- categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
149
- dl_library="TF",
150
- model_wrapper="TPFrcnnDetector",
151
- ),
152
- "layout/model-800000.data-00000-of-00001": ModelProfile(
153
- name="layout/model-800000.data-00000-of-00001",
154
- description="Tensorpack layout detection model trained on Publaynet",
155
- config="dd/tp/conf_frcnn_layout.yaml",
156
- size=[823656748, 25796],
157
- tp_model=True,
158
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
159
- hf_model_name="model-800000",
160
- hf_config_file=["conf_frcnn_layout.yaml"],
161
- dl_library="TF",
162
- categories={
163
- 1: LayoutType.TEXT,
164
- 2: LayoutType.TITLE,
165
- 3: LayoutType.LIST,
166
- 4: LayoutType.TABLE,
167
- 5: LayoutType.FIGURE,
168
- },
169
- model_wrapper="TPFrcnnDetector",
170
- ),
171
- "cell/model-1800000.data-00000-of-00001": ModelProfile(
172
- name="cell/model-1800000.data-00000-of-00001",
173
- description="Tensorpack cell detection model trained on Pubtabnet",
174
- config="dd/tp/conf_frcnn_cell.yaml",
175
- size=[823509160, 25905],
176
- tp_model=True,
177
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
178
- hf_model_name="model-1800000",
179
- hf_config_file=["conf_frcnn_cell.yaml"],
180
- categories={1: LayoutType.CELL},
181
- dl_library="TF",
182
- model_wrapper="TPFrcnnDetector",
183
- ),
184
139
  "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
185
140
  name="layout/d2_model_0829999_layout_inf_only.pt",
186
141
  description="Detectron2 layout detection model trained on Publaynet",
@@ -200,25 +155,6 @@ class ModelCatalog:
200
155
  dl_library="PT",
201
156
  model_wrapper="D2FrcnnDetector",
202
157
  ),
203
- "layout/d2_model_0829999_layout.pth": ModelProfile(
204
- name="layout/d2_model_0829999_layout.pth",
205
- description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
206
- config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
207
- size=[548377327],
208
- tp_model=False,
209
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
210
- hf_model_name="d2_model_0829999_layout.pth",
211
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
212
- categories={
213
- 1: LayoutType.TEXT,
214
- 2: LayoutType.TITLE,
215
- 3: LayoutType.LIST,
216
- 4: LayoutType.TABLE,
217
- 5: LayoutType.FIGURE,
218
- },
219
- dl_library="PT",
220
- model_wrapper="D2FrcnnDetector",
221
- ),
222
158
  "layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
223
159
  name="layout/d2_model_0829999_layout_inf_only.ts",
224
160
  description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
@@ -264,32 +200,6 @@ class ModelCatalog:
264
200
  dl_library="PT",
265
201
  model_wrapper="D2FrcnnTracingDetector",
266
202
  ),
267
- "cell/d2_model_1849999_cell.pth": ModelProfile(
268
- name="cell/d2_model_1849999_cell.pth",
269
- description="Detectron2 cell detection inference only model trained on Pubtabnet",
270
- config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
271
- size=[548279023],
272
- tp_model=False,
273
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
274
- hf_model_name="cell/d2_model_1849999_cell.pth",
275
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
276
- categories={1: LayoutType.CELL},
277
- dl_library="PT",
278
- model_wrapper="D2FrcnnDetector",
279
- ),
280
- "item/d2_model_1639999_item.pth": ModelProfile(
281
- name="item/d2_model_1639999_item.pth",
282
- description="Detectron2 item detection model trained on Pubtabnet",
283
- config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
284
- size=[548303599],
285
- tp_model=False,
286
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
287
- hf_model_name="d2_model_1639999_item.pth",
288
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
289
- categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
290
- dl_library="PT",
291
- model_wrapper="D2FrcnnDetector",
292
- ),
293
203
  "item/d2_model_1639999_item_inf_only.pt": ModelProfile(
294
204
  name="item/d2_model_1639999_item_inf_only.pt",
295
205
  description="Detectron2 item detection model inference only trained on Pubtabnet",
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
1232
1142
  def _load_from_hf_hub(
1233
1143
  repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
1234
1144
  ) -> int:
1235
- url = hf_hub_url(repo_id=repo_id, filename=file_name)
1236
1145
  token = os.environ.get("HF_CREDENTIALS", None)
1237
- f_path = cached_download(
1238
- url,
1239
- cache_dir=cache_directory,
1146
+ f_path = hf_hub_download(
1147
+ repo_id,
1148
+ file_name,
1149
+ local_dir=cache_directory, # type: ignore
1240
1150
  force_filename=file_name,
1241
1151
  force_download=force_download,
1242
1152
  token=token,
1243
- legacy_cache_layout=True,
1244
1153
  )
1245
1154
  if f_path:
1246
1155
  stat_info = os.stat(f_path)
1247
1156
  size = stat_info.st_size
1248
1157
 
1249
- assert size > 0, f"Downloaded an empty file from {url}!"
1158
+ assert size > 0, f"Downloaded an empty file from {f_path}!"
1250
1159
  return size
1251
1160
  raise TypeError("Returned value from cached_download cannot be Null")
@@ -73,18 +73,21 @@ def re_assign_cat_ids(
73
73
  Annotations that are not in the dictionary provided will be removed.
74
74
 
75
75
  :param dp: Image
76
- :param categories_dict_name_as_key: e.g. `{LayoutType.word: '1'}`
76
+ :param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
77
77
  :param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
78
78
  {<WordType.token_class>:
79
- {<FundsFirstPage.report_date>: '1',
80
- <FundsFirstPage.report_type>: '2',
81
- <FundsFirstPage.umbrella>: '3',
82
- <FundsFirstPage.fund_name>: '4',
83
- <TokenClasses.other>: '5'},
84
- <WordType.tag>:
85
- {<BioTag.inside>: '1',
86
- <BioTag.outside>: '2',
87
- <BioTag.begin>: '3'}}}`
79
+ {<FundsFirstPage.REPORT_DATE>: 1,
80
+ <FundsFirstPage.REPORT_TYPE>: 2,
81
+ <FundsFirstPage.UMBRELLA>: 3,
82
+ <FundsFirstPage.FUND_NAME>: 4,
83
+ <TokenClasses.OTHER>: 5},
84
+ <WordType.TAG>:
85
+ {<BioTag.INSIDE>: 1,
86
+ <BioTag.OUTSIDE>: 2,
87
+ <BioTag.BEGIN>: 3}}}`
88
+ To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
89
+ `{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
90
+ <DocumentType.BANK_STATEMENT>:2}}}`
88
91
  :return: Image
89
92
  """
90
93
 
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
104
107
  sub_category = ann.get_sub_category(key)
105
108
  sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
106
109
 
110
+ if cat_to_sub_cat_mapping:
111
+ if "default_type" in cat_to_sub_cat_mapping:
112
+ sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
113
+ for key in sub_cat_keys_to_sub_cat_values:
114
+ sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
115
+ sub_category = dp.summary.get_sub_category(key)
116
+ sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
117
+
107
118
  dp.remove(annotation_ids=ann_ids_to_remove)
108
119
 
109
120
  return dp
@@ -101,17 +101,6 @@ def match_anns_by_intersection(
101
101
  ]
102
102
  )
103
103
 
104
- # second try, if ann has empty image
105
- n_dim = child_ann_boxes.ndim
106
- if n_dim != 2:
107
- child_ann_boxes = np.array(
108
- [
109
- ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
110
- for ann in child_anns
111
- if ann.bounding_box is not None
112
- ]
113
- )
114
-
115
104
  parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
116
105
  parent_ann_boxes = np.array(
117
106
  [
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
120
109
  ]
121
110
  )
122
111
 
123
- # same for parent
124
- n_dim = parent_ann_boxes.ndim
125
- if n_dim != 2:
126
- parent_ann_boxes = np.array(
127
- [
128
- ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
129
- for ann in parent_anns
130
- if ann.bounding_box is not None
131
- ]
132
- )
133
-
134
112
  if matching_rule in ["iou"] and parent_anns and child_anns:
135
113
  iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
136
114
  output = iou_matrix > threshold
@@ -38,12 +38,20 @@ with try_import() as import_guard:
38
38
  from lxml import etree # pylint: disable=W0611
39
39
 
40
40
 
41
- def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
41
+ def to_image(
42
+ dp: Union[str, Mapping[str, Union[str, bytes]]],
43
+ dpi: Optional[int] = None,
44
+ width: Optional[int] = None,
45
+ height: Optional[int] = None,
46
+ ) -> Optional[Image]:
42
47
  """
43
48
  Mapping an input from `dataflow.SerializerFiles` or similar to an Image
44
49
 
45
50
  :param dp: Image
46
51
  :param dpi: dot per inch definition for pdf resolution when converting to numpy array
52
+ :param width: target width of the image. This option does only work when using Poppler as PDF renderer
53
+ :param height: target width of the image. This option does only work when using Poppler as PDF renderer
54
+ :param height: target height of the image
47
55
  :return: Image
48
56
  """
49
57
 
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
77
85
  dp_image.pdf_bytes = dp.get("pdf_bytes")
78
86
  if dp_image.pdf_bytes is not None:
79
87
  if isinstance(dp_image.pdf_bytes, bytes):
80
- dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
88
+ dp_image.image = convert_pdf_bytes_to_np_array_v2(
89
+ dp_image.pdf_bytes, dpi=dpi, width=width, height=height
90
+ )
81
91
  elif image_bytes is not None:
82
92
  dp_image.image = convert_bytes_to_np_array(image_bytes)
83
93
  else:
@@ -393,7 +393,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
393
393
  np_image = load_image_from_file(dp["filename"])
394
394
  if is_file_extension(dp["filename"], ".pdf"):
395
395
  pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
396
- np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
396
+ np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
397
397
  dp = _convert_boxes(dp, np_image.shape[0])
398
398
 
399
399
  if load_image and np_image is not None:
@@ -24,7 +24,7 @@ from __future__ import annotations
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
26
  from dataclasses import dataclass, field
27
- from typing import Any, Mapping, Optional, Union
27
+ from typing import Any, Mapping, Optional, Union, Callable
28
28
  from uuid import uuid1
29
29
 
30
30
  from ..dataflow import DataFlow, MapData
@@ -33,6 +33,7 @@ from ..mapper.misc import curry
33
33
  from ..utils.context import timed_operation
34
34
  from ..utils.identifier import get_uuid_from_str
35
35
  from ..utils.settings import ObjectTypes
36
+ from ..utils.types import DP
36
37
  from .anngen import DatapointManager
37
38
 
38
39
 
@@ -76,6 +77,30 @@ class PipelineComponent(ABC):
76
77
  self.service_id = self.get_service_id()
77
78
  self.dp_manager = DatapointManager(self.service_id, model_id)
78
79
  self.timer_on = False
80
+ self.filter_func: Callable[[DP], bool] = lambda dp: False
81
+
82
+ def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
83
+ """
84
+ Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
85
+ The filter function should return a boolean value. If the function returns True, the image will not be processed
86
+ by this pipeline component.
87
+
88
+ **Example:**
89
+
90
+ ```python
91
+ def do_not_process_tables(dp: Image) -> bool:
92
+ if "table" not in dp.get_categories_from_current_state():
93
+ return True
94
+ return False
95
+
96
+ layout_component = ImageLayoutService(...)
97
+ layout_component.set_inbound_filter(do_not_process_tables)
98
+ ```
99
+
100
+
101
+ :param filter_func: A function that takes an image datapoint and returns a boolean value
102
+ """
103
+ self.filter_func = filter_func # type: ignore
79
104
 
80
105
  @abstractmethod
81
106
  def serve(self, dp: Image) -> None:
@@ -92,6 +117,12 @@ class PipelineComponent(ABC):
92
117
  """
93
118
  raise NotImplementedError()
94
119
 
120
+ def _pass_datapoint(self, dp: Image) -> None:
121
+ self.dp_manager.datapoint = dp
122
+ if not self.filter_func(dp):
123
+ self.serve(dp)
124
+
125
+
95
126
  def pass_datapoint(self, dp: Image) -> Image:
96
127
  """
97
128
  Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
@@ -103,11 +134,9 @@ class PipelineComponent(ABC):
103
134
  """
104
135
  if self.timer_on:
105
136
  with timed_operation(self.__class__.__name__):
106
- self.dp_manager.datapoint = dp
107
- self.serve(dp)
137
+ self._pass_datapoint(dp)
108
138
  else:
109
- self.dp_manager.datapoint = dp
110
- self.serve(dp)
139
+ self._pass_datapoint(dp)
111
140
  return self.dp_manager.datapoint
112
141
 
113
142
  def predict_dataflow(self, df: DataFlow) -> DataFlow:
@@ -205,6 +234,7 @@ class Pipeline(ABC):
205
234
 
206
235
  **Example:**
207
236
 
237
+ ```python
208
238
  layout = LayoutPipeComponent(layout_detector ...)
209
239
  text = TextExtractPipeComponent(text_detector ...)
210
240
  simple_pipe = MyPipeline(pipeline_component = [layout, text])
@@ -212,6 +242,7 @@ class Pipeline(ABC):
212
242
 
213
243
  for page in doc_dataflow:
214
244
  print(page)
245
+ ```
215
246
 
216
247
  In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
217
248
  model or already processed further).
@@ -225,10 +256,12 @@ class Pipeline(ABC):
225
256
 
226
257
  **Example:**
227
258
 
259
+ ```python
228
260
  pipe = MyPipeline(pipeline_component = [layout, text])
229
261
  pipe.set_session_id = True
230
262
 
231
263
  df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
264
+ ```
232
265
  """
233
266
 
234
267
  def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
@@ -349,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
349
349
  def __init__(
350
350
  self,
351
351
  nms_pairs: Sequence[Sequence[TypeOrStr]],
352
- thresholds: Union[float, list[float]],
353
- priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
352
+ thresholds: Union[float, Sequence[float]],
353
+ priority: Optional[Sequence[Union[Optional[TypeOrStr]]]] = None,
354
354
  ):
355
355
  """
356
356
  :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -362,7 +362,7 @@ class AnnotationNmsService(PipelineComponent):
362
362
  self.threshold = [thresholds for _ in self.nms_pairs]
363
363
  else:
364
364
  assert len(self.nms_pairs) == len(thresholds), "Sequences of nms_pairs and thresholds must have same length"
365
- self.threshold = thresholds
365
+ self.threshold = thresholds # type: ignore
366
366
  if priority:
367
367
  assert len(self.nms_pairs) == len(priority), "Sequences of nms_pairs and priority must have same length"
368
368
 
@@ -109,8 +109,13 @@ def _proto_process(
109
109
 
110
110
 
111
111
  @curry
112
- def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
113
- return to_image(dp, dpi)
112
+ def _to_image(
113
+ dp: Union[str, Mapping[str, Union[str, bytes]]],
114
+ dpi: Optional[int] = None,
115
+ width: Optional[int] = None,
116
+ height: Optional[int] = None,
117
+ ) -> Optional[Image]:
118
+ return to_image(dp, dpi, width, height)
114
119
 
115
120
 
116
121
  def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
@@ -188,7 +193,19 @@ class DoctectionPipe(Pipeline):
188
193
 
189
194
  df = MapData(df, _proto_process(path, doc_path))
190
195
  if dataset_dataflow is None:
191
- df = MapData(df, _to_image(dpi=int(os.environ.get("DPI", 300)))) # pylint: disable=E1120
196
+ if dpi := os.environ["DPI"]:
197
+ df = MapData(df, _to_image(dpi=int(dpi))) # pylint: disable=E1120
198
+ else:
199
+ width, height = kwargs.get("width", ""), kwargs.get("height", "")
200
+ if not width or not height:
201
+ width = os.environ["IMAGE_WIDTH"]
202
+ height = os.environ["IMAGE_HEIGHT"]
203
+ if not width or not height:
204
+ raise ValueError(
205
+ "DPI, IMAGE_WIDTH and IMAGE_HEIGHT are all None, but "
206
+ "either DPI or IMAGE_WIDTH and IMAGE_HEIGHT must be set"
207
+ )
208
+ df = MapData(df, _to_image(width=int(width), height=int(height))) # pylint: disable=E1120
192
209
  return df
193
210
 
194
211
  @staticmethod
deepdoctection/pipe/lm.py CHANGED
@@ -24,6 +24,7 @@ from copy import copy
24
24
  from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
25
25
 
26
26
  from ..datapoint.image import Image
27
+ from ..extern.base import SequenceClassResult
27
28
  from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
28
29
  from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
29
30
  from .base import MetaAnnotation, PipelineComponent
@@ -264,6 +265,7 @@ class LMSequenceClassifierService(PipelineComponent):
264
265
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
265
266
  truncation: bool = True,
266
267
  return_overflowing_tokens: bool = False,
268
+ use_other_as_default_category: bool = False
267
269
  ) -> None:
268
270
  """
269
271
  :param tokenizer: Tokenizer, typing allows currently anything. This will be changed in the future
@@ -279,11 +281,16 @@ class LMSequenceClassifierService(PipelineComponent):
279
281
  :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
280
282
  can be returned as an additional batch element. Not that in this case, the number of input
281
283
  batch samples will be smaller than the output batch samples.
284
+ :param use_other_as_default_category: When predicting document classes, it might be possible that some pages
285
+ do not get sent to the model because they are empty. If set to `True` it
286
+ will assign images with no features the category `TokenClasses.OTHER`.
287
+
282
288
  """
283
289
  self.language_model = language_model
284
290
  self.padding = padding
285
291
  self.truncation = truncation
286
292
  self.return_overflowing_tokens = return_overflowing_tokens
293
+ self.use_other_as_default_category = use_other_as_default_category
287
294
  self.tokenizer = tokenizer
288
295
  self.mapping_to_lm_input_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
289
296
  super().__init__(self._get_name(), self.language_model.model_id)
@@ -299,12 +306,20 @@ class LMSequenceClassifierService(PipelineComponent):
299
306
 
300
307
  def serve(self, dp: Image) -> None:
301
308
  lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
309
+ lm_output = None
302
310
  if lm_input is None:
303
- return
304
- lm_output = self.language_model.predict(**lm_input)
305
- self.dp_manager.set_summary_annotation(
306
- PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
307
- )
311
+ if self.use_other_as_default_category:
312
+ class_id = self.language_model.categories.get_categories(as_dict=True,
313
+ name_as_key=True).get(TokenClasses.OTHER, 1)
314
+ lm_output = SequenceClassResult(class_name=TokenClasses.OTHER,
315
+ class_id = class_id,
316
+ score=-1.)
317
+ else:
318
+ lm_output = self.language_model.predict(**lm_input)
319
+ if lm_output:
320
+ self.dp_manager.set_summary_annotation(
321
+ PageType.DOCUMENT_TYPE, lm_output.class_name, lm_output.class_id, None, lm_output.score
322
+ )
308
323
 
309
324
  def clone(self) -> LMSequenceClassifierService:
310
325
  return self.__class__(
@@ -1191,17 +1191,13 @@ class PubtablesSegmentationService(PipelineComponent):
1191
1191
  if key[idx] == item_number:
1192
1192
  cell_ann = dp.get_annotation(annotation_ids=value)[0]
1193
1193
  self.dp_manager.set_category_annotation(
1194
- item_header_cell_name,
1195
- None,
1196
- item_header_cell_name,
1197
- cell_ann.annotation_id
1194
+ item_header_cell_name, None, item_header_cell_name, cell_ann.annotation_id
1198
1195
  )
1199
1196
  else:
1200
1197
  cell_ann = dp.get_annotation(annotation_ids=value)[0]
1201
- self.dp_manager.set_category_annotation(item_header_cell_name,
1202
- None,
1203
- CellType.BODY,
1204
- cell_ann.annotation_id)
1198
+ self.dp_manager.set_category_annotation(
1199
+ item_header_cell_name, None, CellType.BODY, cell_ann.annotation_id
1200
+ )
1205
1201
 
1206
1202
  # TODO: the summaries should be sub categories of the underlying ann
1207
1203
  self.dp_manager.set_summary_annotation(
@@ -73,7 +73,7 @@ class DetrDerivedTrainer(Trainer):
73
73
  model: Union[PreTrainedModel, nn.Module],
74
74
  args: TrainingArguments,
75
75
  data_collator: DetrDataCollator,
76
- train_dataset: Dataset[Any],
76
+ train_dataset: DatasetAdapter,
77
77
  ):
78
78
  self.evaluator: Optional[Evaluator] = None
79
79
  self.build_eval_kwargs: Optional[dict[str, Any]] = None
@@ -499,7 +499,9 @@ def train_hf_layoutlm(
499
499
  )
500
500
  pipeline_component_cls = pipeline_component_registry.get(pipeline_component_name)
501
501
  if dataset_type == DatasetType.SEQUENCE_CLASSIFICATION:
502
- pipeline_component = pipeline_component_cls(tokenizer_fast, dd_model)
502
+ pipeline_component = pipeline_component_cls(tokenizer_fast,
503
+ dd_model,
504
+ use_other_as_default_category=True)
503
505
  else:
504
506
  pipeline_component = pipeline_component_cls(
505
507
  tokenizer_fast,
@@ -181,8 +181,6 @@ class PDFStreamer:
181
181
 
182
182
  streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
183
183
  # cause memory leaks if you open many files.
184
-
185
-
186
184
  """
187
185
 
188
186
  def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
@@ -223,7 +221,10 @@ class PDFStreamer:
223
221
 
224
222
 
225
223
  def _input_to_cli_str(
226
- input_file_name: PathLikeOrStr, output_file_name: PathLikeOrStr, dpi: int, size: Optional[tuple[int, int]] = None
224
+ input_file_name: PathLikeOrStr,
225
+ output_file_name: PathLikeOrStr,
226
+ dpi: Optional[int] = None,
227
+ size: Optional[tuple[int, int]] = None,
227
228
  ) -> list[str]:
228
229
  cmd_args: list[str] = []
229
230
 
@@ -237,7 +238,10 @@ def _input_to_cli_str(
237
238
  if platform.system() == "Windows":
238
239
  command = command + ".exe"
239
240
  cmd_args.append(command)
240
- cmd_args.extend(["-r", str(dpi), str(input_file_name)])
241
+
242
+ if dpi:
243
+ cmd_args.extend(["-r", str(dpi)])
244
+ cmd_args.append(str(input_file_name))
241
245
  cmd_args.append("-png")
242
246
  cmd_args.append(str(output_file_name))
243
247
 
@@ -275,7 +279,9 @@ def _run_poppler(poppler_args: list[str]) -> None:
275
279
  raise PopplerError(status=proc.returncode, message="Syntax Error: PDF cannot be read with Poppler")
276
280
 
277
281
 
278
- def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
282
+ def pdf_to_np_array_poppler(
283
+ pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None
284
+ ) -> PixelValues:
279
285
  """
280
286
  Convert a single pdf page from its byte representation to a numpy array. This function will save the pdf as to a tmp
281
287
  file and then call poppler via `pdftoppm` resp. `pdftocairo` if the former is not available.
@@ -285,7 +291,8 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
285
291
  :param dpi: Image quality in DPI/dots-per-inch (default 200)
286
292
  :return: numpy array
287
293
  """
288
-
294
+ if dpi is None and size is None:
295
+ raise ValueError("Either dpi or size must be provided.")
289
296
  with save_tmp_file(pdf_bytes, "pdf_") as (tmp_name, input_file_name):
290
297
  _run_poppler(_input_to_cli_str(input_file_name, tmp_name, dpi, size))
291
298
  image = viz_handler.read_image(tmp_name + "-1.png")
@@ -293,7 +300,7 @@ def pdf_to_np_array_poppler(pdf_bytes: bytes, size: Optional[tuple[int, int]] =
293
300
  return image.astype(uint8)
294
301
 
295
302
 
296
- def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
303
+ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
297
304
  """
298
305
  Convert a single pdf page from its byte representation to a numpy array using pdfium.
299
306
 
@@ -301,12 +308,13 @@ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: int = 200) -> PixelValues:
301
308
  :param dpi: Image quality in DPI/dots-per-inch (default 200)
302
309
  :return: numpy array
303
310
  """
304
-
311
+ if dpi is None:
312
+ raise ValueError("dpi must be provided.")
305
313
  page = pypdfium2.PdfDocument(pdf_bytes)[0]
306
314
  return page.render(scale=dpi * 1 / 72).to_numpy().astype(uint8)
307
315
 
308
316
 
309
- def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: int = 200) -> PixelValues:
317
+ def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None) -> PixelValues:
310
318
  """
311
319
  Convert a single pdf page from its byte representation to a numpy array. This function will either use Poppler or
312
320
  pdfium to render the pdf.
@@ -101,7 +101,6 @@ class DocumentType(ObjectTypes):
101
101
  GOVERNMENT_TENDERS = "government_tenders"
102
102
  MANUALS = "manuals"
103
103
  PATENTS = "patents"
104
- MARK = "mark"
105
104
 
106
105
 
107
106
  @object_types_registry.register("LayoutType")
@@ -132,6 +131,7 @@ class LayoutType(ObjectTypes):
132
131
  PAGE_NUMBER = "page_number"
133
132
  KEY_VALUE_AREA = "key_value_area"
134
133
  LIST_ITEM = "list_item"
134
+ MARK = "mark"
135
135
 
136
136
 
137
137
  @object_types_registry.register("TableType")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deepdoctection
3
- Version: 0.38
3
+ Version: 0.39.1
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: catalogue==2.0.10
20
- Requires-Dist: huggingface_hub<0.26,>=0.12.0
20
+ Requires-Dist: huggingface_hub>=0.26.0
21
21
  Requires-Dist: importlib-metadata>=5.0.0
22
22
  Requires-Dist: jsonlines==3.1.0
23
23
  Requires-Dist: lazy-imports==0.3.1
@@ -36,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
36
36
  Requires-Dist: tqdm==4.64.0
37
37
  Provides-Extra: tf
38
38
  Requires-Dist: catalogue==2.0.10; extra == "tf"
39
- Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
39
+ Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
40
40
  Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
41
41
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
42
42
  Requires-Dist: lazy-imports==0.3.1; extra == "tf"
@@ -61,14 +61,14 @@ Requires-Dist: python-doctr==0.8.1; extra == "tf"
61
61
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
62
62
  Requires-Dist: boto3==1.34.102; extra == "tf"
63
63
  Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
64
- Requires-Dist: fasttext==0.9.2; extra == "tf"
64
+ Requires-Dist: fasttext-wheel; extra == "tf"
65
65
  Requires-Dist: jdeskew>=0.2.2; extra == "tf"
66
66
  Requires-Dist: apted==1.0.3; extra == "tf"
67
67
  Requires-Dist: distance==0.1.3; extra == "tf"
68
68
  Requires-Dist: lxml>=4.9.1; extra == "tf"
69
69
  Provides-Extra: pt
70
70
  Requires-Dist: catalogue==2.0.10; extra == "pt"
71
- Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
71
+ Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
72
72
  Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
73
73
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
74
74
  Requires-Dist: lazy-imports==0.3.1; extra == "pt"
@@ -86,12 +86,12 @@ Requires-Dist: termcolor>=1.1; extra == "pt"
86
86
  Requires-Dist: tabulate>=0.7.7; extra == "pt"
87
87
  Requires-Dist: tqdm==4.64.0; extra == "pt"
88
88
  Requires-Dist: timm>=0.9.16; extra == "pt"
89
- Requires-Dist: transformers>=4.36.0; extra == "pt"
89
+ Requires-Dist: transformers>=4.48.0; extra == "pt"
90
90
  Requires-Dist: accelerate>=0.29.1; extra == "pt"
91
91
  Requires-Dist: python-doctr==0.8.1; extra == "pt"
92
92
  Requires-Dist: boto3==1.34.102; extra == "pt"
93
93
  Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
94
- Requires-Dist: fasttext==0.9.2; extra == "pt"
94
+ Requires-Dist: fasttext-wheel; extra == "pt"
95
95
  Requires-Dist: jdeskew>=0.2.2; extra == "pt"
96
96
  Requires-Dist: apted==1.0.3; extra == "pt"
97
97
  Requires-Dist: distance==0.1.3; extra == "pt"
@@ -99,7 +99,7 @@ Requires-Dist: lxml>=4.9.1; extra == "pt"
99
99
  Provides-Extra: docs
100
100
  Requires-Dist: tensorpack==0.11; extra == "docs"
101
101
  Requires-Dist: boto3==1.34.102; extra == "docs"
102
- Requires-Dist: transformers>=4.36.0; extra == "docs"
102
+ Requires-Dist: transformers>=4.48.0; extra == "docs"
103
103
  Requires-Dist: accelerate>=0.29.1; extra == "docs"
104
104
  Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
105
105
  Requires-Dist: lxml>=4.9.1; extra == "docs"
@@ -1,9 +1,9 @@
1
- deepdoctection/__init__.py,sha256=EpkATv3al-4H6AomNHcSpFPChv5KqFdZJBzg97FVOWo,12653
1
+ deepdoctection/__init__.py,sha256=uDowNayqaYZGYaqnGzPSz6pVuHQhtDVRAN_bvPq85Ko,12754
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
4
  deepdoctection/analyzer/_config.py,sha256=OZMOPlyFv4gcyabPG6KO08EYx-0tUH82Ehs9YDv2B1Q,5027
5
- deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
6
- deepdoctection/analyzer/factory.py,sha256=idvIMuohtvyECBcAVBtUFGouNpMZ_DrXBbizSxieZWI,31899
5
+ deepdoctection/analyzer/dd.py,sha256=bfR7e1JV7BwUNDRLu0jYZU7qQXnyA_vbRAJl2Ylrq5o,5905
6
+ deepdoctection/analyzer/factory.py,sha256=Kf3Ztv5FEcF5yJf6i4I557aOIUHybuxIP0moHryguTQ,32344
7
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
8
8
  deepdoctection/configs/conf_dd_one.yaml,sha256=qnrDAST1PHBtdIKE_hdkZexW22FqVvNTI-PEo9wvinM,3025
9
9
  deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
@@ -18,9 +18,9 @@ deepdoctection/dataflow/stats.py,sha256=Bsr6v7lcesKXUYtO9wjqlzx_Yq_uyIF3Lel-tQ0i
18
18
  deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SPb7C1lOY,1643
19
19
  deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
20
20
  deepdoctection/datapoint/box.py,sha256=UAdSnLexvFyg4KK1u9kXdJxhaWTwRxTU-cnQcvl37Q8,23410
21
- deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
22
- deepdoctection/datapoint/image.py,sha256=0ipkaF5k5sCe-qVQsWA8FOYF90UBAbAVLfAFwtq_sLg,33639
23
- deepdoctection/datapoint/view.py,sha256=AgSEZlKK-cm1erQ872ZWGUN3gomNpsQ39LkTR8Cg0BQ,49019
21
+ deepdoctection/datapoint/convert.py,sha256=gJbHY2V8nlMht1N5VdxTmWSsOeydpFPTJsaJHp6XGgE,7516
22
+ deepdoctection/datapoint/image.py,sha256=S6yfsIRQgMCl6HYAcHYJSBcbfdYKKtebtkEkkkrXsMQ,33619
23
+ deepdoctection/datapoint/view.py,sha256=srMyPQGsK4OSiorxkyG6UAIgpViM6Ks1CI3b5k97cjY,49452
24
24
  deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
25
25
  deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
26
26
  deepdoctection/datasets/base.py,sha256=DT4i-d74sIEiUNC6UspIHNJuHSK0t1dBv7qwadg4rLw,22341
@@ -58,7 +58,7 @@ deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ
58
58
  deepdoctection/extern/hfdetr.py,sha256=JzHrrTyzS9qh6T2TsvKboAGZkIhno2txmSoLQ5Vd-lo,12077
59
59
  deepdoctection/extern/hflayoutlm.py,sha256=tFaf90FRbZzhSycdp8rGkeiPywQa6UcTEEwbayIXkr0,57023
60
60
  deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
61
- deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
61
+ deepdoctection/extern/model.py,sha256=lbVwDa3vD6VwCD_dsozcI8b4xDZs4KJ1628SxaDdtHQ,55378
62
62
  deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
63
63
  deepdoctection/extern/tessocr.py,sha256=tG7etMvZ-jHFdq-jJAHYMJii3ujDjMfAFYUsjBp3nKI,17444
64
64
  deepdoctection/extern/texocr.py,sha256=yMt5ZzKtsjd7ogrcNXba7zccGGGF9LXK194EtER6YNQ,5804
@@ -88,39 +88,39 @@ deepdoctection/extern/tp/tpfrcnn/utils/__init__.py,sha256=kiPlXxHlTGN9eI7YE9Bgwt
88
88
  deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py,sha256=aBLqPg_ApaiimtBRaOsLKTZZFIBh87vVtqjLPMaX9fQ,2379
89
89
  deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py,sha256=O-q1GQiOEd1lN1MQDsJvHwD2OmBO-qHNeqJ1Qnec93g,3539
90
90
  deepdoctection/mapper/__init__.py,sha256=Xqb34aCjslZDQnqQgCSvnloL5DbdT9eHhn-StpVPbzE,1130
91
- deepdoctection/mapper/cats.py,sha256=EsYdUw8LAPsyqAfGfhNa6gAPVpUhP8GGOchSIKto_e0,15741
91
+ deepdoctection/mapper/cats.py,sha256=s73JzONV2UQ71szfljurk7H1-UjDBWsW4oNLs5xePUk,16474
92
92
  deepdoctection/mapper/cocostruct.py,sha256=GcbUpPFUg67pcOHQluWBFOFcGaYnlZcTmwBDERBVgCA,5978
93
93
  deepdoctection/mapper/d2struct.py,sha256=Dx-YnycsIQH4a5-9Gn_yMhiQ-gOFgMueNeH3rhXjuCU,8555
94
94
  deepdoctection/mapper/hfstruct.py,sha256=2PjGKsYturVJBimLT1CahYh09KSRAFEHz_QNtC162kQ,5551
95
95
  deepdoctection/mapper/laylmstruct.py,sha256=abMZkYU2W0e_VcCm_c0ZXNFuv-lfMFWcTedcZS5EYvE,42935
96
96
  deepdoctection/mapper/maputils.py,sha256=eI6ZcDg9W5uB6xQNBZpMIdEd86HlCxTtkJuyROdTqiw,8146
97
- deepdoctection/mapper/match.py,sha256=pCWZpz2R8JahiKXCw7dxKRTLiPgJXeVDgkddDPLy_c0,9643
98
- deepdoctection/mapper/misc.py,sha256=NLSSgk066Tkrrdi075HkqV7cP-iqT9fv_MtyAJ-8gOg,6743
97
+ deepdoctection/mapper/match.py,sha256=Ed9FsuVPNp_faaW5PKnvUHZoEXcRcrO-muduTMzjp1s,8937
98
+ deepdoctection/mapper/misc.py,sha256=vX-fV420Te00eD-cqTiWBV2twHqdBcBV2_7rAFRgPRg,7164
99
99
  deepdoctection/mapper/pascalstruct.py,sha256=TzVU1p0oiw0nOuxTFFbEB9vXJxH1v6VUvTJ7MD0manU,3828
100
100
  deepdoctection/mapper/prodigystruct.py,sha256=Re4Sd_zAp6qOvbXZLmMJeG0IGEfMQxebuyDeZgMcTa8,6827
101
- deepdoctection/mapper/pubstruct.py,sha256=YxsrZ-E0pD45Mm_VCPQB9yEgHsTPkw4htt-3DwCRX1k,23361
101
+ deepdoctection/mapper/pubstruct.py,sha256=PAJ2N1HSPNS6F2ZrIwlD7PiBhIM-rJscK_Ti8OR_IGs,23370
102
102
  deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_rnww,4481
103
103
  deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
104
104
  deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
105
105
  deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
106
- deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
107
- deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
106
+ deepdoctection/pipe/base.py,sha256=F4NusbZ-xYc6wuO-XAngmC8uzahT2ubsu2g9NO8PpVw,15390
107
+ deepdoctection/pipe/common.py,sha256=vlWzvwn8wl7baPbK-917HUWujEGJEkHur_-ilkweKjk,17751
108
108
  deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
109
- deepdoctection/pipe/doctectionpipe.py,sha256=xrDK2_84tVUMsRG7bzqGKiOCsoO-49tweTOK2Je1fls,11770
109
+ deepdoctection/pipe/doctectionpipe.py,sha256=bGW3ugky-fb-nEe-3bvO6Oc_4_6w82cQboGM_6p2eIo,12530
110
110
  deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
111
111
  deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
112
- deepdoctection/pipe/lm.py,sha256=tLuCtML-S_kTEYcDAEtM3NBYmR7Aovv9p5TcXYL_AAg,16693
112
+ deepdoctection/pipe/lm.py,sha256=Ygj6MmBvBZ7l4RGCwBuhmMsOM0Ep3LWteNg7bzh-UmI,17703
113
113
  deepdoctection/pipe/order.py,sha256=PnJZiCnxFluJiECXLTZT0c1Rr66vIRBFraa_G41UA2k,40121
114
114
  deepdoctection/pipe/refine.py,sha256=dTfI396xydPdbzpfo4yqFcuxl3UAB1y-WbSQn1o76ec,22367
115
115
  deepdoctection/pipe/registry.py,sha256=aFx-Tn0xhVA5l5H18duNW5QoTNKQltybsEUEzsMgUfg,902
116
- deepdoctection/pipe/segment.py,sha256=WhIi-m6Wwm9JjHOBomw9q5XUUzmt7-BFNpdcU1m2LH8,59386
116
+ deepdoctection/pipe/segment.py,sha256=CR83HQMW0hrRG8W6pFuB0YibxQMWpqI7_LaUIcJcQwo,59116
117
117
  deepdoctection/pipe/sub_layout.py,sha256=N1RcID-boORcwsW_j0l64HpUu3rff0ge5qEanudLYgk,13838
118
118
  deepdoctection/pipe/text.py,sha256=h9q6d3HFOs7LOg-iwdLUPiQxrPqgunBVNmtYMBrfRQE,11180
119
119
  deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3nI7w,3798
120
120
  deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
121
121
  deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
122
- deepdoctection/train/hf_detr_train.py,sha256=8ydysxzOPE_IPoNFGaHb7PbKr9Nbl41rcY4lbylQavU,10783
123
- deepdoctection/train/hf_layoutlm_train.py,sha256=BNjPgPAvxm4beHULqzo58u-gW7GcTGiZAk2rF6TootM,22532
122
+ deepdoctection/train/hf_detr_train.py,sha256=eHSdI11U8oGy93noZxAISfukhRBElj4dBerJ4Xcercw,10785
123
+ deepdoctection/train/hf_layoutlm_train.py,sha256=irSg-IpbVoSlaw1-vZCej2mCZcctONtXr5Z2NQAc_a4,22680
124
124
  deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
125
125
  deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
126
126
  deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
@@ -134,15 +134,15 @@ deepdoctection/utils/identifier.py,sha256=QkNaGGqPynHwDPnd3_m8iur4Cv64rcQa7qolCE
134
134
  deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mVA,10003
135
135
  deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
136
136
  deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
137
- deepdoctection/utils/pdf_utils.py,sha256=G0m8kUn2HwwyZWH_BcrDkm-m3MP9GN9SWHj5VhB7swY,12845
138
- deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
137
+ deepdoctection/utils/pdf_utils.py,sha256=Fi0eZ2GbnO7N61Rd8b8YRKRff4dalHAzkcn3zpGPoic,13119
138
+ deepdoctection/utils/settings.py,sha256=hDD6yDX_4pQXwR5ILVwJIj6hb7NXA0-ifnC25ldcUjA,12464
139
139
  deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
140
140
  deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
141
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
142
142
  deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
143
143
  deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
144
- deepdoctection-0.38.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
- deepdoctection-0.38.dist-info/METADATA,sha256=WoWX8R8jC04bj81VPQWYzBJgB9mREE5Ng7LCZtqGylc,19759
146
- deepdoctection-0.38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
147
- deepdoctection-0.38.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
- deepdoctection-0.38.dist-info/RECORD,,
144
+ deepdoctection-0.39.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.39.1.dist-info/METADATA,sha256=NBN2dqFMUiXkcJ28xJDwyN6eNP-MmFw64F7dm3kUWTA,19741
146
+ deepdoctection-0.39.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
147
+ deepdoctection-0.39.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.39.1.dist-info/RECORD,,