deepdoctection 0.37.3__py3-none-any.whl → 0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -28,7 +28,7 @@ import numpy as np
28
28
  from typing_extensions import LiteralString
29
29
 
30
30
  from ..utils.error import AnnotationError, ImageError
31
- from ..utils.logger import LoggingRecord, logger
31
+ from ..utils.logger import LoggingRecord, log_once, logger
32
32
  from ..utils.settings import (
33
33
  CellType,
34
34
  LayoutType,
@@ -228,23 +228,33 @@ class Layout(ImageAnnotationBaseView):
228
228
 
229
229
  """
230
230
  words = self.get_ordered_words()
231
- characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
232
- *[
233
- (
234
- word.characters,
235
- word.annotation_id,
236
- word.token_class,
237
- word.token_tag,
238
- word.get_sub_category(WordType.TOKEN_CLASS).category_id
239
- if WordType.TOKEN_CLASS in word.sub_categories
240
- else None,
241
- word.get_sub_category(WordType.TOKEN_TAG).category_id
242
- if WordType.TOKEN_TAG in word.sub_categories
243
- else None,
244
- )
245
- for word in words
246
- ]
247
- )
231
+ if words:
232
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
233
+ *[
234
+ (
235
+ word.characters,
236
+ word.annotation_id,
237
+ word.token_class,
238
+ word.token_tag,
239
+ word.get_sub_category(WordType.TOKEN_CLASS).category_id
240
+ if WordType.TOKEN_CLASS in word.sub_categories
241
+ else None,
242
+ word.get_sub_category(WordType.TOKEN_TAG).category_id
243
+ if WordType.TOKEN_TAG in word.sub_categories
244
+ else None,
245
+ )
246
+ for word in words
247
+ ]
248
+ )
249
+ else:
250
+ characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
251
+ [], # type: ignore
252
+ [], # type: ignore
253
+ [], # type: ignore
254
+ [], # type: ignore
255
+ [], # type: ignore
256
+ [], # type: ignore
257
+ )
248
258
  return {
249
259
  "text": " ".join(characters),
250
260
  "words": characters,
@@ -282,25 +292,103 @@ class Table(Layout):
282
292
  """
283
293
 
284
294
  @property
285
- def cells(self) -> list[ImageAnnotationBaseView]:
295
+ def cells(self) -> list[Cell]:
286
296
  """
287
297
  A list of a table cells.
288
298
  """
289
299
  all_relation_ids = self.get_relationship(Relationships.CHILD)
290
- cell_anns = self.base_page.get_annotation(
300
+ cell_anns: list[Cell] = self.base_page.get_annotation( # type: ignore
291
301
  annotation_ids=all_relation_ids,
292
302
  category_names=[
293
303
  LayoutType.CELL,
294
304
  CellType.HEADER,
295
305
  CellType.BODY,
296
- CellType.PROJECTED_ROW_HEADER,
297
306
  CellType.SPANNING,
298
- CellType.ROW_HEADER,
299
- CellType.COLUMN_HEADER,
300
307
  ],
301
308
  )
302
309
  return cell_anns
303
310
 
311
+ @property
312
+ def column_header_cells(self) -> list[Cell]:
313
+ """
314
+ Retrieve a list of cells that are column headers in the table.
315
+
316
+ This property filters and sorts the cells in the table to return only those that are column headers.
317
+ The cells are sorted by their column number.
318
+
319
+ :return: A list of `Cell` objects that are column headers.
320
+ """
321
+ all_relation_ids = self.get_relationship(Relationships.CHILD)
322
+ all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
323
+ category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
324
+ )
325
+ headers = list(filter(lambda cell: CellType.COLUMN_HEADER in cell.sub_categories, all_cells))
326
+ headers.sort(key=lambda x: x.column_number) # type: ignore
327
+ return headers
328
+
329
+ @property
330
+ def row_header_cells(self) -> list[Cell]:
331
+ """
332
+ Retrieve a list of cells that are row headers in the table.
333
+
334
+ This property filters and sorts the cells in the table to return only those that are row headers.
335
+ The cells are sorted by their column number.
336
+
337
+ :return: A list of `Cell` objects that are row headers.
338
+ """
339
+ all_relation_ids = self.get_relationship(Relationships.CHILD)
340
+ all_cells: list[Cell] = self.base_page.get_annotation( # type: ignore
341
+ category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
342
+ )
343
+ row_header_cells = list(filter(lambda cell: CellType.ROW_HEADER in cell.sub_categories, all_cells))
344
+ row_header_cells.sort(key=lambda x: x.column_number) # type: ignore
345
+ return row_header_cells
346
+
347
+ def kv_header_rows(self, row_number: int) -> Mapping[str, str]:
348
+ """
349
+ For a given row number, returns a dictionary mapping column headers to cell values in that row.
350
+
351
+ This method retrieves all cells in the specified row and matches them with their corresponding column headers.
352
+ It then creates a key-value pair where the key is a tuple containing the column number and header text,
353
+ and the value is the cell text.
354
+
355
+ :param row_number: The row number for which to retrieve the key-value pairs.
356
+ :return: A dictionary where keys are tuples of (column number, header text) and values are cell texts.
357
+
358
+ Example:
359
+ If the table has the following structure:
360
+ | Header1 | Header2 |
361
+ |---------|---------|
362
+ | Value1 | Value2 |
363
+ | Value3 | Value4 |
364
+
365
+ Calling kv_header_rows(1) would return:
366
+ {
367
+ (1, 'Header1'): 'Value1',
368
+ (2, 'Header2'): 'Value2'
369
+ }
370
+ """
371
+ all_relation_ids = self.get_relationship(Relationships.CHILD)
372
+ all_cells = self.base_page.get_annotation(
373
+ category_names=[LayoutType.CELL, CellType.SPANNING], annotation_ids=all_relation_ids
374
+ )
375
+ row_cells = list(
376
+ filter(lambda c: row_number in (c.row_number, c.row_number + c.row_span), all_cells) # type: ignore
377
+ )
378
+ row_cells.sort(key=lambda c: c.column_number) # type: ignore
379
+ column_header_cells = self.column_header_cells
380
+
381
+ kv_dict: Mapping[str, str] = {}
382
+ for cell in row_cells:
383
+ for header in column_header_cells:
384
+ if (
385
+ cell.column_number == header.column_number # type: ignore
386
+ and cell.annotation_id != header.annotation_id # type: ignore
387
+ ):
388
+ kv_dict[(header.column_number, header.text)] = cell.text # type: ignore
389
+ break
390
+ return kv_dict
391
+
304
392
  @property
305
393
  def rows(self) -> list[ImageAnnotationBaseView]:
306
394
  """
@@ -335,7 +423,7 @@ class Table(Layout):
335
423
  try:
336
424
  html_index = html_list.index(cell.annotation_id)
337
425
  html_list.pop(html_index)
338
- html_list.insert(html_index, cell.text) # type: ignore
426
+ html_list.insert(html_index, cell.text)
339
427
  except ValueError:
340
428
  logger.warning(LoggingRecord("html construction not possible", {"annotation_id": cell.annotation_id}))
341
429
 
@@ -357,6 +445,12 @@ class Table(Layout):
357
445
  cells = self.cells
358
446
  table_list = [["" for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
359
447
  for cell in cells:
448
+ if cell.category_name == CellType.SPANNING:
449
+ log_once(
450
+ "Table has spanning cells. This implies, that the .csv output will not be correct."
451
+ "To prevent spanning cell table creation set PT.ITEM.FILTER=['table','spanning'] ",
452
+ "error",
453
+ )
360
454
  table_list[cell.row_number - 1][cell.column_number - 1] = ( # type: ignore
361
455
  table_list[cell.row_number - 1][cell.column_number - 1] + cell.text + " " # type: ignore
362
456
  )
@@ -386,13 +480,13 @@ class Table(Layout):
386
480
  token_class_ids: list[str] = []
387
481
  token_tag_ids: list[str] = []
388
482
  for cell in cells:
389
- text.extend(cell.text_["text"]) # type: ignore
390
- words.extend(cell.text_["words"]) # type: ignore
391
- ann_ids.extend(cell.text_["ann_ids"]) # type: ignore
392
- token_classes.extend(cell.text_["token_classes"]) # type: ignore
393
- token_tags.extend(cell.text_["token_tags"]) # type: ignore
394
- token_class_ids.extend(cell.text_["token_class_ids"]) # type: ignore
395
- token_tag_ids.extend(cell.text_["token_tag_ids"]) # type: ignore
483
+ text.extend(cell.text_["text"])
484
+ words.extend(cell.text_["words"])
485
+ ann_ids.extend(cell.text_["ann_ids"])
486
+ token_classes.extend(cell.text_["token_classes"])
487
+ token_tags.extend(cell.text_["token_tags"])
488
+ token_class_ids.extend(cell.text_["token_class_ids"])
489
+ token_tag_ids.extend(cell.text_["token_tag_ids"])
396
490
  return {
397
491
  "text": " ".join(text),
398
492
  "words": words,
@@ -414,7 +508,7 @@ class Table(Layout):
414
508
  if not cells:
415
509
  return super().words
416
510
  for cell in cells:
417
- all_words.extend(cell.words) # type: ignore
511
+ all_words.extend(cell.words)
418
512
  return all_words
419
513
 
420
514
  def get_ordered_words(self) -> list[ImageAnnotationBaseView]:
@@ -424,7 +518,7 @@ class Table(Layout):
424
518
  all_words = []
425
519
  cells.sort(key=lambda x: (x.ROW_NUMBER, x.COLUMN_NUMBER))
426
520
  for cell in cells:
427
- all_words.extend(cell.get_ordered_words()) # type: ignore
521
+ all_words.extend(cell.get_ordered_words())
428
522
  return all_words
429
523
  except (TypeError, AnnotationError):
430
524
  return super().get_ordered_words()
@@ -436,10 +530,10 @@ IMAGE_ANNOTATION_TO_LAYOUTS: dict[ObjectTypes, Type[Union[Layout, Table, Word]]]
436
530
  LayoutType.TABLE_ROTATED: Table,
437
531
  LayoutType.WORD: Word,
438
532
  LayoutType.CELL: Cell,
439
- CellType.PROJECTED_ROW_HEADER: Cell,
440
533
  CellType.SPANNING: Cell,
441
534
  CellType.ROW_HEADER: Cell,
442
535
  CellType.COLUMN_HEADER: Cell,
536
+ CellType.PROJECTED_ROW_HEADER: Cell,
443
537
  }
444
538
 
445
539
 
@@ -465,10 +559,7 @@ IMAGE_DEFAULTS: ImageDefaults = {
465
559
  LayoutType.LIST,
466
560
  LayoutType.CELL,
467
561
  LayoutType.FIGURE,
468
- CellType.COLUMN_HEADER,
469
- CellType.PROJECTED_ROW_HEADER,
470
562
  CellType.SPANNING,
471
- CellType.ROW_HEADER,
472
563
  ),
473
564
  }
474
565
 
@@ -851,6 +942,16 @@ class Page(Image):
851
942
  """
852
943
  return self._make_text(False)
853
944
 
945
+ def _ann_viz_bbox(self, ann: ImageAnnotationBaseView) -> list[float]:
946
+ """
947
+ Get the bounding box as list and in absolute coordinates of the base page.
948
+ """
949
+ bounding_box = ann.get_bounding_box(self.image_id)
950
+
951
+ if not bounding_box.absolute_coords:
952
+ bounding_box = bounding_box.transform(self.width, self.height, absolute_coords=True)
953
+ return bounding_box.to_list(mode="xyxy")
954
+
854
955
  @no_type_check
855
956
  def viz(
856
957
  self,
@@ -886,6 +987,7 @@ class Page(Image):
886
987
  :param show_tables: Will display all tables boxes as well as cells, rows and columns
887
988
  :param show_layouts: Will display all other layout components.
888
989
  :param show_figures: Will display all figures
990
+ :param show_residual_layouts: Will display all residual layouts
889
991
  :param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
890
992
  :param show_table_structure: Will display rows and columns
891
993
  :param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
@@ -910,50 +1012,46 @@ class Page(Image):
910
1012
  if debug_kwargs:
911
1013
  anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
912
1014
  for ann in anns:
913
- box_stack.append(ann.bbox)
1015
+ box_stack.append(self._ann_viz_bbox(ann))
914
1016
  category_names_list.append(str(getattr(ann, debug_kwargs[ann.category_name])))
915
1017
 
916
1018
  if show_layouts and not debug_kwargs:
917
1019
  for item in self.layouts:
918
- box_stack.append(item.bbox)
1020
+ box_stack.append(self._ann_viz_bbox(item))
919
1021
  category_names_list.append(item.category_name.value)
920
1022
 
921
1023
  if show_figures and not debug_kwargs:
922
1024
  for item in self.figures:
923
- box_stack.append(item.bbox)
1025
+ box_stack.append(self._ann_viz_bbox(item))
924
1026
  category_names_list.append(item.category_name.value)
925
1027
 
926
1028
  if show_tables and not debug_kwargs:
927
1029
  for table in self.tables:
928
- box_stack.append(table.bbox)
1030
+ box_stack.append(self._ann_viz_bbox(table))
929
1031
  category_names_list.append(LayoutType.TABLE.value)
930
1032
  if show_cells:
931
1033
  for cell in table.cells:
932
1034
  if cell.category_name in {
933
1035
  LayoutType.CELL,
934
- CellType.PROJECTED_ROW_HEADER,
935
1036
  CellType.SPANNING,
936
- CellType.ROW_HEADER,
937
- CellType.COLUMN_HEADER,
938
1037
  }:
939
1038
  cells_found = True
940
- box_stack.append(cell.bbox)
1039
+ box_stack.append(self._ann_viz_bbox(cell))
941
1040
  category_names_list.append(None)
942
1041
  if show_table_structure:
943
1042
  rows = table.rows
944
1043
  cols = table.columns
945
1044
  for row in rows:
946
- box_stack.append(row.bbox)
1045
+ box_stack.append(self._ann_viz_bbox(row))
947
1046
  category_names_list.append(None)
948
1047
  for col in cols:
949
- box_stack.append(col.bbox)
1048
+ box_stack.append(self._ann_viz_bbox(col))
950
1049
  category_names_list.append(None)
951
1050
 
952
1051
  if show_cells and not cells_found and not debug_kwargs:
953
- for ann in self.annotations:
954
- if isinstance(ann, Cell) and ann.active:
955
- box_stack.append(ann.bbox)
956
- category_names_list.append(None)
1052
+ for ann in self.get_annotation(category_names=[LayoutType.CELL, CellType.SPANNING]):
1053
+ box_stack.append(self._ann_viz_bbox(ann))
1054
+ category_names_list.append(None)
957
1055
 
958
1056
  if show_words and not debug_kwargs:
959
1057
  all_words = []
@@ -965,7 +1063,7 @@ class Page(Image):
965
1063
  all_words = self.get_annotation(category_names=LayoutType.WORD)
966
1064
  if not ignore_default_token_class:
967
1065
  for word in all_words:
968
- box_stack.append(word.bbox)
1066
+ box_stack.append(self._ann_viz_bbox(word))
969
1067
  if show_token_class:
970
1068
  category_names_list.append(word.token_class.value if word.token_class is not None else None)
971
1069
  else:
@@ -973,7 +1071,7 @@ class Page(Image):
973
1071
  else:
974
1072
  for word in all_words:
975
1073
  if word.token_class is not None and word.token_class != TokenClasses.OTHER:
976
- box_stack.append(word.bbox)
1074
+ box_stack.append(self._ann_viz_bbox(word))
977
1075
  if show_token_class:
978
1076
  category_names_list.append(word.token_class.value if word.token_class is not None else None)
979
1077
  else:
@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
41
41
  from transformers import ( # pylint: disable=W0611
42
42
  AutoFeatureExtractor,
43
43
  DetrFeatureExtractor,
44
+ DetrImageProcessor,
44
45
  PretrainedConfig,
45
46
  TableTransformerForObjectDetection,
46
47
  )
@@ -55,7 +56,7 @@ def _detr_post_processing(
55
56
  def detr_predict_image(
56
57
  np_img: PixelValues,
57
58
  predictor: TableTransformerForObjectDetection,
58
- feature_extractor: DetrFeatureExtractor,
59
+ feature_extractor: DetrImageProcessor,
59
60
  device: torch.device,
60
61
  threshold: float,
61
62
  nms_threshold: float,
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
224
225
  )
225
226
 
226
227
  @staticmethod
227
- def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrFeatureExtractor:
228
+ def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
228
229
  """
229
230
  Builds the feature extractor
230
231
 
231
232
  :return: DetrFeatureExtractor
232
233
  """
233
- return AutoFeatureExtractor.from_pretrained(
234
+ return DetrImageProcessor.from_pretrained(
234
235
  pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
235
236
  )
236
237
 
@@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass, field
24
24
  from typing import Any, Mapping, Optional, Union
25
25
 
26
26
  import jsonlines
27
- from huggingface_hub import cached_download, hf_hub_url # type: ignore
27
+ from huggingface_hub import hf_hub_download
28
28
  from tabulate import tabulate
29
29
  from termcolor import colored
30
30
 
@@ -136,51 +136,6 @@ class ModelCatalog:
136
136
  dl_library="TF",
137
137
  model_wrapper="TPFrcnnDetector",
138
138
  ),
139
- "item/model-1620000.data-00000-of-00001": ModelProfile(
140
- name="item/model-1620000.data-00000-of-00001",
141
- description="Tensorpack row/column detection model trained on Pubtabnet",
142
- config="dd/tp/conf_frcnn_rows.yaml",
143
- size=[823546048, 25787],
144
- tp_model=True,
145
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc",
146
- hf_model_name="model-1620000",
147
- hf_config_file=["conf_frcnn_rows.yaml"],
148
- categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
149
- dl_library="TF",
150
- model_wrapper="TPFrcnnDetector",
151
- ),
152
- "layout/model-800000.data-00000-of-00001": ModelProfile(
153
- name="layout/model-800000.data-00000-of-00001",
154
- description="Tensorpack layout detection model trained on Publaynet",
155
- config="dd/tp/conf_frcnn_layout.yaml",
156
- size=[823656748, 25796],
157
- tp_model=True,
158
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet",
159
- hf_model_name="model-800000",
160
- hf_config_file=["conf_frcnn_layout.yaml"],
161
- dl_library="TF",
162
- categories={
163
- 1: LayoutType.TEXT,
164
- 2: LayoutType.TITLE,
165
- 3: LayoutType.LIST,
166
- 4: LayoutType.TABLE,
167
- 5: LayoutType.FIGURE,
168
- },
169
- model_wrapper="TPFrcnnDetector",
170
- ),
171
- "cell/model-1800000.data-00000-of-00001": ModelProfile(
172
- name="cell/model-1800000.data-00000-of-00001",
173
- description="Tensorpack cell detection model trained on Pubtabnet",
174
- config="dd/tp/conf_frcnn_cell.yaml",
175
- size=[823509160, 25905],
176
- tp_model=True,
177
- hf_repo_id="deepdoctection/tp_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c",
178
- hf_model_name="model-1800000",
179
- hf_config_file=["conf_frcnn_cell.yaml"],
180
- categories={1: LayoutType.CELL},
181
- dl_library="TF",
182
- model_wrapper="TPFrcnnDetector",
183
- ),
184
139
  "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
185
140
  name="layout/d2_model_0829999_layout_inf_only.pt",
186
141
  description="Detectron2 layout detection model trained on Publaynet",
@@ -200,25 +155,6 @@ class ModelCatalog:
200
155
  dl_library="PT",
201
156
  model_wrapper="D2FrcnnDetector",
202
157
  ),
203
- "layout/d2_model_0829999_layout.pth": ModelProfile(
204
- name="layout/d2_model_0829999_layout.pth",
205
- description="Detectron2 layout detection model trained on Publaynet. Checkpoint for resuming training",
206
- config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
207
- size=[548377327],
208
- tp_model=False,
209
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
210
- hf_model_name="d2_model_0829999_layout.pth",
211
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
212
- categories={
213
- 1: LayoutType.TEXT,
214
- 2: LayoutType.TITLE,
215
- 3: LayoutType.LIST,
216
- 4: LayoutType.TABLE,
217
- 5: LayoutType.FIGURE,
218
- },
219
- dl_library="PT",
220
- model_wrapper="D2FrcnnDetector",
221
- ),
222
158
  "layout/d2_model_0829999_layout_inf_only.ts": ModelProfile(
223
159
  name="layout/d2_model_0829999_layout_inf_only.ts",
224
160
  description="Detectron2 layout detection model trained on Publaynet. Torchscript export",
@@ -264,32 +200,6 @@ class ModelCatalog:
264
200
  dl_library="PT",
265
201
  model_wrapper="D2FrcnnTracingDetector",
266
202
  ),
267
- "cell/d2_model_1849999_cell.pth": ModelProfile(
268
- name="cell/d2_model_1849999_cell.pth",
269
- description="Detectron2 cell detection inference only model trained on Pubtabnet",
270
- config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
271
- size=[548279023],
272
- tp_model=False,
273
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
274
- hf_model_name="cell/d2_model_1849999_cell.pth",
275
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
276
- categories={1: LayoutType.CELL},
277
- dl_library="PT",
278
- model_wrapper="D2FrcnnDetector",
279
- ),
280
- "item/d2_model_1639999_item.pth": ModelProfile(
281
- name="item/d2_model_1639999_item.pth",
282
- description="Detectron2 item detection model trained on Pubtabnet",
283
- config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
284
- size=[548303599],
285
- tp_model=False,
286
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
287
- hf_model_name="d2_model_1639999_item.pth",
288
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
289
- categories={1: LayoutType.ROW, 2: LayoutType.COLUMN},
290
- dl_library="PT",
291
- model_wrapper="D2FrcnnDetector",
292
- ),
293
203
  "item/d2_model_1639999_item_inf_only.pt": ModelProfile(
294
204
  name="item/d2_model_1639999_item_inf_only.pt",
295
205
  description="Detectron2 item detection model inference only trained on Pubtabnet",
@@ -1232,20 +1142,19 @@ class ModelDownloadManager:
1232
1142
  def _load_from_hf_hub(
1233
1143
  repo_id: str, file_name: str, cache_directory: PathLikeOrStr, force_download: bool = False
1234
1144
  ) -> int:
1235
- url = hf_hub_url(repo_id=repo_id, filename=file_name)
1236
1145
  token = os.environ.get("HF_CREDENTIALS", None)
1237
- f_path = cached_download(
1238
- url,
1239
- cache_dir=cache_directory,
1146
+ f_path = hf_hub_download(
1147
+ repo_id,
1148
+ file_name,
1149
+ local_dir=cache_directory, # type: ignore
1240
1150
  force_filename=file_name,
1241
1151
  force_download=force_download,
1242
1152
  token=token,
1243
- legacy_cache_layout=True,
1244
1153
  )
1245
1154
  if f_path:
1246
1155
  stat_info = os.stat(f_path)
1247
1156
  size = stat_info.st_size
1248
1157
 
1249
- assert size > 0, f"Downloaded an empty file from {url}!"
1158
+ assert size > 0, f"Downloaded an empty file from {f_path}!"
1250
1159
  return size
1251
1160
  raise TypeError("Returned value from cached_download cannot be Null")
@@ -73,18 +73,21 @@ def re_assign_cat_ids(
73
73
  Annotations that are not in the dictionary provided will be removed.
74
74
 
75
75
  :param dp: Image
76
- :param categories_dict_name_as_key: e.g. `{LayoutType.word: '1'}`
76
+ :param categories_dict_name_as_key: e.g. `{LayoutType.word: 1}`
77
77
  :param cat_to_sub_cat_mapping: e.g. `{<LayoutType.word>:
78
78
  {<WordType.token_class>:
79
- {<FundsFirstPage.report_date>: '1',
80
- <FundsFirstPage.report_type>: '2',
81
- <FundsFirstPage.umbrella>: '3',
82
- <FundsFirstPage.fund_name>: '4',
83
- <TokenClasses.other>: '5'},
84
- <WordType.tag>:
85
- {<BioTag.inside>: '1',
86
- <BioTag.outside>: '2',
87
- <BioTag.begin>: '3'}}}`
79
+ {<FundsFirstPage.REPORT_DATE>: 1,
80
+ <FundsFirstPage.REPORT_TYPE>: 2,
81
+ <FundsFirstPage.UMBRELLA>: 3,
82
+ <FundsFirstPage.FUND_NAME>: 4,
83
+ <TokenClasses.OTHER>: 5},
84
+ <WordType.TAG>:
85
+ {<BioTag.INSIDE>: 1,
86
+ <BioTag.OUTSIDE>: 2,
87
+ <BioTag.BEGIN>: 3}}}`
88
+ To re-assign the category ids of an image summary, use the key 'default_type' for the default category, e.g.
89
+ `{DefaultType.DEFAULT_TYPE: {<PageType.DOCUMENT_TYPE>: {<DocumentType.INVOICE>:1,
90
+ <DocumentType.BANK_STATEMENT>:2}}}`
88
91
  :return: Image
89
92
  """
90
93
 
@@ -104,6 +107,14 @@ def re_assign_cat_ids(
104
107
  sub_category = ann.get_sub_category(key)
105
108
  sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
106
109
 
110
+ if cat_to_sub_cat_mapping:
111
+ if "default_type" in cat_to_sub_cat_mapping:
112
+ sub_cat_keys_to_sub_cat_values = cat_to_sub_cat_mapping[get_type("default_type")]
113
+ for key in sub_cat_keys_to_sub_cat_values:
114
+ sub_cat_values_dict = sub_cat_keys_to_sub_cat_values[key]
115
+ sub_category = dp.summary.get_sub_category(key)
116
+ sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
117
+
107
118
  dp.remove(annotation_ids=ann_ids_to_remove)
108
119
 
109
120
  return dp
@@ -101,17 +101,6 @@ def match_anns_by_intersection(
101
101
  ]
102
102
  )
103
103
 
104
- # second try, if ann has empty image
105
- n_dim = child_ann_boxes.ndim
106
- if n_dim != 2:
107
- child_ann_boxes = np.array(
108
- [
109
- ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
110
- for ann in child_anns
111
- if ann.bounding_box is not None
112
- ]
113
- )
114
-
115
104
  parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
116
105
  parent_ann_boxes = np.array(
117
106
  [
@@ -120,17 +109,6 @@ def match_anns_by_intersection(
120
109
  ]
121
110
  )
122
111
 
123
- # same for parent
124
- n_dim = parent_ann_boxes.ndim
125
- if n_dim != 2:
126
- parent_ann_boxes = np.array(
127
- [
128
- ann.bounding_box.transform(dp.width, dp.height, absolute_coords=True).to_list(mode="xyxy")
129
- for ann in parent_anns
130
- if ann.bounding_box is not None
131
- ]
132
- )
133
-
134
112
  if matching_rule in ["iou"] and parent_anns and child_anns:
135
113
  iou_matrix = iou(child_ann_boxes, parent_ann_boxes)
136
114
  output = iou_matrix > threshold
@@ -38,12 +38,20 @@ with try_import() as import_guard:
38
38
  from lxml import etree # pylint: disable=W0611
39
39
 
40
40
 
41
- def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int] = None) -> Optional[Image]:
41
+ def to_image(
42
+ dp: Union[str, Mapping[str, Union[str, bytes]]],
43
+ dpi: Optional[int] = None,
44
+ width: Optional[int] = None,
45
+ height: Optional[int] = None,
46
+ ) -> Optional[Image]:
42
47
  """
43
48
  Mapping an input from `dataflow.SerializerFiles` or similar to an Image
44
49
 
45
50
  :param dp: Image
46
51
  :param dpi: dot per inch definition for pdf resolution when converting to numpy array
52
+ :param width: target width of the image. This option does only work when using Poppler as PDF renderer
53
+ :param height: target width of the image. This option does only work when using Poppler as PDF renderer
54
+ :param height: target height of the image
47
55
  :return: Image
48
56
  """
49
57
 
@@ -77,7 +85,9 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
77
85
  dp_image.pdf_bytes = dp.get("pdf_bytes")
78
86
  if dp_image.pdf_bytes is not None:
79
87
  if isinstance(dp_image.pdf_bytes, bytes):
80
- dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
88
+ dp_image.image = convert_pdf_bytes_to_np_array_v2(
89
+ dp_image.pdf_bytes, dpi=dpi, width=width, height=height
90
+ )
81
91
  elif image_bytes is not None:
82
92
  dp_image.image = convert_bytes_to_np_array(image_bytes)
83
93
  else:
@@ -393,7 +393,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
393
393
  np_image = load_image_from_file(dp["filename"])
394
394
  if is_file_extension(dp["filename"], ".pdf"):
395
395
  pdf_bytes = load_bytes_from_pdf_file(dp["filename"])
396
- np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes)
396
+ np_image = convert_pdf_bytes_to_np_array_v2(pdf_bytes, dpi=200)
397
397
  dp = _convert_boxes(dp, np_image.shape[0])
398
398
 
399
399
  if load_image and np_image is not None: