deepdoctection 0.37.2__py3-none-any.whl → 0.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
41
41
  from transformers import ( # pylint: disable=W0611
42
42
  AutoFeatureExtractor,
43
43
  DetrFeatureExtractor,
44
+ DetrImageProcessor,
44
45
  PretrainedConfig,
45
46
  TableTransformerForObjectDetection,
46
47
  )
@@ -55,7 +56,7 @@ def _detr_post_processing(
55
56
  def detr_predict_image(
56
57
  np_img: PixelValues,
57
58
  predictor: TableTransformerForObjectDetection,
58
- feature_extractor: DetrFeatureExtractor,
59
+ feature_extractor: DetrImageProcessor,
59
60
  device: torch.device,
60
61
  threshold: float,
61
62
  nms_threshold: float,
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
224
225
  )
225
226
 
226
227
  @staticmethod
227
- def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrFeatureExtractor:
228
+ def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
228
229
  """
229
230
  Builds the feature extractor
230
231
 
231
232
  :return: DetrFeatureExtractor
232
233
  """
233
- return AutoFeatureExtractor.from_pretrained(
234
+ return DetrImageProcessor.from_pretrained(
234
235
  pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
235
236
  )
236
237
 
@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
48
48
  import torch.nn.functional as F
49
49
 
50
50
  with try_import() as tr_import_guard:
51
- from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type:ignore
51
+ from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
52
52
  from transformers import (
53
53
  LayoutLMForSequenceClassification,
54
54
  LayoutLMForTokenClassification,
@@ -67,20 +67,6 @@ with try_import() as tr_import_guard:
67
67
  )
68
68
 
69
69
  if TYPE_CHECKING:
70
- LayoutTokenModels: TypeAlias = Union[
71
- LayoutLMForTokenClassification,
72
- LayoutLMv2ForTokenClassification,
73
- LayoutLMv3ForTokenClassification,
74
- LiltForTokenClassification,
75
- ]
76
-
77
- LayoutSequenceModels: TypeAlias = Union[
78
- LayoutLMForSequenceClassification,
79
- LayoutLMv2ForSequenceClassification,
80
- LayoutLMv3ForSequenceClassification,
81
- LiltForSequenceClassification,
82
- ]
83
-
84
70
  HfLayoutTokenModels: TypeAlias = Union[
85
71
  LayoutLMForTokenClassification,
86
72
  LayoutLMv2ForTokenClassification,
@@ -147,7 +133,7 @@ def predict_token_classes(
147
133
  token_type_ids: torch.Tensor,
148
134
  boxes: torch.Tensor,
149
135
  tokens: list[list[str]],
150
- model: LayoutTokenModels,
136
+ model: HfLayoutTokenModels,
151
137
  images: Optional[torch.Tensor] = None,
152
138
  ) -> list[TokenClassResult]:
153
139
  """
@@ -205,7 +191,7 @@ def predict_sequence_classes(
205
191
  attention_mask: torch.Tensor,
206
192
  token_type_ids: torch.Tensor,
207
193
  boxes: torch.Tensor,
208
- model: LayoutSequenceModels,
194
+ model: HfLayoutSequenceModels,
209
195
  images: Optional[torch.Tensor] = None,
210
196
  ) -> SequenceClassResult:
211
197
  """
@@ -254,6 +240,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
254
240
  categories_bio: Optional[Sequence[TypeOrStr]] = None,
255
241
  categories: Optional[Mapping[int, TypeOrStr]] = None,
256
242
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
243
+ use_xlm_tokenizer: bool = False,
257
244
  ):
258
245
  """
259
246
  :param path_config_json: path to .json config file
@@ -281,6 +268,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
281
268
  init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
282
269
  )
283
270
  self.device = get_torch_device(device)
271
+ self.use_xlm_tokenizer = use_xlm_tokenizer
284
272
 
285
273
  @classmethod
286
274
  def get_requirements(cls) -> list[Requirement]:
@@ -342,6 +330,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
342
330
  self.categories.categories_bio,
343
331
  self.categories.get_categories(),
344
332
  self.device,
333
+ self.use_xlm_tokenizer,
345
334
  )
346
335
 
347
336
  @staticmethod
@@ -427,13 +416,15 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
427
416
  :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
428
417
  Tokenizer.
429
418
  """
430
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
419
+ super().__init__(
420
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
421
+ )
431
422
  self.name = self.get_name(path_weights, "LayoutLM")
432
423
  self.model_id = self.get_model_id()
433
424
  self.model = self.get_wrapped_model(path_config_json, path_weights)
434
425
  self.model.to(self.device)
435
426
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
436
- self.model.__class__.__name__, use_xlm_tokenizer
427
+ self.model.__class__.__name__, self.use_xlm_tokenizer
437
428
  )
438
429
 
439
430
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -540,13 +531,15 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
540
531
  :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
541
532
  default value.
542
533
  """
543
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
534
+ super().__init__(
535
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
536
+ )
544
537
  self.name = self.get_name(path_weights, "LayoutLMv2")
545
538
  self.model_id = self.get_model_id()
546
539
  self.model = self.get_wrapped_model(path_config_json, path_weights)
547
540
  self.model.to(self.device)
548
541
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
549
- self.model.__class__.__name__, use_xlm_tokenizer
542
+ self.model.__class__.__name__, self.use_xlm_tokenizer
550
543
  )
551
544
 
552
545
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -666,13 +659,15 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
666
659
  :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
667
660
  tokenizer.
668
661
  """
669
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
662
+ super().__init__(
663
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
664
+ )
670
665
  self.name = self.get_name(path_weights, "LayoutLMv3")
671
666
  self.model_id = self.get_model_id()
672
667
  self.model = self.get_wrapped_model(path_config_json, path_weights)
673
668
  self.model.to(self.device)
674
669
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
675
- self.model.__class__.__name__, use_xlm_tokenizer
670
+ self.model.__class__.__name__, self.use_xlm_tokenizer
676
671
  )
677
672
 
678
673
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -746,19 +741,23 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
746
741
  path_weights: PathLikeOrStr,
747
742
  categories: Mapping[int, TypeOrStr],
748
743
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
744
+ use_xlm_tokenizer: bool = False,
749
745
  ):
750
746
  self.path_config = Path(path_config_json)
751
747
  self.path_weights = Path(path_weights)
752
748
  self.categories = ModelCategories(init_categories=categories)
753
749
 
754
750
  self.device = get_torch_device(device)
751
+ self.use_xlm_tokenizer = use_xlm_tokenizer
755
752
 
756
753
  @classmethod
757
754
  def get_requirements(cls) -> list[Requirement]:
758
755
  return [get_pytorch_requirement(), get_transformers_requirement()]
759
756
 
760
757
  def clone(self) -> HFLayoutLmSequenceClassifierBase:
761
- return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
758
+ return self.__class__(
759
+ self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
760
+ )
762
761
 
763
762
  def _validate_encodings(
764
763
  self, **encodings: Union[list[list[str]], torch.Tensor]
@@ -856,13 +855,13 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
856
855
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
857
856
  use_xlm_tokenizer: bool = False,
858
857
  ):
859
- super().__init__(path_config_json, path_weights, categories, device)
858
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
860
859
  self.name = self.get_name(path_weights, "LayoutLM")
861
860
  self.model_id = self.get_model_id()
862
861
  self.model = self.get_wrapped_model(path_config_json, path_weights)
863
862
  self.model.to(self.device)
864
863
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
865
- self.model.__class__.__name__, use_xlm_tokenizer
864
+ self.model.__class__.__name__, self.use_xlm_tokenizer
866
865
  )
867
866
 
868
867
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -939,13 +938,13 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
939
938
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
940
939
  use_xlm_tokenizer: bool = False,
941
940
  ):
942
- super().__init__(path_config_json, path_weights, categories, device)
941
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
943
942
  self.name = self.get_name(path_weights, "LayoutLMv2")
944
943
  self.model_id = self.get_model_id()
945
944
  self.model = self.get_wrapped_model(path_config_json, path_weights)
946
945
  self.model.to(self.device)
947
946
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
948
- self.model.__class__.__name__, use_xlm_tokenizer
947
+ self.model.__class__.__name__, self.use_xlm_tokenizer
949
948
  )
950
949
 
951
950
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1029,13 +1028,13 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
1029
1028
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1030
1029
  use_xlm_tokenizer: bool = False,
1031
1030
  ):
1032
- super().__init__(path_config_json, path_weights, categories, device)
1031
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
1033
1032
  self.name = self.get_name(path_weights, "LayoutLMv3")
1034
1033
  self.model_id = self.get_model_id()
1035
1034
  self.model = self.get_wrapped_model(path_config_json, path_weights)
1036
1035
  self.model.to(self.device)
1037
1036
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
1038
- self.model.__class__.__name__, use_xlm_tokenizer
1037
+ self.model.__class__.__name__, self.use_xlm_tokenizer
1039
1038
  )
1040
1039
 
1041
1040
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1142,13 +1141,15 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1142
1141
  :param device: The device (cpu,"cuda"), where to place the model.
1143
1142
  """
1144
1143
 
1145
- super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
1144
+ super().__init__(
1145
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
1146
+ )
1146
1147
  self.name = self.get_name(path_weights, "LiLT")
1147
1148
  self.model_id = self.get_model_id()
1148
1149
  self.model = self.get_wrapped_model(path_config_json, path_weights)
1149
1150
  self.model.to(self.device)
1150
1151
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
1151
- self.model.__class__.__name__, use_xlm_tokenizer
1152
+ self.model.__class__.__name__, self.use_xlm_tokenizer
1152
1153
  )
1153
1154
 
1154
1155
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -1232,13 +1233,13 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1232
1233
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
1233
1234
  use_xlm_tokenizer: bool = False,
1234
1235
  ):
1235
- super().__init__(path_config_json, path_weights, categories, device)
1236
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
1236
1237
  self.name = self.get_name(path_weights, "LiLT")
1237
1238
  self.model_id = self.get_model_id()
1238
1239
  self.model = self.get_wrapped_model(path_config_json, path_weights)
1239
1240
  self.model.to(self.device)
1240
1241
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
1241
- self.model.__class__.__name__, use_xlm_tokenizer
1242
+ self.model.__class__.__name__, self.use_xlm_tokenizer
1242
1243
  )
1243
1244
 
1244
1245
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1270,3 +1271,19 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1270
1271
 
1271
1272
  def clear_model(self) -> None:
1272
1273
  self.model = None
1274
+
1275
+
1276
+ if TYPE_CHECKING:
1277
+ LayoutTokenModels: TypeAlias = Union[
1278
+ HFLayoutLmTokenClassifier,
1279
+ HFLayoutLmv2TokenClassifier,
1280
+ HFLayoutLmv3TokenClassifier,
1281
+ HFLiltTokenClassifier,
1282
+ ]
1283
+
1284
+ LayoutSequenceModels: TypeAlias = Union[
1285
+ HFLayoutLmSequenceClassifier,
1286
+ HFLayoutLmv2SequenceClassifier,
1287
+ HFLayoutLmv3SequenceClassifier,
1288
+ HFLiltSequenceClassifier,
1289
+ ]
@@ -188,7 +188,7 @@ class DoctectionPipe(Pipeline):
188
188
 
189
189
  df = MapData(df, _proto_process(path, doc_path))
190
190
  if dataset_dataflow is None:
191
- df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
191
+ df = MapData(df, _to_image(dpi=int(os.environ.get("DPI", 300)))) # pylint: disable=E1120
192
192
  return df
193
193
 
194
194
  @staticmethod
deepdoctection/pipe/lm.py CHANGED
@@ -30,7 +30,7 @@ from .base import MetaAnnotation, PipelineComponent
30
30
  from .registry import pipeline_component_registry
31
31
 
32
32
  if TYPE_CHECKING:
33
- from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
33
+ from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
34
34
 
35
35
 
36
36
  @pipeline_component_registry.register("LMTokenClassifierService")
@@ -66,7 +66,7 @@ class LMTokenClassifierService(PipelineComponent):
66
66
  def __init__(
67
67
  self,
68
68
  tokenizer: Any,
69
- language_model: HfLayoutTokenModels,
69
+ language_model: LayoutTokenModels,
70
70
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
71
71
  truncation: bool = True,
72
72
  return_overflowing_tokens: bool = False,
@@ -155,11 +155,11 @@ class LMTokenClassifierService(PipelineComponent):
155
155
  else:
156
156
  token_class_name_id = None
157
157
  self.dp_manager.set_category_annotation(
158
- token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
158
+ token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid, token.score
159
159
  )
160
160
  self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
161
161
  self.dp_manager.set_category_annotation(
162
- token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
162
+ token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid, token.score
163
163
  )
164
164
  words_populated.append(token.uuid)
165
165
 
@@ -188,7 +188,7 @@ class LMTokenClassifierService(PipelineComponent):
188
188
  # multiple threads
189
189
  return self.__class__(
190
190
  copy(self.tokenizer),
191
- self.language_model.clone(),
191
+ self.language_model.clone(), # type: ignore
192
192
  self.padding,
193
193
  self.truncation,
194
194
  self.return_overflowing_tokens,
@@ -260,7 +260,7 @@ class LMSequenceClassifierService(PipelineComponent):
260
260
  def __init__(
261
261
  self,
262
262
  tokenizer: Any,
263
- language_model: HfLayoutSequenceModels,
263
+ language_model: LayoutSequenceModels,
264
264
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
265
265
  truncation: bool = True,
266
266
  return_overflowing_tokens: bool = False,
@@ -309,7 +309,7 @@ class LMSequenceClassifierService(PipelineComponent):
309
309
  def clone(self) -> LMSequenceClassifierService:
310
310
  return self.__class__(
311
311
  copy(self.tokenizer),
312
- self.language_model.clone(),
312
+ self.language_model.clone(), # type: ignore
313
313
  self.padding,
314
314
  self.truncation,
315
315
  self.return_overflowing_tokens,
@@ -295,28 +295,21 @@ def _html_table(
295
295
  return html
296
296
 
297
297
 
298
- def generate_html_string(table: ImageAnnotation) -> list[str]:
298
+ def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
299
299
  """
300
300
  Takes the table segmentation by using table cells row number, column numbers etc. and generates a html
301
301
  representation.
302
302
 
303
303
  :param table: An annotation that has a not None image and fully segmented cell annotation.
304
+ :param cell_names: List of cell names that are used for the table segmentation. Note: It must be ensured that
305
+ that all cells have a row number, column number, row span and column span and that the dissection
306
+ by rows and columns is completely covered by cells.
304
307
  :return: HTML representation of the table
305
308
  """
306
309
  if table.image is None:
307
310
  raise ImageError("table.image cannot be None")
308
311
  table_image = table.image
309
- cells = table_image.get_annotation(
310
- category_names=[
311
- LayoutType.CELL,
312
- CellType.HEADER,
313
- CellType.BODY,
314
- CellType.SPANNING,
315
- CellType.ROW_HEADER,
316
- CellType.COLUMN_HEADER,
317
- CellType.PROJECTED_ROW_HEADER,
318
- ]
319
- )
312
+ cells = table_image.get_annotation(category_names=cell_names)
320
313
  number_of_rows = table_image.summary.get_sub_category(TableType.NUMBER_OF_ROWS).category_id
321
314
  number_of_cols = table_image.summary.get_sub_category(TableType.NUMBER_OF_COLUMNS).category_id
322
315
  table_list = []
@@ -485,7 +478,7 @@ class TableSegmentationRefinementService(PipelineComponent):
485
478
  self.dp_manager.set_summary_annotation(
486
479
  TableType.MAX_COL_SPAN, TableType.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
487
480
  )
488
- html = generate_html_string(table)
481
+ html = generate_html_string(table, self.cell_names)
489
482
  self.dp_manager.set_container_annotation(TableType.HTML, -1, TableType.HTML, table.annotation_id, html)
490
483
 
491
484
  def clone(self) -> TableSegmentationRefinementService: