deepdoctection 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -20,16 +20,28 @@ Wrapper for the HF Language Model for sequence and token classification
20
20
  """
21
21
  from __future__ import annotations
22
22
 
23
+ import os
23
24
  from abc import ABC
25
+ from collections import defaultdict
24
26
  from pathlib import Path
25
- from typing import Literal, Mapping, Optional, Union
27
+ from typing import TYPE_CHECKING, Any, Literal, Mapping, Optional, Sequence, Union
26
28
 
27
29
  from lazy_imports import try_import
30
+ from typing_extensions import TypeAlias
28
31
 
29
32
  from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
30
33
  from ..utils.settings import TypeOrStr
31
34
  from ..utils.types import JsonDict, PathLikeOrStr, Requirement
32
- from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
35
+ from .base import (
36
+ DetectionResult,
37
+ LanguageDetector,
38
+ LMSequenceClassifier,
39
+ LMTokenClassifier,
40
+ ModelCategories,
41
+ NerModelCategories,
42
+ SequenceClassResult,
43
+ TokenClassResult,
44
+ )
33
45
  from .hflayoutlm import get_tokenizer_from_model_class
34
46
  from .pt.ptutils import get_torch_device
35
47
 
@@ -38,14 +50,63 @@ with try_import() as pt_import_guard:
38
50
  import torch.nn.functional as F
39
51
 
40
52
  with try_import() as tr_import_guard:
41
- from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
53
+ from transformers import (
54
+ PretrainedConfig,
55
+ XLMRobertaForSequenceClassification,
56
+ XLMRobertaForTokenClassification,
57
+ XLMRobertaTokenizerFast,
58
+ )
42
59
 
43
60
 
44
- def predict_sequence_classes(
61
+ def predict_token_classes_from_lm(
62
+ uuids: list[list[str]],
45
63
  input_ids: torch.Tensor,
46
64
  attention_mask: torch.Tensor,
47
65
  token_type_ids: torch.Tensor,
48
- model: Union[XLMRobertaForSequenceClassification],
66
+ tokens: list[list[str]],
67
+ model: XLMRobertaForTokenClassification,
68
+ ) -> list[TokenClassResult]:
69
+ """
70
+ Args:
71
+ uuids: A list of uuids that correspond to a word that induces the resulting token
72
+ input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
73
+ attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
74
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
75
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
76
+ model: layoutlm model for token classification
77
+
78
+ Returns:
79
+ A list of `TokenClassResult`s
80
+ """
81
+
82
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
83
+
84
+ soft_max = F.softmax(outputs.logits, dim=2)
85
+ score = torch.max(soft_max, dim=2)[0].tolist()
86
+ token_class_predictions_ = outputs.logits.argmax(-1).tolist()
87
+ input_ids_list = input_ids.tolist()
88
+
89
+ all_results = defaultdict(list)
90
+ for idx, uuid_list in enumerate(uuids):
91
+ for pos, token in enumerate(uuid_list):
92
+ all_results[token].append(
93
+ (input_ids_list[idx][pos], token_class_predictions_[idx][pos], tokens[idx][pos], score[idx][pos])
94
+ )
95
+ all_token_classes = []
96
+ for uuid, res in all_results.items():
97
+ res.sort(key=lambda x: x[3], reverse=True)
98
+ output = res[0]
99
+ all_token_classes.append(
100
+ TokenClassResult(uuid=uuid, token_id=output[0], class_id=output[1], token=output[2], score=output[3])
101
+ )
102
+ return all_token_classes
103
+
104
+
105
+ def predict_sequence_classes_from_lm(
106
+ input_ids: torch.Tensor,
107
+ attention_mask: torch.Tensor,
108
+ token_type_ids: torch.Tensor,
109
+ model: XLMRobertaForSequenceClassification,
49
110
  ) -> SequenceClassResult:
50
111
  """
51
112
  Args:
@@ -66,6 +127,250 @@ def predict_sequence_classes(
66
127
  return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
67
128
 
68
129
 
130
+ class HFLmTokenClassifierBase(LMTokenClassifier, ABC):
131
+ """
132
+ Abstract base class for wrapping Bert-like models for token classification into the framework.
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ path_config_json: PathLikeOrStr,
138
+ path_weights: PathLikeOrStr,
139
+ categories_semantics: Optional[Sequence[TypeOrStr]] = None,
140
+ categories_bio: Optional[Sequence[TypeOrStr]] = None,
141
+ categories: Optional[Mapping[int, TypeOrStr]] = None,
142
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
143
+ use_xlm_tokenizer: bool = False,
144
+ ):
145
+ """
146
+ Args:
147
+ path_config_json: path to `.json` config file
148
+ path_weights: path to model artifact
149
+ categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
150
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
151
+ be done internally.
152
+ categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
153
+ consistent with detectors use only `values>0`. Conversion will be done internally.
154
+ categories: If you have a pre-trained model you can pass a complete dict of NER categories
155
+ device: The device (cpu,"cuda"), where to place the model.
156
+ use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
157
+ `info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
158
+ """
159
+
160
+ if categories is None:
161
+ if categories_semantics is None:
162
+ raise ValueError("If categories is None then categories_semantics cannot be None")
163
+ if categories_bio is None:
164
+ raise ValueError("If categories is None then categories_bio cannot be None")
165
+
166
+ self.path_config = Path(path_config_json)
167
+ self.path_weights = Path(path_weights)
168
+ self.categories = NerModelCategories(
169
+ init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
170
+ )
171
+ self.device = get_torch_device(device)
172
+ self.use_xlm_tokenizer = use_xlm_tokenizer
173
+
174
+ @classmethod
175
+ def get_requirements(cls) -> list[Requirement]:
176
+ return [get_pytorch_requirement(), get_transformers_requirement()]
177
+
178
+ def _map_category_names(self, token_results: list[TokenClassResult]) -> list[TokenClassResult]:
179
+ for result in token_results:
180
+ result.class_name = self.categories.categories[result.class_id + 1]
181
+ output = self.categories.disentangle_token_class_and_tag(result.class_name)
182
+ if output is not None:
183
+ token_class, tag = output
184
+ result.semantic_name = token_class
185
+ result.bio_tag = tag
186
+ else:
187
+ result.semantic_name = result.class_name
188
+ result.class_id += 1
189
+ return token_results
190
+
191
+ def _validate_encodings(
192
+ self, **encodings: Any
193
+ ) -> tuple[list[list[str]], list[str], torch.Tensor, torch.Tensor, torch.Tensor, list[list[str]]]:
194
+ image_ids = encodings.get("image_ids", [])
195
+ ann_ids = encodings.get("ann_ids")
196
+ input_ids = encodings.get("input_ids")
197
+ attention_mask = encodings.get("attention_mask")
198
+ token_type_ids = encodings.get("token_type_ids")
199
+ tokens = encodings.get("tokens")
200
+
201
+ assert isinstance(ann_ids, list), type(ann_ids)
202
+ if len(set(image_ids)) > 1:
203
+ raise ValueError("HFLmTokenClassifier accepts for inference only one image.")
204
+ if isinstance(input_ids, torch.Tensor):
205
+ input_ids = input_ids.to(self.device)
206
+ else:
207
+ raise ValueError(f"input_ids must be list but is {type(input_ids)}")
208
+ if isinstance(attention_mask, torch.Tensor):
209
+ attention_mask = attention_mask.to(self.device)
210
+ else:
211
+ raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
212
+ if isinstance(token_type_ids, torch.Tensor):
213
+ token_type_ids = token_type_ids.to(self.device)
214
+ else:
215
+ raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
216
+ if not isinstance(tokens, list):
217
+ raise ValueError(f"tokens must be list but is {type(tokens)}")
218
+
219
+ return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, tokens
220
+
221
+ def clone(self) -> HFLmTokenClassifierBase:
222
+ return self.__class__(
223
+ self.path_config,
224
+ self.path_weights,
225
+ self.categories.categories_semantics,
226
+ self.categories.categories_bio,
227
+ self.categories.get_categories(),
228
+ self.device,
229
+ self.use_xlm_tokenizer,
230
+ )
231
+
232
+ @staticmethod
233
+ def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
234
+ """Returns the name of the model"""
235
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
236
+
237
+ @staticmethod
238
+ def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
239
+ """
240
+ A refinement for adding the tokenizer class name to the model configs.
241
+
242
+ Args:
243
+ model_class_name: The model name, e.g. `model.__class__.__name__`
244
+ use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
245
+
246
+ Returns:
247
+ The name of the tokenizer class.
248
+ """
249
+ tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
250
+ return tokenizer.__class__.__name__
251
+
252
+ @staticmethod
253
+ def image_to_raw_features_mapping() -> str:
254
+ """Returns the mapping function to convert images into raw features."""
255
+ return "image_to_raw_lm_features"
256
+
257
+ @staticmethod
258
+ def image_to_features_mapping() -> str:
259
+ """Returns the mapping function to convert images into features."""
260
+ return "image_to_lm_features"
261
+
262
+
263
+ class HFLmTokenClassifier(HFLmTokenClassifierBase):
264
+ """
265
+ A wrapper class for `transformers.XLMRobertaForTokenClassification` and similar models to use within a pipeline
266
+ component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
267
+ model itself.
268
+ Note that this model is equipped with a head that is only useful for classifying the tokens. For sequence
269
+ classification and other things please use another model of the family.
270
+
271
+ Example:
272
+ ```python
273
+ # setting up compulsory ocr service
274
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
275
+ tess = TesseractOcrDetector(tesseract_config_path)
276
+ ocr_service = TextExtractionService(tess)
277
+
278
+ # hf tokenizer and token classifier
279
+ tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
280
+ roberta = XLMRobertaForTokenClassification("path/to/config.json","path/to/model.bin",
281
+ categories=["first_name", "surname", "street"])
282
+
283
+ # token classification service
284
+ roberta_service = LMTokenClassifierService(tokenizer,roberta)
285
+
286
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
287
+
288
+ path = "path/to/some/form"
289
+ df = pipe.analyze(path=path)
290
+
291
+ for dp in df:
292
+ ...
293
+ ```
294
+ """
295
+
296
+ def __init__(
297
+ self,
298
+ path_config_json: PathLikeOrStr,
299
+ path_weights: PathLikeOrStr,
300
+ categories_semantics: Optional[Sequence[TypeOrStr]] = None,
301
+ categories_bio: Optional[Sequence[TypeOrStr]] = None,
302
+ categories: Optional[Mapping[int, TypeOrStr]] = None,
303
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
304
+ use_xlm_tokenizer: bool = True,
305
+ ):
306
+ """
307
+ Args:
308
+ path_config_json: path to `.json` config file
309
+ path_weights: path to model artifact
310
+ categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
311
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
312
+ be done internally.
313
+ categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
314
+ consistent with detectors use only values>0. Conversion will be done internally.
315
+ categories: If you have a pre-trained model you can pass a complete dict of NER categories
316
+ device: The device (cpu,"cuda"), where to place the model.
317
+ use_xlm_tokenizer: Do not change this value unless you pre-trained a bert-like model with a different
318
+ Tokenizer.
319
+ """
320
+ super().__init__(
321
+ path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
322
+ )
323
+ self.name = self.get_name(path_weights, "bert-like-token-classification")
324
+ self.model_id = self.get_model_id()
325
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
326
+ self.model.to(self.device)
327
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(
328
+ self.model.__class__.__name__, self.use_xlm_tokenizer
329
+ )
330
+
331
+ def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
332
+ """
333
+ Launch inference on bert-like models for token classification. Pass the following arguments
334
+
335
+ Args:
336
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
337
+ attention_mask: The associated attention masks from padded sequences taken from
338
+ `LayoutLMTokenizer`
339
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
340
+ boxes: Torch tensor of bounding boxes of type `xyxy`
341
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
342
+
343
+ Returns:
344
+ A list of `TokenClassResult`s
345
+ """
346
+
347
+ ann_ids, _, input_ids, attention_mask, token_type_ids, tokens = self._validate_encodings(**encodings)
348
+ results = predict_token_classes_from_lm(ann_ids, input_ids, attention_mask, token_type_ids, tokens, self.model)
349
+ return self._map_category_names(results)
350
+
351
+ @staticmethod
352
+ def get_wrapped_model(
353
+ path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
354
+ ) -> XLMRobertaForTokenClassification:
355
+ """
356
+ Get the inner (wrapped) model.
357
+
358
+ Args:
359
+ path_config_json: path to .json config file
360
+ path_weights: path to model artifact
361
+
362
+ Returns:
363
+ `nn.Module`
364
+ """
365
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
366
+ return XLMRobertaForTokenClassification.from_pretrained(
367
+ pretrained_model_name_or_path=os.fspath(path_weights), config=config
368
+ )
369
+
370
+ def clear_model(self) -> None:
371
+ self.model = None
372
+
373
+
69
374
  class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
70
375
  """
71
376
  Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
@@ -208,10 +513,11 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
208
513
  use_xlm_tokenizer: bool = True,
209
514
  ):
210
515
  super().__init__(path_config_json, path_weights, categories, device)
211
- self.name = self.get_name(path_weights, "bert-like")
516
+ self.name = self.get_name(path_weights, "bert-like-sequence-classification")
212
517
  self.model_id = self.get_model_id()
213
518
  self.model = self.get_wrapped_model(path_config_json, path_weights)
214
519
  self.model.to(self.device)
520
+ self.use_xlm_tokenizer = use_xlm_tokenizer
215
521
  self.model.config.tokenizer_class = self.get_tokenizer_class_name(
216
522
  self.model.__class__.__name__, use_xlm_tokenizer
217
523
  )
@@ -219,7 +525,7 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
219
525
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
220
526
  input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
221
527
 
222
- result = predict_sequence_classes(
528
+ result = predict_sequence_classes_from_lm(
223
529
  input_ids,
224
530
  attention_mask,
225
531
  token_type_ids,
@@ -262,3 +568,122 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
262
568
 
263
569
  def clear_model(self) -> None:
264
570
  self.model = None
571
+
572
+
573
+ class HFLmLanguageDetector(LanguageDetector):
574
+ """
575
+ Language detector using HuggingFace's `XLMRobertaForSequenceClassification`.
576
+
577
+ This class wraps a multilingual sequence classification model (XLMRobertaForSequenceClassification)
578
+ for language detection tasks. Input text is tokenized and truncated/padded to a maximum length of 512 tokens.
579
+ The prediction returns a `DetectionResult` containing the detected language code and its confidence score.
580
+ """
581
+
582
+ def __init__(
583
+ self,
584
+ path_config_json: PathLikeOrStr,
585
+ path_weights: PathLikeOrStr,
586
+ categories: Mapping[int, TypeOrStr],
587
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
588
+ use_xlm_tokenizer: bool = True,
589
+ ):
590
+ super().__init__()
591
+ self.path_config = Path(path_config_json)
592
+ self.path_weights = Path(path_weights)
593
+ self.categories = ModelCategories(init_categories=categories)
594
+ self.device = get_torch_device(device)
595
+ self.use_xlm_tokenizer = use_xlm_tokenizer
596
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
597
+ self.model.to(self.device)
598
+ self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
599
+ self.name = self.get_name(path_weights, "bert-like-language-detection")
600
+ self.model_id = self.get_model_id()
601
+
602
+ def predict(self, text_string: str) -> DetectionResult:
603
+ """
604
+ Predict the language of the input sequence.
605
+
606
+ Args:
607
+ text_string: The input text sequence to classify.
608
+
609
+ Returns:
610
+ DetectionResult: The detected language and its confidence score.
611
+ """
612
+ encoding = self.tokenizer(
613
+ text_string,
614
+ return_tensors="pt",
615
+ padding=True,
616
+ truncation=True,
617
+ max_length=512,
618
+ )
619
+ input_ids = encoding["input_ids"].to(self.device)
620
+ attention_mask = encoding["attention_mask"].to(self.device)
621
+ token_type_ids = encoding.get("token_type_ids")
622
+ if token_type_ids is not None:
623
+ token_type_ids = token_type_ids.to(self.device)
624
+ else:
625
+ token_type_ids = torch.zeros_like(input_ids)
626
+
627
+ self.model.eval()
628
+ with torch.no_grad():
629
+ outputs = self.model(
630
+ input_ids=input_ids,
631
+ attention_mask=attention_mask,
632
+ token_type_ids=token_type_ids,
633
+ )
634
+ probs = torch.softmax(outputs.logits, dim=-1)
635
+ score, class_id_tensor = torch.max(probs, dim=-1)
636
+ class_id = int(class_id_tensor.item() + 1)
637
+ lang = self.categories.categories[class_id]
638
+
639
+ return DetectionResult(class_name=lang, score=float(score.item()))
640
+
641
+ def clear_model(self) -> None:
642
+ self.model = None
643
+
644
+ @classmethod
645
+ def get_requirements(cls) -> list[Requirement]:
646
+ return [get_pytorch_requirement(), get_transformers_requirement()]
647
+
648
+ @staticmethod
649
+ def get_wrapped_model(
650
+ path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
651
+ ) -> XLMRobertaForSequenceClassification:
652
+ """
653
+ Get the inner (wrapped) model.
654
+
655
+ Args:
656
+ path_config_json: path to .json config file
657
+ path_weights: path to model artifact
658
+
659
+ Returns:
660
+ `XLMRobertaForSequenceClassification`
661
+ """
662
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
663
+ return XLMRobertaForSequenceClassification.from_pretrained(
664
+ pretrained_model_name_or_path=path_weights, config=config
665
+ )
666
+
667
+ def clone(self) -> HFLmLanguageDetector:
668
+ return self.__class__(
669
+ self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
670
+ )
671
+
672
+ @staticmethod
673
+ def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
674
+ """
675
+ Returns the name of the model
676
+
677
+ Args:
678
+ path_weights: Path to model weights
679
+ architecture: Architecture name
680
+
681
+ Returns:
682
+ str: Model name
683
+ """
684
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
685
+
686
+
687
+ if TYPE_CHECKING:
688
+ LmTokenModels: TypeAlias = Union[HFLmTokenClassifier,]
689
+ LmSequenceModels: TypeAlias = Union[HFLmSequenceClassifier,]
@@ -806,17 +806,17 @@ def image_to_raw_lm_features(
806
806
  raw_features["image_id"] = page.image_id
807
807
  raw_features["width"] = page.width
808
808
  raw_features["height"] = page.height
809
- raw_features["ann_ids"] = text_["ann_ids"]
810
- raw_features["words"] = text_["words"]
809
+ raw_features["ann_ids"] = text_.ann_ids
810
+ raw_features["words"] = text_.words
811
811
  # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
812
812
  # raw_features_to_layoutlm_features
813
- raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
813
+ raw_features["bbox"] = [_CLS_BOX] * len(text_.words)
814
814
  raw_features["dataset_type"] = dataset_type
815
815
 
816
- if use_token_tag and text_["token_tags"]:
817
- raw_features["labels"] = text_["token_tags"]
818
- elif text_["token_classes"]:
819
- raw_features["labels"] = text_["token_classes"]
816
+ if use_token_tag and text_.token_tags:
817
+ raw_features["labels"] = text_.token_tags
818
+ elif text_.token_classes:
819
+ raw_features["labels"] = text_.token_classes
820
820
  elif page.document_type is not None:
821
821
  document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
822
822
  raw_features["labels"] = [document_type_id]
@@ -21,7 +21,7 @@ Module for language detection pipeline component
21
21
  from typing import Optional, Sequence
22
22
 
23
23
  from ..datapoint.image import Image, MetaAnnotation
24
- from ..datapoint.view import ImageDefaults, Page
24
+ from ..datapoint.view import IMAGE_DEFAULTS, Page
25
25
  from ..extern.base import LanguageDetector, ObjectDetector
26
26
  from ..utils.error import ImageError
27
27
  from ..utils.settings import PageType, TypeOrStr, get_type
@@ -75,11 +75,11 @@ class LanguageDetectionService(PipelineComponent):
75
75
 
76
76
  self.predictor = language_detector
77
77
  self.text_detector = text_detector
78
- self.text_container = get_type(text_container) if text_container is not None else ImageDefaults.TEXT_CONTAINER
78
+ self.text_container = get_type(text_container) if text_container is not None else IMAGE_DEFAULTS.TEXT_CONTAINER
79
79
  self.floating_text_block_categories = (
80
80
  tuple(get_type(text_block) for text_block in floating_text_block_categories)
81
81
  if (floating_text_block_categories is not None)
82
- else ()
82
+ else IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
83
83
  )
84
84
 
85
85
  super().__init__(self._get_name(self.predictor.name))
@@ -109,7 +109,7 @@ class LanguageDetectionService(PipelineComponent):
109
109
  text = " ".join((result.text for result in detect_result_list if result.text is not None))
110
110
  predict_result = self.predictor.predict(text)
111
111
  self.dp_manager.set_summary_annotation(
112
- PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.text, predict_result.score
112
+ PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.class_name, predict_result.score
113
113
  )
114
114
 
115
115
  def clone(self) -> PipelineComponent:
deepdoctection/pipe/lm.py CHANGED
@@ -20,6 +20,7 @@ Module for token classification pipeline
20
20
  """
21
21
  from __future__ import annotations
22
22
 
23
+ import inspect
23
24
  from copy import copy
24
25
  from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
25
26
 
@@ -32,6 +33,7 @@ from .registry import pipeline_component_registry
32
33
 
33
34
  if TYPE_CHECKING:
34
35
  from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
36
+ from ..extern.hflm import LmSequenceModels, LmTokenModels
35
37
 
36
38
 
37
39
  @pipeline_component_registry.register("LMTokenClassifierService")
@@ -70,7 +72,7 @@ class LMTokenClassifierService(PipelineComponent):
70
72
  def __init__(
71
73
  self,
72
74
  tokenizer: Any,
73
- language_model: LayoutTokenModels,
75
+ language_model: Union[LayoutTokenModels, LmTokenModels],
74
76
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
75
77
  truncation: bool = True,
76
78
  return_overflowing_tokens: bool = False,
@@ -124,7 +126,7 @@ class LMTokenClassifierService(PipelineComponent):
124
126
  might not get sent to the model because they are categorized as not
125
127
  eligible token (e.g. empty string). If set to `True` it will assign all
126
128
  words without token the `BioTag.outside` token.
127
- segment_positions: Using bounding boxes of segment instead of words improves model accuracy
129
+ segment_positions: Using bounding boxes of segments instead of words improves model accuracy
128
130
  significantly for models that have been trained on segments rather than words.
129
131
  Choose a single or a sequence of layout segments to use their bounding boxes. Note,
130
132
  that the layout segments need to have a child-relationship with words. If a word
@@ -271,6 +273,8 @@ class LMTokenClassifierService(PipelineComponent):
271
273
  f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
272
274
  f"in this framework"
273
275
  )
276
+ func_params = inspect.signature(self.mapping_to_lm_input_func).parameters
277
+ self.required_kwargs = {k: v for k, v in self.required_kwargs.items() if k in func_params}
274
278
 
275
279
  @staticmethod
276
280
  def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
@@ -318,7 +322,7 @@ class LMSequenceClassifierService(PipelineComponent):
318
322
  def __init__(
319
323
  self,
320
324
  tokenizer: Any,
321
- language_model: LayoutSequenceModels,
325
+ language_model: Union[LayoutSequenceModels, LmSequenceModels],
322
326
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
323
327
  truncation: bool = True,
324
328
  return_overflowing_tokens: bool = False,
@@ -18,6 +18,7 @@ from types import ModuleType
18
18
  from typing import Any, Union, no_type_check
19
19
 
20
20
  import importlib_metadata
21
+ import numpy as np
21
22
  from packaging import version
22
23
 
23
24
  from .error import DependencyError
@@ -249,6 +250,39 @@ def get_distance_requirement() -> Requirement:
249
250
  return "distance", distance_available(), _DISTANCE_ERR_MSG
250
251
 
251
252
 
253
+ _NUMPY_V1_ERR_MSG = "numpy v1 must be installed."
254
+
255
+
256
+ def numpy_v1_available() -> bool:
257
+ """
258
+ Check if the installed NumPy version is version 1.
259
+
260
+ This helper function determines whether the currently installed version
261
+ of NumPy is version 1 by inspecting its major version number.
262
+
263
+ Returns:
264
+ True if the installed NumPy version is 1, otherwise False
265
+ """
266
+ major_version = np.__version__.split(".", maxsplit=1)[0]
267
+ print(f"major version: {major_version}")
268
+ if major_version in (1, "1"):
269
+ return True
270
+ return False
271
+
272
+
273
+ def get_numpy_v1_requirement() -> Requirement:
274
+ """
275
+ Retrieves the requirement details for numpy version 1.
276
+
277
+ Returns:
278
+ A tuple containing three elements:
279
+ - The requirement name for numpy version 1.
280
+ - A Boolean value indicating whether numpy version 1 is available.
281
+ - An error message in case numpy version 1 is not available.
282
+ """
283
+ return "numpy v1", numpy_v1_available(), _NUMPY_V1_ERR_MSG
284
+
285
+
252
286
  # Transformers
253
287
  _TRANSFORMERS_AVAILABLE = importlib.util.find_spec("transformers") is not None
254
288
  _TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"
@@ -108,6 +108,7 @@ class DocumentType(ObjectTypes):
108
108
  GOVERNMENT_TENDERS = "government_tenders"
109
109
  MANUALS = "manuals"
110
110
  PATENTS = "patents"
111
+ BANK_STATEMENT = "bank_statement"
111
112
 
112
113
 
113
114
  @object_types_registry.register("LayoutType")
@@ -296,6 +297,7 @@ class Languages(ObjectTypes):
296
297
  BOSNIAN = "bos"
297
298
  NORWEGIAN_NOVOSIBIRSK = "nno"
298
299
  URDU = "urd"
300
+ SWAHILI = "swa"
299
301
  NOT_DEFINED = "nn"
300
302
 
301
303
 
@@ -70,7 +70,6 @@ AnnotationDict: TypeAlias = dict[str, Any]
70
70
  ImageDict: TypeAlias = dict[str, Any]
71
71
 
72
72
  # We use these types for output types of the Page object
73
- Text_: TypeAlias = dict[str, Any]
74
73
  HTML: TypeAlias = str
75
74
  csv: TypeAlias = list[list[str]]
76
75
  Chunks: TypeAlias = list[tuple[str, str, int, str, str, str, str]]