deepdoctection 0.37.2__py3-none-any.whl → 0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +1 -1
- deepdoctection/analyzer/_config.py +2 -1
- deepdoctection/analyzer/factory.py +9 -4
- deepdoctection/configs/conf_dd_one.yaml +126 -85
- deepdoctection/datapoint/box.py +2 -4
- deepdoctection/datapoint/image.py +11 -4
- deepdoctection/datapoint/view.py +124 -36
- deepdoctection/extern/hfdetr.py +4 -3
- deepdoctection/extern/hflayoutlm.py +51 -34
- deepdoctection/pipe/doctectionpipe.py +1 -1
- deepdoctection/pipe/lm.py +7 -7
- deepdoctection/pipe/refine.py +6 -13
- deepdoctection/pipe/segment.py +229 -46
- deepdoctection/pipe/sub_layout.py +40 -22
- {deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/METADATA +12 -2
- {deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/RECORD +19 -19
- {deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/WHEEL +1 -1
- {deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/LICENSE +0 -0
- {deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/top_level.txt +0 -0
deepdoctection/extern/hfdetr.py
CHANGED
|
@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
|
|
|
41
41
|
from transformers import ( # pylint: disable=W0611
|
|
42
42
|
AutoFeatureExtractor,
|
|
43
43
|
DetrFeatureExtractor,
|
|
44
|
+
DetrImageProcessor,
|
|
44
45
|
PretrainedConfig,
|
|
45
46
|
TableTransformerForObjectDetection,
|
|
46
47
|
)
|
|
@@ -55,7 +56,7 @@ def _detr_post_processing(
|
|
|
55
56
|
def detr_predict_image(
|
|
56
57
|
np_img: PixelValues,
|
|
57
58
|
predictor: TableTransformerForObjectDetection,
|
|
58
|
-
feature_extractor:
|
|
59
|
+
feature_extractor: DetrImageProcessor,
|
|
59
60
|
device: torch.device,
|
|
60
61
|
threshold: float,
|
|
61
62
|
nms_threshold: float,
|
|
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
|
|
|
224
225
|
)
|
|
225
226
|
|
|
226
227
|
@staticmethod
|
|
227
|
-
def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) ->
|
|
228
|
+
def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
|
|
228
229
|
"""
|
|
229
230
|
Builds the feature extractor
|
|
230
231
|
|
|
231
232
|
:return: DetrFeatureExtractor
|
|
232
233
|
"""
|
|
233
|
-
return
|
|
234
|
+
return DetrImageProcessor.from_pretrained(
|
|
234
235
|
pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
|
|
235
236
|
)
|
|
236
237
|
|
|
@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
|
|
|
48
48
|
import torch.nn.functional as F
|
|
49
49
|
|
|
50
50
|
with try_import() as tr_import_guard:
|
|
51
|
-
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
|
51
|
+
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
|
52
52
|
from transformers import (
|
|
53
53
|
LayoutLMForSequenceClassification,
|
|
54
54
|
LayoutLMForTokenClassification,
|
|
@@ -67,20 +67,6 @@ with try_import() as tr_import_guard:
|
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
if TYPE_CHECKING:
|
|
70
|
-
LayoutTokenModels: TypeAlias = Union[
|
|
71
|
-
LayoutLMForTokenClassification,
|
|
72
|
-
LayoutLMv2ForTokenClassification,
|
|
73
|
-
LayoutLMv3ForTokenClassification,
|
|
74
|
-
LiltForTokenClassification,
|
|
75
|
-
]
|
|
76
|
-
|
|
77
|
-
LayoutSequenceModels: TypeAlias = Union[
|
|
78
|
-
LayoutLMForSequenceClassification,
|
|
79
|
-
LayoutLMv2ForSequenceClassification,
|
|
80
|
-
LayoutLMv3ForSequenceClassification,
|
|
81
|
-
LiltForSequenceClassification,
|
|
82
|
-
]
|
|
83
|
-
|
|
84
70
|
HfLayoutTokenModels: TypeAlias = Union[
|
|
85
71
|
LayoutLMForTokenClassification,
|
|
86
72
|
LayoutLMv2ForTokenClassification,
|
|
@@ -147,7 +133,7 @@ def predict_token_classes(
|
|
|
147
133
|
token_type_ids: torch.Tensor,
|
|
148
134
|
boxes: torch.Tensor,
|
|
149
135
|
tokens: list[list[str]],
|
|
150
|
-
model:
|
|
136
|
+
model: HfLayoutTokenModels,
|
|
151
137
|
images: Optional[torch.Tensor] = None,
|
|
152
138
|
) -> list[TokenClassResult]:
|
|
153
139
|
"""
|
|
@@ -205,7 +191,7 @@ def predict_sequence_classes(
|
|
|
205
191
|
attention_mask: torch.Tensor,
|
|
206
192
|
token_type_ids: torch.Tensor,
|
|
207
193
|
boxes: torch.Tensor,
|
|
208
|
-
model:
|
|
194
|
+
model: HfLayoutSequenceModels,
|
|
209
195
|
images: Optional[torch.Tensor] = None,
|
|
210
196
|
) -> SequenceClassResult:
|
|
211
197
|
"""
|
|
@@ -254,6 +240,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
254
240
|
categories_bio: Optional[Sequence[TypeOrStr]] = None,
|
|
255
241
|
categories: Optional[Mapping[int, TypeOrStr]] = None,
|
|
256
242
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
243
|
+
use_xlm_tokenizer: bool = False,
|
|
257
244
|
):
|
|
258
245
|
"""
|
|
259
246
|
:param path_config_json: path to .json config file
|
|
@@ -281,6 +268,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
281
268
|
init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
|
|
282
269
|
)
|
|
283
270
|
self.device = get_torch_device(device)
|
|
271
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
284
272
|
|
|
285
273
|
@classmethod
|
|
286
274
|
def get_requirements(cls) -> list[Requirement]:
|
|
@@ -342,6 +330,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
342
330
|
self.categories.categories_bio,
|
|
343
331
|
self.categories.get_categories(),
|
|
344
332
|
self.device,
|
|
333
|
+
self.use_xlm_tokenizer,
|
|
345
334
|
)
|
|
346
335
|
|
|
347
336
|
@staticmethod
|
|
@@ -427,13 +416,15 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
427
416
|
:param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
|
|
428
417
|
Tokenizer.
|
|
429
418
|
"""
|
|
430
|
-
super().__init__(
|
|
419
|
+
super().__init__(
|
|
420
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
421
|
+
)
|
|
431
422
|
self.name = self.get_name(path_weights, "LayoutLM")
|
|
432
423
|
self.model_id = self.get_model_id()
|
|
433
424
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
434
425
|
self.model.to(self.device)
|
|
435
426
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
436
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
427
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
437
428
|
)
|
|
438
429
|
|
|
439
430
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
@@ -540,13 +531,15 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
540
531
|
:param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
|
|
541
532
|
default value.
|
|
542
533
|
"""
|
|
543
|
-
super().__init__(
|
|
534
|
+
super().__init__(
|
|
535
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
536
|
+
)
|
|
544
537
|
self.name = self.get_name(path_weights, "LayoutLMv2")
|
|
545
538
|
self.model_id = self.get_model_id()
|
|
546
539
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
547
540
|
self.model.to(self.device)
|
|
548
541
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
549
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
542
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
550
543
|
)
|
|
551
544
|
|
|
552
545
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
@@ -666,13 +659,15 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
666
659
|
:param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
|
|
667
660
|
tokenizer.
|
|
668
661
|
"""
|
|
669
|
-
super().__init__(
|
|
662
|
+
super().__init__(
|
|
663
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
664
|
+
)
|
|
670
665
|
self.name = self.get_name(path_weights, "LayoutLMv3")
|
|
671
666
|
self.model_id = self.get_model_id()
|
|
672
667
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
673
668
|
self.model.to(self.device)
|
|
674
669
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
675
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
670
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
676
671
|
)
|
|
677
672
|
|
|
678
673
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
@@ -746,19 +741,23 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
746
741
|
path_weights: PathLikeOrStr,
|
|
747
742
|
categories: Mapping[int, TypeOrStr],
|
|
748
743
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
744
|
+
use_xlm_tokenizer: bool = False,
|
|
749
745
|
):
|
|
750
746
|
self.path_config = Path(path_config_json)
|
|
751
747
|
self.path_weights = Path(path_weights)
|
|
752
748
|
self.categories = ModelCategories(init_categories=categories)
|
|
753
749
|
|
|
754
750
|
self.device = get_torch_device(device)
|
|
751
|
+
self.use_xlm_tokenizer = use_xlm_tokenizer
|
|
755
752
|
|
|
756
753
|
@classmethod
|
|
757
754
|
def get_requirements(cls) -> list[Requirement]:
|
|
758
755
|
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
759
756
|
|
|
760
757
|
def clone(self) -> HFLayoutLmSequenceClassifierBase:
|
|
761
|
-
return self.__class__(
|
|
758
|
+
return self.__class__(
|
|
759
|
+
self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
|
|
760
|
+
)
|
|
762
761
|
|
|
763
762
|
def _validate_encodings(
|
|
764
763
|
self, **encodings: Union[list[list[str]], torch.Tensor]
|
|
@@ -856,13 +855,13 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
856
855
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
857
856
|
use_xlm_tokenizer: bool = False,
|
|
858
857
|
):
|
|
859
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
858
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
860
859
|
self.name = self.get_name(path_weights, "LayoutLM")
|
|
861
860
|
self.model_id = self.get_model_id()
|
|
862
861
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
863
862
|
self.model.to(self.device)
|
|
864
863
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
865
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
864
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
866
865
|
)
|
|
867
866
|
|
|
868
867
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
@@ -939,13 +938,13 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
939
938
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
940
939
|
use_xlm_tokenizer: bool = False,
|
|
941
940
|
):
|
|
942
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
941
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
943
942
|
self.name = self.get_name(path_weights, "LayoutLMv2")
|
|
944
943
|
self.model_id = self.get_model_id()
|
|
945
944
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
946
945
|
self.model.to(self.device)
|
|
947
946
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
948
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
947
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
949
948
|
)
|
|
950
949
|
|
|
951
950
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
@@ -1029,13 +1028,13 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1029
1028
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
1030
1029
|
use_xlm_tokenizer: bool = False,
|
|
1031
1030
|
):
|
|
1032
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
1031
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
1033
1032
|
self.name = self.get_name(path_weights, "LayoutLMv3")
|
|
1034
1033
|
self.model_id = self.get_model_id()
|
|
1035
1034
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
1036
1035
|
self.model.to(self.device)
|
|
1037
1036
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
1038
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
1037
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
1039
1038
|
)
|
|
1040
1039
|
|
|
1041
1040
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
@@ -1142,13 +1141,15 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
1142
1141
|
:param device: The device (cpu,"cuda"), where to place the model.
|
|
1143
1142
|
"""
|
|
1144
1143
|
|
|
1145
|
-
super().__init__(
|
|
1144
|
+
super().__init__(
|
|
1145
|
+
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
1146
|
+
)
|
|
1146
1147
|
self.name = self.get_name(path_weights, "LiLT")
|
|
1147
1148
|
self.model_id = self.get_model_id()
|
|
1148
1149
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
1149
1150
|
self.model.to(self.device)
|
|
1150
1151
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
1151
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
1152
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
1152
1153
|
)
|
|
1153
1154
|
|
|
1154
1155
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
@@ -1232,13 +1233,13 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1232
1233
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
1233
1234
|
use_xlm_tokenizer: bool = False,
|
|
1234
1235
|
):
|
|
1235
|
-
super().__init__(path_config_json, path_weights, categories, device)
|
|
1236
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
1236
1237
|
self.name = self.get_name(path_weights, "LiLT")
|
|
1237
1238
|
self.model_id = self.get_model_id()
|
|
1238
1239
|
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
1239
1240
|
self.model.to(self.device)
|
|
1240
1241
|
self.model.config.tokenizer_class = self.get_tokenizer_class_name(
|
|
1241
|
-
self.model.__class__.__name__, use_xlm_tokenizer
|
|
1242
|
+
self.model.__class__.__name__, self.use_xlm_tokenizer
|
|
1242
1243
|
)
|
|
1243
1244
|
|
|
1244
1245
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
@@ -1270,3 +1271,19 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1270
1271
|
|
|
1271
1272
|
def clear_model(self) -> None:
|
|
1272
1273
|
self.model = None
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
if TYPE_CHECKING:
|
|
1277
|
+
LayoutTokenModels: TypeAlias = Union[
|
|
1278
|
+
HFLayoutLmTokenClassifier,
|
|
1279
|
+
HFLayoutLmv2TokenClassifier,
|
|
1280
|
+
HFLayoutLmv3TokenClassifier,
|
|
1281
|
+
HFLiltTokenClassifier,
|
|
1282
|
+
]
|
|
1283
|
+
|
|
1284
|
+
LayoutSequenceModels: TypeAlias = Union[
|
|
1285
|
+
HFLayoutLmSequenceClassifier,
|
|
1286
|
+
HFLayoutLmv2SequenceClassifier,
|
|
1287
|
+
HFLayoutLmv3SequenceClassifier,
|
|
1288
|
+
HFLiltSequenceClassifier,
|
|
1289
|
+
]
|
|
@@ -188,7 +188,7 @@ class DoctectionPipe(Pipeline):
|
|
|
188
188
|
|
|
189
189
|
df = MapData(df, _proto_process(path, doc_path))
|
|
190
190
|
if dataset_dataflow is None:
|
|
191
|
-
df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
|
|
191
|
+
df = MapData(df, _to_image(dpi=int(os.environ.get("DPI", 300)))) # pylint: disable=E1120
|
|
192
192
|
return df
|
|
193
193
|
|
|
194
194
|
@staticmethod
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -30,7 +30,7 @@ from .base import MetaAnnotation, PipelineComponent
|
|
|
30
30
|
from .registry import pipeline_component_registry
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
|
-
from ..extern.hflayoutlm import
|
|
33
|
+
from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
@@ -66,7 +66,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
66
66
|
def __init__(
|
|
67
67
|
self,
|
|
68
68
|
tokenizer: Any,
|
|
69
|
-
language_model:
|
|
69
|
+
language_model: LayoutTokenModels,
|
|
70
70
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
71
71
|
truncation: bool = True,
|
|
72
72
|
return_overflowing_tokens: bool = False,
|
|
@@ -155,11 +155,11 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
155
155
|
else:
|
|
156
156
|
token_class_name_id = None
|
|
157
157
|
self.dp_manager.set_category_annotation(
|
|
158
|
-
token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
|
|
158
|
+
token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid, token.score
|
|
159
159
|
)
|
|
160
160
|
self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
|
|
161
161
|
self.dp_manager.set_category_annotation(
|
|
162
|
-
token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
|
|
162
|
+
token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid, token.score
|
|
163
163
|
)
|
|
164
164
|
words_populated.append(token.uuid)
|
|
165
165
|
|
|
@@ -188,7 +188,7 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
188
188
|
# multiple threads
|
|
189
189
|
return self.__class__(
|
|
190
190
|
copy(self.tokenizer),
|
|
191
|
-
self.language_model.clone(),
|
|
191
|
+
self.language_model.clone(), # type: ignore
|
|
192
192
|
self.padding,
|
|
193
193
|
self.truncation,
|
|
194
194
|
self.return_overflowing_tokens,
|
|
@@ -260,7 +260,7 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
260
260
|
def __init__(
|
|
261
261
|
self,
|
|
262
262
|
tokenizer: Any,
|
|
263
|
-
language_model:
|
|
263
|
+
language_model: LayoutSequenceModels,
|
|
264
264
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
265
265
|
truncation: bool = True,
|
|
266
266
|
return_overflowing_tokens: bool = False,
|
|
@@ -309,7 +309,7 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
309
309
|
def clone(self) -> LMSequenceClassifierService:
|
|
310
310
|
return self.__class__(
|
|
311
311
|
copy(self.tokenizer),
|
|
312
|
-
self.language_model.clone(),
|
|
312
|
+
self.language_model.clone(), # type: ignore
|
|
313
313
|
self.padding,
|
|
314
314
|
self.truncation,
|
|
315
315
|
self.return_overflowing_tokens,
|
deepdoctection/pipe/refine.py
CHANGED
|
@@ -295,28 +295,21 @@ def _html_table(
|
|
|
295
295
|
return html
|
|
296
296
|
|
|
297
297
|
|
|
298
|
-
def generate_html_string(table: ImageAnnotation) -> list[str]:
|
|
298
|
+
def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
|
|
299
299
|
"""
|
|
300
300
|
Takes the table segmentation by using table cells row number, column numbers etc. and generates a html
|
|
301
301
|
representation.
|
|
302
302
|
|
|
303
303
|
:param table: An annotation that has a not None image and fully segmented cell annotation.
|
|
304
|
+
:param cell_names: List of cell names that are used for the table segmentation. Note: It must be ensured that
|
|
305
|
+
that all cells have a row number, column number, row span and column span and that the dissection
|
|
306
|
+
by rows and columns is completely covered by cells.
|
|
304
307
|
:return: HTML representation of the table
|
|
305
308
|
"""
|
|
306
309
|
if table.image is None:
|
|
307
310
|
raise ImageError("table.image cannot be None")
|
|
308
311
|
table_image = table.image
|
|
309
|
-
cells = table_image.get_annotation(
|
|
310
|
-
category_names=[
|
|
311
|
-
LayoutType.CELL,
|
|
312
|
-
CellType.HEADER,
|
|
313
|
-
CellType.BODY,
|
|
314
|
-
CellType.SPANNING,
|
|
315
|
-
CellType.ROW_HEADER,
|
|
316
|
-
CellType.COLUMN_HEADER,
|
|
317
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
318
|
-
]
|
|
319
|
-
)
|
|
312
|
+
cells = table_image.get_annotation(category_names=cell_names)
|
|
320
313
|
number_of_rows = table_image.summary.get_sub_category(TableType.NUMBER_OF_ROWS).category_id
|
|
321
314
|
number_of_cols = table_image.summary.get_sub_category(TableType.NUMBER_OF_COLUMNS).category_id
|
|
322
315
|
table_list = []
|
|
@@ -485,7 +478,7 @@ class TableSegmentationRefinementService(PipelineComponent):
|
|
|
485
478
|
self.dp_manager.set_summary_annotation(
|
|
486
479
|
TableType.MAX_COL_SPAN, TableType.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
|
|
487
480
|
)
|
|
488
|
-
html = generate_html_string(table)
|
|
481
|
+
html = generate_html_string(table, self.cell_names)
|
|
489
482
|
self.dp_manager.set_container_annotation(TableType.HTML, -1, TableType.HTML, table.annotation_id, html)
|
|
490
483
|
|
|
491
484
|
def clone(self) -> TableSegmentationRefinementService:
|