deepdoctection 0.39__py3-none-any.whl → 0.39.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +1 -1
- deepdoctection/pipe/base.py +38 -5
- deepdoctection/pipe/common.py +3 -3
- deepdoctection/train/hf_detr_train.py +1 -1
- deepdoctection/utils/settings.py +1 -1
- {deepdoctection-0.39.dist-info → deepdoctection-0.39.1.dist-info}/METADATA +5 -5
- {deepdoctection-0.39.dist-info → deepdoctection-0.39.1.dist-info}/RECORD +10 -10
- {deepdoctection-0.39.dist-info → deepdoctection-0.39.1.dist-info}/LICENSE +0 -0
- {deepdoctection-0.39.dist-info → deepdoctection-0.39.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.39.dist-info → deepdoctection-0.39.1.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
deepdoctection/pipe/base.py
CHANGED
|
@@ -24,7 +24,7 @@ from __future__ import annotations
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from dataclasses import dataclass, field
|
|
27
|
-
from typing import Any, Mapping, Optional, Union
|
|
27
|
+
from typing import Any, Mapping, Optional, Union, Callable
|
|
28
28
|
from uuid import uuid1
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
@@ -33,6 +33,7 @@ from ..mapper.misc import curry
|
|
|
33
33
|
from ..utils.context import timed_operation
|
|
34
34
|
from ..utils.identifier import get_uuid_from_str
|
|
35
35
|
from ..utils.settings import ObjectTypes
|
|
36
|
+
from ..utils.types import DP
|
|
36
37
|
from .anngen import DatapointManager
|
|
37
38
|
|
|
38
39
|
|
|
@@ -76,6 +77,30 @@ class PipelineComponent(ABC):
|
|
|
76
77
|
self.service_id = self.get_service_id()
|
|
77
78
|
self.dp_manager = DatapointManager(self.service_id, model_id)
|
|
78
79
|
self.timer_on = False
|
|
80
|
+
self.filter_func: Callable[[DP], bool] = lambda dp: False
|
|
81
|
+
|
|
82
|
+
def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
|
|
85
|
+
The filter function should return a boolean value. If the function returns True, the image will not be processed
|
|
86
|
+
by this pipeline component.
|
|
87
|
+
|
|
88
|
+
**Example:**
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
def do_not_process_tables(dp: Image) -> bool:
|
|
92
|
+
if "table" not in dp.get_categories_from_current_state():
|
|
93
|
+
return True
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
layout_component = ImageLayoutService(...)
|
|
97
|
+
layout_component.set_inbound_filter(do_not_process_tables)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
:param filter_func: A function that takes an image datapoint and returns a boolean value
|
|
102
|
+
"""
|
|
103
|
+
self.filter_func = filter_func # type: ignore
|
|
79
104
|
|
|
80
105
|
@abstractmethod
|
|
81
106
|
def serve(self, dp: Image) -> None:
|
|
@@ -92,6 +117,12 @@ class PipelineComponent(ABC):
|
|
|
92
117
|
"""
|
|
93
118
|
raise NotImplementedError()
|
|
94
119
|
|
|
120
|
+
def _pass_datapoint(self, dp: Image) -> None:
|
|
121
|
+
self.dp_manager.datapoint = dp
|
|
122
|
+
if not self.filter_func(dp):
|
|
123
|
+
self.serve(dp)
|
|
124
|
+
|
|
125
|
+
|
|
95
126
|
def pass_datapoint(self, dp: Image) -> Image:
|
|
96
127
|
"""
|
|
97
128
|
Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
|
|
@@ -103,11 +134,9 @@ class PipelineComponent(ABC):
|
|
|
103
134
|
"""
|
|
104
135
|
if self.timer_on:
|
|
105
136
|
with timed_operation(self.__class__.__name__):
|
|
106
|
-
self.
|
|
107
|
-
self.serve(dp)
|
|
137
|
+
self._pass_datapoint(dp)
|
|
108
138
|
else:
|
|
109
|
-
self.
|
|
110
|
-
self.serve(dp)
|
|
139
|
+
self._pass_datapoint(dp)
|
|
111
140
|
return self.dp_manager.datapoint
|
|
112
141
|
|
|
113
142
|
def predict_dataflow(self, df: DataFlow) -> DataFlow:
|
|
@@ -205,6 +234,7 @@ class Pipeline(ABC):
|
|
|
205
234
|
|
|
206
235
|
**Example:**
|
|
207
236
|
|
|
237
|
+
```python
|
|
208
238
|
layout = LayoutPipeComponent(layout_detector ...)
|
|
209
239
|
text = TextExtractPipeComponent(text_detector ...)
|
|
210
240
|
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
@@ -212,6 +242,7 @@ class Pipeline(ABC):
|
|
|
212
242
|
|
|
213
243
|
for page in doc_dataflow:
|
|
214
244
|
print(page)
|
|
245
|
+
```
|
|
215
246
|
|
|
216
247
|
In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
|
|
217
248
|
model or already processed further).
|
|
@@ -225,10 +256,12 @@ class Pipeline(ABC):
|
|
|
225
256
|
|
|
226
257
|
**Example:**
|
|
227
258
|
|
|
259
|
+
```python
|
|
228
260
|
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
229
261
|
pipe.set_session_id = True
|
|
230
262
|
|
|
231
263
|
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
264
|
+
```
|
|
232
265
|
"""
|
|
233
266
|
|
|
234
267
|
def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -349,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
349
349
|
def __init__(
|
|
350
350
|
self,
|
|
351
351
|
nms_pairs: Sequence[Sequence[TypeOrStr]],
|
|
352
|
-
thresholds: Union[float,
|
|
353
|
-
priority: Optional[
|
|
352
|
+
thresholds: Union[float, Sequence[float]],
|
|
353
|
+
priority: Optional[Sequence[Union[Optional[TypeOrStr]]]] = None,
|
|
354
354
|
):
|
|
355
355
|
"""
|
|
356
356
|
:param nms_pairs: Groups of categories, either as string or by `ObjectType`.
|
|
@@ -362,7 +362,7 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
362
362
|
self.threshold = [thresholds for _ in self.nms_pairs]
|
|
363
363
|
else:
|
|
364
364
|
assert len(self.nms_pairs) == len(thresholds), "Sequences of nms_pairs and thresholds must have same length"
|
|
365
|
-
self.threshold = thresholds
|
|
365
|
+
self.threshold = thresholds # type: ignore
|
|
366
366
|
if priority:
|
|
367
367
|
assert len(self.nms_pairs) == len(priority), "Sequences of nms_pairs and priority must have same length"
|
|
368
368
|
|
|
@@ -73,7 +73,7 @@ class DetrDerivedTrainer(Trainer):
|
|
|
73
73
|
model: Union[PreTrainedModel, nn.Module],
|
|
74
74
|
args: TrainingArguments,
|
|
75
75
|
data_collator: DetrDataCollator,
|
|
76
|
-
train_dataset:
|
|
76
|
+
train_dataset: DatasetAdapter,
|
|
77
77
|
):
|
|
78
78
|
self.evaluator: Optional[Evaluator] = None
|
|
79
79
|
self.build_eval_kwargs: Optional[dict[str, Any]] = None
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -101,7 +101,6 @@ class DocumentType(ObjectTypes):
|
|
|
101
101
|
GOVERNMENT_TENDERS = "government_tenders"
|
|
102
102
|
MANUALS = "manuals"
|
|
103
103
|
PATENTS = "patents"
|
|
104
|
-
MARK = "mark"
|
|
105
104
|
|
|
106
105
|
|
|
107
106
|
@object_types_registry.register("LayoutType")
|
|
@@ -132,6 +131,7 @@ class LayoutType(ObjectTypes):
|
|
|
132
131
|
PAGE_NUMBER = "page_number"
|
|
133
132
|
KEY_VALUE_AREA = "key_value_area"
|
|
134
133
|
LIST_ITEM = "list_item"
|
|
134
|
+
MARK = "mark"
|
|
135
135
|
|
|
136
136
|
|
|
137
137
|
@object_types_registry.register("TableType")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.39
|
|
3
|
+
Version: 0.39.1
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -61,7 +61,7 @@ Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
|
61
61
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
62
62
|
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
63
63
|
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
64
|
-
Requires-Dist: fasttext
|
|
64
|
+
Requires-Dist: fasttext-wheel; extra == "tf"
|
|
65
65
|
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
66
66
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
67
67
|
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
@@ -86,12 +86,12 @@ Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
|
86
86
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
87
87
|
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
88
88
|
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
89
|
-
Requires-Dist: transformers>=4.
|
|
89
|
+
Requires-Dist: transformers>=4.48.0; extra == "pt"
|
|
90
90
|
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
91
91
|
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
92
92
|
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
93
93
|
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
94
|
-
Requires-Dist: fasttext
|
|
94
|
+
Requires-Dist: fasttext-wheel; extra == "pt"
|
|
95
95
|
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
96
96
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
97
97
|
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
@@ -99,7 +99,7 @@ Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
|
99
99
|
Provides-Extra: docs
|
|
100
100
|
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
101
101
|
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
102
|
-
Requires-Dist: transformers>=4.
|
|
102
|
+
Requires-Dist: transformers>=4.48.0; extra == "docs"
|
|
103
103
|
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
104
104
|
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
105
105
|
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
deepdoctection/__init__.py,sha256=
|
|
1
|
+
deepdoctection/__init__.py,sha256=uDowNayqaYZGYaqnGzPSz6pVuHQhtDVRAN_bvPq85Ko,12754
|
|
2
2
|
deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
|
|
4
4
|
deepdoctection/analyzer/_config.py,sha256=OZMOPlyFv4gcyabPG6KO08EYx-0tUH82Ehs9YDv2B1Q,5027
|
|
@@ -103,8 +103,8 @@ deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_
|
|
|
103
103
|
deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
|
|
104
104
|
deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
|
|
105
105
|
deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
|
|
106
|
-
deepdoctection/pipe/base.py,sha256=
|
|
107
|
-
deepdoctection/pipe/common.py,sha256=
|
|
106
|
+
deepdoctection/pipe/base.py,sha256=F4NusbZ-xYc6wuO-XAngmC8uzahT2ubsu2g9NO8PpVw,15390
|
|
107
|
+
deepdoctection/pipe/common.py,sha256=vlWzvwn8wl7baPbK-917HUWujEGJEkHur_-ilkweKjk,17751
|
|
108
108
|
deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
|
|
109
109
|
deepdoctection/pipe/doctectionpipe.py,sha256=bGW3ugky-fb-nEe-3bvO6Oc_4_6w82cQboGM_6p2eIo,12530
|
|
110
110
|
deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
|
|
@@ -119,7 +119,7 @@ deepdoctection/pipe/text.py,sha256=h9q6d3HFOs7LOg-iwdLUPiQxrPqgunBVNmtYMBrfRQE,1
|
|
|
119
119
|
deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3nI7w,3798
|
|
120
120
|
deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
|
|
121
121
|
deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
|
|
122
|
-
deepdoctection/train/hf_detr_train.py,sha256=
|
|
122
|
+
deepdoctection/train/hf_detr_train.py,sha256=eHSdI11U8oGy93noZxAISfukhRBElj4dBerJ4Xcercw,10785
|
|
123
123
|
deepdoctection/train/hf_layoutlm_train.py,sha256=irSg-IpbVoSlaw1-vZCej2mCZcctONtXr5Z2NQAc_a4,22680
|
|
124
124
|
deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
|
|
125
125
|
deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
|
|
@@ -135,14 +135,14 @@ deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mV
|
|
|
135
135
|
deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
|
|
136
136
|
deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
|
|
137
137
|
deepdoctection/utils/pdf_utils.py,sha256=Fi0eZ2GbnO7N61Rd8b8YRKRff4dalHAzkcn3zpGPoic,13119
|
|
138
|
-
deepdoctection/utils/settings.py,sha256=
|
|
138
|
+
deepdoctection/utils/settings.py,sha256=hDD6yDX_4pQXwR5ILVwJIj6hb7NXA0-ifnC25ldcUjA,12464
|
|
139
139
|
deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
|
|
140
140
|
deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
|
|
141
141
|
deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
|
|
142
142
|
deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
|
|
143
143
|
deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
|
|
144
|
-
deepdoctection-0.39.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
-
deepdoctection-0.39.dist-info/METADATA,sha256=
|
|
146
|
-
deepdoctection-0.39.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
147
|
-
deepdoctection-0.39.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
-
deepdoctection-0.39.dist-info/RECORD,,
|
|
144
|
+
deepdoctection-0.39.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
+
deepdoctection-0.39.1.dist-info/METADATA,sha256=NBN2dqFMUiXkcJ28xJDwyN6eNP-MmFw64F7dm3kUWTA,19741
|
|
146
|
+
deepdoctection-0.39.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
147
|
+
deepdoctection-0.39.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
+
deepdoctection-0.39.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|