deepdoctection 0.39__py3-none-any.whl → 0.39.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.39"
28
+ __version__ = "0.39.1"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -24,7 +24,7 @@ from __future__ import annotations
24
24
  from abc import ABC, abstractmethod
25
25
  from collections import defaultdict
26
26
  from dataclasses import dataclass, field
27
- from typing import Any, Mapping, Optional, Union
27
+ from typing import Any, Mapping, Optional, Union, Callable
28
28
  from uuid import uuid1
29
29
 
30
30
  from ..dataflow import DataFlow, MapData
@@ -33,6 +33,7 @@ from ..mapper.misc import curry
33
33
  from ..utils.context import timed_operation
34
34
  from ..utils.identifier import get_uuid_from_str
35
35
  from ..utils.settings import ObjectTypes
36
+ from ..utils.types import DP
36
37
  from .anngen import DatapointManager
37
38
 
38
39
 
@@ -76,6 +77,30 @@ class PipelineComponent(ABC):
76
77
  self.service_id = self.get_service_id()
77
78
  self.dp_manager = DatapointManager(self.service_id, model_id)
78
79
  self.timer_on = False
80
+ self.filter_func: Callable[[DP], bool] = lambda dp: False
81
+
82
+ def set_inbound_filter(self, filter_func: Callable[[DP], bool]) -> None:
83
+ """
84
+ Set a filter function to decide, if an image of the inbound dataflow should be passed to self.serve.
85
+ The filter function should return a boolean value. If the function returns True, the image will not be processed
86
+ by this pipeline component.
87
+
88
+ **Example:**
89
+
90
+ ```python
91
+ def do_not_process_tables(dp: Image) -> bool:
92
+ if "table" not in dp.get_categories_from_current_state():
93
+ return True
94
+ return False
95
+
96
+ layout_component = ImageLayoutService(...)
97
+ layout_component.set_inbound_filter(do_not_process_tables)
98
+ ```
99
+
100
+
101
+ :param filter_func: A function that takes an image datapoint and returns a boolean value
102
+ """
103
+ self.filter_func = filter_func # type: ignore
79
104
 
80
105
  @abstractmethod
81
106
  def serve(self, dp: Image) -> None:
@@ -92,6 +117,12 @@ class PipelineComponent(ABC):
92
117
  """
93
118
  raise NotImplementedError()
94
119
 
120
+ def _pass_datapoint(self, dp: Image) -> None:
121
+ self.dp_manager.datapoint = dp
122
+ if not self.filter_func(dp):
123
+ self.serve(dp)
124
+
125
+
95
126
  def pass_datapoint(self, dp: Image) -> Image:
96
127
  """
97
128
  Acceptance, handover to dp_manager, transformation and forwarding of dp. To measure the time, use
@@ -103,11 +134,9 @@ class PipelineComponent(ABC):
103
134
  """
104
135
  if self.timer_on:
105
136
  with timed_operation(self.__class__.__name__):
106
- self.dp_manager.datapoint = dp
107
- self.serve(dp)
137
+ self._pass_datapoint(dp)
108
138
  else:
109
- self.dp_manager.datapoint = dp
110
- self.serve(dp)
139
+ self._pass_datapoint(dp)
111
140
  return self.dp_manager.datapoint
112
141
 
113
142
  def predict_dataflow(self, df: DataFlow) -> DataFlow:
@@ -205,6 +234,7 @@ class Pipeline(ABC):
205
234
 
206
235
  **Example:**
207
236
 
237
+ ```python
208
238
  layout = LayoutPipeComponent(layout_detector ...)
209
239
  text = TextExtractPipeComponent(text_detector ...)
210
240
  simple_pipe = MyPipeline(pipeline_component = [layout, text])
@@ -212,6 +242,7 @@ class Pipeline(ABC):
212
242
 
213
243
  for page in doc_dataflow:
214
244
  print(page)
245
+ ```
215
246
 
216
247
  In doing so, page contains all document structures determined via the pipeline (either directly from the Image core
217
248
  model or already processed further).
@@ -225,10 +256,12 @@ class Pipeline(ABC):
225
256
 
226
257
  **Example:**
227
258
 
259
+ ```python
228
260
  pipe = MyPipeline(pipeline_component = [layout, text])
229
261
  pipe.set_session_id = True
230
262
 
231
263
  df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
264
+ ```
232
265
  """
233
266
 
234
267
  def __init__(self, pipeline_component_list: list[PipelineComponent]) -> None:
@@ -349,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
349
349
  def __init__(
350
350
  self,
351
351
  nms_pairs: Sequence[Sequence[TypeOrStr]],
352
- thresholds: Union[float, list[float]],
353
- priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
352
+ thresholds: Union[float, Sequence[float]],
353
+ priority: Optional[Sequence[Union[Optional[TypeOrStr]]]] = None,
354
354
  ):
355
355
  """
356
356
  :param nms_pairs: Groups of categories, either as string or by `ObjectType`.
@@ -362,7 +362,7 @@ class AnnotationNmsService(PipelineComponent):
362
362
  self.threshold = [thresholds for _ in self.nms_pairs]
363
363
  else:
364
364
  assert len(self.nms_pairs) == len(thresholds), "Sequences of nms_pairs and thresholds must have same length"
365
- self.threshold = thresholds
365
+ self.threshold = thresholds # type: ignore
366
366
  if priority:
367
367
  assert len(self.nms_pairs) == len(priority), "Sequences of nms_pairs and priority must have same length"
368
368
 
@@ -73,7 +73,7 @@ class DetrDerivedTrainer(Trainer):
73
73
  model: Union[PreTrainedModel, nn.Module],
74
74
  args: TrainingArguments,
75
75
  data_collator: DetrDataCollator,
76
- train_dataset: Dataset[Any],
76
+ train_dataset: DatasetAdapter,
77
77
  ):
78
78
  self.evaluator: Optional[Evaluator] = None
79
79
  self.build_eval_kwargs: Optional[dict[str, Any]] = None
@@ -101,7 +101,6 @@ class DocumentType(ObjectTypes):
101
101
  GOVERNMENT_TENDERS = "government_tenders"
102
102
  MANUALS = "manuals"
103
103
  PATENTS = "patents"
104
- MARK = "mark"
105
104
 
106
105
 
107
106
  @object_types_registry.register("LayoutType")
@@ -132,6 +131,7 @@ class LayoutType(ObjectTypes):
132
131
  PAGE_NUMBER = "page_number"
133
132
  KEY_VALUE_AREA = "key_value_area"
134
133
  LIST_ITEM = "list_item"
134
+ MARK = "mark"
135
135
 
136
136
 
137
137
  @object_types_registry.register("TableType")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deepdoctection
3
- Version: 0.39
3
+ Version: 0.39.1
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -61,7 +61,7 @@ Requires-Dist: python-doctr==0.8.1; extra == "tf"
61
61
  Requires-Dist: pycocotools>=2.0.2; extra == "tf"
62
62
  Requires-Dist: boto3==1.34.102; extra == "tf"
63
63
  Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
64
- Requires-Dist: fasttext==0.9.2; extra == "tf"
64
+ Requires-Dist: fasttext-wheel; extra == "tf"
65
65
  Requires-Dist: jdeskew>=0.2.2; extra == "tf"
66
66
  Requires-Dist: apted==1.0.3; extra == "tf"
67
67
  Requires-Dist: distance==0.1.3; extra == "tf"
@@ -86,12 +86,12 @@ Requires-Dist: termcolor>=1.1; extra == "pt"
86
86
  Requires-Dist: tabulate>=0.7.7; extra == "pt"
87
87
  Requires-Dist: tqdm==4.64.0; extra == "pt"
88
88
  Requires-Dist: timm>=0.9.16; extra == "pt"
89
- Requires-Dist: transformers>=4.36.0; extra == "pt"
89
+ Requires-Dist: transformers>=4.48.0; extra == "pt"
90
90
  Requires-Dist: accelerate>=0.29.1; extra == "pt"
91
91
  Requires-Dist: python-doctr==0.8.1; extra == "pt"
92
92
  Requires-Dist: boto3==1.34.102; extra == "pt"
93
93
  Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
94
- Requires-Dist: fasttext==0.9.2; extra == "pt"
94
+ Requires-Dist: fasttext-wheel; extra == "pt"
95
95
  Requires-Dist: jdeskew>=0.2.2; extra == "pt"
96
96
  Requires-Dist: apted==1.0.3; extra == "pt"
97
97
  Requires-Dist: distance==0.1.3; extra == "pt"
@@ -99,7 +99,7 @@ Requires-Dist: lxml>=4.9.1; extra == "pt"
99
99
  Provides-Extra: docs
100
100
  Requires-Dist: tensorpack==0.11; extra == "docs"
101
101
  Requires-Dist: boto3==1.34.102; extra == "docs"
102
- Requires-Dist: transformers>=4.36.0; extra == "docs"
102
+ Requires-Dist: transformers>=4.48.0; extra == "docs"
103
103
  Requires-Dist: accelerate>=0.29.1; extra == "docs"
104
104
  Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
105
105
  Requires-Dist: lxml>=4.9.1; extra == "docs"
@@ -1,4 +1,4 @@
1
- deepdoctection/__init__.py,sha256=0nxfBTu-aeg3DYu9g2kEAnt3Y-lCnHSgP31qvCnsLOs,12752
1
+ deepdoctection/__init__.py,sha256=uDowNayqaYZGYaqnGzPSz6pVuHQhtDVRAN_bvPq85Ko,12754
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
4
  deepdoctection/analyzer/_config.py,sha256=OZMOPlyFv4gcyabPG6KO08EYx-0tUH82Ehs9YDv2B1Q,5027
@@ -103,8 +103,8 @@ deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_
103
103
  deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
104
104
  deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
105
105
  deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
106
- deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
107
- deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
106
+ deepdoctection/pipe/base.py,sha256=F4NusbZ-xYc6wuO-XAngmC8uzahT2ubsu2g9NO8PpVw,15390
107
+ deepdoctection/pipe/common.py,sha256=vlWzvwn8wl7baPbK-917HUWujEGJEkHur_-ilkweKjk,17751
108
108
  deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
109
109
  deepdoctection/pipe/doctectionpipe.py,sha256=bGW3ugky-fb-nEe-3bvO6Oc_4_6w82cQboGM_6p2eIo,12530
110
110
  deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
@@ -119,7 +119,7 @@ deepdoctection/pipe/text.py,sha256=h9q6d3HFOs7LOg-iwdLUPiQxrPqgunBVNmtYMBrfRQE,1
119
119
  deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3nI7w,3798
120
120
  deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
121
121
  deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
122
- deepdoctection/train/hf_detr_train.py,sha256=8ydysxzOPE_IPoNFGaHb7PbKr9Nbl41rcY4lbylQavU,10783
122
+ deepdoctection/train/hf_detr_train.py,sha256=eHSdI11U8oGy93noZxAISfukhRBElj4dBerJ4Xcercw,10785
123
123
  deepdoctection/train/hf_layoutlm_train.py,sha256=irSg-IpbVoSlaw1-vZCej2mCZcctONtXr5Z2NQAc_a4,22680
124
124
  deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
125
125
  deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
@@ -135,14 +135,14 @@ deepdoctection/utils/logger.py,sha256=J0OVKiXP_2A82MWbbJoOeMEJ-75aZu5npgaS_yI6mV
135
135
  deepdoctection/utils/metacfg.py,sha256=hD76KQ_RnD_5B02qLI2Zxf3WfnsnXhEI_KUTKpw91RI,5711
136
136
  deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
137
137
  deepdoctection/utils/pdf_utils.py,sha256=Fi0eZ2GbnO7N61Rd8b8YRKRff4dalHAzkcn3zpGPoic,13119
138
- deepdoctection/utils/settings.py,sha256=k6OyuWbj-IPeaO9zT9RZ-5Yad1wNhWGYqGLZdtgXAZY,12464
138
+ deepdoctection/utils/settings.py,sha256=hDD6yDX_4pQXwR5ILVwJIj6hb7NXA0-ifnC25ldcUjA,12464
139
139
  deepdoctection/utils/tqdm.py,sha256=cBUtR0L1x0KMeYrLP2rrzyzCamCjpQAKroHXLv81_pk,1820
140
140
  deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F2GPU,8502
141
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
142
142
  deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
143
143
  deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
144
- deepdoctection-0.39.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
- deepdoctection-0.39.dist-info/METADATA,sha256=0OcWmWb8bssiwC2_Xnb6hUKyY0ISv6Bc5qVGosbrn3c,19741
146
- deepdoctection-0.39.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
147
- deepdoctection-0.39.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
- deepdoctection-0.39.dist-info/RECORD,,
144
+ deepdoctection-0.39.1.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.39.1.dist-info/METADATA,sha256=NBN2dqFMUiXkcJ28xJDwyN6eNP-MmFw64F7dm3kUWTA,19741
146
+ deepdoctection-0.39.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
147
+ deepdoctection-0.39.1.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.39.1.dist-info/RECORD,,