deepdoctection 0.37__py3-none-any.whl → 0.37.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -24,7 +24,7 @@ from .utils.logger import LoggingRecord, logger
24
24
 
25
25
  # pylint: enable=wrong-import-position
26
26
 
27
- __version__ = 0.37
27
+ __version__ = "0.37.2"
28
28
 
29
29
  _IMPORT_STRUCTURE = {
30
30
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -327,9 +327,9 @@ class ServiceFactory:
327
327
  )
328
328
  if config.OCR.USE_TEXTRACT:
329
329
  credentials_kwargs = {
330
- "aws_access_key_id": environ.get("ACCESS_KEY", None),
331
- "aws_secret_access_key": environ.get("SECRET_KEY", None),
332
- "config": Config(region_name=environ.get("REGION", None)),
330
+ "aws_access_key_id": environ.get("AWS_ACCESS_KEY", None),
331
+ "aws_secret_access_key": environ.get("AWS_SECRET_KEY", None),
332
+ "config": Config(region_name=environ.get("AWS_REGION", None)),
333
333
  }
334
334
  return TextractOcrDetector(**credentials_kwargs)
335
335
  raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -32,6 +32,7 @@ from numpy import uint8
32
32
 
33
33
  from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
34
34
  from ..utils.identifier import get_uuid, is_uuid_like
35
+ from ..utils.logger import LoggingRecord, logger
35
36
  from ..utils.settings import ObjectTypes, SummaryType, get_type
36
37
  from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
37
38
  from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
@@ -474,8 +475,13 @@ class Image:
474
475
 
475
476
  for service_id in service_ids:
476
477
  if service_id not in service_id_to_annotation_id:
477
- raise ImageError(f"Service id {service_id} not found")
478
- annotation_ids = service_id_to_annotation_id[service_id]
478
+ logger.info(
479
+ LoggingRecord(
480
+ f"Service_id {service_id} for image_id: {self.image_id} not found. Skipping removal."
481
+ )
482
+ )
483
+
484
+ annotation_ids = service_id_to_annotation_id.get(service_id, [])
479
485
 
480
486
  for ann_id in annotation_ids:
481
487
  if ann_id not in ann_id_to_annotation_maps:
@@ -747,7 +753,7 @@ class Image:
747
753
  if sub_cat.service_id:
748
754
  service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
749
755
  if ann.image is not None:
750
- for summary_cat_key in ann.image.summary:
756
+ for summary_cat_key in ann.image.summary.sub_categories:
751
757
  summary_cat = ann.get_summary(summary_cat_key)
752
758
  if summary_cat.service_id:
753
759
  service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
48
48
  import torch.nn.functional as F
49
49
 
50
50
  with try_import() as tr_import_guard:
51
- from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type: ignore
51
+ from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type:ignore
52
52
  from transformers import (
53
53
  LayoutLMForSequenceClassification,
54
54
  LayoutLMForTokenClassification,
@@ -29,6 +29,7 @@ from uuid import uuid1
29
29
 
30
30
  from ..dataflow import DataFlow, MapData
31
31
  from ..datapoint.image import Image
32
+ from ..mapper.misc import curry
32
33
  from ..utils.context import timed_operation
33
34
  from ..utils.identifier import get_uuid_from_str
34
35
  from ..utils.settings import ObjectTypes
@@ -247,17 +248,24 @@ class Pipeline(ABC):
247
248
  """
248
249
  raise NotImplementedError()
249
250
 
250
- def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
251
+ @staticmethod
252
+ @curry
253
+ def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
251
254
  """
252
- Composition of the backbone
255
+ Remove annotations from a datapoint
253
256
  """
254
- if session_id is None and self.set_session_id:
255
- session_id = self.get_session_id()
256
- for component in self.pipe_component_list:
257
- component.timer_on = True
258
- component.dp_manager.session_id = session_id
259
- df = component.predict_dataflow(df)
260
- return df
257
+ dp.remove(service_ids=service_ids)
258
+ return dp
259
+
260
+ def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
261
+ """
262
+ Mapping a datapoint via `_undo` within a dataflow pipeline
263
+
264
+ :param df: An input dataflow of Images
265
+ :param service_ids: A set of service ids to remove
266
+ :return: A output dataflow of Images
267
+ """
268
+ return MapData(df, self._undo(service_ids=service_ids))
261
269
 
262
270
  @abstractmethod
263
271
  def analyze(self, **kwargs: Any) -> DataFlow:
@@ -273,6 +281,18 @@ class Pipeline(ABC):
273
281
  """
274
282
  raise NotImplementedError()
275
283
 
284
+ def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
285
+ """
286
+ Composition of the backbone
287
+ """
288
+ if session_id is None and self.set_session_id:
289
+ session_id = self.get_session_id()
290
+ for component in self.pipe_component_list:
291
+ component.timer_on = True
292
+ component.dp_manager.session_id = session_id
293
+ df = component.predict_dataflow(df)
294
+ return df
295
+
276
296
  def get_meta_annotation(self) -> MetaAnnotation:
277
297
  """
278
298
  Collects meta annotations from all pipeline components and summarizes the returned results
@@ -161,16 +161,17 @@ class DoctectionPipe(Pipeline):
161
161
 
162
162
  super().__init__(pipeline_component_list)
163
163
 
164
- def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
165
- -> DataFlow:
164
+ def _entry(
165
+ self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
166
+ ) -> DataFlow:
166
167
  path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
167
168
 
168
169
  df: DataFlow
169
170
 
170
171
  if isinstance(b_bytes, bytes):
171
- df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
172
- b_bytes=b_bytes,
173
- file_type=file_type)
172
+ df = DoctectionPipe.bytes_to_dataflow(
173
+ path=doc_path if path is None else path, b_bytes=b_bytes, file_type=file_type
174
+ )
174
175
 
175
176
  elif isinstance(path, (str, Path)):
176
177
  if not isinstance(file_type, (str, list)):
@@ -161,11 +161,12 @@ class LayoutLMTrainer(Trainer):
161
161
  model: Union[PreTrainedModel, nn.Module],
162
162
  args: TrainingArguments,
163
163
  data_collator: LayoutLMDataCollator,
164
- train_dataset: Dataset[Any],
164
+ train_dataset: DatasetAdapter,
165
+ eval_dataset: Optional[DatasetBase] = None,
165
166
  ):
166
167
  self.evaluator: Optional[Evaluator] = None
167
168
  self.build_eval_kwargs: Optional[dict[str, Any]] = None
168
- super().__init__(model, args, data_collator, train_dataset)
169
+ super().__init__(model, args, data_collator, train_dataset, eval_dataset=eval_dataset)
169
170
 
170
171
  def setup_evaluator(
171
172
  self,
@@ -472,7 +473,7 @@ def train_hf_layoutlm(
472
473
  max_batch_size=max_batch_size, # type: ignore
473
474
  remove_bounding_box_features=remove_box_features,
474
475
  )
475
- trainer = LayoutLMTrainer(model, arguments, data_collator, dataset)
476
+ trainer = LayoutLMTrainer(model, arguments, data_collator, dataset, eval_dataset=dataset_val)
476
477
 
477
478
  if arguments.evaluation_strategy in (IntervalStrategy.STEPS,):
478
479
  assert metric is not None # silence mypy
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.37
3
+ Version: 0.37.2
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -1,9 +1,9 @@
1
- deepdoctection/__init__.py,sha256=7VELexCFRaBTCXHQpBoKhVi4hqUUgpcsLTqvHXHjufQ,12651
1
+ deepdoctection/__init__.py,sha256=T2sHOc6ZPpx44hWbarp0i_QlAqm0dEmzs7HVg2mL_nM,12655
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
4
4
  deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
5
5
  deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
6
- deepdoctection/analyzer/factory.py,sha256=xmo5F9X7I6lp0ZWJv8QavpMyG8UWYLvMi4qogsZV1_s,31507
6
+ deepdoctection/analyzer/factory.py,sha256=dEUOtdBS3yQGLqMqLR_kq5EYCR3IE30DjHNzE0spoQE,31519
7
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
8
8
  deepdoctection/configs/conf_dd_one.yaml,sha256=td7XsyVhdXkhh5Pie7sT_WNjGTaxBOWgpxhkobHd1H0,2325
9
9
  deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
@@ -19,7 +19,7 @@ deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SP
19
19
  deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
20
20
  deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
21
21
  deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
22
- deepdoctection/datapoint/image.py,sha256=DIXXXD2yKsacg47Wt_GEYEIe1MQkrd06Yr5xAWv_n64,33047
22
+ deepdoctection/datapoint/image.py,sha256=UDBKXJJpuKAUx0J-DGjvLGqrMV4N3kLpksJYxoVkong,33279
23
23
  deepdoctection/datapoint/view.py,sha256=1rVMuqucCrI5zlwyXMADJQBV38V_zSNFqFyBi3cMA1E,44914
24
24
  deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
25
25
  deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
@@ -56,7 +56,7 @@ deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1
56
56
  deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
57
57
  deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ9IlY,4780
58
58
  deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0oM,12055
59
- deepdoctection/extern/hflayoutlm.py,sha256=KfoWx9_Rpa1Y2L51HLrYvenfWaTB4SVTmVJH00Cqb-s,56510
59
+ deepdoctection/extern/hflayoutlm.py,sha256=T1IBm3C8CtG97-tauo03YqhUac6xdFc2y345BWVMajQ,56509
60
60
  deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
61
61
  deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
62
62
  deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
@@ -103,10 +103,10 @@ deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_
103
103
  deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
104
104
  deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
105
105
  deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
106
- deepdoctection/pipe/base.py,sha256=Davjkf3D837y9AIITcx7yXdebmVaz6Moyw_5Wi3nfmg,13561
106
+ deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
107
107
  deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
108
108
  deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
109
- deepdoctection/pipe/doctectionpipe.py,sha256=uhsrSuwaHcOMj8b8i6wCpPaZlSxCTaeHVhMokJ8vRSI,11835
109
+ deepdoctection/pipe/doctectionpipe.py,sha256=wCg96P9Pb54i5AVgG02b4FljobM64_qEML_GxiULy-4,11765
110
110
  deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
111
111
  deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
112
112
  deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
@@ -120,7 +120,7 @@ deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3n
120
120
  deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
121
121
  deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
122
122
  deepdoctection/train/hf_detr_train.py,sha256=8ydysxzOPE_IPoNFGaHb7PbKr9Nbl41rcY4lbylQavU,10783
123
- deepdoctection/train/hf_layoutlm_train.py,sha256=e3pekLfe2KeYAI04COiTTL3KKiLDaXxTj0A2vwTvYZo,22425
123
+ deepdoctection/train/hf_layoutlm_train.py,sha256=BNjPgPAvxm4beHULqzo58u-gW7GcTGiZAk2rF6TootM,22532
124
124
  deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
125
125
  deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
126
126
  deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
@@ -141,8 +141,8 @@ deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F
141
141
  deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
142
142
  deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
143
143
  deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
144
- deepdoctection-0.37.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
- deepdoctection-0.37.dist-info/METADATA,sha256=0qGgmf07xmNRJx55yfMagHcfAoQG6GO9KTw6b0tv0uA,19543
146
- deepdoctection-0.37.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
147
- deepdoctection-0.37.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
- deepdoctection-0.37.dist-info/RECORD,,
144
+ deepdoctection-0.37.2.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
145
+ deepdoctection-0.37.2.dist-info/METADATA,sha256=XLOCkFBWynZhyZmKpDDRaomDIxPnVpy07WdUkodRF3Y,19545
146
+ deepdoctection-0.37.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
147
+ deepdoctection-0.37.2.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
148
+ deepdoctection-0.37.2.dist-info/RECORD,,