deepdoctection 0.37__py3-none-any.whl → 0.37.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +1 -1
- deepdoctection/analyzer/factory.py +3 -3
- deepdoctection/datapoint/image.py +9 -3
- deepdoctection/extern/hflayoutlm.py +1 -1
- deepdoctection/pipe/base.py +29 -9
- deepdoctection/pipe/doctectionpipe.py +6 -5
- deepdoctection/train/hf_layoutlm_train.py +4 -3
- {deepdoctection-0.37.dist-info → deepdoctection-0.37.2.dist-info}/METADATA +1 -1
- {deepdoctection-0.37.dist-info → deepdoctection-0.37.2.dist-info}/RECORD +12 -12
- {deepdoctection-0.37.dist-info → deepdoctection-0.37.2.dist-info}/LICENSE +0 -0
- {deepdoctection-0.37.dist-info → deepdoctection-0.37.2.dist-info}/WHEEL +0 -0
- {deepdoctection-0.37.dist-info → deepdoctection-0.37.2.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -327,9 +327,9 @@ class ServiceFactory:
|
|
|
327
327
|
)
|
|
328
328
|
if config.OCR.USE_TEXTRACT:
|
|
329
329
|
credentials_kwargs = {
|
|
330
|
-
"aws_access_key_id": environ.get("
|
|
331
|
-
"aws_secret_access_key": environ.get("
|
|
332
|
-
"config": Config(region_name=environ.get("
|
|
330
|
+
"aws_access_key_id": environ.get("AWS_ACCESS_KEY", None),
|
|
331
|
+
"aws_secret_access_key": environ.get("AWS_SECRET_KEY", None),
|
|
332
|
+
"config": Config(region_name=environ.get("AWS_REGION", None)),
|
|
333
333
|
}
|
|
334
334
|
return TextractOcrDetector(**credentials_kwargs)
|
|
335
335
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
@@ -32,6 +32,7 @@ from numpy import uint8
|
|
|
32
32
|
|
|
33
33
|
from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
|
|
34
34
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
35
|
+
from ..utils.logger import LoggingRecord, logger
|
|
35
36
|
from ..utils.settings import ObjectTypes, SummaryType, get_type
|
|
36
37
|
from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
|
|
37
38
|
from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
@@ -474,8 +475,13 @@ class Image:
|
|
|
474
475
|
|
|
475
476
|
for service_id in service_ids:
|
|
476
477
|
if service_id not in service_id_to_annotation_id:
|
|
477
|
-
|
|
478
|
-
|
|
478
|
+
logger.info(
|
|
479
|
+
LoggingRecord(
|
|
480
|
+
f"Service_id {service_id} for image_id: {self.image_id} not found. Skipping removal."
|
|
481
|
+
)
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
annotation_ids = service_id_to_annotation_id.get(service_id, [])
|
|
479
485
|
|
|
480
486
|
for ann_id in annotation_ids:
|
|
481
487
|
if ann_id not in ann_id_to_annotation_maps:
|
|
@@ -747,7 +753,7 @@ class Image:
|
|
|
747
753
|
if sub_cat.service_id:
|
|
748
754
|
service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
|
|
749
755
|
if ann.image is not None:
|
|
750
|
-
for summary_cat_key in ann.image.summary:
|
|
756
|
+
for summary_cat_key in ann.image.summary.sub_categories:
|
|
751
757
|
summary_cat = ann.get_summary(summary_cat_key)
|
|
752
758
|
if summary_cat.service_id:
|
|
753
759
|
service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
|
|
@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
|
|
|
48
48
|
import torch.nn.functional as F
|
|
49
49
|
|
|
50
50
|
with try_import() as tr_import_guard:
|
|
51
|
-
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type:
|
|
51
|
+
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD # type:ignore
|
|
52
52
|
from transformers import (
|
|
53
53
|
LayoutLMForSequenceClassification,
|
|
54
54
|
LayoutLMForTokenClassification,
|
deepdoctection/pipe/base.py
CHANGED
|
@@ -29,6 +29,7 @@ from uuid import uuid1
|
|
|
29
29
|
|
|
30
30
|
from ..dataflow import DataFlow, MapData
|
|
31
31
|
from ..datapoint.image import Image
|
|
32
|
+
from ..mapper.misc import curry
|
|
32
33
|
from ..utils.context import timed_operation
|
|
33
34
|
from ..utils.identifier import get_uuid_from_str
|
|
34
35
|
from ..utils.settings import ObjectTypes
|
|
@@ -247,17 +248,24 @@ class Pipeline(ABC):
|
|
|
247
248
|
"""
|
|
248
249
|
raise NotImplementedError()
|
|
249
250
|
|
|
250
|
-
|
|
251
|
+
@staticmethod
|
|
252
|
+
@curry
|
|
253
|
+
def _undo(dp: Image, service_ids: Optional[list[str]] = None) -> Image:
|
|
251
254
|
"""
|
|
252
|
-
|
|
255
|
+
Remove annotations from a datapoint
|
|
253
256
|
"""
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
257
|
+
dp.remove(service_ids=service_ids)
|
|
258
|
+
return dp
|
|
259
|
+
|
|
260
|
+
def undo(self, df: DataFlow, service_ids: Optional[set[str]] = None) -> DataFlow:
|
|
261
|
+
"""
|
|
262
|
+
Mapping a datapoint via `_undo` within a dataflow pipeline
|
|
263
|
+
|
|
264
|
+
:param df: An input dataflow of Images
|
|
265
|
+
:param service_ids: A set of service ids to remove
|
|
266
|
+
:return: A output dataflow of Images
|
|
267
|
+
"""
|
|
268
|
+
return MapData(df, self._undo(service_ids=service_ids))
|
|
261
269
|
|
|
262
270
|
@abstractmethod
|
|
263
271
|
def analyze(self, **kwargs: Any) -> DataFlow:
|
|
@@ -273,6 +281,18 @@ class Pipeline(ABC):
|
|
|
273
281
|
"""
|
|
274
282
|
raise NotImplementedError()
|
|
275
283
|
|
|
284
|
+
def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
|
|
285
|
+
"""
|
|
286
|
+
Composition of the backbone
|
|
287
|
+
"""
|
|
288
|
+
if session_id is None and self.set_session_id:
|
|
289
|
+
session_id = self.get_session_id()
|
|
290
|
+
for component in self.pipe_component_list:
|
|
291
|
+
component.timer_on = True
|
|
292
|
+
component.dp_manager.session_id = session_id
|
|
293
|
+
df = component.predict_dataflow(df)
|
|
294
|
+
return df
|
|
295
|
+
|
|
276
296
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
277
297
|
"""
|
|
278
298
|
Collects meta annotations from all pipeline components and summarizes the returned results
|
|
@@ -161,16 +161,17 @@ class DoctectionPipe(Pipeline):
|
|
|
161
161
|
|
|
162
162
|
super().__init__(pipeline_component_list)
|
|
163
163
|
|
|
164
|
-
def _entry(
|
|
165
|
-
|
|
164
|
+
def _entry(
|
|
165
|
+
self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
166
|
+
) -> DataFlow:
|
|
166
167
|
path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
|
|
167
168
|
|
|
168
169
|
df: DataFlow
|
|
169
170
|
|
|
170
171
|
if isinstance(b_bytes, bytes):
|
|
171
|
-
df = DoctectionPipe.bytes_to_dataflow(
|
|
172
|
-
|
|
173
|
-
|
|
172
|
+
df = DoctectionPipe.bytes_to_dataflow(
|
|
173
|
+
path=doc_path if path is None else path, b_bytes=b_bytes, file_type=file_type
|
|
174
|
+
)
|
|
174
175
|
|
|
175
176
|
elif isinstance(path, (str, Path)):
|
|
176
177
|
if not isinstance(file_type, (str, list)):
|
|
@@ -161,11 +161,12 @@ class LayoutLMTrainer(Trainer):
|
|
|
161
161
|
model: Union[PreTrainedModel, nn.Module],
|
|
162
162
|
args: TrainingArguments,
|
|
163
163
|
data_collator: LayoutLMDataCollator,
|
|
164
|
-
train_dataset:
|
|
164
|
+
train_dataset: DatasetAdapter,
|
|
165
|
+
eval_dataset: Optional[DatasetBase] = None,
|
|
165
166
|
):
|
|
166
167
|
self.evaluator: Optional[Evaluator] = None
|
|
167
168
|
self.build_eval_kwargs: Optional[dict[str, Any]] = None
|
|
168
|
-
super().__init__(model, args, data_collator, train_dataset)
|
|
169
|
+
super().__init__(model, args, data_collator, train_dataset, eval_dataset=eval_dataset)
|
|
169
170
|
|
|
170
171
|
def setup_evaluator(
|
|
171
172
|
self,
|
|
@@ -472,7 +473,7 @@ def train_hf_layoutlm(
|
|
|
472
473
|
max_batch_size=max_batch_size, # type: ignore
|
|
473
474
|
remove_bounding_box_features=remove_box_features,
|
|
474
475
|
)
|
|
475
|
-
trainer = LayoutLMTrainer(model, arguments, data_collator, dataset)
|
|
476
|
+
trainer = LayoutLMTrainer(model, arguments, data_collator, dataset, eval_dataset=dataset_val)
|
|
476
477
|
|
|
477
478
|
if arguments.evaluation_strategy in (IntervalStrategy.STEPS,):
|
|
478
479
|
assert metric is not None # silence mypy
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
deepdoctection/__init__.py,sha256=
|
|
1
|
+
deepdoctection/__init__.py,sha256=T2sHOc6ZPpx44hWbarp0i_QlAqm0dEmzs7HVg2mL_nM,12655
|
|
2
2
|
deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
deepdoctection/analyzer/__init__.py,sha256=icClxrd20XutD6LxLgEPIWceSs4j_QfI3szCE-9BL2w,729
|
|
4
4
|
deepdoctection/analyzer/_config.py,sha256=NZl_REM8Ge2xfxvHN-mZR5KURcHfZii3xfMlKQwckbA,4864
|
|
5
5
|
deepdoctection/analyzer/dd.py,sha256=DUOhOtwipHw5nabYqn3WGR9aZcgP0ma_bi_tjf9xscw,5973
|
|
6
|
-
deepdoctection/analyzer/factory.py,sha256=
|
|
6
|
+
deepdoctection/analyzer/factory.py,sha256=dEUOtdBS3yQGLqMqLR_kq5EYCR3IE30DjHNzE0spoQE,31519
|
|
7
7
|
deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
|
|
8
8
|
deepdoctection/configs/conf_dd_one.yaml,sha256=td7XsyVhdXkhh5Pie7sT_WNjGTaxBOWgpxhkobHd1H0,2325
|
|
9
9
|
deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
|
|
@@ -19,7 +19,7 @@ deepdoctection/datapoint/__init__.py,sha256=3K406GbOPhoEp8koVaSbMocmSsmWifnSZ1SP
|
|
|
19
19
|
deepdoctection/datapoint/annotation.py,sha256=FEgz4COxVDfjic0gG7kS6iHnWLBIgFnquQ63Cbj2a4Y,22531
|
|
20
20
|
deepdoctection/datapoint/box.py,sha256=tkFuVM6xfx2jL7W4UED4qHXV572LSRdIsVJbrEiyIxI,23524
|
|
21
21
|
deepdoctection/datapoint/convert.py,sha256=O7920pIomyEkzXwxpFsrzfhn7Pl6UzVGhNzv90VcuKU,7099
|
|
22
|
-
deepdoctection/datapoint/image.py,sha256=
|
|
22
|
+
deepdoctection/datapoint/image.py,sha256=UDBKXJJpuKAUx0J-DGjvLGqrMV4N3kLpksJYxoVkong,33279
|
|
23
23
|
deepdoctection/datapoint/view.py,sha256=1rVMuqucCrI5zlwyXMADJQBV38V_zSNFqFyBi3cMA1E,44914
|
|
24
24
|
deepdoctection/datasets/__init__.py,sha256=-A3aR90aDsHPmVM35JavfnQ2itYSCn3ujl4krRni1QU,1076
|
|
25
25
|
deepdoctection/datasets/adapter.py,sha256=Ly_vbOAgVI73V41FUccnSX1ECTOyesW_qsuvQuvOZbw,7796
|
|
@@ -56,7 +56,7 @@ deepdoctection/extern/deskew.py,sha256=sPoixu8S9he-0wbs-jgxtPE2V9BiP4-3uZlb6F5Y1
|
|
|
56
56
|
deepdoctection/extern/doctrocr.py,sha256=T3_tvlih22_dVCBZypS1Y8tjQQB1fkAxIbGdUGHIapQ,24473
|
|
57
57
|
deepdoctection/extern/fastlang.py,sha256=F4gK-SEwcCujjxH327ZDzMGWToJ49xS_dCKcePQ9IlY,4780
|
|
58
58
|
deepdoctection/extern/hfdetr.py,sha256=1NPW_u5eH2tP3ixZ91l4WR-O-wLVcrFsLWA7BqID0oM,12055
|
|
59
|
-
deepdoctection/extern/hflayoutlm.py,sha256=
|
|
59
|
+
deepdoctection/extern/hflayoutlm.py,sha256=T1IBm3C8CtG97-tauo03YqhUac6xdFc2y345BWVMajQ,56509
|
|
60
60
|
deepdoctection/extern/hflm.py,sha256=kwS6kcSlY_2m9u0RzBLTRq-UMM7c1PhyUaDTvSdejus,9217
|
|
61
61
|
deepdoctection/extern/model.py,sha256=ViHHKPvbGmLCPw7ZESv_rmjlkA90UiBU6oZiHOMqNSw,59869
|
|
62
62
|
deepdoctection/extern/pdftext.py,sha256=KS_t27SUiYn_IOS_J2lF9lSSo22vLagxmxvYCY3CqXA,7228
|
|
@@ -103,10 +103,10 @@ deepdoctection/mapper/tpstruct.py,sha256=YNABRibvcISD5Lavg3jouoE4FMdqXEJoM-hNoB_
|
|
|
103
103
|
deepdoctection/mapper/xfundstruct.py,sha256=_3r3c0K82fnF2h1HxA85h-9ETYrHwcERa6MNc6Ko6Z8,8807
|
|
104
104
|
deepdoctection/pipe/__init__.py,sha256=ywTVoetftdL6plXg2YlBzMfmqBZupq7yXblSVyvvkcQ,1127
|
|
105
105
|
deepdoctection/pipe/anngen.py,sha256=3319l4aaXzcY4w6ItVBNPX8LGS5fHFDVtyVY9KMefac,16393
|
|
106
|
-
deepdoctection/pipe/base.py,sha256=
|
|
106
|
+
deepdoctection/pipe/base.py,sha256=ynNg5SSRuUVxN69VWOO3Oi7WSeGrYwn3A56NQMBJDvw,14222
|
|
107
107
|
deepdoctection/pipe/common.py,sha256=haOb4v0jLX3r41BSC8cVseX2E320_HkSrGlZsQiKE2g,17728
|
|
108
108
|
deepdoctection/pipe/concurrency.py,sha256=AAKRsVgaBEYNluntbDa46SBF1JZ_XqnWLDSWrNvAzEo,9657
|
|
109
|
-
deepdoctection/pipe/doctectionpipe.py,sha256=
|
|
109
|
+
deepdoctection/pipe/doctectionpipe.py,sha256=wCg96P9Pb54i5AVgG02b4FljobM64_qEML_GxiULy-4,11765
|
|
110
110
|
deepdoctection/pipe/language.py,sha256=5zI0UQC6Fh12_r2pfVL42HoCGz2hpHrOhpXAn5m-rYw,5451
|
|
111
111
|
deepdoctection/pipe/layout.py,sha256=xIhnJpyUSbvLbhTXyAKXY1hmG9352jihGYFSclTH_1g,5567
|
|
112
112
|
deepdoctection/pipe/lm.py,sha256=Sp-b7smeslNDyioEfNjuNBUxAuFKn3-OKpCZkGXri_c,16643
|
|
@@ -120,7 +120,7 @@ deepdoctection/pipe/transform.py,sha256=9Om7X7hJeL4jgUwHM1CHa4sb5v7Qo1PtVG0ls_3n
|
|
|
120
120
|
deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
|
|
121
121
|
deepdoctection/train/d2_frcnn_train.py,sha256=sFc_G-mEpaM8d1CCE0_6Gl4nBh11X2RYRBA3p_ylFJQ,16000
|
|
122
122
|
deepdoctection/train/hf_detr_train.py,sha256=8ydysxzOPE_IPoNFGaHb7PbKr9Nbl41rcY4lbylQavU,10783
|
|
123
|
-
deepdoctection/train/hf_layoutlm_train.py,sha256=
|
|
123
|
+
deepdoctection/train/hf_layoutlm_train.py,sha256=BNjPgPAvxm4beHULqzo58u-gW7GcTGiZAk2rF6TootM,22532
|
|
124
124
|
deepdoctection/train/tp_frcnn_train.py,sha256=pEpXokSVGveqo82pRnhnAmHPmjQ_8wQWpqM4ZyNHJgs,13049
|
|
125
125
|
deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
|
|
126
126
|
deepdoctection/utils/concurrency.py,sha256=nIhpkSncmv0LBB8PtcOLY-BsRGlfcDpz7foVdgzZd20,4598
|
|
@@ -141,8 +141,8 @@ deepdoctection/utils/transform.py,sha256=3kCgsEeRkG1efCdkfvj7tUFMs-e2jbjbflq826F
|
|
|
141
141
|
deepdoctection/utils/types.py,sha256=_3dmPdCIZNLbgU5QP5k_c5phDf18xLe1kYL6t2nM45s,2953
|
|
142
142
|
deepdoctection/utils/utils.py,sha256=csVs_VvCq4QBETPoE2JdTTL4MFYnD4xh-Js5vRb612g,6492
|
|
143
143
|
deepdoctection/utils/viz.py,sha256=Jf8ePNYWlpuyaS6SeTYQ4OyA3eNhtgjvAQZnGNdgHC0,27051
|
|
144
|
-
deepdoctection-0.37.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
-
deepdoctection-0.37.dist-info/METADATA,sha256=
|
|
146
|
-
deepdoctection-0.37.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
147
|
-
deepdoctection-0.37.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
-
deepdoctection-0.37.dist-info/RECORD,,
|
|
144
|
+
deepdoctection-0.37.2.dist-info/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
145
|
+
deepdoctection-0.37.2.dist-info/METADATA,sha256=XLOCkFBWynZhyZmKpDDRaomDIxPnVpy07WdUkodRF3Y,19545
|
|
146
|
+
deepdoctection-0.37.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
147
|
+
deepdoctection-0.37.2.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
148
|
+
deepdoctection-0.37.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|