deepdoctection 0.44.1__tar.gz → 0.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.44.1 → deepdoctection-0.46}/PKG-INFO +16 -21
- {deepdoctection-0.44.1 → deepdoctection-0.46}/README.md +6 -4
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/__init__.py +7 -3
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/analyzer/config.py +44 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/analyzer/factory.py +264 -7
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/configs/profiles.jsonl +2 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/parallel_map.py +7 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/box.py +5 -5
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/image.py +5 -5
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/view.py +73 -52
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/cocometric.py +1 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/__init__.py +1 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/base.py +8 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/d2detect.py +1 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/doctrocr.py +18 -2
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/fastlang.py +2 -2
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/hflayoutlm.py +17 -10
- deepdoctection-0.46/deepdoctection/extern/hflm.py +689 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tessocr.py +17 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/language.py +4 -4
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/lm.py +7 -3
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/order.py +12 -6
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/refine.py +10 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/text.py +6 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/transform.py +3 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/file_utils.py +34 -5
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/logger.py +38 -1
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/settings.py +2 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/transform.py +43 -18
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/viz.py +24 -15
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection.egg-info/PKG-INFO +16 -21
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection.egg-info/requires.txt +9 -16
- {deepdoctection-0.44.1 → deepdoctection-0.46}/setup.py +7 -8
- deepdoctection-0.44.1/deepdoctection/extern/hflm.py +0 -264
- {deepdoctection-0.44.1 → deepdoctection-0.46}/LICENSE +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/analyzer/dd.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/configs/conf_dd_one.yaml +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/annotation.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/base.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/pdftext.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/env_info.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection.egg-info/SOURCES.txt +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/setup.cfg +0 -0
- {deepdoctection-0.44.1 → deepdoctection-0.46}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.46
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -19,18 +19,15 @@ Description-Content-Type: text/markdown
|
|
|
19
19
|
License-File: LICENSE
|
|
20
20
|
Requires-Dist: catalogue==2.0.10
|
|
21
21
|
Requires-Dist: huggingface_hub>=0.26.0
|
|
22
|
-
Requires-Dist: importlib-metadata>=5.0.0
|
|
23
22
|
Requires-Dist: jsonlines==3.1.0
|
|
24
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
25
24
|
Requires-Dist: mock==4.0.3
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist: numpy<2.0,>=1.21
|
|
25
|
+
Requires-Dist: numpy>2.0
|
|
28
26
|
Requires-Dist: packaging>=20.0
|
|
29
27
|
Requires-Dist: Pillow>=10.0.0
|
|
30
28
|
Requires-Dist: pypdf>=6.0.0
|
|
31
29
|
Requires-Dist: pypdfium2>=4.30.0
|
|
32
30
|
Requires-Dist: pyyaml>=6.0.1
|
|
33
|
-
Requires-Dist: pyzmq>=16
|
|
34
31
|
Requires-Dist: scipy>=1.13.1
|
|
35
32
|
Requires-Dist: termcolor>=1.1
|
|
36
33
|
Requires-Dist: tabulate>=0.7.7
|
|
@@ -38,18 +35,15 @@ Requires-Dist: tqdm>=4.64.0
|
|
|
38
35
|
Provides-Extra: tf
|
|
39
36
|
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
40
37
|
Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
|
|
41
|
-
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
42
38
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
43
39
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
44
40
|
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
41
|
+
Requires-Dist: numpy>2.0; extra == "tf"
|
|
47
42
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
48
43
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
49
44
|
Requires-Dist: pypdf>=6.0.0; extra == "tf"
|
|
50
45
|
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
51
46
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
52
|
-
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
53
47
|
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
54
48
|
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
55
49
|
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
@@ -58,30 +52,28 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
|
58
52
|
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
59
53
|
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
60
54
|
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
61
|
-
Requires-Dist: python-doctr==0.
|
|
55
|
+
Requires-Dist: python-doctr==0.10.0; extra == "tf"
|
|
62
56
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
63
57
|
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
64
58
|
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
65
|
-
Requires-Dist:
|
|
59
|
+
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
66
60
|
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
67
61
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
68
62
|
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
69
63
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
64
|
+
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
70
65
|
Provides-Extra: pt
|
|
71
66
|
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
72
67
|
Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
|
|
73
|
-
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
74
68
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
75
69
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
76
70
|
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
77
|
-
Requires-Dist:
|
|
78
|
-
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
71
|
+
Requires-Dist: numpy>2.0; extra == "pt"
|
|
79
72
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
80
73
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
81
74
|
Requires-Dist: pypdf>=6.0.0; extra == "pt"
|
|
82
75
|
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
83
76
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
84
|
-
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
85
77
|
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
86
78
|
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
87
79
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
@@ -89,15 +81,16 @@ Requires-Dist: tqdm>=4.64.0; extra == "pt"
|
|
|
89
81
|
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
90
82
|
Requires-Dist: transformers>=4.48.0; extra == "pt"
|
|
91
83
|
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
92
|
-
Requires-Dist: python-doctr==0.
|
|
84
|
+
Requires-Dist: python-doctr==0.10.0; extra == "pt"
|
|
93
85
|
Requires-Dist: pycocotools>=2.0.2; extra == "pt"
|
|
94
86
|
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
95
87
|
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
96
|
-
Requires-Dist:
|
|
88
|
+
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
97
89
|
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
98
90
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
99
91
|
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
100
92
|
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
93
|
+
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
101
94
|
Provides-Extra: docs
|
|
102
95
|
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
103
96
|
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
@@ -183,7 +176,8 @@ It also provides a framework for training, evaluating and inferencing Document A
|
|
|
183
176
|
[**LiLT**](https://github.com/jpWang/LiLT) and selected
|
|
184
177
|
[**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
|
|
185
178
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
186
|
-
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText)
|
|
179
|
+
- Language detection with `papluca/xlm-roberta-base-language-detection`. [**fastText**](https://github.com/facebookresearch/fastText) is still available but
|
|
180
|
+
but will be removed in a future version.
|
|
187
181
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
188
182
|
- Fine-tuning and evaluation tools.
|
|
189
183
|
- Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
|
|
@@ -294,7 +288,7 @@ alt="text" width="40%">
|
|
|
294
288
|
|
|
295
289
|
- Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
|
|
296
290
|
- Python >= 3.9
|
|
297
|
-
- 2.
|
|
291
|
+
- 2.6 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
298
292
|
Tensorflow support will be stopped from Python 3.11 onwards.
|
|
299
293
|
- To fine-tune models, a GPU is recommended.
|
|
300
294
|
|
|
@@ -321,7 +315,7 @@ For a simple setup which is enough to parse documents with the default setting,
|
|
|
321
315
|
|
|
322
316
|
```
|
|
323
317
|
pip install transformers
|
|
324
|
-
pip install python-doctr==0.
|
|
318
|
+
pip install python-doctr==0.10.0 # If you use Python 3.10 or higher you can use the latest version.
|
|
325
319
|
pip install deepdoctection
|
|
326
320
|
```
|
|
327
321
|
|
|
@@ -329,8 +323,9 @@ pip install deepdoctection
|
|
|
329
323
|
|
|
330
324
|
```
|
|
331
325
|
pip install tensorpack
|
|
332
|
-
pip install python-doctr==0.9.0
|
|
333
326
|
pip install deepdoctection
|
|
327
|
+
pip install "numpy>=1.21,<2.0" --upgrade --force-reinstall # because TF 2.11 does not support numpy 2.0
|
|
328
|
+
pip install "python-doctr==0.9.0"
|
|
334
329
|
```
|
|
335
330
|
|
|
336
331
|
Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
|
|
@@ -40,7 +40,8 @@ It also provides a framework for training, evaluating and inferencing Document A
|
|
|
40
40
|
[**LiLT**](https://github.com/jpWang/LiLT) and selected
|
|
41
41
|
[**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
|
|
42
42
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
43
|
-
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText)
|
|
43
|
+
- Language detection with `papluca/xlm-roberta-base-language-detection`. [**fastText**](https://github.com/facebookresearch/fastText) is still available but
|
|
44
|
+
but will be removed in a future version.
|
|
44
45
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
45
46
|
- Fine-tuning and evaluation tools.
|
|
46
47
|
- Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
|
|
@@ -151,7 +152,7 @@ alt="text" width="40%">
|
|
|
151
152
|
|
|
152
153
|
- Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
|
|
153
154
|
- Python >= 3.9
|
|
154
|
-
- 2.
|
|
155
|
+
- 2.6 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
155
156
|
Tensorflow support will be stopped from Python 3.11 onwards.
|
|
156
157
|
- To fine-tune models, a GPU is recommended.
|
|
157
158
|
|
|
@@ -178,7 +179,7 @@ For a simple setup which is enough to parse documents with the default setting,
|
|
|
178
179
|
|
|
179
180
|
```
|
|
180
181
|
pip install transformers
|
|
181
|
-
pip install python-doctr==0.
|
|
182
|
+
pip install python-doctr==0.10.0 # If you use Python 3.10 or higher you can use the latest version.
|
|
182
183
|
pip install deepdoctection
|
|
183
184
|
```
|
|
184
185
|
|
|
@@ -186,8 +187,9 @@ pip install deepdoctection
|
|
|
186
187
|
|
|
187
188
|
```
|
|
188
189
|
pip install tensorpack
|
|
189
|
-
pip install python-doctr==0.9.0
|
|
190
190
|
pip install deepdoctection
|
|
191
|
+
pip install "numpy>=1.21,<2.0" --upgrade --force-reinstall # because TF 2.11 does not support numpy 2.0
|
|
192
|
+
pip install "python-doctr==0.9.0"
|
|
191
193
|
```
|
|
192
194
|
|
|
193
195
|
Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
|
|
@@ -25,11 +25,10 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.46"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
|
-
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
32
|
-
"configs": ["update_cfg_from_defaults"],
|
|
31
|
+
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
|
|
33
32
|
"dataflow": [
|
|
34
33
|
"DataFlowTerminated",
|
|
35
34
|
"DataFlowResetStateNotCalled",
|
|
@@ -186,7 +185,9 @@ _IMPORT_STRUCTURE = {
|
|
|
186
185
|
"HFLayoutLmv3SequenceClassifier",
|
|
187
186
|
"HFLiltTokenClassifier",
|
|
188
187
|
"HFLiltSequenceClassifier",
|
|
188
|
+
"HFLmTokenClassifier",
|
|
189
189
|
"HFLmSequenceClassifier",
|
|
190
|
+
"HFLmLanguageDetector",
|
|
190
191
|
"ModelProfile",
|
|
191
192
|
"ModelCatalog",
|
|
192
193
|
"print_model_infos",
|
|
@@ -270,6 +271,7 @@ _IMPORT_STRUCTURE = {
|
|
|
270
271
|
"MultiThreadPipelineComponent",
|
|
271
272
|
"DoctectionPipe",
|
|
272
273
|
"LanguageDetectionService",
|
|
274
|
+
"skip_if_category_or_service_extracted",
|
|
273
275
|
"ImageLayoutService",
|
|
274
276
|
"LMTokenClassifierService",
|
|
275
277
|
"LMSequenceClassifierService",
|
|
@@ -309,12 +311,14 @@ _IMPORT_STRUCTURE = {
|
|
|
309
311
|
"get_tensorpack_requirement",
|
|
310
312
|
"pytorch_available",
|
|
311
313
|
"get_pytorch_requirement",
|
|
314
|
+
"pyzmq_available",
|
|
312
315
|
"lxml_available",
|
|
313
316
|
"get_lxml_requirement",
|
|
314
317
|
"apted_available",
|
|
315
318
|
"get_apted_requirement",
|
|
316
319
|
"distance_available",
|
|
317
320
|
"get_distance_requirement",
|
|
321
|
+
"networkx_available",
|
|
318
322
|
"numpy_v1_available",
|
|
319
323
|
"get_numpy_v1_requirement",
|
|
320
324
|
"transformers_available",
|
|
@@ -520,6 +520,16 @@ cfg.USE_LAYOUT_LINK = False
|
|
|
520
520
|
# (e.g., by grouping orphan text containers). Only applicable if list items were previously grouped.
|
|
521
521
|
cfg.USE_LINE_MATCHER = False
|
|
522
522
|
|
|
523
|
+
# Enables a sequence classification pipeline component, e.g. a LayoutLM or a Bert-like model.
|
|
524
|
+
cfg.USE_LM_SEQUENCE_CLASS = False
|
|
525
|
+
|
|
526
|
+
# Enables a token classification pipeline component, e.g. a LayoutLM or Bert-like model
|
|
527
|
+
cfg.USE_LM_TOKEN_CLASS = False
|
|
528
|
+
|
|
529
|
+
# Specifies the selection of the rotation model. There are two models available: A rotation estimator
|
|
530
|
+
# based on Tesseract ('tesseract'), and a rotation estimator based on DocTr ('doctr').
|
|
531
|
+
cfg.ROTATOR.MODEL = "tesseract"
|
|
532
|
+
|
|
523
533
|
# Relevant when LIB = TF. Specifies the layout detection model.
|
|
524
534
|
# This model should detect multiple or single objects across an entire page.
|
|
525
535
|
# Currently, only one default model is supported.
|
|
@@ -899,6 +909,40 @@ cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = [LayoutType.FIGURE, LayoutType.TABLE]
|
|
|
899
909
|
# These are typically smaller or subordinate elements (e.g., captions).
|
|
900
910
|
cfg.LAYOUT_LINK.CHILD_CATEGORIES = [LayoutType.CAPTION]
|
|
901
911
|
|
|
912
|
+
|
|
913
|
+
# Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
|
|
914
|
+
# LayoutXLM, LayoutLMv3, LiLT or Roberta base model for sequence classification.
|
|
915
|
+
cfg.LM_SEQUENCE_CLASS.WEIGHTS = None
|
|
916
|
+
|
|
917
|
+
# When predicting document classes, it might be possible that some pages are empty or do not contain any text, in
|
|
918
|
+
# which case the model will be unable to predict anything. If set to `True` it will
|
|
919
|
+
# assign images with no features the category `TokenClasses.OTHER`.
|
|
920
|
+
cfg.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
|
|
921
|
+
|
|
922
|
+
# Weights configuration for sequence classifier. This will be a fine-tuned version of a LayoutLM, LayoutLMv2,
|
|
923
|
+
# LayoutXLM, LayoutLMv3, LiLT or Roberta base model for token classification.
|
|
924
|
+
cfg.LM_TOKEN_CLASS.WEIGHTS = None
|
|
925
|
+
|
|
926
|
+
# When predicting token classes, it might be possible that some words might not get sent to the model because they are
|
|
927
|
+
# categorized as not eligible token (e.g. empty string). If set to `True` it will assign all words without token
|
|
928
|
+
# as `TokenClasses.OTHER`.
|
|
929
|
+
cfg.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY = False
|
|
930
|
+
|
|
931
|
+
# Using bounding boxes of segments instead of words might improve model accuracy
|
|
932
|
+
# for models that have been trained on segments rather than words (e.g. LiLT, LayoutLMv3).
|
|
933
|
+
# Choose a single or a sequence of layout segments to use their bounding boxes. Note,
|
|
934
|
+
# that the layout segments need to have a child-relationship with words. If a word
|
|
935
|
+
# does not appear as child, it will use the word bounding box.
|
|
936
|
+
cfg.LM_TOKEN_CLASS.SEGMENT_POSITIONS = None
|
|
937
|
+
|
|
938
|
+
# If the output of the `tokenizer` exceeds the `max_length` sequence length, a
|
|
939
|
+
# sliding window will be created with each window having `max_length` sequence
|
|
940
|
+
# input. When using `SLIDING_WINDOW_STRIDE=0` no strides will be created,
|
|
941
|
+
# otherwise it will create slides with windows shifted `SLIDING_WINDOW_STRIDE` to
|
|
942
|
+
# the right.
|
|
943
|
+
cfg.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE = 0
|
|
944
|
+
|
|
945
|
+
|
|
902
946
|
# Freezes the configuration to make it immutable.
|
|
903
947
|
# This prevents accidental modification at runtime.
|
|
904
948
|
cfg.freeze()
|
|
@@ -19,16 +19,29 @@
|
|
|
19
19
|
`ServiceFactory` for building analyzers
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
+
from __future__ import annotations
|
|
22
23
|
|
|
23
24
|
from os import environ
|
|
24
|
-
from typing import Union
|
|
25
|
+
from typing import TYPE_CHECKING, Literal, Union
|
|
25
26
|
|
|
26
27
|
from lazy_imports import try_import
|
|
27
28
|
|
|
28
29
|
from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
|
|
29
30
|
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
30
|
-
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
31
|
+
from ..extern.doctrocr import DocTrRotationTransformer, DoctrTextlineDetector, DoctrTextRecognizer
|
|
31
32
|
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
33
|
+
from ..extern.hflayoutlm import (
|
|
34
|
+
HFLayoutLmSequenceClassifier,
|
|
35
|
+
HFLayoutLmTokenClassifier,
|
|
36
|
+
HFLayoutLmv2SequenceClassifier,
|
|
37
|
+
HFLayoutLmv2TokenClassifier,
|
|
38
|
+
HFLayoutLmv3SequenceClassifier,
|
|
39
|
+
HFLayoutLmv3TokenClassifier,
|
|
40
|
+
HFLiltSequenceClassifier,
|
|
41
|
+
HFLiltTokenClassifier,
|
|
42
|
+
get_tokenizer_from_model_class,
|
|
43
|
+
)
|
|
44
|
+
from ..extern.hflm import HFLmSequenceClassifier, HFLmTokenClassifier
|
|
32
45
|
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
33
46
|
from ..extern.pdftext import PdfPlumberTextDetector
|
|
34
47
|
from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
|
|
@@ -45,6 +58,7 @@ from ..pipe.common import (
|
|
|
45
58
|
)
|
|
46
59
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
47
60
|
from ..pipe.layout import ImageLayoutService, skip_if_category_or_service_extracted
|
|
61
|
+
from ..pipe.lm import LMSequenceClassifierService, LMTokenClassifierService
|
|
48
62
|
from ..pipe.order import TextOrderService
|
|
49
63
|
from ..pipe.refine import TableSegmentationRefinementService
|
|
50
64
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
@@ -60,6 +74,11 @@ from ..utils.transform import PadTransform
|
|
|
60
74
|
with try_import() as image_guard:
|
|
61
75
|
from botocore.config import Config # type: ignore
|
|
62
76
|
|
|
77
|
+
if TYPE_CHECKING:
|
|
78
|
+
from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
|
|
79
|
+
from ..extern.hflm import LmSequenceModels, LmTokenModels
|
|
80
|
+
|
|
81
|
+
RotationTransformer = Union[TesseractRotationTransformer, DocTrRotationTransformer]
|
|
63
82
|
|
|
64
83
|
__all__ = [
|
|
65
84
|
"ServiceFactory",
|
|
@@ -172,24 +191,32 @@ class ServiceFactory:
|
|
|
172
191
|
return ServiceFactory._build_layout_detector(config, mode)
|
|
173
192
|
|
|
174
193
|
@staticmethod
|
|
175
|
-
def _build_rotation_detector() ->
|
|
194
|
+
def _build_rotation_detector(rotator_name: Literal["tesseract", "doctr"]) -> RotationTransformer:
|
|
176
195
|
"""
|
|
177
196
|
Building a rotation detector.
|
|
178
197
|
|
|
179
198
|
Returns:
|
|
180
199
|
TesseractRotationTransformer: Rotation detector instance.
|
|
181
200
|
"""
|
|
182
|
-
|
|
201
|
+
|
|
202
|
+
if rotator_name == "tesseract":
|
|
203
|
+
return TesseractRotationTransformer()
|
|
204
|
+
if rotator_name == "doctr":
|
|
205
|
+
return DocTrRotationTransformer()
|
|
206
|
+
raise ValueError(
|
|
207
|
+
f"You have chosen rotator_name: {rotator_name} which is not allowed. Only tesseract or "
|
|
208
|
+
f"doctr are allowed."
|
|
209
|
+
)
|
|
183
210
|
|
|
184
211
|
@staticmethod
|
|
185
|
-
def build_rotation_detector() ->
|
|
212
|
+
def build_rotation_detector(rotator_name: Literal["tesseract", "doctr"]) -> RotationTransformer:
|
|
186
213
|
"""
|
|
187
214
|
Building a rotation detector.
|
|
188
215
|
|
|
189
216
|
Returns:
|
|
190
217
|
TesseractRotationTransformer: Rotation detector instance.
|
|
191
218
|
"""
|
|
192
|
-
return ServiceFactory._build_rotation_detector()
|
|
219
|
+
return ServiceFactory._build_rotation_detector(rotator_name)
|
|
193
220
|
|
|
194
221
|
@staticmethod
|
|
195
222
|
def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
@@ -841,6 +868,226 @@ class ServiceFactory:
|
|
|
841
868
|
"""
|
|
842
869
|
return ServiceFactory._build_text_order_service(config)
|
|
843
870
|
|
|
871
|
+
@staticmethod
|
|
872
|
+
def _build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
|
|
873
|
+
"""
|
|
874
|
+
Builds and returns a sequence classifier instance.
|
|
875
|
+
|
|
876
|
+
Args:
|
|
877
|
+
config: Configuration object that determines the type of sequence classifier to construct.
|
|
878
|
+
|
|
879
|
+
Returns:
|
|
880
|
+
A sequence classifier instance constructed according to the specified configuration.
|
|
881
|
+
"""
|
|
882
|
+
config_path = ModelCatalog.get_full_path_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
|
|
883
|
+
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_SEQUENCE_CLASS.WEIGHTS)
|
|
884
|
+
profile = ModelCatalog.get_profile(config.LM_SEQUENCE_CLASS.WEIGHTS)
|
|
885
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
886
|
+
use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
|
|
887
|
+
|
|
888
|
+
if profile.model_wrapper in ("HFLayoutLmSequenceClassifier",):
|
|
889
|
+
return HFLayoutLmSequenceClassifier(
|
|
890
|
+
path_config_json=config_path,
|
|
891
|
+
path_weights=weights_path,
|
|
892
|
+
categories=categories,
|
|
893
|
+
device=config.DEVICE,
|
|
894
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
895
|
+
)
|
|
896
|
+
if profile.model_wrapper in ("HFLayoutLmv2SequenceClassifier",):
|
|
897
|
+
return HFLayoutLmv2SequenceClassifier(
|
|
898
|
+
path_config_json=config_path,
|
|
899
|
+
path_weights=weights_path,
|
|
900
|
+
categories=categories,
|
|
901
|
+
device=config.DEVICE,
|
|
902
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
903
|
+
)
|
|
904
|
+
if profile.model_wrapper in ("HFLayoutLmv3SequenceClassifier",):
|
|
905
|
+
return HFLayoutLmv3SequenceClassifier(
|
|
906
|
+
path_config_json=config_path,
|
|
907
|
+
path_weights=weights_path,
|
|
908
|
+
categories=categories,
|
|
909
|
+
device=config.DEVICE,
|
|
910
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
911
|
+
)
|
|
912
|
+
if profile.model_wrapper in ("HFLiltSequenceClassifier",):
|
|
913
|
+
return HFLiltSequenceClassifier(
|
|
914
|
+
path_config_json=config_path,
|
|
915
|
+
path_weights=weights_path,
|
|
916
|
+
categories=categories,
|
|
917
|
+
device=config.DEVICE,
|
|
918
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
919
|
+
)
|
|
920
|
+
if profile.model_wrapper in ("HFLmSequenceClassifier",):
|
|
921
|
+
return HFLmSequenceClassifier(
|
|
922
|
+
path_config_json=config_path,
|
|
923
|
+
path_weights=weights_path,
|
|
924
|
+
categories=categories,
|
|
925
|
+
device=config.DEVICE,
|
|
926
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
927
|
+
)
|
|
928
|
+
raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
|
|
929
|
+
|
|
930
|
+
@staticmethod
|
|
931
|
+
def build_sequence_classifier(config: AttrDict) -> Union[LayoutSequenceModels, LmSequenceModels]:
|
|
932
|
+
"""
|
|
933
|
+
Builds and returns a sequence classifier instance.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
config: Configuration object that determines the type of sequence classifier to construct.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
A sequence classifier instance constructed according to the specified configuration.
|
|
940
|
+
"""
|
|
941
|
+
return ServiceFactory._build_sequence_classifier(config)
|
|
942
|
+
|
|
943
|
+
@staticmethod
|
|
944
|
+
def _build_sequence_classifier_service(
|
|
945
|
+
config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
|
|
946
|
+
) -> LMSequenceClassifierService:
|
|
947
|
+
"""
|
|
948
|
+
Building a sequence classifier service.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
config: Configuration object.
|
|
952
|
+
sequence_classifier: Sequence classifier instance.
|
|
953
|
+
|
|
954
|
+
Returns:
|
|
955
|
+
LMSequenceClassifierService: Text order service instance.
|
|
956
|
+
"""
|
|
957
|
+
tokenizer_fast = get_tokenizer_from_model_class(
|
|
958
|
+
sequence_classifier.model.__class__.__name__, sequence_classifier.use_xlm_tokenizer
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
return LMSequenceClassifierService(
|
|
962
|
+
tokenizer=tokenizer_fast,
|
|
963
|
+
language_model=sequence_classifier,
|
|
964
|
+
use_other_as_default_category=config.LM_SEQUENCE_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
@staticmethod
|
|
968
|
+
def build_sequence_classifier_service(
|
|
969
|
+
config: AttrDict, sequence_classifier: Union[LayoutSequenceModels, LmSequenceModels]
|
|
970
|
+
) -> LMSequenceClassifierService:
|
|
971
|
+
"""
|
|
972
|
+
Building a sequence classifier service.
|
|
973
|
+
|
|
974
|
+
Args:
|
|
975
|
+
config: Configuration object.
|
|
976
|
+
sequence_classifier: Sequence classifier instance.
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
LMSequenceClassifierService: Text order service instance.
|
|
980
|
+
"""
|
|
981
|
+
return ServiceFactory._build_sequence_classifier_service(config, sequence_classifier)
|
|
982
|
+
|
|
983
|
+
@staticmethod
|
|
984
|
+
def _build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
|
|
985
|
+
"""
|
|
986
|
+
Builds and returns a token classifier model.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
config: Configuration object.
|
|
990
|
+
|
|
991
|
+
Returns:
|
|
992
|
+
The instantiated token classifier model.
|
|
993
|
+
"""
|
|
994
|
+
config_path = ModelCatalog.get_full_path_configs(config.LM_TOKEN_CLASS.WEIGHTS)
|
|
995
|
+
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(config.LM_TOKEN_CLASS.WEIGHTS)
|
|
996
|
+
profile = ModelCatalog.get_profile(config.LM_TOKEN_CLASS.WEIGHTS)
|
|
997
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
998
|
+
use_xlm_tokenizer = "xlm_tokenizer" == profile.architecture
|
|
999
|
+
if profile.model_wrapper in ("HFLayoutLmTokenClassifier",):
|
|
1000
|
+
return HFLayoutLmTokenClassifier(
|
|
1001
|
+
path_config_json=config_path,
|
|
1002
|
+
path_weights=weights_path,
|
|
1003
|
+
categories=categories,
|
|
1004
|
+
device=config.DEVICE,
|
|
1005
|
+
use_xlm_tokenizer=use_xlm_tokenizer,
|
|
1006
|
+
)
|
|
1007
|
+
if profile.model_wrapper in ("HFLayoutLmv2TokenClassifier",):
|
|
1008
|
+
return HFLayoutLmv2TokenClassifier(
|
|
1009
|
+
path_config_json=config_path,
|
|
1010
|
+
path_weights=weights_path,
|
|
1011
|
+
categories=categories,
|
|
1012
|
+
device=config.DEVICE,
|
|
1013
|
+
)
|
|
1014
|
+
if profile.model_wrapper in ("HFLayoutLmv3TokenClassifier",):
|
|
1015
|
+
return HFLayoutLmv3TokenClassifier(
|
|
1016
|
+
path_config_json=config_path,
|
|
1017
|
+
path_weights=weights_path,
|
|
1018
|
+
categories=categories,
|
|
1019
|
+
device=config.DEVICE,
|
|
1020
|
+
)
|
|
1021
|
+
if profile.model_wrapper in ("HFLiltTokenClassifier",):
|
|
1022
|
+
return HFLiltTokenClassifier(
|
|
1023
|
+
path_config_json=config_path,
|
|
1024
|
+
path_weights=weights_path,
|
|
1025
|
+
categories=categories,
|
|
1026
|
+
device=config.DEVICE,
|
|
1027
|
+
)
|
|
1028
|
+
if profile.model_wrapper in ("HFLmTokenClassifier",):
|
|
1029
|
+
return HFLmTokenClassifier(
|
|
1030
|
+
path_config_json=config_path,
|
|
1031
|
+
path_weights=weights_path,
|
|
1032
|
+
categories=categories,
|
|
1033
|
+
)
|
|
1034
|
+
raise ValueError(f"Unsupported model wrapper: {profile.model_wrapper}")
|
|
1035
|
+
|
|
1036
|
+
@staticmethod
|
|
1037
|
+
def build_token_classifier(config: AttrDict) -> Union[LayoutTokenModels, LmTokenModels]:
|
|
1038
|
+
"""
|
|
1039
|
+
Builds and returns a token classifier model.
|
|
1040
|
+
|
|
1041
|
+
Args:
|
|
1042
|
+
config: Configuration object.
|
|
1043
|
+
|
|
1044
|
+
Returns:
|
|
1045
|
+
The instantiated token classifier model.
|
|
1046
|
+
"""
|
|
1047
|
+
return ServiceFactory._build_token_classifier(config)
|
|
1048
|
+
|
|
1049
|
+
@staticmethod
|
|
1050
|
+
def _build_token_classifier_service(
|
|
1051
|
+
config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
|
|
1052
|
+
) -> LMTokenClassifierService:
|
|
1053
|
+
"""
|
|
1054
|
+
Building a token classifier service.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
config: Configuration object.
|
|
1058
|
+
token_classifier: Token classifier instance.
|
|
1059
|
+
|
|
1060
|
+
Returns:
|
|
1061
|
+
A LMTokenClassifierService instance.
|
|
1062
|
+
"""
|
|
1063
|
+
tokenizer_fast = get_tokenizer_from_model_class(
|
|
1064
|
+
token_classifier.model.__class__.__name__, token_classifier.use_xlm_tokenizer
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
return LMTokenClassifierService(
|
|
1068
|
+
tokenizer=tokenizer_fast,
|
|
1069
|
+
language_model=token_classifier,
|
|
1070
|
+
use_other_as_default_category=config.LM_TOKEN_CLASS.USE_OTHER_AS_DEFAULT_CATEGORY,
|
|
1071
|
+
segment_positions=config.LM_TOKEN_CLASS.SEGMENT_POSITIONS,
|
|
1072
|
+
sliding_window_stride=config.LM_TOKEN_CLASS.SLIDING_WINDOW_STRIDE,
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
@staticmethod
|
|
1076
|
+
def build_token_classifier_service(
|
|
1077
|
+
config: AttrDict, token_classifier: Union[LayoutTokenModels, LmTokenModels]
|
|
1078
|
+
) -> LMTokenClassifierService:
|
|
1079
|
+
"""
|
|
1080
|
+
Building a token classifier service.
|
|
1081
|
+
|
|
1082
|
+
Args:
|
|
1083
|
+
config: Configuration object.
|
|
1084
|
+
token_classifier: Token classifier instance.
|
|
1085
|
+
|
|
1086
|
+
Returns:
|
|
1087
|
+
A LMTokenClassifierService instance.
|
|
1088
|
+
"""
|
|
1089
|
+
return ServiceFactory._build_token_classifier_service(config, token_classifier)
|
|
1090
|
+
|
|
844
1091
|
@staticmethod
|
|
845
1092
|
def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
846
1093
|
"""
|
|
@@ -885,7 +1132,7 @@ class ServiceFactory:
|
|
|
885
1132
|
pipe_component_list: list[PipelineComponent] = []
|
|
886
1133
|
|
|
887
1134
|
if config.USE_ROTATOR:
|
|
888
|
-
rotation_detector = ServiceFactory.build_rotation_detector()
|
|
1135
|
+
rotation_detector = ServiceFactory.build_rotation_detector(config.ROTATOR.MODEL)
|
|
889
1136
|
transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
|
|
890
1137
|
pipe_component_list.append(transform_service)
|
|
891
1138
|
|
|
@@ -955,6 +1202,16 @@ class ServiceFactory:
|
|
|
955
1202
|
line_list_matching_service = ServiceFactory.build_line_matching_service(config)
|
|
956
1203
|
pipe_component_list.append(line_list_matching_service)
|
|
957
1204
|
|
|
1205
|
+
if config.USE_LM_SEQUENCE_CLASS:
|
|
1206
|
+
sequence_classifier = ServiceFactory.build_sequence_classifier(config)
|
|
1207
|
+
sequence_classifier_service = ServiceFactory.build_sequence_classifier_service(config, sequence_classifier)
|
|
1208
|
+
pipe_component_list.append(sequence_classifier_service)
|
|
1209
|
+
|
|
1210
|
+
if config.USE_LM_TOKEN_CLASS:
|
|
1211
|
+
token_classifier = ServiceFactory.build_token_classifier(config)
|
|
1212
|
+
token_classifier_service = ServiceFactory.build_token_classifier_service(config, token_classifier)
|
|
1213
|
+
pipe_component_list.append(token_classifier_service)
|
|
1214
|
+
|
|
958
1215
|
page_parsing_service = ServiceFactory.build_page_parsing_service(config)
|
|
959
1216
|
|
|
960
1217
|
return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
|
|
@@ -30,4 +30,5 @@
|
|
|
30
30
|
{"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
|
|
31
31
|
{"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
|
|
32
32
|
{"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
33
|
-
{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
33
|
+
{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
34
|
+
{"name": "papluca/xlm-roberta-base-language-detection/model.safetensors", "description": "This model is an XLM-RoBERTa transformer model with a classification head on top (i.e. a linear layer on top of the pooled output). For additional information please refer to the xlm-roberta-base model card or to the paper Unsupervised Cross-lingual Representation Learning at Scale by Conneau et al.", "size": [101971449], "tp_model": false, "config": "papluca/xlm-roberta-base-language-detection/config.json", "preprocessor_config": null, "hf_repo_id": "papluca/xlm-roberta-base-language-detection", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json"], "urls": null, "categories": {"1": "jpn", "2": "dut", "3": "ara", "4": "pol", "5": "deu", "6": "ita", "7": "por", "8": "tur", "9": "spa", "10": "hin", "11": "gre", "12": "urd", "13": "bul", "14": "eng", "15": "fre", "16": "chi", "17": "rus", "18": "tha", "19": "swa", "20": "vie"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFLmLanguageDetector", "architecture": null, "padding": null}
|
|
@@ -24,15 +24,19 @@ from abc import ABC, abstractmethod
|
|
|
24
24
|
from contextlib import contextmanager
|
|
25
25
|
from typing import Any, Callable, Iterator, no_type_check
|
|
26
26
|
|
|
27
|
-
import
|
|
27
|
+
from lazy_imports import try_import
|
|
28
28
|
|
|
29
29
|
from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
|
|
30
30
|
from ..utils.error import DataFlowTerminatedError
|
|
31
|
+
from ..utils.file_utils import pyzmq_available
|
|
31
32
|
from ..utils.logger import LoggingRecord, logger
|
|
32
33
|
from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
|
|
33
34
|
from .common import RepeatedData
|
|
34
35
|
from .serialize import PickleSerializer
|
|
35
36
|
|
|
37
|
+
with try_import() as import_guard:
|
|
38
|
+
import zmq
|
|
39
|
+
|
|
36
40
|
|
|
37
41
|
@no_type_check
|
|
38
42
|
def del_weakref(x):
|
|
@@ -77,6 +81,8 @@ def _get_pipe_name(name):
|
|
|
77
81
|
|
|
78
82
|
class _ParallelMapData(ProxyDataFlow, ABC):
|
|
79
83
|
def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
|
|
84
|
+
if not pyzmq_available():
|
|
85
|
+
raise ModuleNotFoundError("pyzmq is required for running parallel dataflows (multiprocess/multithread).")
|
|
80
86
|
super().__init__(df)
|
|
81
87
|
if buffer_size <= 0:
|
|
82
88
|
raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
|