deepdoctection 0.43.5__py3-none-any.whl → 0.43.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +1 -1
- deepdoctection/analyzer/config.py +1 -1
- deepdoctection/configs/profiles.jsonl +1 -0
- deepdoctection/datapoint/view.py +25 -13
- deepdoctection/extern/model.py +1 -1
- {deepdoctection-0.43.5.dist-info → deepdoctection-0.43.6.dist-info}/METADATA +3 -3
- {deepdoctection-0.43.5.dist-info → deepdoctection-0.43.6.dist-info}/RECORD +10 -10
- {deepdoctection-0.43.5.dist-info → deepdoctection-0.43.6.dist-info}/WHEEL +0 -0
- {deepdoctection-0.43.5.dist-info → deepdoctection-0.43.6.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.43.5.dist-info → deepdoctection-0.43.6.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
|
|
|
629
629
|
|
|
630
630
|
# Specifies the PyTorch model weights for item detection.
|
|
631
631
|
# Use either .pt or .safetensors files.
|
|
632
|
-
cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/
|
|
632
|
+
cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
|
|
633
633
|
|
|
634
634
|
# Specifies the TorchScript model for item detection.
|
|
635
635
|
# Use .ts files for deployment without model implementation dependencies.
|
|
@@ -30,3 +30,4 @@
|
|
|
30
30
|
{"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
|
|
31
31
|
{"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
|
|
32
32
|
{"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
|
33
|
+
{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
|
deepdoctection/datapoint/view.py
CHANGED
|
@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
|
|
|
195
195
|
attr_names = (
|
|
196
196
|
set(WordType)
|
|
197
197
|
.union(super().get_attribute_names())
|
|
198
|
-
.union(
|
|
198
|
+
.union(
|
|
199
|
+
{Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
|
|
200
|
+
)
|
|
199
201
|
)
|
|
200
202
|
return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
|
|
201
203
|
|
|
@@ -384,16 +386,10 @@ class Table(Layout):
|
|
|
384
386
|
Returns:
|
|
385
387
|
A list of a table cells.
|
|
386
388
|
"""
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
LayoutType.CELL,
|
|
392
|
-
CellType.HEADER,
|
|
393
|
-
CellType.BODY,
|
|
394
|
-
CellType.SPANNING,
|
|
395
|
-
],
|
|
396
|
-
)
|
|
389
|
+
cell_anns: list[Cell] = []
|
|
390
|
+
for row_number in range(1, self.number_of_rows + 1): # type: ignore
|
|
391
|
+
cell_anns.extend(self.row(row_number)) # type: ignore
|
|
392
|
+
|
|
397
393
|
return cell_anns
|
|
398
394
|
|
|
399
395
|
@property
|
|
@@ -592,6 +588,16 @@ class Table(Layout):
|
|
|
592
588
|
)
|
|
593
589
|
return table_list
|
|
594
590
|
|
|
591
|
+
@property
|
|
592
|
+
def csv_(self) -> list[list[list[Text_]]]:
|
|
593
|
+
cells = self.cells
|
|
594
|
+
table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
|
|
595
|
+
for cell in cells:
|
|
596
|
+
table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
|
|
597
|
+
return table_list
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
|
|
595
601
|
def __str__(self) -> str:
|
|
596
602
|
out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
|
|
597
603
|
return out
|
|
@@ -599,7 +605,13 @@ class Table(Layout):
|
|
|
599
605
|
@property
|
|
600
606
|
def text(self) -> str:
|
|
601
607
|
try:
|
|
602
|
-
|
|
608
|
+
cells = self.cells
|
|
609
|
+
if not cells:
|
|
610
|
+
return super().text
|
|
611
|
+
text_list: list[str] = []
|
|
612
|
+
for cell in cells:
|
|
613
|
+
text_list.append(cell.text)
|
|
614
|
+
return " ".join(text_list)
|
|
603
615
|
except (TypeError, AnnotationError):
|
|
604
616
|
return super().text
|
|
605
617
|
|
|
@@ -616,7 +628,7 @@ class Table(Layout):
|
|
|
616
628
|
token_class_ids: list[str] = []
|
|
617
629
|
token_tag_ids: list[str] = []
|
|
618
630
|
for cell in cells:
|
|
619
|
-
text.
|
|
631
|
+
text.append(cell.text_["text"])
|
|
620
632
|
words.extend(cell.text_["words"])
|
|
621
633
|
ann_ids.extend(cell.text_["ann_ids"])
|
|
622
634
|
token_classes.extend(cell.text_["token_classes"])
|
deepdoctection/extern/model.py
CHANGED
|
@@ -306,7 +306,7 @@ class ModelCatalog:
|
|
|
306
306
|
|
|
307
307
|
# Loading default profiles
|
|
308
308
|
dd_profile_path = maybe_copy_config_to_cache(
|
|
309
|
-
get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl",
|
|
309
|
+
get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
|
|
310
310
|
)
|
|
311
311
|
ModelCatalog.load_profiles_from_file(dd_profile_path)
|
|
312
312
|
# Additional profiles can be added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.43.
|
|
3
|
+
Version: 0.43.6
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -321,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
|
|
|
321
321
|
|
|
322
322
|
```
|
|
323
323
|
pip install transformers
|
|
324
|
-
pip install python-doctr
|
|
324
|
+
pip install python-doctr==0.9.0
|
|
325
325
|
pip install deepdoctection
|
|
326
326
|
```
|
|
327
327
|
|
|
@@ -329,7 +329,7 @@ pip install deepdoctection
|
|
|
329
329
|
|
|
330
330
|
```
|
|
331
331
|
pip install tensorpack
|
|
332
|
-
pip install python-doctr
|
|
332
|
+
pip install python-doctr==0.9.0
|
|
333
333
|
pip install deepdoctection
|
|
334
334
|
```
|
|
335
335
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
deepdoctection/__init__.py,sha256=
|
|
1
|
+
deepdoctection/__init__.py,sha256=AwVtfVry6NA7FttBvL3GdHV-05TK3fcvhx87bjHwWko,12964
|
|
2
2
|
deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
deepdoctection/analyzer/__init__.py,sha256=wg0BcFwdCeREwzZfa--Yx8HUJ9LPv5z5PmLwtkZdPH8,772
|
|
4
|
-
deepdoctection/analyzer/config.py,sha256=
|
|
4
|
+
deepdoctection/analyzer/config.py,sha256=lTfBKwzm9iVKCZoq7-FcoYUfrcgWmRknwYUzD5Jx-0U,41762
|
|
5
5
|
deepdoctection/analyzer/dd.py,sha256=2BGvZpl9o9khcaOV52-DPHMrs0DsqUO8cpdqFVHHzDQ,5176
|
|
6
6
|
deepdoctection/analyzer/factory.py,sha256=DI0S38KAG2sIROrSximsWJsMbem91a9zXaeWsDNvkGg,37574
|
|
7
7
|
deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
|
|
8
8
|
deepdoctection/configs/conf_dd_one.yaml,sha256=DHqAIKH3jRam54QO7qib2zutmpyFA8TqdV5UvIV191A,3688
|
|
9
9
|
deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
|
|
10
|
-
deepdoctection/configs/profiles.jsonl,sha256=
|
|
10
|
+
deepdoctection/configs/profiles.jsonl,sha256=8O1WTnsD0vhrdiY3RXzYoPe4mU_5C8TMafEjk0zHw9g,31438
|
|
11
11
|
deepdoctection/dataflow/__init__.py,sha256=pY4lhjTes2BU-0AdIIRMnRqo9Sv6TopVE_SNfLmpgnc,828
|
|
12
12
|
deepdoctection/dataflow/base.py,sha256=ZLRijyHI1J7tBfnE-q7eqUieYMMERjtK-c1oK40dBkk,6556
|
|
13
13
|
deepdoctection/dataflow/common.py,sha256=DKD_pRZBCt2vO3oNZcOvdoC3jThabTNcNbTS16mpVR0,10351
|
|
@@ -21,7 +21,7 @@ deepdoctection/datapoint/annotation.py,sha256=f32BNmzUGJoNMeGst2RGC2jmjJpzzjxyBR
|
|
|
21
21
|
deepdoctection/datapoint/box.py,sha256=QAS8sK2Ge4_ysW6zOYkLlzNwhSyw_mhYcYsxscClEno,31453
|
|
22
22
|
deepdoctection/datapoint/convert.py,sha256=6ENXX3tBdY8ogb2NBPxsOsQMGnQux8ol5nrUfWS5tYE,7352
|
|
23
23
|
deepdoctection/datapoint/image.py,sha256=kqwCz8DSc19hQpkl_4L1_Ek7_2KrH5KsV9e0S-R4n5w,35147
|
|
24
|
-
deepdoctection/datapoint/view.py,sha256=
|
|
24
|
+
deepdoctection/datapoint/view.py,sha256=cc-6WekGht1cU9Cgwfp7crwDlK71sUg_cCR48SmHrVY,58339
|
|
25
25
|
deepdoctection/datasets/__init__.py,sha256=4ifjIwWCPYiS31GzUlVDScrkNOrb1eo5xHlRXNyg_58,994
|
|
26
26
|
deepdoctection/datasets/adapter.py,sha256=VSLM_980aHi4TpgOxfxiBHiF_fUXyh348PXet6zTo-4,7779
|
|
27
27
|
deepdoctection/datasets/base.py,sha256=Qfh52aVtBd2df2ZY0hjLz4D1jrExnPuu_8uYpolVNks,23181
|
|
@@ -59,7 +59,7 @@ deepdoctection/extern/fastlang.py,sha256=4D9A-_hTXUcvXG6IJJknX34LrD71v08XtNdWgvX
|
|
|
59
59
|
deepdoctection/extern/hfdetr.py,sha256=N3eLNI5BsQS9_7YZyBeWndSgUydJij7ugZA9p4V1xaQ,14316
|
|
60
60
|
deepdoctection/extern/hflayoutlm.py,sha256=3mZZ3byn00jSrLWO2vZFas9j4VrhbYQNmF1mwPG2ElQ,59642
|
|
61
61
|
deepdoctection/extern/hflm.py,sha256=y-9brzmT2NYtFoNcWHABNg2ZZQXSOP9CyqtT1OoeV9U,9754
|
|
62
|
-
deepdoctection/extern/model.py,sha256
|
|
62
|
+
deepdoctection/extern/model.py,sha256=kMIlx07_kdwZHLYB3QUG0DT_VSv2aZuKIIbv3fs0WqA,18233
|
|
63
63
|
deepdoctection/extern/pdftext.py,sha256=ljzPQn3yYAlS6MoZqzixD-fO2GlHwu1aMiOQ6qMIzbg,7513
|
|
64
64
|
deepdoctection/extern/tessocr.py,sha256=SuPmngsJg38riL4b09z6_FIzJH6H3RIwoighG2GPMYM,17457
|
|
65
65
|
deepdoctection/extern/texocr.py,sha256=wVOuu6eUGao0mUbC8vrgdCsKfY1GqA1Am9560YgWyXU,5915
|
|
@@ -142,8 +142,8 @@ deepdoctection/utils/transform.py,sha256=jgeCyQWLN9q79jCGW7jysyKUKcJ1AVMk8OslF-3
|
|
|
142
142
|
deepdoctection/utils/types.py,sha256=ti4WdtIJSg3TGK_YPkkoY9PYGMnR2tTX6Xfik8U1pNk,2986
|
|
143
143
|
deepdoctection/utils/utils.py,sha256=NBUb1qbx8Jm-AvYN1Sdbk0huXhbAKxZ-ZtOcMespsMM,7064
|
|
144
144
|
deepdoctection/utils/viz.py,sha256=bujRIujvX317rPz4jBrj0yd3WP8wPjDUiI5GUrw9MzQ,27339
|
|
145
|
-
deepdoctection-0.43.
|
|
146
|
-
deepdoctection-0.43.
|
|
147
|
-
deepdoctection-0.43.
|
|
148
|
-
deepdoctection-0.43.
|
|
149
|
-
deepdoctection-0.43.
|
|
145
|
+
deepdoctection-0.43.6.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
|
|
146
|
+
deepdoctection-0.43.6.dist-info/METADATA,sha256=tOH9kBOEUncGe0sI1hU-h28u4gIHcqVjzbBYV3PiJ8I,14796
|
|
147
|
+
deepdoctection-0.43.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
148
|
+
deepdoctection-0.43.6.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
|
|
149
|
+
deepdoctection-0.43.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|