PyPI - deepdoctection - Versions diffs - 0.43.4__py3-none-any.whl → 0.43.6__py3-none-any.whl - Mend

deepdoctection 0.43.4py3-none-any.whl → 0.43.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (11) hide show

deepdoctection/__init__.py CHANGED Viewed

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = "0.43.4"
+__version__ = "0.43.6"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],

deepdoctection/analyzer/config.py CHANGED Viewed

@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
 # Specifies the PyTorch model weights for item detection.
 # Use either .pt or .safetensors files.
-cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin"
+cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
 # Specifies the TorchScript model for item detection.
 # Use .ts files for deployment without model implementation dependencies.

deepdoctection/configs/profiles.jsonl CHANGED Viewed

@@ -30,3 +30,4 @@
 {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
 {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
 {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
+{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
         attr_names = (
             set(WordType)
             .union(super().get_attribute_names())
-            .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK})
+            .union(
+                {Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
+            )
         )
         return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
@@ -384,16 +386,10 @@ class Table(Layout):
         Returns:
             A list of a table cells.
         """
-        all_relation_ids = self.get_relationship(Relationships.CHILD)
-        cell_anns: list[Cell] = self.base_page.get_annotation(  # type: ignore
-            annotation_ids=all_relation_ids,
-            category_names=[
-                LayoutType.CELL,
-                CellType.HEADER,
-                CellType.BODY,
-                CellType.SPANNING,
-            ],
-        )
+        cell_anns: list[Cell] = []
+        for row_number in range(1, self.number_of_rows + 1):  # type: ignore
+            cell_anns.extend(self.row(row_number))  # type: ignore
         return cell_anns
     @property
@@ -592,6 +588,16 @@ class Table(Layout):
             )
         return table_list
+    @property
+    def csv_(self) -> list[list[list[Text_]]]:
+        cells = self.cells
+        table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)]  # type: ignore
+        for cell in cells:
+            table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_)  # type: ignore
+        return table_list
     def __str__(self) -> str:
         out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
         return out
@@ -599,7 +605,13 @@ class Table(Layout):
     @property
     def text(self) -> str:
         try:
-            return str(self)
+            cells = self.cells
+            if not cells:
+                return super().text
+            text_list: list[str] = []
+            for cell in cells:
+                text_list.append(cell.text)
+            return " ".join(text_list)
         except (TypeError, AnnotationError):
             return super().text
@@ -616,7 +628,7 @@ class Table(Layout):
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for cell in cells:
-            text.extend(cell.text_["text"])
+            text.append(cell.text_["text"])
             words.extend(cell.text_["words"])
             ann_ids.extend(cell.text_["ann_ids"])
             token_classes.extend(cell.text_["token_classes"])

deepdoctection/datasets/base.py CHANGED Viewed

@@ -484,7 +484,7 @@ class CustomDataset(DatasetBase):
         return DatasetInfo(
             name=self.name,
             type=self.type,
-            description=self.description if self.description is not None else "",
+            short_description=self.description if self.description is not None else "",
             license="",
             url="",
             splits={},

deepdoctection/extern/model.py CHANGED Viewed

@@ -306,7 +306,7 @@ class ModelCatalog:
 # Loading default profiles
 dd_profile_path = maybe_copy_config_to_cache(
-    get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", False
+    get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
 )
 ModelCatalog.load_profiles_from_file(dd_profile_path)
 # Additional profiles can be added

{deepdoctection-0.43.4.dist-info → deepdoctection-0.43.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 0.43.4
+Version: 0.43.6
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -168,13 +168,9 @@ Version `v.0.43` includes a significant redesign of the Analyzer's default confi
 </p>
 **deep**doctection is a Python library that orchestrates Scan and PDF document layout analysis and extraction for RAG.
 It also provides a framework for training, evaluating and inferencing Document AI models.
-Check the demo of a document layout analysis pipeline with OCR on 🤗
-[**Hugging Face spaces**](https://huggingface.co/spaces/deepdoctection/deepdoctection).
 # Overview
 - Document layout analysis and table recognition in PyTorch with
@@ -197,6 +193,54 @@ for an easy start.
 Check the [**release notes**](https://github.com/deepdoctection/deepdoctection/releases) for recent updates.
+----------------------------------------------------------------------------------------
+# Hugging Face Space Demo
+Check the demo of a document layout analysis pipeline with OCR on 🤗
+[**Hugging Face spaces**](https://huggingface.co/spaces/deepdoctection/deepdoctection) or use the gradio client.
+```
+pip install gradio_client   # requires Python >= 3.10
+```
+To process a single image:
+```python
+from gradio_client import Client, handle_file
+if __name__ == "__main__":
+    client = Client("deepdoctection/deepdoctection")
+    result = client.predict(
+        img=handle_file('/local_path/to/dir/file_name.jpeg'),  # accepts image files, e.g. JPEG, PNG
+        pdf=None,
+        max_datapoints = 2,
+        api_name = "/analyze_image"
+    )
+    print(result)
+```
+To process a PDF document:
+```python
+from gradio_client import Client, handle_file
+if __name__ == "__main__":
+    client = Client("deepdoctection/deepdoctection")
+    result = client.predict(
+        img=None,
+        pdf=handle_file("/local_path/to/dir/your_doc.pdf"),
+        max_datapoints = 2, # increase to process up to 9 pages
+        api_name = "/analyze_image"
+    )
+    print(result)
+```
+--------------------------------------------------------------------------------------------------------
 # Example
 ```python
@@ -242,8 +286,9 @@ alt="text" width="40%">
 </p>
+-----------------------------------------------------------------------------------------
-## Requirements
+# Requirements
 ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/install_01.png)
@@ -262,11 +307,13 @@ alt="text" width="40%">
 | DocTr | ✅ | ❌ | ✅ |
 | LayoutLM (v1, v2, v3, XLM) via Transformers | ✅ | ❌ | ❌ |
-## Installation
+------------------------------------------------------------------------------------------
+# Installation
 We recommend using a virtual environment.
-#### Get started installation
+## Get started installation
 For a simple setup which is enough to parse documents with the default setting, install the following:
@@ -274,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
 ```
 pip install transformers
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```
@@ -282,13 +329,13 @@ pip install deepdoctection
 ```
 pip install tensorpack
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```
 Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
-#### Full installation
+### Full installation
 The following installation will give you ALL models available within the Deep Learning framework as well as all models
 that are independent of Tensorflow/PyTorch.
@@ -318,7 +365,7 @@ pip install deepdoctection[tf]
 For further information, please consult the [**full installation instructions**](https://deepdoctection.readthedocs.io/en/latest/install/).
-### Installation from source
+## Installation from source
 Download the repository or clone via
@@ -341,8 +388,7 @@ pip install ".[tf]" # or "pip install -e .[tf]"
 ```
-### Running a Docker container from Docker hub
+## Running a Docker container from Docker hub
 Pre-existing Docker images can be downloaded from the [Docker hub](https://hub.docker.com/r/deepdoctection/deepdoctection).
@@ -360,16 +406,18 @@ docker compose up -d
 will start the container. There is no endpoint exposed, though.
-## Credits
+-----------------------------------------------------------------------------------------------
+# Credits
 We thank all libraries that provide high quality code and pre-trained models. Without, it would have been impossible
 to develop this framework.
-## If you like **deep**doctection ...
+# If you like **deep**doctection ...
 ...you can easily support the project by making it more visible. Leaving a star or a recommendation will help.
-## License
+# License
 Distributed under the Apache 2.0 License. Check [LICENSE](https://github.com/deepdoctection/deepdoctection/blob/master/LICENSE) for additional information.

{deepdoctection-0.43.4.dist-info → deepdoctection-0.43.6.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-deepdoctection/__init__.py,sha256=UftLKUS4Z03F_LCcON51Gx0XEKfCLp7VAw9MKysFLxQ,12964
+deepdoctection/__init__.py,sha256=AwVtfVry6NA7FttBvL3GdHV-05TK3fcvhx87bjHwWko,12964
 deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deepdoctection/analyzer/__init__.py,sha256=wg0BcFwdCeREwzZfa--Yx8HUJ9LPv5z5PmLwtkZdPH8,772
-deepdoctection/analyzer/config.py,sha256=DToaXs59w7SpEi2vkeBEyrBwyyGiXUST_N99wL9nHoI,41762
+deepdoctection/analyzer/config.py,sha256=lTfBKwzm9iVKCZoq7-FcoYUfrcgWmRknwYUzD5Jx-0U,41762
 deepdoctection/analyzer/dd.py,sha256=2BGvZpl9o9khcaOV52-DPHMrs0DsqUO8cpdqFVHHzDQ,5176
 deepdoctection/analyzer/factory.py,sha256=DI0S38KAG2sIROrSximsWJsMbem91a9zXaeWsDNvkGg,37574
 deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
 deepdoctection/configs/conf_dd_one.yaml,sha256=DHqAIKH3jRam54QO7qib2zutmpyFA8TqdV5UvIV191A,3688
 deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
-deepdoctection/configs/profiles.jsonl,sha256=zhMpsJWdfeSj2oq2J0BbiKhHnE7PIq47PA8-I1Th0pA,30266
+deepdoctection/configs/profiles.jsonl,sha256=8O1WTnsD0vhrdiY3RXzYoPe4mU_5C8TMafEjk0zHw9g,31438
 deepdoctection/dataflow/__init__.py,sha256=pY4lhjTes2BU-0AdIIRMnRqo9Sv6TopVE_SNfLmpgnc,828
 deepdoctection/dataflow/base.py,sha256=ZLRijyHI1J7tBfnE-q7eqUieYMMERjtK-c1oK40dBkk,6556
 deepdoctection/dataflow/common.py,sha256=DKD_pRZBCt2vO3oNZcOvdoC3jThabTNcNbTS16mpVR0,10351
@@ -21,10 +21,10 @@ deepdoctection/datapoint/annotation.py,sha256=f32BNmzUGJoNMeGst2RGC2jmjJpzzjxyBR
 deepdoctection/datapoint/box.py,sha256=QAS8sK2Ge4_ysW6zOYkLlzNwhSyw_mhYcYsxscClEno,31453
 deepdoctection/datapoint/convert.py,sha256=6ENXX3tBdY8ogb2NBPxsOsQMGnQux8ol5nrUfWS5tYE,7352
 deepdoctection/datapoint/image.py,sha256=kqwCz8DSc19hQpkl_4L1_Ek7_2KrH5KsV9e0S-R4n5w,35147
-deepdoctection/datapoint/view.py,sha256=YtoqafStrHqbfyD628-W1HOA2Gb0kUI2oaEiteBHjbA,57902
+deepdoctection/datapoint/view.py,sha256=cc-6WekGht1cU9Cgwfp7crwDlK71sUg_cCR48SmHrVY,58339
 deepdoctection/datasets/__init__.py,sha256=4ifjIwWCPYiS31GzUlVDScrkNOrb1eo5xHlRXNyg_58,994
 deepdoctection/datasets/adapter.py,sha256=VSLM_980aHi4TpgOxfxiBHiF_fUXyh348PXet6zTo-4,7779
-deepdoctection/datasets/base.py,sha256=HTIquJir2BZRTLl1HSQM0ICfvjIaWAjJeyz3BEHgdb0,23175
+deepdoctection/datasets/base.py,sha256=Qfh52aVtBd2df2ZY0hjLz4D1jrExnPuu_8uYpolVNks,23181
 deepdoctection/datasets/dataflow_builder.py,sha256=0vwkItr0wVbKPtTXoS6uJLO9QQNWbS0Ri7CySuywWxU,4186
 deepdoctection/datasets/info.py,sha256=DLRYq3cHp3L34CcSXPUJ8j8wguJp2aVdoH-AhODNLBA,20814
 deepdoctection/datasets/registry.py,sha256=qYRVycNYFeAzWB7jENGYzokgyzIEvTRb49he2UmPUe8,3451
@@ -59,7 +59,7 @@ deepdoctection/extern/fastlang.py,sha256=4D9A-_hTXUcvXG6IJJknX34LrD71v08XtNdWgvX
 deepdoctection/extern/hfdetr.py,sha256=N3eLNI5BsQS9_7YZyBeWndSgUydJij7ugZA9p4V1xaQ,14316
 deepdoctection/extern/hflayoutlm.py,sha256=3mZZ3byn00jSrLWO2vZFas9j4VrhbYQNmF1mwPG2ElQ,59642
 deepdoctection/extern/hflm.py,sha256=y-9brzmT2NYtFoNcWHABNg2ZZQXSOP9CyqtT1OoeV9U,9754
-deepdoctection/extern/model.py,sha256=-GbnuhLFq7jpBOvtpJe6IhGXxQdqwiM8epEd7IRELoU,18234
+deepdoctection/extern/model.py,sha256=kMIlx07_kdwZHLYB3QUG0DT_VSv2aZuKIIbv3fs0WqA,18233
 deepdoctection/extern/pdftext.py,sha256=ljzPQn3yYAlS6MoZqzixD-fO2GlHwu1aMiOQ6qMIzbg,7513
 deepdoctection/extern/tessocr.py,sha256=SuPmngsJg38riL4b09z6_FIzJH6H3RIwoighG2GPMYM,17457
 deepdoctection/extern/texocr.py,sha256=wVOuu6eUGao0mUbC8vrgdCsKfY1GqA1Am9560YgWyXU,5915
@@ -142,8 +142,8 @@ deepdoctection/utils/transform.py,sha256=jgeCyQWLN9q79jCGW7jysyKUKcJ1AVMk8OslF-3
 deepdoctection/utils/types.py,sha256=ti4WdtIJSg3TGK_YPkkoY9PYGMnR2tTX6Xfik8U1pNk,2986
 deepdoctection/utils/utils.py,sha256=NBUb1qbx8Jm-AvYN1Sdbk0huXhbAKxZ-ZtOcMespsMM,7064
 deepdoctection/utils/viz.py,sha256=bujRIujvX317rPz4jBrj0yd3WP8wPjDUiI5GUrw9MzQ,27339
-deepdoctection-0.43.4.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
-deepdoctection-0.43.4.dist-info/METADATA,sha256=Rq3g8AYO5ClbhHXAHJVh7YkpeP22PBwpB_TN57TbSOI,13389
-deepdoctection-0.43.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-deepdoctection-0.43.4.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
-deepdoctection-0.43.4.dist-info/RECORD,,
+deepdoctection-0.43.6.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
+deepdoctection-0.43.6.dist-info/METADATA,sha256=tOH9kBOEUncGe0sI1hU-h28u4gIHcqVjzbBYV3PiJ8I,14796
+deepdoctection-0.43.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deepdoctection-0.43.6.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
+deepdoctection-0.43.6.dist-info/RECORD,,

{deepdoctection-0.43.4.dist-info → deepdoctection-0.43.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{deepdoctection-0.43.4.dist-info → deepdoctection-0.43.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deepdoctection-0.43.4.dist-info → deepdoctection-0.43.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

deepdoctection 0.43.4__py3-none-any.whl → 0.43.6__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.43.4py3-none-any.whl → 0.43.6py3-none-any.whl