deepdoctection 0.43.5__py3-none-any.whl → 0.43.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
25
25
 
26
26
  # pylint: enable=wrong-import-position
27
27
 
28
- __version__ = "0.43.5"
28
+ __version__ = "0.43.6"
29
29
 
30
30
  _IMPORT_STRUCTURE = {
31
31
  "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
629
629
 
630
630
  # Specifies the PyTorch model weights for item detection.
631
631
  # Use either .pt or .safetensors files.
632
- cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin"
632
+ cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
633
633
 
634
634
  # Specifies the TorchScript model for item detection.
635
635
  # Use .ts files for deployment without model implementation dependencies.
@@ -30,3 +30,4 @@
30
30
  {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
31
31
  {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
32
32
  {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
33
+ {"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
195
195
  attr_names = (
196
196
  set(WordType)
197
197
  .union(super().get_attribute_names())
198
- .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK})
198
+ .union(
199
+ {Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
200
+ )
199
201
  )
200
202
  return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
201
203
 
@@ -384,16 +386,10 @@ class Table(Layout):
384
386
  Returns:
385
387
  A list of a table cells.
386
388
  """
387
- all_relation_ids = self.get_relationship(Relationships.CHILD)
388
- cell_anns: list[Cell] = self.base_page.get_annotation( # type: ignore
389
- annotation_ids=all_relation_ids,
390
- category_names=[
391
- LayoutType.CELL,
392
- CellType.HEADER,
393
- CellType.BODY,
394
- CellType.SPANNING,
395
- ],
396
- )
389
+ cell_anns: list[Cell] = []
390
+ for row_number in range(1, self.number_of_rows + 1): # type: ignore
391
+ cell_anns.extend(self.row(row_number)) # type: ignore
392
+
397
393
  return cell_anns
398
394
 
399
395
  @property
@@ -592,6 +588,16 @@ class Table(Layout):
592
588
  )
593
589
  return table_list
594
590
 
591
+ @property
592
+ def csv_(self) -> list[list[list[Text_]]]:
593
+ cells = self.cells
594
+ table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)] # type: ignore
595
+ for cell in cells:
596
+ table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_) # type: ignore
597
+ return table_list
598
+
599
+
600
+
595
601
  def __str__(self) -> str:
596
602
  out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
597
603
  return out
@@ -599,7 +605,13 @@ class Table(Layout):
599
605
  @property
600
606
  def text(self) -> str:
601
607
  try:
602
- return str(self)
608
+ cells = self.cells
609
+ if not cells:
610
+ return super().text
611
+ text_list: list[str] = []
612
+ for cell in cells:
613
+ text_list.append(cell.text)
614
+ return " ".join(text_list)
603
615
  except (TypeError, AnnotationError):
604
616
  return super().text
605
617
 
@@ -616,7 +628,7 @@ class Table(Layout):
616
628
  token_class_ids: list[str] = []
617
629
  token_tag_ids: list[str] = []
618
630
  for cell in cells:
619
- text.extend(cell.text_["text"])
631
+ text.append(cell.text_["text"])
620
632
  words.extend(cell.text_["words"])
621
633
  ann_ids.extend(cell.text_["ann_ids"])
622
634
  token_classes.extend(cell.text_["token_classes"])
@@ -306,7 +306,7 @@ class ModelCatalog:
306
306
 
307
307
  # Loading default profiles
308
308
  dd_profile_path = maybe_copy_config_to_cache(
309
- get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", False
309
+ get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
310
310
  )
311
311
  ModelCatalog.load_profiles_from_file(dd_profile_path)
312
312
  # Additional profiles can be added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepdoctection
3
- Version: 0.43.5
3
+ Version: 0.43.6
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -321,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
321
321
 
322
322
  ```
323
323
  pip install transformers
324
- pip install python-doctr
324
+ pip install python-doctr==0.9.0
325
325
  pip install deepdoctection
326
326
  ```
327
327
 
@@ -329,7 +329,7 @@ pip install deepdoctection
329
329
 
330
330
  ```
331
331
  pip install tensorpack
332
- pip install python-doctr
332
+ pip install python-doctr==0.9.0
333
333
  pip install deepdoctection
334
334
  ```
335
335
 
@@ -1,13 +1,13 @@
1
- deepdoctection/__init__.py,sha256=ackdlzoLcoHJAmWtxrkE2PnYgG-3q9bL2c1d3kqFgTY,12964
1
+ deepdoctection/__init__.py,sha256=AwVtfVry6NA7FttBvL3GdHV-05TK3fcvhx87bjHwWko,12964
2
2
  deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  deepdoctection/analyzer/__init__.py,sha256=wg0BcFwdCeREwzZfa--Yx8HUJ9LPv5z5PmLwtkZdPH8,772
4
- deepdoctection/analyzer/config.py,sha256=DToaXs59w7SpEi2vkeBEyrBwyyGiXUST_N99wL9nHoI,41762
4
+ deepdoctection/analyzer/config.py,sha256=lTfBKwzm9iVKCZoq7-FcoYUfrcgWmRknwYUzD5Jx-0U,41762
5
5
  deepdoctection/analyzer/dd.py,sha256=2BGvZpl9o9khcaOV52-DPHMrs0DsqUO8cpdqFVHHzDQ,5176
6
6
  deepdoctection/analyzer/factory.py,sha256=DI0S38KAG2sIROrSximsWJsMbem91a9zXaeWsDNvkGg,37574
7
7
  deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
8
8
  deepdoctection/configs/conf_dd_one.yaml,sha256=DHqAIKH3jRam54QO7qib2zutmpyFA8TqdV5UvIV191A,3688
9
9
  deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
10
- deepdoctection/configs/profiles.jsonl,sha256=zhMpsJWdfeSj2oq2J0BbiKhHnE7PIq47PA8-I1Th0pA,30266
10
+ deepdoctection/configs/profiles.jsonl,sha256=8O1WTnsD0vhrdiY3RXzYoPe4mU_5C8TMafEjk0zHw9g,31438
11
11
  deepdoctection/dataflow/__init__.py,sha256=pY4lhjTes2BU-0AdIIRMnRqo9Sv6TopVE_SNfLmpgnc,828
12
12
  deepdoctection/dataflow/base.py,sha256=ZLRijyHI1J7tBfnE-q7eqUieYMMERjtK-c1oK40dBkk,6556
13
13
  deepdoctection/dataflow/common.py,sha256=DKD_pRZBCt2vO3oNZcOvdoC3jThabTNcNbTS16mpVR0,10351
@@ -21,7 +21,7 @@ deepdoctection/datapoint/annotation.py,sha256=f32BNmzUGJoNMeGst2RGC2jmjJpzzjxyBR
21
21
  deepdoctection/datapoint/box.py,sha256=QAS8sK2Ge4_ysW6zOYkLlzNwhSyw_mhYcYsxscClEno,31453
22
22
  deepdoctection/datapoint/convert.py,sha256=6ENXX3tBdY8ogb2NBPxsOsQMGnQux8ol5nrUfWS5tYE,7352
23
23
  deepdoctection/datapoint/image.py,sha256=kqwCz8DSc19hQpkl_4L1_Ek7_2KrH5KsV9e0S-R4n5w,35147
24
- deepdoctection/datapoint/view.py,sha256=YtoqafStrHqbfyD628-W1HOA2Gb0kUI2oaEiteBHjbA,57902
24
+ deepdoctection/datapoint/view.py,sha256=cc-6WekGht1cU9Cgwfp7crwDlK71sUg_cCR48SmHrVY,58339
25
25
  deepdoctection/datasets/__init__.py,sha256=4ifjIwWCPYiS31GzUlVDScrkNOrb1eo5xHlRXNyg_58,994
26
26
  deepdoctection/datasets/adapter.py,sha256=VSLM_980aHi4TpgOxfxiBHiF_fUXyh348PXet6zTo-4,7779
27
27
  deepdoctection/datasets/base.py,sha256=Qfh52aVtBd2df2ZY0hjLz4D1jrExnPuu_8uYpolVNks,23181
@@ -59,7 +59,7 @@ deepdoctection/extern/fastlang.py,sha256=4D9A-_hTXUcvXG6IJJknX34LrD71v08XtNdWgvX
59
59
  deepdoctection/extern/hfdetr.py,sha256=N3eLNI5BsQS9_7YZyBeWndSgUydJij7ugZA9p4V1xaQ,14316
60
60
  deepdoctection/extern/hflayoutlm.py,sha256=3mZZ3byn00jSrLWO2vZFas9j4VrhbYQNmF1mwPG2ElQ,59642
61
61
  deepdoctection/extern/hflm.py,sha256=y-9brzmT2NYtFoNcWHABNg2ZZQXSOP9CyqtT1OoeV9U,9754
62
- deepdoctection/extern/model.py,sha256=-GbnuhLFq7jpBOvtpJe6IhGXxQdqwiM8epEd7IRELoU,18234
62
+ deepdoctection/extern/model.py,sha256=kMIlx07_kdwZHLYB3QUG0DT_VSv2aZuKIIbv3fs0WqA,18233
63
63
  deepdoctection/extern/pdftext.py,sha256=ljzPQn3yYAlS6MoZqzixD-fO2GlHwu1aMiOQ6qMIzbg,7513
64
64
  deepdoctection/extern/tessocr.py,sha256=SuPmngsJg38riL4b09z6_FIzJH6H3RIwoighG2GPMYM,17457
65
65
  deepdoctection/extern/texocr.py,sha256=wVOuu6eUGao0mUbC8vrgdCsKfY1GqA1Am9560YgWyXU,5915
@@ -142,8 +142,8 @@ deepdoctection/utils/transform.py,sha256=jgeCyQWLN9q79jCGW7jysyKUKcJ1AVMk8OslF-3
142
142
  deepdoctection/utils/types.py,sha256=ti4WdtIJSg3TGK_YPkkoY9PYGMnR2tTX6Xfik8U1pNk,2986
143
143
  deepdoctection/utils/utils.py,sha256=NBUb1qbx8Jm-AvYN1Sdbk0huXhbAKxZ-ZtOcMespsMM,7064
144
144
  deepdoctection/utils/viz.py,sha256=bujRIujvX317rPz4jBrj0yd3WP8wPjDUiI5GUrw9MzQ,27339
145
- deepdoctection-0.43.5.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
146
- deepdoctection-0.43.5.dist-info/METADATA,sha256=EeIEGN2SrcUzSny93poDmEM7Fogqqiep_XDHOs--XpM,14782
147
- deepdoctection-0.43.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
- deepdoctection-0.43.5.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
149
- deepdoctection-0.43.5.dist-info/RECORD,,
145
+ deepdoctection-0.43.6.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
146
+ deepdoctection-0.43.6.dist-info/METADATA,sha256=tOH9kBOEUncGe0sI1hU-h28u4gIHcqVjzbBYV3PiJ8I,14796
147
+ deepdoctection-0.43.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
+ deepdoctection-0.43.6.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
149
+ deepdoctection-0.43.6.dist-info/RECORD,,