PyPI - doc-page-extractor - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

doc-page-extractor 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (18) hide show

doc_page_extractor/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from .extractor import DocExtractor
 from .clipper import clip, clip_from_image
 from .plot import plot
 from .rectangle import Point, Rectangle
+from .model import Model, HuggingfaceModel
 from .types import (
   ExtractedResult,
   OCRFragment,

doc_page_extractor/extractor.py CHANGED Viewed

@@ -52,6 +52,13 @@ class DocExtractor:
     self._latex: LaTeX = LaTeX(device, model)
     self._layout_order: LayoutOrder = LayoutOrder(device, model)
+  def prepare_models(self):
+    self._model.get_onnx_ocr_path()
+    self._model.get_yolo_path()
+    self._model.get_layoutreader_path()
+    self._model.get_struct_eqtable_path()
+    self._model.get_latex_path()
   def extract(
       self,
       image: Image,

doc_page_extractor/model.py CHANGED Viewed

@@ -1,10 +1,14 @@
 from os import PathLike
+from time import sleep
 from typing import runtime_checkable, Protocol
 from pathlib import Path
 from threading import Lock
 from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
+_RETRY_TIMES = 6
+_RETRY_SLEEP = 3.5
 @runtime_checkable
 class Model(Protocol):
   def get_onnx_ocr_path(self) -> Path:
@@ -31,9 +35,10 @@ class HuggingfaceModel(Model):
   def get_onnx_ocr_path(self) -> Path:
     return self._get_model_path(
       repo_id="moskize/OnnxOCR",
-      filename=None,
+      filename="README.md",
       repo_type=None,
-      is_snapshot=True
+      is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_yolo_path(self) -> Path:
@@ -42,14 +47,16 @@ class HuggingfaceModel(Model):
       filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
       repo_type=None,
       is_snapshot=False,
+      wanna_dir_path=False,
     )
   def get_layoutreader_path(self) -> Path:
     return self._get_model_path(
       repo_id="hantian/layoutreader",
-      filename=None,
+      filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_struct_eqtable_path(self) -> Path:
@@ -58,6 +65,7 @@ class HuggingfaceModel(Model):
       filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_latex_path(self) -> Path:
@@ -66,38 +74,60 @@ class HuggingfaceModel(Model):
       filename="checkpoints/weights.pth",
       repo_type="space",
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def _get_model_path(
         self,
         repo_id: str,
-        filename: str | None,
+        filename: str,
         repo_type: str | None,
         is_snapshot: bool,
+        wanna_dir_path: bool,
       ) -> Path:
     with self._lock:
-      cache_filename = "README.md"
-      if filename is not None:
-        cache_filename = filename
       model_path = try_to_load_from_cache(
         repo_id=repo_id,
-        filename=cache_filename,
+        filename=filename,
         repo_type=repo_type,
         cache_dir=self._model_cache_dir
       )
       if isinstance(model_path, str):
-        if filename is None:
-          model_path = Path(model_path).parent
-      elif is_snapshot:
-        model_path = snapshot_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-        )
+        model_path = Path(model_path)
+        if wanna_dir_path:
+          for _ in Path(filename).parts:
+            model_path = model_path.parent
       else:
-        model_path = hf_hub_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-          filename=filename,
-        )
-      return Path(model_path)
+        # https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
+        latest_error: ConnectionError | None = None
+        for i in range(_RETRY_TIMES + 1):
+          if latest_error is not None:
+            print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
+            sleep(_RETRY_SLEEP)
+          try:
+            if is_snapshot:
+              model_path = snapshot_download(
+                cache_dir=self._model_cache_dir,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                resume_download=True,
+              )
+            else:
+              model_path = hf_hub_download(
+                cache_dir=self._model_cache_dir,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                filename=filename,
+                resume_download=True,
+              )
+            latest_error = None
+          except ConnectionError as err:
+            latest_error = err
+        if latest_error is not None:
+          raise latest_error
+        model_path = Path(model_path)
+      return model_path

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,30 +1,33 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.3
 Name: doc-page-extractor
-Version: 0.2.1
-Summary: doc page extractor can identify text and format in images and return structured data.
-Home-page: https://github.com/Moskize91/doc-page-extractor
+Version: 0.2.3
+Summary:
+License: AGPL-3.0
 Author: Tao Zeyu
 Author-email: i@taozeyu.com
+Maintainer: Tao Zeyu
+Maintainer-email: i@taozeyu.com
+Requires-Python: >=3.10,<3.13
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: accelerate (>=1.6.0,<2.0)
+Requires-Dist: doclayout_yolo (>=0.0.3)
+Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
+Requires-Dist: numpy (>=1.24.0,<2.0)
+Requires-Dist: opencv-python (>=4.10.0,<5.0)
+Requires-Dist: pillow (>=10.3,<11.0)
+Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
+Requires-Dist: pyclipper (>=1.2.0,<2.0)
+Requires-Dist: shapely (>=2.0.0,<3.0)
+Requires-Dist: transformers (>=4.42.4,<=4.47)
+Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
 Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: opencv-python<5.0,>=4.10.0
-Requires-Dist: pillow<11.0,>=10.3
-Requires-Dist: pyclipper<2.0,>=1.2.0
-Requires-Dist: numpy<2.0,>=1.24.0
-Requires-Dist: shapely<3.0,>=2.0.0
-Requires-Dist: transformers<=4.47,>=4.42.4
-Requires-Dist: doclayout_yolo>=0.0.3
-Requires-Dist: pix2tex<=0.2.0,>=0.1.4
-Requires-Dist: accelerate<2.0,>=1.6.0
-Requires-Dist: huggingface_hub<1.0,>=0.30.2
-Dynamic: author
-Dynamic: author-email
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license-file
-Dynamic: requires-dist
-Dynamic: summary
 # doc page extractor

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.3.dist-info}/RECORD RENAMED Viewed

@@ -1,21 +1,13 @@
-doc_page_extractor/__init__.py,sha256=Q3cZxT1wpjb1kGI0fJ9YvL4Fh_rBN3NdLqaZh4ATFQM,312
+doc_page_extractor/__init__.py,sha256=rt_XALcqNNg3iVkMTUHltWxvdweH2FY6Y_olU2TkVBY,355
 doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
 doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
-doc_page_extractor/extractor.py,sha256=SGDPEIjmp5HiAYq_nSVsKFTt0IvqJJXXDZRfFFRlfzU,6993
+doc_page_extractor/extractor.py,sha256=7dJ1N4d2_9tB49XAfKBIJjEUvNtjf0CGqcm2uf6BMPg,7205
 doc_page_extractor/latex.py,sha256=kD3NIzZTEGUFIAqqyHYmNcfyTrlu77GB1YSFgzbFb7A,1024
 doc_page_extractor/layout_order.py,sha256=lm_rXzRZ3AtufaRwxqZPfM9vplwaOlE2pnTNc7Z_oLM,7435
 doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
-doc_page_extractor/model.py,sha256=BBrsY2iQXCIsvqemjL37pKRPV3EtKkXsl7yA5_SI8RE,2610
+doc_page_extractor/model.py,sha256=0J8Nt9Xl6mJu5VaFQ0nyUJrM3UmFg48_lXv4mDJSEpQ,3617
 doc_page_extractor/ocr.py,sha256=niZY4ZgbbbV-IeTrtF3Y8LPPrynU2wvZKRVYbBP1Dog,5586
 doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
-doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
-doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
-doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
-doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
-doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
-doc_page_extractor/table.py,sha256=3esYqvHS9btINF9htmLrnDgTec2q7VvljaMXzDLj34w,1690
-doc_page_extractor/types.py,sha256=XtNfGuG6ZEwozFdk04mOD77m34zxr8aD9-fR9hbrrNU,1313
-doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
 doc_page_extractor/onnxocr/__init__.py,sha256=BK4YpX4pU0nRxbcI5f5cbIVfdBEsx4W980QYmpNQaH0,38
 doc_page_extractor/onnxocr/cls_postprocess.py,sha256=o879Ned0RMUERYLviuToZ0xTvhn2UsYAb-yPC5gj8h4,822
 doc_page_extractor/onnxocr/db_postprocess.py,sha256=R3yXXfReiQgLaYIvvfnrFfshI202LjHMvcZwcLpjmTY,7913
@@ -28,18 +20,15 @@ doc_page_extractor/onnxocr/predict_rec.py,sha256=UsgPhl6X3frx5u-LzIEPITOM3WJ1iAm
 doc_page_extractor/onnxocr/predict_system.py,sha256=yoqXunAsoboPsWe7qQjvQf2_SMW1T1QMriEoiGdX3BM,2721
 doc_page_extractor/onnxocr/rec_postprocess.py,sha256=qZt5Ripal7z9hniKq5e7azOkD9e6NR1ylWpRpznhweg,29556
 doc_page_extractor/onnxocr/utils.py,sha256=AQoHgQyv-jpPo4BsVzq3r7_ze698EZ-a7LJobm2fwUI,1864
-doc_page_extractor/struct_eqtable/__init__.py,sha256=QoTsNuJfpNSrMIMd6Cot1jJqWk88_lDqFP_C2rcVJO4,1329
-doc_page_extractor/struct_eqtable/internvl/__init__.py,sha256=2aOsU-aHkFv_gjdP8LeUXjj_9-0d4x79iyxh4cCzaEw,79
-doc_page_extractor/struct_eqtable/internvl/conversation.py,sha256=s7DceRlM6JtHmxgyuE6vqu5XVT1fHzhzCL_I6r8MI1c,15129
-doc_page_extractor/struct_eqtable/internvl/internvl.py,sha256=ovVZG-PuBrsj_9lEoNPOygJ-2en3v6gPzRfWjDpWNOM,7678
-doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS38G43PzE955Nv4fjKk_-Oz4y_o9JEjQwlg,2608
-doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
-doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
-doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
-doc_page_extractor-0.2.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
-tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/test_history_bus.py,sha256=g-bpDIiebyEHKDH0YS5OHF2ONfhZt3-EFLZhWJn94WE,2534
-doc_page_extractor-0.2.1.dist-info/METADATA,sha256=lAM7CCs34AN2TPf9dvNQQZu2EsGFVLMXPKqSaRddZQQ,2480
-doc_page_extractor-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-doc_page_extractor-0.2.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
-doc_page_extractor-0.2.1.dist-info/RECORD,,
+doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
+doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
+doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
+doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
+doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
+doc_page_extractor/table.py,sha256=3esYqvHS9btINF9htmLrnDgTec2q7VvljaMXzDLj34w,1690
+doc_page_extractor/types.py,sha256=XtNfGuG6ZEwozFdk04mOD77m34zxr8aD9-fR9hbrrNU,1313
+doc_page_extractor/utils.py,sha256=ZlQVOLPUg_v5J8u6SoD8XtMG_JkF-ERgjubc4LO5_Lg,688
+doc_page_extractor-0.2.3.dist-info/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
+doc_page_extractor-0.2.3.dist-info/METADATA,sha256=KYcecqR5aLCGkepWf-gDLur-oRrIHrsd3DNB0BeI8EU,2756
+doc_page_extractor-0.2.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+doc_page_extractor-0.2.3.dist-info/RECORD,,

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,4 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: poetry-core 2.1.3
 Root-Is-Purelib: true
 Tag: py3-none-any

doc_page_extractor/struct_eqtable/__init__.py DELETED Viewed

@@ -1,49 +0,0 @@
-from .pix2s import Pix2Struct, Pix2StructTensorRT
-from .internvl import InternVL, InternVL_LMDeploy
-from transformers import AutoConfig
-__ALL_MODELS__ = {
-    'Pix2Struct': Pix2Struct,
-    'Pix2StructTensorRT': Pix2StructTensorRT,
-    'InternVL': InternVL,
-    'InternVL_LMDeploy': InternVL_LMDeploy,
-}
-def get_model_name(model_path):
-    model_config = AutoConfig.from_pretrained(
-        model_path,
-        trust_remote_code=True,
-    )
-    if 'Pix2Struct' in model_config.architectures[0]:
-        model_name = 'Pix2Struct'
-    elif 'InternVL' in model_config.architectures[0]:
-        model_name = 'InternVL'
-    else:
-        raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
-    return model_name
-def build_model(
-        model_ckpt='U4R/StructTable-InternVL2-1B',
-        cache_dir=None,
-        local_files_only=None,
-        **kwargs,
-    ):
-    model_name = get_model_name(model_ckpt)
-    if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
-        model_name = 'InternVL_LMDeploy'
-    elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
-        model_name = 'Pix2StructTensorRT'
-    model = __ALL_MODELS__[model_name](
-        model_ckpt,
-        cache_dir=cache_dir,
-        local_files_only=local_files_only,
-        **kwargs
-    )
-    return model

doc_page_extractor/struct_eqtable/internvl/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .internvl import InternVL
2	- from .internvl_lmdeploy import InternVL_LMDeploy

doc-page-extractor 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

Potentially problematic release.

doc-page-extractor 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl