PyPI - doc-page-extractor - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

doc-page-extractor 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (51) hide show

{doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,30 +1,33 @@
-Metadata-Version: 2.4
+Metadata-Version: 2.3
 Name: doc-page-extractor
-Version: 0.2.1
-Summary: doc page extractor can identify text and format in images and return structured data.
-Home-page: https://github.com/Moskize91/doc-page-extractor
+Version: 0.2.3
+Summary:
+License: AGPL-3.0
 Author: Tao Zeyu
 Author-email: i@taozeyu.com
+Maintainer: Tao Zeyu
+Maintainer-email: i@taozeyu.com
+Requires-Python: >=3.10,<3.13
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: accelerate (>=1.6.0,<2.0)
+Requires-Dist: doclayout_yolo (>=0.0.3)
+Requires-Dist: huggingface_hub (>=0.33.0,<1.0)
+Requires-Dist: numpy (>=1.24.0,<2.0)
+Requires-Dist: opencv-python (>=4.10.0,<5.0)
+Requires-Dist: pillow (>=10.3,<11.0)
+Requires-Dist: pix2tex (>=0.1.4,<=0.2.0)
+Requires-Dist: pyclipper (>=1.2.0,<2.0)
+Requires-Dist: shapely (>=2.0.0,<3.0)
+Requires-Dist: transformers (>=4.42.4,<=4.47)
+Project-URL: Repository, https://github.com/moskize91/doc-page-extractor
 Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: opencv-python<5.0,>=4.10.0
-Requires-Dist: pillow<11.0,>=10.3
-Requires-Dist: pyclipper<2.0,>=1.2.0
-Requires-Dist: numpy<2.0,>=1.24.0
-Requires-Dist: shapely<3.0,>=2.0.0
-Requires-Dist: transformers<=4.47,>=4.42.4
-Requires-Dist: doclayout_yolo>=0.0.3
-Requires-Dist: pix2tex<=0.2.0,>=0.1.4
-Requires-Dist: accelerate<2.0,>=1.6.0
-Requires-Dist: huggingface_hub<1.0,>=0.30.2
-Dynamic: author
-Dynamic: author-email
-Dynamic: description
-Dynamic: description-content-type
-Dynamic: home-page
-Dynamic: license-file
-Dynamic: requires-dist
-Dynamic: summary
 # doc page extractor

{doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ from .extractor import DocExtractor
 from .clipper import clip, clip_from_image
 from .plot import plot
 from .rectangle import Point, Rectangle
+from .model import Model, HuggingfaceModel
 from .types import (
   ExtractedResult,
   OCRFragment,

{doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/extractor.py RENAMED Viewed

@@ -52,6 +52,13 @@ class DocExtractor:
     self._latex: LaTeX = LaTeX(device, model)
     self._layout_order: LayoutOrder = LayoutOrder(device, model)
+  def prepare_models(self):
+    self._model.get_onnx_ocr_path()
+    self._model.get_yolo_path()
+    self._model.get_layoutreader_path()
+    self._model.get_struct_eqtable_path()
+    self._model.get_latex_path()
   def extract(
       self,
       image: Image,

{doc_page_extractor-0.2.1 → doc_page_extractor-0.2.3}/doc_page_extractor/model.py RENAMED Viewed

@@ -1,10 +1,14 @@
 from os import PathLike
+from time import sleep
 from typing import runtime_checkable, Protocol
 from pathlib import Path
 from threading import Lock
 from huggingface_hub import hf_hub_download, snapshot_download, try_to_load_from_cache
+_RETRY_TIMES = 6
+_RETRY_SLEEP = 3.5
 @runtime_checkable
 class Model(Protocol):
   def get_onnx_ocr_path(self) -> Path:
@@ -31,9 +35,10 @@ class HuggingfaceModel(Model):
   def get_onnx_ocr_path(self) -> Path:
     return self._get_model_path(
       repo_id="moskize/OnnxOCR",
-      filename=None,
+      filename="README.md",
       repo_type=None,
-      is_snapshot=True
+      is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_yolo_path(self) -> Path:
@@ -42,14 +47,16 @@ class HuggingfaceModel(Model):
       filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
       repo_type=None,
       is_snapshot=False,
+      wanna_dir_path=False,
     )
   def get_layoutreader_path(self) -> Path:
     return self._get_model_path(
       repo_id="hantian/layoutreader",
-      filename=None,
+      filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_struct_eqtable_path(self) -> Path:
@@ -58,6 +65,7 @@ class HuggingfaceModel(Model):
       filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_latex_path(self) -> Path:
@@ -66,38 +74,60 @@ class HuggingfaceModel(Model):
       filename="checkpoints/weights.pth",
       repo_type="space",
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def _get_model_path(
         self,
         repo_id: str,
-        filename: str | None,
+        filename: str,
         repo_type: str | None,
         is_snapshot: bool,
+        wanna_dir_path: bool,
       ) -> Path:
     with self._lock:
-      cache_filename = "README.md"
-      if filename is not None:
-        cache_filename = filename
       model_path = try_to_load_from_cache(
         repo_id=repo_id,
-        filename=cache_filename,
+        filename=filename,
         repo_type=repo_type,
         cache_dir=self._model_cache_dir
       )
       if isinstance(model_path, str):
-        if filename is None:
-          model_path = Path(model_path).parent
-      elif is_snapshot:
-        model_path = snapshot_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-        )
+        model_path = Path(model_path)
+        if wanna_dir_path:
+          for _ in Path(filename).parts:
+            model_path = model_path.parent
       else:
-        model_path = hf_hub_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-          filename=filename,
-        )
-      return Path(model_path)
+        # https://github.com/huggingface/huggingface_hub/issues/1542#issuecomment-1630465844
+        latest_error: ConnectionError | None = None
+        for i in range(_RETRY_TIMES + 1):
+          if latest_error is not None:
+            print(f"Retrying to download {repo_id} model, attempt {i + 1}/{_RETRY_TIMES}...")
+            sleep(_RETRY_SLEEP)
+          try:
+            if is_snapshot:
+              model_path = snapshot_download(
+                cache_dir=self._model_cache_dir,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                resume_download=True,
+              )
+            else:
+              model_path = hf_hub_download(
+                cache_dir=self._model_cache_dir,
+                repo_id=repo_id,
+                repo_type=repo_type,
+                filename=filename,
+                resume_download=True,
+              )
+            latest_error = None
+          except ConnectionError as err:
+            latest_error = err
+        if latest_error is not None:
+          raise latest_error
+        model_path = Path(model_path)
+      return model_path

doc_page_extractor-0.2.3/pyproject.toml ADDED Viewed

@@ -0,0 +1,48 @@
+[project]
+name = "doc-page-extractor"
+version = "0.2.3"
+description = ""
+authors = [
+    {name = "Tao Zeyu",email = "i@taozeyu.com"}
+]
+maintainers = [
+    {name = "Tao Zeyu", email = "i@taozeyu.com"}
+]
+license = {text = "AGPL-3.0"}
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+dependencies = [
+    "opencv-python>=4.10.0,<5.0",
+    "pillow>=10.3,<11.0",
+    "pyclipper>=1.2.0,<2.0",
+    "numpy>=1.24.0,<2.0",
+    "shapely>=2.0.0,<3.0",
+    "transformers>=4.42.4,<=4.47",
+    "doclayout_yolo>=0.0.3",
+    "pix2tex>=0.1.4,<=0.2.0",
+    "accelerate>=1.6.0,<2.0",
+    "huggingface_hub>=0.33.0,<1.0",
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry]
+license = "AGPL-3.0"
+readme = "README.md"
+repository = "https://github.com/moskize91/doc-page-extractor"
+packages = [
+    {include = "doc_page_extractor" }
+]
+classifiers=[
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: GNU Affero General Public License v3",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.10",
+]
+[tool.poetry.group.dev.dependencies]
+pylint = "^3.3.7"

doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/__init__.py DELETED Viewed

@@ -1,49 +0,0 @@
-from .pix2s import Pix2Struct, Pix2StructTensorRT
-from .internvl import InternVL, InternVL_LMDeploy
-from transformers import AutoConfig
-__ALL_MODELS__ = {
-    'Pix2Struct': Pix2Struct,
-    'Pix2StructTensorRT': Pix2StructTensorRT,
-    'InternVL': InternVL,
-    'InternVL_LMDeploy': InternVL_LMDeploy,
-}
-def get_model_name(model_path):
-    model_config = AutoConfig.from_pretrained(
-        model_path,
-        trust_remote_code=True,
-    )
-    if 'Pix2Struct' in model_config.architectures[0]:
-        model_name = 'Pix2Struct'
-    elif 'InternVL' in model_config.architectures[0]:
-        model_name = 'InternVL'
-    else:
-        raise ValueError(f"Unsupported model type: {model_config.architectures[0]}")
-    return model_name
-def build_model(
-        model_ckpt='U4R/StructTable-InternVL2-1B',
-        cache_dir=None,
-        local_files_only=None,
-        **kwargs,
-    ):
-    model_name = get_model_name(model_ckpt)
-    if model_name == 'InternVL' and kwargs.get('lmdeploy', False):
-        model_name = 'InternVL_LMDeploy'
-    elif model_name == 'Pix2Struct' and kwargs.get('tensorrt_path', None):
-        model_name = 'Pix2StructTensorRT'
-    model = __ALL_MODELS__[model_name](
-        model_ckpt,
-        cache_dir=cache_dir,
-        local_files_only=local_files_only,
-        **kwargs
-    )
-    return model

doc_page_extractor-0.2.1/doc_page_extractor/struct_eqtable/internvl/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .internvl import InternVL
2	- from .internvl_lmdeploy import InternVL_LMDeploy

doc-page-extractor 0.2.1__tar.gz → 0.2.3__tar.gz

Potentially problematic release.

doc-page-extractor 0.2.1tar.gz → 0.2.3tar.gz