PyPI - doc-page-extractor - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

doc-page-extractor 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (8) hide show

doc_page_extractor/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from .extractor import DocExtractor
 from .clipper import clip, clip_from_image
 from .plot import plot
 from .rectangle import Point, Rectangle
+from .model import Model, HuggingfaceModel
 from .types import (
   ExtractedResult,
   OCRFragment,

doc_page_extractor/extractor.py CHANGED Viewed

@@ -52,6 +52,13 @@ class DocExtractor:
     self._latex: LaTeX = LaTeX(device, model)
     self._layout_order: LayoutOrder = LayoutOrder(device, model)
+  def prepare_models(self):
+    self._model.get_onnx_ocr_path()
+    self._model.get_yolo_path()
+    self._model.get_layoutreader_path()
+    self._model.get_struct_eqtable_path()
+    self._model.get_latex_path()
   def extract(
       self,
       image: Image,

doc_page_extractor/model.py CHANGED Viewed

@@ -31,9 +31,10 @@ class HuggingfaceModel(Model):
   def get_onnx_ocr_path(self) -> Path:
     return self._get_model_path(
       repo_id="moskize/OnnxOCR",
-      filename=None,
+      filename="README.md",
       repo_type=None,
-      is_snapshot=True
+      is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_yolo_path(self) -> Path:
@@ -42,14 +43,16 @@ class HuggingfaceModel(Model):
       filename="models/Layout/YOLO/doclayout_yolo_ft.pt",
       repo_type=None,
       is_snapshot=False,
+      wanna_dir_path=False,
     )
   def get_layoutreader_path(self) -> Path:
     return self._get_model_path(
       repo_id="hantian/layoutreader",
-      filename=None,
+      filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_struct_eqtable_path(self) -> Path:
@@ -58,6 +61,7 @@ class HuggingfaceModel(Model):
       filename="model.safetensors",
       repo_type=None,
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def get_latex_path(self) -> Path:
@@ -66,38 +70,45 @@ class HuggingfaceModel(Model):
       filename="checkpoints/weights.pth",
       repo_type="space",
       is_snapshot=True,
+      wanna_dir_path=True,
     )
   def _get_model_path(
         self,
         repo_id: str,
-        filename: str | None,
+        filename: str,
         repo_type: str | None,
         is_snapshot: bool,
+        wanna_dir_path: bool,
       ) -> Path:
     with self._lock:
-      cache_filename = "README.md"
-      if filename is not None:
-        cache_filename = filename
       model_path = try_to_load_from_cache(
         repo_id=repo_id,
-        filename=cache_filename,
+        filename=filename,
         repo_type=repo_type,
         cache_dir=self._model_cache_dir
       )
       if isinstance(model_path, str):
-        if filename is None:
-          model_path = Path(model_path).parent
-      elif is_snapshot:
-        model_path = snapshot_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-        )
+        model_path = Path(model_path)
+        if wanna_dir_path:
+          for _ in Path(filename).parts:
+            model_path = model_path.parent
       else:
-        model_path = hf_hub_download(
-          cache_dir=self._model_cache_dir,
-          repo_id=repo_id,
-          filename=filename,
-        )
-      return Path(model_path)
+        if is_snapshot:
+          model_path = snapshot_download(
+            cache_dir=self._model_cache_dir,
+            repo_id=repo_id,
+            repo_type=repo_type,
+          )
+        else:
+          model_path = hf_hub_download(
+            cache_dir=self._model_cache_dir,
+            repo_id=repo_id,
+            repo_type=repo_type,
+            filename=filename,
+          )
+        model_path = Path(model_path)
+      return model_path

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: doc-page-extractor
-Version: 0.2.1
+Version: 0.2.2
 Summary: doc page extractor can identify text and format in images and return structured data.
 Home-page: https://github.com/Moskize91/doc-page-extractor
 Author: Tao Zeyu

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-doc_page_extractor/__init__.py,sha256=Q3cZxT1wpjb1kGI0fJ9YvL4Fh_rBN3NdLqaZh4ATFQM,312
+doc_page_extractor/__init__.py,sha256=rt_XALcqNNg3iVkMTUHltWxvdweH2FY6Y_olU2TkVBY,355
 doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
 doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
-doc_page_extractor/extractor.py,sha256=SGDPEIjmp5HiAYq_nSVsKFTt0IvqJJXXDZRfFFRlfzU,6993
+doc_page_extractor/extractor.py,sha256=7dJ1N4d2_9tB49XAfKBIJjEUvNtjf0CGqcm2uf6BMPg,7205
 doc_page_extractor/latex.py,sha256=kD3NIzZTEGUFIAqqyHYmNcfyTrlu77GB1YSFgzbFb7A,1024
 doc_page_extractor/layout_order.py,sha256=lm_rXzRZ3AtufaRwxqZPfM9vplwaOlE2pnTNc7Z_oLM,7435
 doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
-doc_page_extractor/model.py,sha256=BBrsY2iQXCIsvqemjL37pKRPV3EtKkXsl7yA5_SI8RE,2610
+doc_page_extractor/model.py,sha256=yQhLz28rQ0aCHKs1AQgKdUIT9UkJtpA0HP2cpWN-rls,2891
 doc_page_extractor/ocr.py,sha256=niZY4ZgbbbV-IeTrtF3Y8LPPrynU2wvZKRVYbBP1Dog,5586
 doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
 doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
@@ -36,10 +36,10 @@ doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS
 doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
 doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
 doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
-doc_page_extractor-0.2.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
+doc_page_extractor-0.2.2.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_history_bus.py,sha256=g-bpDIiebyEHKDH0YS5OHF2ONfhZt3-EFLZhWJn94WE,2534
-doc_page_extractor-0.2.1.dist-info/METADATA,sha256=lAM7CCs34AN2TPf9dvNQQZu2EsGFVLMXPKqSaRddZQQ,2480
-doc_page_extractor-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-doc_page_extractor-0.2.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
-doc_page_extractor-0.2.1.dist-info/RECORD,,
+doc_page_extractor-0.2.2.dist-info/METADATA,sha256=SATSntedRznSoTzLUsKWBHNwiD3CDujgc9FEvK_Fbg4,2480
+doc_page_extractor-0.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+doc_page_extractor-0.2.2.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
+doc_page_extractor-0.2.2.dist-info/RECORD,,

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{doc_page_extractor-0.2.1.dist-info → doc_page_extractor-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

doc-page-extractor 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

doc-page-extractor 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl