PyPI - pdfprep - Versions diffs - 0.2.0__tar.gz → 0.2.1__tar.gz - Mend

pdfprep 0.2.0tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{pdfprep-0.2.0 → pdfprep-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pdfprep
-Version: 0.2.0
+Version: 0.2.1
 Summary: PDF 전처리 통합 도구 — 메타데이터, 텍스트 파싱, OCR, 표 추출
 Author-email: uwpark <uwpark@simplatform.com>
 License: MIT

{pdfprep-0.2.0 → pdfprep-0.2.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "pdfprep"
-version = "0.2.0"
+version = "0.2.1"
 description = "PDF 전처리 통합 도구 — 메타데이터, 텍스트 파싱, OCR, 표 추출"
 readme = "README.md"
 requires-python = ">=3.12"

{pdfprep-0.2.0 → pdfprep-0.2.1}/src/pdfprep/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ pdfprep — PDF 전처리 도구 모음
     table     : 3종 라이브러리(camelot / tabula / layoutparser)로 표 추출
 """
-__version__ = "0.2.0"
+__version__ = "0.2.1"
 from . import metadata, parsing, ocr, table

{pdfprep-0.2.0 → pdfprep-0.2.1}/src/pdfprep/ocr.py RENAMED Viewed

@@ -71,8 +71,18 @@ def ocr_with_paddleocr(pdf_path, lang="korean", dpi=200):
     PaddleOCR — Baidu의 딥러닝 기반 OCR.
     검출(detection) + 인식(recognition) 단계를 거쳐 박스 + 텍스트 + 신뢰도 반환.
     """
-    import numpy as np
-    from paddleocr import PaddleOCR
+    try:
+        import numpy as np
+        from paddleocr import PaddleOCR
+    except ImportError as e:
+        raise RuntimeError(
+            "paddleocr 엔진을 사용할 수 없습니다 — paddleocr / paddlepaddle 가 설치되어 있지 않습니다.\n"
+            f"  현재 Python {sys.version_info.major}.{sys.version_info.minor} 환경입니다. "
+            "paddleocr 2.7.3 / paddlepaddle 2.6.2 는 Python 3.12 이하 휠만 제공하므로 "
+            "3.13+ 에서는 설치 대상에서 자동 제외됩니다.\n"
+            "  → Python 3.12 환경에서 `pip install \"pdfprep[ocr]\"` 로 설치하거나, "
+            "tesseract 엔진(engine='tesseract')을 사용하세요."
+        ) from e
     ocr = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)

{pdfprep-0.2.0 → pdfprep-0.2.1}/src/pdfprep/table.py RENAMED Viewed

@@ -128,7 +128,17 @@ def extract_with_layoutparser(pdf_path, dpi=200, model_name=None):
     if not hasattr(Image, "CUBIC"):
         Image.CUBIC = Image.BICUBIC
-    import layoutparser as lp
+    try:
+        import layoutparser as lp
+    except ImportError as e:
+        raise RuntimeError(
+            "layoutparser 엔진을 사용할 수 없습니다 — layoutparser 가 설치되어 있지 않습니다.\n"
+            f"  현재 Python {sys.version_info.major}.{sys.version_info.minor} 환경입니다. "
+            "layoutparser + paddlepaddle 백엔드는 Python 3.12 이하 휠만 제공하므로 "
+            "3.13+ 에서는 설치 대상에서 자동 제외됩니다.\n"
+            "  → Python 3.12 환경에서 `pip install \"pdfprep[table]\"` 로 설치하거나, "
+            "camelot / tabula 엔진을 사용하세요."
+        ) from e
     # PaddleDetection 백엔드 + TableBank 모델 (표 전용 학습)
     try:

{pdfprep-0.2.0 → pdfprep-0.2.1}/src/pdfprep.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pdfprep
-Version: 0.2.0
+Version: 0.2.1
 Summary: PDF 전처리 통합 도구 — 메타데이터, 텍스트 파싱, OCR, 표 추출
 Author-email: uwpark <uwpark@simplatform.com>
 License: MIT