PyPI - ocr-poetry-pkg - Versions diffs - 0.1.0__tar.gz - Mend

ocr-poetry-pkg 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

ocr_poetry_pkg-0.1.0/PKG-INFO +16 -0
ocr_poetry_pkg-0.1.0/README.md +2 -0
ocr_poetry_pkg-0.1.0/pyproject.toml +19 -0
ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/__init__.py +1 -0
ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/cli.py +110 -0
ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/layout.py +43 -0
ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/pdf2img.py +15 -0

ocr_poetry_pkg-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.1
+Name: ocr-poetry-pkg
+Version: 0.1.0
+Summary:  OCR CLI for Arabic images using unsloth/surya
+Author: alka gupta
+Author-email: alka.gupta@ksolves.com
+Requires-Python: >=3.12,<4.0
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: click (>=8.3.0,<9.0.0)
+Requires-Dist: pillow (>=12.0.0,<13.0.0)
+Description-Content-Type: text/markdown
+OCR CLI for Arabic images using unsloth/surya

ocr_poetry_pkg-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ OCR CLI for Arabic images using unsloth/surya
2	+

ocr_poetry_pkg-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "ocr-poetry-pkg"
+version = "0.1.0"
+description = " OCR CLI for Arabic images using unsloth/surya"
+authors = ["alka gupta <alka.gupta@ksolves.com>"]
+readme = "README.md"
+packages = [{ include = "ocr_poetry_pkg", from = "src" }]
+[tool.poetry.dependencies]
+python = "^3.12"
+pillow = "^12.0.0"
+click = "^8.3.0"
+[tool.poetry.scripts]
+ocr-batch = "ocr_poetry_pkg.cli:main"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/cli.py ADDED Viewed

@@ -0,0 +1,110 @@
+import json
+import math
+import traceback
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from PIL import Image
+import torch
+from unsloth import FastVisionModel
+# ---------------- Model Loading ----------------
+model, tokenizer = FastVisionModel.from_pretrained(
+    "AhmedZaky1/DIMI-Arabic-OCR-v2",
+    load_in_4bit=True,
+    device_map={"": 0},
+)
+FastVisionModel.for_inference(model)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# ---------------- Config ----------------.
+IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images")  # folder containing page images
+JSON_BASE = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/detections/results")  # base folder for detection jsons
+OUT_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/output")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+        # shrink from 2048 for speed
+# ---------------- Helpers ----------------
+def clamp(v, lo, hi): return max(lo, min(v, hi))
+def axis_aligned_from_polygon(poly, img_w, img_h):
+    xs = [p[0] for p in poly]; ys = [p[1] for p in poly]
+    x0, y0 = clamp(int(min(xs)), 0, img_w - 1), clamp(int(min(ys)), 0, img_h - 1)
+    x1, y1 = clamp(int(max(xs)), 0, img_w), clamp(int(max(ys)), 0, img_h)
+    return [x0, y0, x1, y1]
+INSTRUCTION = (
+    "Extract only the Arabic text visible in this image. Ignore Urdu or Persian. "
+    "Preserve diacritics and punctuation exactly. Do not translate. Output Arabic only."
+)
+# ---------------- Core Function ----------------
+def process_image(image_path: Path):
+    try:
+        img = Image.open(image_path).convert("RGB")
+        img_w, img_h = img.size
+        # Construct the detection JSON path like:
+        # /kaggle/input/json-folder/results/surya/{stem}/results.json
+        json_path = JSON_BASE / image_path.stem / "results.json"
+        if not json_path.exists():
+            return {"image": str(image_path), "error": f"missing detection JSON: {json_path}"}
+        result_data = json.loads(json_path.read_text(encoding="utf-8"))
+        if isinstance(result_data, dict):
+            page_entries = next(iter(result_data.values()))
+        elif isinstance(result_data, list):
+            page_entries = result_data
+        else:
+            return {"image": str(image_path), "error": "invalid JSON structure"}
+        metadata = []
+        for p in page_entries:
+            for bb in p.get("bboxes", []):
+                poly = bb.get("polygon")
+                if not poly:
+                    continue
+                x0, y0, x1, y1 = axis_aligned_from_polygon(poly[:4], img_w, img_h)
+                crop = img.crop((x0, y0, x1, y1))
+                messages = [
+                    {"role": "user", "content": [
+                        {"type": "image", "image": crop},
+                        {"type": "text", "text": INSTRUCTION},
+                    ]}
+                ]
+                text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                inputs = tokenizer(text=[text], images=[crop], return_tensors="pt", padding=True)
+                inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
+                with torch.inference_mode():
+                    outputs = model.generate(**inputs, max_new_tokens=2048)
+                input_len = inputs["input_ids"].shape[1]
+                preds = outputs[:, input_len:]
+                decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
+                prediction = decoded[0] if decoded else ""
+                metadata.append({
+                    "bbox": bb,
+                    "prediction": prediction,
+                })
+        out_file = OUT_DIR / f"{image_path.stem}.json"
+        out_file.write_text(json.dumps({"image": str(image_path), "results": metadata}, ensure_ascii=False, indent=2))
+        return {"image": str(image_path), "status": "done"}
+    except Exception as e:
+        return {"image": str(image_path), "error": str(e), "traceback": traceback.format_exc()}
+# ---------------- Parallel Runner ----------------
+def main():
+    images = sorted([p for p in IMAGES_DIR.iterdir() if p.suffix.lower() in {".jpg", ".jpeg", ".png"}])
+    for img in images:
+        result = process_image(img)
+        print(json.dumps(result, ensure_ascii=False))
+if __name__ == "__main__":
+    main()

ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/layout.py ADDED Viewed

@@ -0,0 +1,43 @@
+import os
+import subprocess
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images")
+VALID_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff"}
+# tune this: number of parallel processes to run
+MAX_WORKERS = min(8, (os.cpu_count() or 4))  # example: up to 8 or CPU count
+def run_layout(image_path: Path):
+    cmd = ["surya_layout", str(image_path)]
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return (image_path.name, True, proc.stdout, proc.stderr)
+    except subprocess.CalledProcessError as e:
+        return (image_path.name, False, e.stdout or "", e.stderr or str(e))
+def get_images(folder: Path):
+    return [p for p in folder.iterdir() if p.suffix.lower() in VALID_EXTS]
+def main():
+    images = get_images(IMAGES_DIR)
+    if not images:
+        print("No images found.")
+        return
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
+        futures = {ex.submit(run_layout, img): img for img in images}
+        for fut in as_completed(futures):
+            name, ok, out, err = fut.result()
+            if ok:
+                print(f"{name} finished.")
+                if out.strip():
+                    print(f"  stdout: {out.strip()}")
+            else:
+                print(f"{name} failed.")
+                if err:
+                    print(f"  stderr: {err.strip()}")
+if __name__ == "__main__":
+    main()

ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/pdf2img.py ADDED Viewed

@@ -0,0 +1,15 @@
+from pdf2image import convert_from_path
+from pathlib import Path
+PDF = "/path/to/input.pdf"
+OUT_DIR = Path("out_images")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+# convert all pages
+pages = convert_from_path(PDF, dpi=300)  # increase dpi for better quality
+for i, page in enumerate(pages, start=1):
+    out_path = OUT_DIR / f"page_{i:03d}.jpg"
+    page.save(out_path, "JPEG", quality=95)
+# Convert a page range or single page:
+# pages = convert_from_path(PDF, dpi=300, first_page=2, last_page=5)