ocr-poetry-pkg 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
ocr_poetry_pkg/cli.py ADDED
@@ -0,0 +1,110 @@
1
+ import json
2
+ import math
3
+ import traceback
4
+ from pathlib import Path
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from PIL import Image
7
+ import torch
8
+ from unsloth import FastVisionModel
9
+
10
+ # ---------------- Model Loading ----------------
11
+ model, tokenizer = FastVisionModel.from_pretrained(
12
+ "AhmedZaky1/DIMI-Arabic-OCR-v2",
13
+ load_in_4bit=True,
14
+ device_map={"": 0},
15
+ )
16
+ FastVisionModel.for_inference(model)
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ model.to(device)
20
+
21
+ # ---------------- Config ----------------.
22
+ IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images") # folder containing page images
23
+ JSON_BASE = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/detections/results") # base folder for detection jsons
24
+ OUT_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/output")
25
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
26
+
27
+ # shrink from 2048 for speed
28
+
29
+
30
+ # ---------------- Helpers ----------------
31
+ def clamp(v, lo, hi): return max(lo, min(v, hi))
32
+
33
+ def axis_aligned_from_polygon(poly, img_w, img_h):
34
+ xs = [p[0] for p in poly]; ys = [p[1] for p in poly]
35
+ x0, y0 = clamp(int(min(xs)), 0, img_w - 1), clamp(int(min(ys)), 0, img_h - 1)
36
+ x1, y1 = clamp(int(max(xs)), 0, img_w), clamp(int(max(ys)), 0, img_h)
37
+ return [x0, y0, x1, y1]
38
+
39
+ INSTRUCTION = (
40
+ "Extract only the Arabic text visible in this image. Ignore Urdu or Persian. "
41
+ "Preserve diacritics and punctuation exactly. Do not translate. Output Arabic only."
42
+ )
43
+
44
+ # ---------------- Core Function ----------------
45
+ def process_image(image_path: Path):
46
+ try:
47
+ img = Image.open(image_path).convert("RGB")
48
+ img_w, img_h = img.size
49
+
50
+ # Construct the detection JSON path like:
51
+ # /kaggle/input/json-folder/results/surya/{stem}/results.json
52
+ json_path = JSON_BASE / image_path.stem / "results.json"
53
+ if not json_path.exists():
54
+ return {"image": str(image_path), "error": f"missing detection JSON: {json_path}"}
55
+
56
+ result_data = json.loads(json_path.read_text(encoding="utf-8"))
57
+ if isinstance(result_data, dict):
58
+ page_entries = next(iter(result_data.values()))
59
+ elif isinstance(result_data, list):
60
+ page_entries = result_data
61
+ else:
62
+ return {"image": str(image_path), "error": "invalid JSON structure"}
63
+
64
+ metadata = []
65
+ for p in page_entries:
66
+ for bb in p.get("bboxes", []):
67
+ poly = bb.get("polygon")
68
+ if not poly:
69
+ continue
70
+ x0, y0, x1, y1 = axis_aligned_from_polygon(poly[:4], img_w, img_h)
71
+ crop = img.crop((x0, y0, x1, y1))
72
+
73
+ messages = [
74
+ {"role": "user", "content": [
75
+ {"type": "image", "image": crop},
76
+ {"type": "text", "text": INSTRUCTION},
77
+ ]}
78
+ ]
79
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
80
+ inputs = tokenizer(text=[text], images=[crop], return_tensors="pt", padding=True)
81
+ inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
82
+
83
+ with torch.inference_mode():
84
+ outputs = model.generate(**inputs, max_new_tokens=2048)
85
+ input_len = inputs["input_ids"].shape[1]
86
+ preds = outputs[:, input_len:]
87
+ decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
88
+ prediction = decoded[0] if decoded else ""
89
+
90
+ metadata.append({
91
+ "bbox": bb,
92
+ "prediction": prediction,
93
+ })
94
+
95
+ out_file = OUT_DIR / f"{image_path.stem}.json"
96
+ out_file.write_text(json.dumps({"image": str(image_path), "results": metadata}, ensure_ascii=False, indent=2))
97
+ return {"image": str(image_path), "status": "done"}
98
+
99
+ except Exception as e:
100
+ return {"image": str(image_path), "error": str(e), "traceback": traceback.format_exc()}
101
+
102
+ # ---------------- Parallel Runner ----------------
103
+ def main():
104
+ images = sorted([p for p in IMAGES_DIR.iterdir() if p.suffix.lower() in {".jpg", ".jpeg", ".png"}])
105
+ for img in images:
106
+ result = process_image(img)
107
+ print(json.dumps(result, ensure_ascii=False))
108
+
109
+ if __name__ == "__main__":
110
+ main()
@@ -0,0 +1,43 @@
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+ IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images")
7
+ VALID_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff"}
8
+
9
+ # tune this: number of parallel processes to run
10
+ MAX_WORKERS = min(8, (os.cpu_count() or 4)) # example: up to 8 or CPU count
11
+
12
+ def run_layout(image_path: Path):
13
+ cmd = ["surya_layout", str(image_path)]
14
+ try:
15
+ proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
16
+ return (image_path.name, True, proc.stdout, proc.stderr)
17
+ except subprocess.CalledProcessError as e:
18
+ return (image_path.name, False, e.stdout or "", e.stderr or str(e))
19
+
20
+ def get_images(folder: Path):
21
+ return [p for p in folder.iterdir() if p.suffix.lower() in VALID_EXTS]
22
+
23
+ def main():
24
+ images = get_images(IMAGES_DIR)
25
+ if not images:
26
+ print("No images found.")
27
+ return
28
+
29
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
30
+ futures = {ex.submit(run_layout, img): img for img in images}
31
+ for fut in as_completed(futures):
32
+ name, ok, out, err = fut.result()
33
+ if ok:
34
+ print(f"{name} finished.")
35
+ if out.strip():
36
+ print(f" stdout: {out.strip()}")
37
+ else:
38
+ print(f"{name} failed.")
39
+ if err:
40
+ print(f" stderr: {err.strip()}")
41
+
42
+ if __name__ == "__main__":
43
+ main()
@@ -0,0 +1,15 @@
1
+ from pdf2image import convert_from_path
2
+ from pathlib import Path
3
+
4
+ PDF = "/path/to/input.pdf"
5
+ OUT_DIR = Path("out_images")
6
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
7
+
8
+ # convert all pages
9
+ pages = convert_from_path(PDF, dpi=300) # increase dpi for better quality
10
+ for i, page in enumerate(pages, start=1):
11
+ out_path = OUT_DIR / f"page_{i:03d}.jpg"
12
+ page.save(out_path, "JPEG", quality=95)
13
+
14
+ # Convert a page range or single page:
15
+ # pages = convert_from_path(PDF, dpi=300, first_page=2, last_page=5)
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.1
2
+ Name: ocr-poetry-pkg
3
+ Version: 0.1.0
4
+ Summary: OCR CLI for Arabic images using unsloth/surya
5
+ Author: alka gupta
6
+ Author-email: alka.gupta@ksolves.com
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Requires-Dist: click (>=8.3.0,<9.0.0)
11
+ Requires-Dist: pillow (>=12.0.0,<13.0.0)
12
+ Description-Content-Type: text/markdown
13
+
14
+ OCR CLI for Arabic images using unsloth/surya
15
+
16
+
@@ -0,0 +1,8 @@
1
+ ocr_poetry_pkg/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
+ ocr_poetry_pkg/cli.py,sha256=DiIhyvvmeiJQJKDn7LPs6I5Wd2T72Wd5drW-RGqLaFw,4417
3
+ ocr_poetry_pkg/layout.py,sha256=FPrXLid1jon8_Ymcp_ncfH0ccYwgM-CXpaxQ4fsN5CQ,1490
4
+ ocr_poetry_pkg/pdf2img.py,sha256=M_94-l3kyY52JB7wsJJkU1fMNZmtOjnNhyp14BPeM6A,501
5
+ ocr_poetry_pkg-0.1.0.dist-info/METADATA,sha256=1fe9VPz9rJqu618guJSczmqPRu66qRsxw6It4qkk75w,466
6
+ ocr_poetry_pkg-0.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
7
+ ocr_poetry_pkg-0.1.0.dist-info/entry_points.txt,sha256=DokfqNNU6gzryAbzrW_lOYanz0k2ZeiPPyTlDTaWXzE,53
8
+ ocr_poetry_pkg-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ ocr-batch=ocr_poetry_pkg.cli:main
3
+