ocr-poetry-pkg 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocr_poetry_pkg-0.1.0/PKG-INFO +16 -0
- ocr_poetry_pkg-0.1.0/README.md +2 -0
- ocr_poetry_pkg-0.1.0/pyproject.toml +19 -0
- ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/__init__.py +1 -0
- ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/cli.py +110 -0
- ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/layout.py +43 -0
- ocr_poetry_pkg-0.1.0/src/ocr_poetry_pkg/pdf2img.py +15 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ocr-poetry-pkg
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OCR CLI for Arabic images using unsloth/surya
|
|
5
|
+
Author: alka gupta
|
|
6
|
+
Author-email: alka.gupta@ksolves.com
|
|
7
|
+
Requires-Python: >=3.12,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Requires-Dist: click (>=8.3.0,<9.0.0)
|
|
11
|
+
Requires-Dist: pillow (>=12.0.0,<13.0.0)
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
OCR CLI for Arabic images using unsloth/surya
|
|
15
|
+
|
|
16
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "ocr-poetry-pkg"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = " OCR CLI for Arabic images using unsloth/surya"
|
|
5
|
+
authors = ["alka gupta <alka.gupta@ksolves.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{ include = "ocr_poetry_pkg", from = "src" }]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.12"
|
|
11
|
+
pillow = "^12.0.0"
|
|
12
|
+
click = "^8.3.0"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.scripts]
|
|
15
|
+
ocr-batch = "ocr_poetry_pkg.cli:main"
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["poetry-core>=1.0.0"]
|
|
19
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
import traceback
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import torch
|
|
8
|
+
from unsloth import FastVisionModel
|
|
9
|
+
|
|
10
|
+
# ---------------- Model Loading ----------------
|
|
11
|
+
model, tokenizer = FastVisionModel.from_pretrained(
|
|
12
|
+
"AhmedZaky1/DIMI-Arabic-OCR-v2",
|
|
13
|
+
load_in_4bit=True,
|
|
14
|
+
device_map={"": 0},
|
|
15
|
+
)
|
|
16
|
+
FastVisionModel.for_inference(model)
|
|
17
|
+
|
|
18
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
19
|
+
model.to(device)
|
|
20
|
+
|
|
21
|
+
# ---------------- Config ----------------.
|
|
22
|
+
IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images") # folder containing page images
|
|
23
|
+
JSON_BASE = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/detections/results") # base folder for detection jsons
|
|
24
|
+
OUT_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/output")
|
|
25
|
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
# shrink from 2048 for speed
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------- Helpers ----------------
|
|
31
|
+
def clamp(v, lo, hi): return max(lo, min(v, hi))
|
|
32
|
+
|
|
33
|
+
def axis_aligned_from_polygon(poly, img_w, img_h):
|
|
34
|
+
xs = [p[0] for p in poly]; ys = [p[1] for p in poly]
|
|
35
|
+
x0, y0 = clamp(int(min(xs)), 0, img_w - 1), clamp(int(min(ys)), 0, img_h - 1)
|
|
36
|
+
x1, y1 = clamp(int(max(xs)), 0, img_w), clamp(int(max(ys)), 0, img_h)
|
|
37
|
+
return [x0, y0, x1, y1]
|
|
38
|
+
|
|
39
|
+
INSTRUCTION = (
|
|
40
|
+
"Extract only the Arabic text visible in this image. Ignore Urdu or Persian. "
|
|
41
|
+
"Preserve diacritics and punctuation exactly. Do not translate. Output Arabic only."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# ---------------- Core Function ----------------
|
|
45
|
+
def process_image(image_path: Path):
|
|
46
|
+
try:
|
|
47
|
+
img = Image.open(image_path).convert("RGB")
|
|
48
|
+
img_w, img_h = img.size
|
|
49
|
+
|
|
50
|
+
# Construct the detection JSON path like:
|
|
51
|
+
# /kaggle/input/json-folder/results/surya/{stem}/results.json
|
|
52
|
+
json_path = JSON_BASE / image_path.stem / "results.json"
|
|
53
|
+
if not json_path.exists():
|
|
54
|
+
return {"image": str(image_path), "error": f"missing detection JSON: {json_path}"}
|
|
55
|
+
|
|
56
|
+
result_data = json.loads(json_path.read_text(encoding="utf-8"))
|
|
57
|
+
if isinstance(result_data, dict):
|
|
58
|
+
page_entries = next(iter(result_data.values()))
|
|
59
|
+
elif isinstance(result_data, list):
|
|
60
|
+
page_entries = result_data
|
|
61
|
+
else:
|
|
62
|
+
return {"image": str(image_path), "error": "invalid JSON structure"}
|
|
63
|
+
|
|
64
|
+
metadata = []
|
|
65
|
+
for p in page_entries:
|
|
66
|
+
for bb in p.get("bboxes", []):
|
|
67
|
+
poly = bb.get("polygon")
|
|
68
|
+
if not poly:
|
|
69
|
+
continue
|
|
70
|
+
x0, y0, x1, y1 = axis_aligned_from_polygon(poly[:4], img_w, img_h)
|
|
71
|
+
crop = img.crop((x0, y0, x1, y1))
|
|
72
|
+
|
|
73
|
+
messages = [
|
|
74
|
+
{"role": "user", "content": [
|
|
75
|
+
{"type": "image", "image": crop},
|
|
76
|
+
{"type": "text", "text": INSTRUCTION},
|
|
77
|
+
]}
|
|
78
|
+
]
|
|
79
|
+
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
80
|
+
inputs = tokenizer(text=[text], images=[crop], return_tensors="pt", padding=True)
|
|
81
|
+
inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
|
|
82
|
+
|
|
83
|
+
with torch.inference_mode():
|
|
84
|
+
outputs = model.generate(**inputs, max_new_tokens=2048)
|
|
85
|
+
input_len = inputs["input_ids"].shape[1]
|
|
86
|
+
preds = outputs[:, input_len:]
|
|
87
|
+
decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
|
88
|
+
prediction = decoded[0] if decoded else ""
|
|
89
|
+
|
|
90
|
+
metadata.append({
|
|
91
|
+
"bbox": bb,
|
|
92
|
+
"prediction": prediction,
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
out_file = OUT_DIR / f"{image_path.stem}.json"
|
|
96
|
+
out_file.write_text(json.dumps({"image": str(image_path), "results": metadata}, ensure_ascii=False, indent=2))
|
|
97
|
+
return {"image": str(image_path), "status": "done"}
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
return {"image": str(image_path), "error": str(e), "traceback": traceback.format_exc()}
|
|
101
|
+
|
|
102
|
+
# ---------------- Parallel Runner ----------------
|
|
103
|
+
def main():
|
|
104
|
+
images = sorted([p for p in IMAGES_DIR.iterdir() if p.suffix.lower() in {".jpg", ".jpeg", ".png"}])
|
|
105
|
+
for img in images:
|
|
106
|
+
result = process_image(img)
|
|
107
|
+
print(json.dumps(result, ensure_ascii=False))
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
main()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
+
|
|
6
|
+
IMAGES_DIR = Path("/home/alkaks1309/Desktop/ocr_poetry_pkg/images")
|
|
7
|
+
VALID_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff"}
|
|
8
|
+
|
|
9
|
+
# tune this: number of parallel processes to run
|
|
10
|
+
MAX_WORKERS = min(8, (os.cpu_count() or 4)) # example: up to 8 or CPU count
|
|
11
|
+
|
|
12
|
+
def run_layout(image_path: Path):
|
|
13
|
+
cmd = ["surya_layout", str(image_path)]
|
|
14
|
+
try:
|
|
15
|
+
proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
16
|
+
return (image_path.name, True, proc.stdout, proc.stderr)
|
|
17
|
+
except subprocess.CalledProcessError as e:
|
|
18
|
+
return (image_path.name, False, e.stdout or "", e.stderr or str(e))
|
|
19
|
+
|
|
20
|
+
def get_images(folder: Path):
|
|
21
|
+
return [p for p in folder.iterdir() if p.suffix.lower() in VALID_EXTS]
|
|
22
|
+
|
|
23
|
+
def main():
|
|
24
|
+
images = get_images(IMAGES_DIR)
|
|
25
|
+
if not images:
|
|
26
|
+
print("No images found.")
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
|
30
|
+
futures = {ex.submit(run_layout, img): img for img in images}
|
|
31
|
+
for fut in as_completed(futures):
|
|
32
|
+
name, ok, out, err = fut.result()
|
|
33
|
+
if ok:
|
|
34
|
+
print(f"{name} finished.")
|
|
35
|
+
if out.strip():
|
|
36
|
+
print(f" stdout: {out.strip()}")
|
|
37
|
+
else:
|
|
38
|
+
print(f"{name} failed.")
|
|
39
|
+
if err:
|
|
40
|
+
print(f" stderr: {err.strip()}")
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pdf2image import convert_from_path
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
PDF = "/path/to/input.pdf"
|
|
5
|
+
OUT_DIR = Path("out_images")
|
|
6
|
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
7
|
+
|
|
8
|
+
# convert all pages
|
|
9
|
+
pages = convert_from_path(PDF, dpi=300) # increase dpi for better quality
|
|
10
|
+
for i, page in enumerate(pages, start=1):
|
|
11
|
+
out_path = OUT_DIR / f"page_{i:03d}.jpg"
|
|
12
|
+
page.save(out_path, "JPEG", quality=95)
|
|
13
|
+
|
|
14
|
+
# Convert a page range or single page:
|
|
15
|
+
# pages = convert_from_path(PDF, dpi=300, first_page=2, last_page=5)
|