ocrany 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from src.light_ocr import LightWeightOCR
2
+ from src.hard_ocr import HardWeightOCR
3
+ from src.extractor import Extractor
4
+
5
+ __all__ = [
6
+ 'LightWeightOCR',
7
+ 'HardWeightOCR',
8
+ 'Extractor'
9
+ ]
extractor.py ADDED
@@ -0,0 +1,26 @@
1
+ import pypdfium2 as pdfium
2
+ from chandra.input import load_pdf_images
3
+
4
+ from ocr.src.light_ocr import LightWeightOCR
5
+ from ocr.src.hard_ocr import HardWeightOCR
6
+
7
+
8
+ class Extractor:
9
+ def __init__(
10
+ self,
11
+ light_model_name_or_path: str,
12
+ light_vllm_api_url: str,
13
+ hard_ocr_mode: str = "vllm",
14
+ hard_vllm_api_url: str = None
15
+ ):
16
+ self.light_ocr = LightWeightOCR(light_model_name_or_path, light_vllm_api_url)
17
+ self.hard_ocr = HardWeightOCR(hard_ocr_mode, hard_vllm_api_url)
18
+
19
+ def extract(self, file_path: str, max_tokens: int, temperature: float) -> str:
20
+ pdf = pdfium.PdfDocument(file_path)
21
+
22
+ if len(pdf) == 1:
23
+ return self.light_ocr.extract(pdf[0], max_tokens, temperature)
24
+
25
+ images = load_pdf_images(file_path, page_range=[i for i in range(min(40, len(pdf)))])
26
+ return self.hard_ocr.extract(images)
hard_ocr.py ADDED
@@ -0,0 +1,23 @@
1
+ from chandra.model import InferenceManager
2
+ from chandra.model.schema import BatchInputItem
3
+
4
+
5
+ class HardWeightOCR:
6
+ def __init__(self, hard_ocr_mode: str = "vllm", hard_vllm_api_url: str = None):
7
+ if hard_ocr_mode not in ["vllm", "hf"]:
8
+ raise ValueError("Only supports `vllm` or `hf` inference mode.")
9
+
10
+ if hard_ocr_mode == "vllm" and not hard_vllm_api_url:
11
+ raise ValueError("Need to provide vllm API url for vllm mode.")
12
+
13
+ self.ocr = InferenceManager(method=hard_ocr_mode)
14
+ self.hard_vllm_api_url = hard_vllm_api_url
15
+
16
+ def process_images(self, images: list) -> str:
17
+ batch_images = [BatchInputItem(image=image, prompt_type="ocr_layout") for image in images]
18
+ responses = self.ocr.generate(batch_images, vllm_api_base=self.hard_vllm_api_url)
19
+
20
+ return "\n\n".join(res.markdown.strip() for res in responses)
21
+
22
+ def extract(self, images: list) -> str:
23
+ return self.process_images(images)
light_ocr.py ADDED
@@ -0,0 +1,39 @@
1
+ import base64
2
+ import io
3
+ import requests
4
+ from PIL import Image
5
+
6
+
7
+ class LightWeightOCR:
8
+ def __init__(
9
+ self,
10
+ light_model_name_or_path: str,
11
+ light_vllm_api_url: str,
12
+ ) -> None:
13
+ self.light_model_name_or_path = light_model_name_or_path
14
+ self.light_vllm_api_url = light_vllm_api_url
15
+
16
+ def extract(self, image: Image.Image, max_tokens: int, temperature: float) -> str:
17
+ pil_image = image.render(scale=2.77).to_pil()
18
+
19
+ buffer = io.BytesIO()
20
+ pil_image.save(buffer, format="PNG")
21
+ image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
22
+
23
+ payload = {
24
+ "model": self.light_model_name_or_path,
25
+ "messages": [{
26
+ "role": "user",
27
+ "content": [{
28
+ "type": "image_url",
29
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"}
30
+ }]
31
+ }],
32
+ "max_tokens": max_tokens,
33
+ "temperature": temperature,
34
+ }
35
+
36
+ response = requests.post(self.light_vllm_api_url, json=payload)
37
+ response = response.json()['choices'][0]['message']['content']
38
+
39
+ return response
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: ocrany
3
+ Version: 0.1.0
4
+ Summary: A lightweight and hardware-accelerated OCR pipeline
5
+ Author-email: Đặng Phương Nam <phuongnamdpn2k2@gmail.com>
6
+ Project-URL: Homepage, https://github.com/phuongnam2002/OCRANY
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: chandra_ocr
10
+ Requires-Dist: json_repair
11
+ Requires-Dist: openai
12
+ Requires-Dist: Pillow
13
+ Requires-Dist: pypdfium2
14
+ Requires-Dist: python-dotenv
15
+ Requires-Dist: tqdm
@@ -0,0 +1,8 @@
1
+ __init__.py,sha256=ZzLRUfW4GTKHOBvkUDuydIzOK0d_zpgCmsEacq4DpoQ,199
2
+ extractor.py,sha256=vtYz43aFVtdeiEda_2j1ry0sh81EKHEGUFSYsc8J0xw,937
3
+ hard_ocr.py,sha256=-ySU7hQvZRzxM75RjitDw2DhNq2FhJWyX25gULgaZR8,995
4
+ light_ocr.py,sha256=nQYiNxt4NqtzaFUmZ40TffBzr7XLlZpH85OFhCpt3lI,1244
5
+ ocrany-0.1.0.dist-info/METADATA,sha256=VlPh1MRe9wZAh-dgU_PD67j1znq_gaFXmjaIcS68lqA,472
6
+ ocrany-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
7
+ ocrany-0.1.0.dist-info/top_level.txt,sha256=CM2uG3Xs3bwLOo1dogbg1QSKVa056xkkYbmxbdrCKO0,38
8
+ ocrany-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,4 @@
1
+ __init__
2
+ extractor
3
+ hard_ocr
4
+ light_ocr