ocrany 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +9 -0
- extractor.py +26 -0
- hard_ocr.py +23 -0
- light_ocr.py +39 -0
- ocrany-0.1.0.dist-info/METADATA +15 -0
- ocrany-0.1.0.dist-info/RECORD +8 -0
- ocrany-0.1.0.dist-info/WHEEL +5 -0
- ocrany-0.1.0.dist-info/top_level.txt +4 -0
__init__.py
ADDED
extractor.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pypdfium2 as pdfium
|
|
2
|
+
from chandra.input import load_pdf_images
|
|
3
|
+
|
|
4
|
+
from ocr.src.light_ocr import LightWeightOCR
|
|
5
|
+
from ocr.src.hard_ocr import HardWeightOCR
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Extractor:
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
light_model_name_or_path: str,
|
|
12
|
+
light_vllm_api_url: str,
|
|
13
|
+
hard_ocr_mode: str = "vllm",
|
|
14
|
+
hard_vllm_api_url: str = None
|
|
15
|
+
):
|
|
16
|
+
self.light_ocr = LightWeightOCR(light_model_name_or_path, light_vllm_api_url)
|
|
17
|
+
self.hard_ocr = HardWeightOCR(hard_ocr_mode, hard_vllm_api_url)
|
|
18
|
+
|
|
19
|
+
def extract(self, file_path: str, max_tokens: int, temperature: float) -> str:
|
|
20
|
+
pdf = pdfium.PdfDocument(file_path)
|
|
21
|
+
|
|
22
|
+
if len(pdf) == 1:
|
|
23
|
+
return self.light_ocr.extract(pdf[0], max_tokens, temperature)
|
|
24
|
+
|
|
25
|
+
images = load_pdf_images(file_path, page_range=[i for i in range(min(40, len(pdf)))])
|
|
26
|
+
return self.hard_ocr.extract(images)
|
hard_ocr.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from chandra.model import InferenceManager
|
|
2
|
+
from chandra.model.schema import BatchInputItem
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class HardWeightOCR:
|
|
6
|
+
def __init__(self, hard_ocr_mode: str = "vllm", hard_vllm_api_url: str = None):
|
|
7
|
+
if hard_ocr_mode not in ["vllm", "hf"]:
|
|
8
|
+
raise ValueError("Only supports `vllm` or `hf` inference mode.")
|
|
9
|
+
|
|
10
|
+
if hard_ocr_mode == "vllm" and not hard_vllm_api_url:
|
|
11
|
+
raise ValueError("Need to provide vllm API url for vllm mode.")
|
|
12
|
+
|
|
13
|
+
self.ocr = InferenceManager(method=hard_ocr_mode)
|
|
14
|
+
self.hard_vllm_api_url = hard_vllm_api_url
|
|
15
|
+
|
|
16
|
+
def process_images(self, images: list) -> str:
|
|
17
|
+
batch_images = [BatchInputItem(image=image, prompt_type="ocr_layout") for image in images]
|
|
18
|
+
responses = self.ocr.generate(batch_images, vllm_api_base=self.hard_vllm_api_url)
|
|
19
|
+
|
|
20
|
+
return "\n\n".join(res.markdown.strip() for res in responses)
|
|
21
|
+
|
|
22
|
+
def extract(self, images: list) -> str:
|
|
23
|
+
return self.process_images(images)
|
light_ocr.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
3
|
+
import requests
|
|
4
|
+
from PIL import Image
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LightWeightOCR:
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
light_model_name_or_path: str,
|
|
11
|
+
light_vllm_api_url: str,
|
|
12
|
+
) -> None:
|
|
13
|
+
self.light_model_name_or_path = light_model_name_or_path
|
|
14
|
+
self.light_vllm_api_url = light_vllm_api_url
|
|
15
|
+
|
|
16
|
+
def extract(self, image: Image.Image, max_tokens: int, temperature: float) -> str:
|
|
17
|
+
pil_image = image.render(scale=2.77).to_pil()
|
|
18
|
+
|
|
19
|
+
buffer = io.BytesIO()
|
|
20
|
+
pil_image.save(buffer, format="PNG")
|
|
21
|
+
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
|
22
|
+
|
|
23
|
+
payload = {
|
|
24
|
+
"model": self.light_model_name_or_path,
|
|
25
|
+
"messages": [{
|
|
26
|
+
"role": "user",
|
|
27
|
+
"content": [{
|
|
28
|
+
"type": "image_url",
|
|
29
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"}
|
|
30
|
+
}]
|
|
31
|
+
}],
|
|
32
|
+
"max_tokens": max_tokens,
|
|
33
|
+
"temperature": temperature,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
response = requests.post(self.light_vllm_api_url, json=payload)
|
|
37
|
+
response = response.json()['choices'][0]['message']['content']
|
|
38
|
+
|
|
39
|
+
return response
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocrany
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight and hardware-accelerated OCR pipeline
|
|
5
|
+
Author-email: Đặng Phương Nam <phuongnamdpn2k2@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/phuongnam2002/OCRANY
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: chandra_ocr
|
|
10
|
+
Requires-Dist: json_repair
|
|
11
|
+
Requires-Dist: openai
|
|
12
|
+
Requires-Dist: Pillow
|
|
13
|
+
Requires-Dist: pypdfium2
|
|
14
|
+
Requires-Dist: python-dotenv
|
|
15
|
+
Requires-Dist: tqdm
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
__init__.py,sha256=ZzLRUfW4GTKHOBvkUDuydIzOK0d_zpgCmsEacq4DpoQ,199
|
|
2
|
+
extractor.py,sha256=vtYz43aFVtdeiEda_2j1ry0sh81EKHEGUFSYsc8J0xw,937
|
|
3
|
+
hard_ocr.py,sha256=-ySU7hQvZRzxM75RjitDw2DhNq2FhJWyX25gULgaZR8,995
|
|
4
|
+
light_ocr.py,sha256=nQYiNxt4NqtzaFUmZ40TffBzr7XLlZpH85OFhCpt3lI,1244
|
|
5
|
+
ocrany-0.1.0.dist-info/METADATA,sha256=VlPh1MRe9wZAh-dgU_PD67j1znq_gaFXmjaIcS68lqA,472
|
|
6
|
+
ocrany-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
ocrany-0.1.0.dist-info/top_level.txt,sha256=CM2uG3Xs3bwLOo1dogbg1QSKVa056xkkYbmxbdrCKO0,38
|
|
8
|
+
ocrany-0.1.0.dist-info/RECORD,,
|