vlm4ocr 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt +1 -0
- vlm4ocr/data_types.py +3 -3
- vlm4ocr/ocr_engines.py +22 -5
- vlm4ocr/utils.py +57 -3
- {vlm4ocr-0.2.0.dist-info → vlm4ocr-0.3.0.dist-info}/METADATA +3 -1
- {vlm4ocr-0.2.0.dist-info → vlm4ocr-0.3.0.dist-info}/RECORD +8 -7
- {vlm4ocr-0.2.0.dist-info → vlm4ocr-0.3.0.dist-info}/WHEEL +0 -0
- {vlm4ocr-0.2.0.dist-info → vlm4ocr-0.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
You are a helpful assistant that can convert scanned documents into JSON format. Your output is accurate and well-formatted, starting with ```json and ending with ```. You will only output the JSON text without any additional explanations or comments. The JSON should include all text, tables, and lists with appropriate keys and values. You will ignore images, icons, or anything that can not be converted into text.
|
vlm4ocr/data_types.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import List, Literal
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from vlm4ocr.utils import get_default_page_delimiter
|
|
5
5
|
|
|
6
|
-
OutputMode = Literal["markdown", "HTML", "text"]
|
|
6
|
+
OutputMode = Literal["markdown", "HTML", "text", "JSON"]
|
|
7
7
|
|
|
8
8
|
@dataclass
|
|
9
9
|
class OCRResult:
|
|
@@ -33,8 +33,8 @@ class OCRResult:
|
|
|
33
33
|
self.filename = os.path.basename(self.input_dir)
|
|
34
34
|
|
|
35
35
|
# output_mode validation
|
|
36
|
-
if self.output_mode not in ["markdown", "HTML", "text"]:
|
|
37
|
-
raise ValueError("output_mode must be 'markdown', 'HTML', or '
|
|
36
|
+
if self.output_mode not in ["markdown", "HTML", "text", "JSON"]:
|
|
37
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
|
|
38
38
|
|
|
39
39
|
# pages validation
|
|
40
40
|
if not isinstance(self.pages, list):
|
vlm4ocr/ocr_engines.py
CHANGED
|
@@ -3,8 +3,8 @@ from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
|
|
|
3
3
|
import importlib
|
|
4
4
|
import asyncio
|
|
5
5
|
from colorama import Fore, Style
|
|
6
|
-
|
|
7
|
-
from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, get_default_page_delimiter
|
|
6
|
+
import json
|
|
7
|
+
from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
|
|
8
8
|
from vlm4ocr.data_types import OCRResult
|
|
9
9
|
from vlm4ocr.vlm_engines import VLMEngine
|
|
10
10
|
|
|
@@ -21,7 +21,7 @@ class OCREngine:
|
|
|
21
21
|
inference_engine : InferenceEngine
|
|
22
22
|
The inference engine to use for OCR.
|
|
23
23
|
output_mode : str, Optional
|
|
24
|
-
The output format. Must be 'markdown', 'HTML', or '
|
|
24
|
+
The output format. Must be 'markdown', 'HTML', 'text', or 'JSON'.
|
|
25
25
|
system_prompt : str, Optional
|
|
26
26
|
Custom system prompt. We recommend use a default system prompt by leaving this blank.
|
|
27
27
|
user_prompt : str, Optional
|
|
@@ -33,8 +33,8 @@ class OCREngine:
|
|
|
33
33
|
self.vlm_engine = vlm_engine
|
|
34
34
|
|
|
35
35
|
# Check output mode
|
|
36
|
-
if output_mode not in ["markdown", "HTML", "text"]:
|
|
37
|
-
raise ValueError("output_mode must be 'markdown', 'HTML', or '
|
|
36
|
+
if output_mode not in ["markdown", "HTML", "text", "JSON"]:
|
|
37
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'.")
|
|
38
38
|
self.output_mode = output_mode
|
|
39
39
|
|
|
40
40
|
# System prompt
|
|
@@ -49,6 +49,9 @@ class OCREngine:
|
|
|
49
49
|
if isinstance(user_prompt, str) and user_prompt:
|
|
50
50
|
self.user_prompt = user_prompt
|
|
51
51
|
else:
|
|
52
|
+
if self.output_mode == "JSON":
|
|
53
|
+
raise ValueError("user_prompt must be provided when output_mode is 'JSON' to define the JSON structure.")
|
|
54
|
+
|
|
52
55
|
prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
|
|
53
56
|
with prompt_template_path.open('r', encoding='utf-8') as f:
|
|
54
57
|
self.user_prompt = f.read()
|
|
@@ -276,6 +279,12 @@ class OCREngine:
|
|
|
276
279
|
# Clean the response if output mode is markdown
|
|
277
280
|
if self.output_mode == "markdown":
|
|
278
281
|
response = clean_markdown(response)
|
|
282
|
+
|
|
283
|
+
# Parse the response if output mode is JSON
|
|
284
|
+
if self.output_mode == "JSON":
|
|
285
|
+
json_list = extract_json(response)
|
|
286
|
+
# Serialize the JSON list to a string
|
|
287
|
+
response = json.dumps(json_list, indent=4)
|
|
279
288
|
|
|
280
289
|
# Add the page to the OCR result
|
|
281
290
|
ocr_result.add_page(text=response,
|
|
@@ -470,6 +479,14 @@ class OCREngine:
|
|
|
470
479
|
ocr_text = await self.vlm_engine.chat_async(
|
|
471
480
|
messages,
|
|
472
481
|
)
|
|
482
|
+
# Clean the OCR text if output mode is markdown
|
|
473
483
|
if self.output_mode == "markdown":
|
|
474
484
|
ocr_text = clean_markdown(ocr_text)
|
|
485
|
+
|
|
486
|
+
# Parse the response if output mode is JSON
|
|
487
|
+
if self.output_mode == "JSON":
|
|
488
|
+
json_list = extract_json(ocr_text)
|
|
489
|
+
# Serialize the JSON list to a string
|
|
490
|
+
ocr_text = json.dumps(json_list, indent=4)
|
|
491
|
+
|
|
475
492
|
return ocr_text, image_processing_status
|
vlm4ocr/utils.py
CHANGED
|
@@ -2,11 +2,14 @@ import abc
|
|
|
2
2
|
import os
|
|
3
3
|
import io
|
|
4
4
|
import base64
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
import json
|
|
7
|
+
import json_repair
|
|
6
8
|
import importlib.util
|
|
7
9
|
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
8
10
|
from PIL import Image
|
|
9
11
|
import asyncio
|
|
12
|
+
import warnings
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
class DataLoader(abc.ABC):
|
|
@@ -229,6 +232,55 @@ def clean_markdown(text:str) -> str:
|
|
|
229
232
|
cleaned_text = text.replace("```markdown", "").replace("```", "")
|
|
230
233
|
return cleaned_text
|
|
231
234
|
|
|
235
|
+
def _find_dict_strings( text: str) -> List[str]:
|
|
236
|
+
"""
|
|
237
|
+
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
238
|
+
|
|
239
|
+
Parameters:
|
|
240
|
+
-----------
|
|
241
|
+
text : str
|
|
242
|
+
the input text containing JSON-like structures.
|
|
243
|
+
|
|
244
|
+
Returns : List[str]
|
|
245
|
+
A list of valid JSON-like strings representing dictionaries.
|
|
246
|
+
"""
|
|
247
|
+
open_brace = 0
|
|
248
|
+
start = -1
|
|
249
|
+
json_objects = []
|
|
250
|
+
|
|
251
|
+
for i, char in enumerate(text):
|
|
252
|
+
if char == '{':
|
|
253
|
+
if open_brace == 0:
|
|
254
|
+
# start of a new JSON object
|
|
255
|
+
start = i
|
|
256
|
+
open_brace += 1
|
|
257
|
+
elif char == '}':
|
|
258
|
+
open_brace -= 1
|
|
259
|
+
if open_brace == 0 and start != -1:
|
|
260
|
+
json_objects.append(text[start:i + 1])
|
|
261
|
+
start = -1
|
|
262
|
+
|
|
263
|
+
return json_objects
|
|
264
|
+
|
|
265
|
+
def extract_json(gen_text:str) -> List[Dict[str, str]]:
|
|
266
|
+
"""
|
|
267
|
+
This method inputs a generated text and output a JSON of information tuples
|
|
268
|
+
"""
|
|
269
|
+
out = []
|
|
270
|
+
dict_str_list = _find_dict_strings(gen_text)
|
|
271
|
+
for dict_str in dict_str_list:
|
|
272
|
+
try:
|
|
273
|
+
dict_obj = json.loads(dict_str)
|
|
274
|
+
out.append(dict_obj)
|
|
275
|
+
except json.JSONDecodeError:
|
|
276
|
+
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
277
|
+
if dict_obj:
|
|
278
|
+
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
279
|
+
out.append(dict_obj)
|
|
280
|
+
else:
|
|
281
|
+
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
282
|
+
return out
|
|
283
|
+
|
|
232
284
|
def get_default_page_delimiter(output_mode:str) -> str:
|
|
233
285
|
"""
|
|
234
286
|
Returns the default page delimiter based on the environment variable.
|
|
@@ -243,8 +295,8 @@ def get_default_page_delimiter(output_mode:str) -> str:
|
|
|
243
295
|
str
|
|
244
296
|
The default page delimiter.
|
|
245
297
|
"""
|
|
246
|
-
if output_mode not in ["markdown", "HTML", "text"]:
|
|
247
|
-
raise ValueError("output_mode must be 'markdown', 'HTML', or '
|
|
298
|
+
if output_mode not in ["markdown", "HTML", "text", "JSON"]:
|
|
299
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
|
|
248
300
|
|
|
249
301
|
if output_mode == "markdown":
|
|
250
302
|
return "\n\n---\n\n"
|
|
@@ -252,6 +304,8 @@ def get_default_page_delimiter(output_mode:str) -> str:
|
|
|
252
304
|
return "<br><br>"
|
|
253
305
|
elif output_mode == "text":
|
|
254
306
|
return "\n\n---\n\n"
|
|
307
|
+
elif output_mode == "JSON":
|
|
308
|
+
return "\n\n---\n\n"
|
|
255
309
|
|
|
256
310
|
|
|
257
311
|
class ImageProcessor:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vlm4ocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Python package and Web App for OCR with vision language models.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3
|
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.11
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
12
|
Provides-Extra: tesseract
|
|
13
|
+
Requires-Dist: colorama (>=0.4.4)
|
|
14
|
+
Requires-Dist: json-repair (>=0.30.0)
|
|
13
15
|
Requires-Dist: pdf2image (>=1.16.0)
|
|
14
16
|
Requires-Dist: pillow (>=10.0.0)
|
|
15
17
|
Requires-Dist: pytesseract (>=0.3.13) ; extra == "tesseract"
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
vlm4ocr/__init__.py,sha256=k5TZY0LmRnjGyjHD0H5AxJHJMw_cS2SzGxTJ0NQbQsc,315
|
|
2
2
|
vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt,sha256=igPOntiLDZXTB71-QrTmMJveb6XC1TgArg1serPc9V8,547
|
|
3
3
|
vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt,sha256=cVn538JojZfCtIhfrcOPWt0dO7dtDqgB9xdS_5VvAqo,41
|
|
4
|
+
vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt,sha256=v-fUw53gkngc_dz9TMH2abALDsAEZfe-zJ2u3-SO4ck,417
|
|
4
5
|
vlm4ocr/assets/default_prompt_templates/ocr_markdown_system_prompt.txt,sha256=pIsYO2G3jkZ5EWg7MJixre3Itz1oPqJSduUZT34_RNY,436
|
|
5
6
|
vlm4ocr/assets/default_prompt_templates/ocr_markdown_user_prompt.txt,sha256=61EJv8POsQGIIUVwCjDU73lMXJE7F3qhPIYl6zSbl1Q,45
|
|
6
7
|
vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt,sha256=WbLSOerqFjlYGaGWJ-w2enhky1WhnPl011s0fgRPgnQ,398
|
|
7
8
|
vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt,sha256=ftgNAIPy_UlrcY6m7-IkH2ApHkCzRnymra1w2wg60Ks,47
|
|
8
9
|
vlm4ocr/cli.py,sha256=b13WswreFxTNLA7n2F2jPR7Wrb2Onb06zFnvf7MOLi0,20268
|
|
9
|
-
vlm4ocr/data_types.py,sha256=
|
|
10
|
-
vlm4ocr/ocr_engines.py,sha256=
|
|
11
|
-
vlm4ocr/utils.py,sha256=
|
|
10
|
+
vlm4ocr/data_types.py,sha256=IygbR6NWn1hMnfMc500pPz6s_odzqIjk-I_5Nz-djCs,3943
|
|
11
|
+
vlm4ocr/ocr_engines.py,sha256=xYTkT2DIbASlJtKMtyfWpuFl5PeSaaVDtGyiTWxCaJg,24429
|
|
12
|
+
vlm4ocr/utils.py,sha256=nQhUskOze99wCVMKmvsen0dhq-9NdN4EPC_bdYfkjgA,13611
|
|
12
13
|
vlm4ocr/vlm_engines.py,sha256=jQuRZ5HlJtTtJXESiFcoYQXwX-lYu0gc-KKOpRLuW6A,22331
|
|
13
|
-
vlm4ocr-0.
|
|
14
|
-
vlm4ocr-0.
|
|
15
|
-
vlm4ocr-0.
|
|
16
|
-
vlm4ocr-0.
|
|
14
|
+
vlm4ocr-0.3.0.dist-info/METADATA,sha256=fK3pR2tuInWeRLqZC4Mt86DKvjYhfYX-4GV09PjptEE,710
|
|
15
|
+
vlm4ocr-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
16
|
+
vlm4ocr-0.3.0.dist-info/entry_points.txt,sha256=qzWUk_QTZ12cH4DLjjfqce89EAlOydD85dreRRZF3K4,44
|
|
17
|
+
vlm4ocr-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|