vlm4ocr 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vlm4ocr
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Python package and Web App for OCR with vision language models.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Provides-Extra: tesseract
13
+ Requires-Dist: colorama (>=0.4.4)
14
+ Requires-Dist: json-repair (>=0.30.0)
13
15
  Requires-Dist: pdf2image (>=1.16.0)
14
16
  Requires-Dist: pillow (>=10.0.0)
15
17
  Requires-Dist: pytesseract (>=0.3.13) ; extra == "tesseract"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "vlm4ocr"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  description = "Python package and Web App for OCR with vision language models."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -15,7 +15,9 @@ exclude = [
15
15
  [tool.poetry.dependencies]
16
16
  python = "^3.11"
17
17
  pdf2image = ">=1.16.0"
18
+ colorama = ">=0.4.4"
18
19
  pillow = ">=10.0.0"
20
+ json-repair = ">=0.30.0"
19
21
  pytesseract = { version = ">=0.3.13", optional = true }
20
22
 
21
23
  [tool.poetry.scripts]
@@ -0,0 +1 @@
1
+ You are a helpful assistant that can convert scanned documents into JSON format. Your output is accurate and well-formatted, starting with ```json and ending with ```. You will only output the JSON text without any additional explanations or comments. The JSON should include all text, tables, and lists with appropriate keys and values. You will ignore images, icons, or anything that can not be converted into text.
@@ -3,7 +3,7 @@ from typing import List, Literal
3
3
  from dataclasses import dataclass, field
4
4
  from vlm4ocr.utils import get_default_page_delimiter
5
5
 
6
- OutputMode = Literal["markdown", "HTML", "text"]
6
+ OutputMode = Literal["markdown", "HTML", "text", "JSON"]
7
7
 
8
8
  @dataclass
9
9
  class OCRResult:
@@ -33,8 +33,8 @@ class OCRResult:
33
33
  self.filename = os.path.basename(self.input_dir)
34
34
 
35
35
  # output_mode validation
36
- if self.output_mode not in ["markdown", "HTML", "text"]:
37
- raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
36
+ if self.output_mode not in ["markdown", "HTML", "text", "JSON"]:
37
+ raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
38
38
 
39
39
  # pages validation
40
40
  if not isinstance(self.pages, list):
@@ -3,8 +3,8 @@ from typing import Tuple, List, Dict, Union, Generator, AsyncGenerator, Iterable
3
3
  import importlib
4
4
  import asyncio
5
5
  from colorama import Fore, Style
6
- from PIL import Image
7
- from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, get_default_page_delimiter
6
+ import json
7
+ from vlm4ocr.utils import DataLoader, PDFDataLoader, TIFFDataLoader, ImageDataLoader, ImageProcessor, clean_markdown, extract_json, get_default_page_delimiter
8
8
  from vlm4ocr.data_types import OCRResult
9
9
  from vlm4ocr.vlm_engines import VLMEngine
10
10
 
@@ -21,7 +21,7 @@ class OCREngine:
21
21
  inference_engine : InferenceEngine
22
22
  The inference engine to use for OCR.
23
23
  output_mode : str, Optional
24
- The output format. Must be 'markdown', 'HTML', or 'text'.
24
+ The output format. Must be 'markdown', 'HTML', 'text', or 'JSON'.
25
25
  system_prompt : str, Optional
26
26
  Custom system prompt. We recommend use a default system prompt by leaving this blank.
27
27
  user_prompt : str, Optional
@@ -33,8 +33,8 @@ class OCREngine:
33
33
  self.vlm_engine = vlm_engine
34
34
 
35
35
  # Check output mode
36
- if output_mode not in ["markdown", "HTML", "text"]:
37
- raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
36
+ if output_mode not in ["markdown", "HTML", "text", "JSON"]:
37
+ raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'.")
38
38
  self.output_mode = output_mode
39
39
 
40
40
  # System prompt
@@ -49,6 +49,9 @@ class OCREngine:
49
49
  if isinstance(user_prompt, str) and user_prompt:
50
50
  self.user_prompt = user_prompt
51
51
  else:
52
+ if self.output_mode == "JSON":
53
+ raise ValueError("user_prompt must be provided when output_mode is 'JSON' to define the JSON structure.")
54
+
52
55
  prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
53
56
  with prompt_template_path.open('r', encoding='utf-8') as f:
54
57
  self.user_prompt = f.read()
@@ -276,6 +279,12 @@ class OCREngine:
276
279
  # Clean the response if output mode is markdown
277
280
  if self.output_mode == "markdown":
278
281
  response = clean_markdown(response)
282
+
283
+ # Parse the response if output mode is JSON
284
+ if self.output_mode == "JSON":
285
+ json_list = extract_json(response)
286
+ # Serialize the JSON list to a string
287
+ response = json.dumps(json_list, indent=4)
279
288
 
280
289
  # Add the page to the OCR result
281
290
  ocr_result.add_page(text=response,
@@ -470,6 +479,14 @@ class OCREngine:
470
479
  ocr_text = await self.vlm_engine.chat_async(
471
480
  messages,
472
481
  )
482
+ # Clean the OCR text if output mode is markdown
473
483
  if self.output_mode == "markdown":
474
484
  ocr_text = clean_markdown(ocr_text)
485
+
486
+ # Parse the response if output mode is JSON
487
+ if self.output_mode == "JSON":
488
+ json_list = extract_json(ocr_text)
489
+ # Serialize the JSON list to a string
490
+ ocr_text = json.dumps(json_list, indent=4)
491
+
475
492
  return ocr_text, image_processing_status
@@ -2,11 +2,14 @@ import abc
2
2
  import os
3
3
  import io
4
4
  import base64
5
- from typing import Union, List, Tuple
5
+ from typing import Dict, List, Tuple
6
+ import json
7
+ import json_repair
6
8
  import importlib.util
7
9
  from pdf2image import convert_from_path, pdfinfo_from_path
8
10
  from PIL import Image
9
11
  import asyncio
12
+ import warnings
10
13
 
11
14
 
12
15
  class DataLoader(abc.ABC):
@@ -229,6 +232,55 @@ def clean_markdown(text:str) -> str:
229
232
  cleaned_text = text.replace("```markdown", "").replace("```", "")
230
233
  return cleaned_text
231
234
 
235
+ def _find_dict_strings( text: str) -> List[str]:
236
+ """
237
+ Extracts balanced JSON-like dictionaries from a string, even if nested.
238
+
239
+ Parameters:
240
+ -----------
241
+ text : str
242
+ the input text containing JSON-like structures.
243
+
244
+ Returns : List[str]
245
+ A list of valid JSON-like strings representing dictionaries.
246
+ """
247
+ open_brace = 0
248
+ start = -1
249
+ json_objects = []
250
+
251
+ for i, char in enumerate(text):
252
+ if char == '{':
253
+ if open_brace == 0:
254
+ # start of a new JSON object
255
+ start = i
256
+ open_brace += 1
257
+ elif char == '}':
258
+ open_brace -= 1
259
+ if open_brace == 0 and start != -1:
260
+ json_objects.append(text[start:i + 1])
261
+ start = -1
262
+
263
+ return json_objects
264
+
265
+ def extract_json(gen_text:str) -> List[Dict[str, str]]:
266
+ """
267
+ This method inputs a generated text and output a JSON of information tuples
268
+ """
269
+ out = []
270
+ dict_str_list = _find_dict_strings(gen_text)
271
+ for dict_str in dict_str_list:
272
+ try:
273
+ dict_obj = json.loads(dict_str)
274
+ out.append(dict_obj)
275
+ except json.JSONDecodeError:
276
+ dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
277
+ if dict_obj:
278
+ warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
279
+ out.append(dict_obj)
280
+ else:
281
+ warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
282
+ return out
283
+
232
284
  def get_default_page_delimiter(output_mode:str) -> str:
233
285
  """
234
286
  Returns the default page delimiter based on the environment variable.
@@ -243,8 +295,8 @@ def get_default_page_delimiter(output_mode:str) -> str:
243
295
  str
244
296
  The default page delimiter.
245
297
  """
246
- if output_mode not in ["markdown", "HTML", "text"]:
247
- raise ValueError("output_mode must be 'markdown', 'HTML', or 'text'")
298
+ if output_mode not in ["markdown", "HTML", "text", "JSON"]:
299
+ raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
248
300
 
249
301
  if output_mode == "markdown":
250
302
  return "\n\n---\n\n"
@@ -252,6 +304,8 @@ def get_default_page_delimiter(output_mode:str) -> str:
252
304
  return "<br><br>"
253
305
  elif output_mode == "text":
254
306
  return "\n\n---\n\n"
307
+ elif output_mode == "JSON":
308
+ return "\n\n---\n\n"
255
309
 
256
310
 
257
311
  class ImageProcessor:
File without changes
File without changes
File without changes
File without changes