vlm4ocr 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlm4ocr/__init__.py +3 -1
- vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt +1 -0
- vlm4ocr/cli.py +276 -287
- vlm4ocr/data_types.py +109 -0
- vlm4ocr/ocr_engines.py +363 -195
- vlm4ocr/utils.py +386 -39
- vlm4ocr/vlm_engines.py +316 -190
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/METADATA +5 -1
- vlm4ocr-0.3.0.dist-info/RECORD +17 -0
- vlm4ocr-0.1.0.dist-info/RECORD +0 -15
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/WHEEL +0 -0
- {vlm4ocr-0.1.0.dist-info → vlm4ocr-0.3.0.dist-info}/entry_points.txt +0 -0
vlm4ocr/data_types.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Literal
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from vlm4ocr.utils import get_default_page_delimiter
|
|
5
|
+
|
|
6
|
+
OutputMode = Literal["markdown", "HTML", "text", "JSON"]
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class OCRResult:
|
|
10
|
+
"""
|
|
11
|
+
This class represents the result of an OCR process.
|
|
12
|
+
|
|
13
|
+
Parameters:
|
|
14
|
+
----------
|
|
15
|
+
input_dir : str
|
|
16
|
+
The directory where the input files (e.g., image, PDF, tiff) are located.
|
|
17
|
+
output_mode : str
|
|
18
|
+
The output format. Must be 'markdown', 'HTML', or 'text'.
|
|
19
|
+
pages : List[str]
|
|
20
|
+
A list of strings, each representing a page of the OCR result.
|
|
21
|
+
"""
|
|
22
|
+
input_dir: str
|
|
23
|
+
output_mode: OutputMode
|
|
24
|
+
pages: List[dict] = field(default_factory=list)
|
|
25
|
+
filename: str = field(init=False)
|
|
26
|
+
status: str = field(init=False, default="processing")
|
|
27
|
+
|
|
28
|
+
def __post_init__(self):
|
|
29
|
+
"""
|
|
30
|
+
Called after the dataclass-generated __init__ method.
|
|
31
|
+
Used for validation and initializing derived fields.
|
|
32
|
+
"""
|
|
33
|
+
self.filename = os.path.basename(self.input_dir)
|
|
34
|
+
|
|
35
|
+
# output_mode validation
|
|
36
|
+
if self.output_mode not in ["markdown", "HTML", "text", "JSON"]:
|
|
37
|
+
raise ValueError("output_mode must be 'markdown', 'HTML', 'text', or 'JSON'")
|
|
38
|
+
|
|
39
|
+
# pages validation
|
|
40
|
+
if not isinstance(self.pages, list):
|
|
41
|
+
raise ValueError("pages must be a list of dict")
|
|
42
|
+
for i, page_content in enumerate(self.pages):
|
|
43
|
+
if not isinstance(page_content, dict):
|
|
44
|
+
raise ValueError(f"Each page must be a dict. Page at index {i} is not a dict.")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def add_page(self, text:str, image_processing_status: dict):
|
|
48
|
+
"""
|
|
49
|
+
This method adds a new page to the OCRResult object.
|
|
50
|
+
|
|
51
|
+
Parameters:
|
|
52
|
+
----------
|
|
53
|
+
text : str
|
|
54
|
+
The OCR result text of the page.
|
|
55
|
+
image_processing_status : dict
|
|
56
|
+
A dictionary containing the image processing status for the page.
|
|
57
|
+
It can include keys like 'rotate_correction', 'max_dimension_pixels', etc.
|
|
58
|
+
"""
|
|
59
|
+
if not isinstance(text, str):
|
|
60
|
+
raise ValueError("text must be a string")
|
|
61
|
+
if not isinstance(image_processing_status, dict):
|
|
62
|
+
raise ValueError("image_processing_status must be a dict")
|
|
63
|
+
|
|
64
|
+
page = {
|
|
65
|
+
"text": text,
|
|
66
|
+
"image_processing_status": image_processing_status
|
|
67
|
+
}
|
|
68
|
+
self.pages.append(page)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def __len__(self):
|
|
72
|
+
return len(self.pages)
|
|
73
|
+
|
|
74
|
+
def get_page(self, idx):
|
|
75
|
+
if not isinstance(idx, int):
|
|
76
|
+
raise ValueError("Index must be an integer")
|
|
77
|
+
if idx < 0 or idx >= len(self.pages):
|
|
78
|
+
raise IndexError(f"Index out of range. The OCRResult has {len(self.pages)} pages, but index {idx} was requested.")
|
|
79
|
+
|
|
80
|
+
return self.pages[idx]
|
|
81
|
+
|
|
82
|
+
def __iter__(self):
|
|
83
|
+
return iter(self.pages)
|
|
84
|
+
|
|
85
|
+
def __repr__(self):
|
|
86
|
+
return f"OCRResult(filename={self.filename}, output_mode={self.output_mode}, pages_count={len(self.pages)}, status={self.status})"
|
|
87
|
+
|
|
88
|
+
def to_string(self, page_delimiter:str="auto") -> str:
|
|
89
|
+
"""
|
|
90
|
+
Convert the OCRResult object to a string representation.
|
|
91
|
+
|
|
92
|
+
Parameters:
|
|
93
|
+
----------
|
|
94
|
+
page_delimiter : str, Optional
|
|
95
|
+
Only applies if separate_pages = True. The delimiter to use between PDF pages.
|
|
96
|
+
if 'auto', it will be set to the default page delimiter for the output mode:
|
|
97
|
+
'markdown' -> '\n\n---\n\n'
|
|
98
|
+
'HTML' -> '<br><br>'
|
|
99
|
+
'text' -> '\n\n---\n\n'
|
|
100
|
+
"""
|
|
101
|
+
if not isinstance(page_delimiter, str):
|
|
102
|
+
raise ValueError("page_delimiter must be a string")
|
|
103
|
+
|
|
104
|
+
if page_delimiter == "auto":
|
|
105
|
+
self.page_delimiter = get_default_page_delimiter(self.output_mode)
|
|
106
|
+
else:
|
|
107
|
+
self.page_delimiter = page_delimiter
|
|
108
|
+
|
|
109
|
+
return self.page_delimiter.join([page.get("text", "") for page in self.pages])
|