vlm4ocr 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/PKG-INFO +2 -1
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/pyproject.toml +2 -1
- vlm4ocr-0.4.1/vlm4ocr/vlm_engines.py +276 -0
- vlm4ocr-0.4.0/vlm4ocr/vlm_engines.py +0 -1246
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/README.md +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/__init__.py +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_HTML_system_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_HTML_user_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_JSON_system_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_markdown_system_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_markdown_user_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_text_system_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/assets/default_prompt_templates/ocr_text_user_prompt.txt +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/cli.py +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/data_types.py +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/ocr_engines.py +0 -0
- {vlm4ocr-0.4.0 → vlm4ocr-0.4.1}/vlm4ocr/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vlm4ocr
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Python package and Web App for OCR with vision language models.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
12
12
|
Provides-Extra: tesseract
|
|
13
13
|
Requires-Dist: colorama (>=0.4.4)
|
|
14
14
|
Requires-Dist: json-repair (>=0.30.0)
|
|
15
|
+
Requires-Dist: llm-inference-engine (>=0.1.1,<0.2.0)
|
|
15
16
|
Requires-Dist: pdf2image (>=1.16.0)
|
|
16
17
|
Requires-Dist: pillow (>=10.0.0)
|
|
17
18
|
Requires-Dist: pytesseract (>=0.3.13) ; extra == "tesseract"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "vlm4ocr"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.1"
|
|
4
4
|
description = "Python package and Web App for OCR with vision language models."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
@@ -18,6 +18,7 @@ pdf2image = ">=1.16.0"
|
|
|
18
18
|
colorama = ">=0.4.4"
|
|
19
19
|
pillow = ">=10.0.0"
|
|
20
20
|
json-repair = ">=0.30.0"
|
|
21
|
+
llm-inference-engine = "^0.1.1"
|
|
21
22
|
pytesseract = { version = ">=0.3.13", optional = true }
|
|
22
23
|
|
|
23
24
|
[tool.poetry.scripts]
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
from PIL import Image
|
|
4
|
+
from vlm4ocr.utils import image_to_base64
|
|
5
|
+
from vlm4ocr.data_types import FewShotExample
|
|
6
|
+
from llm_inference_engine.llm_configs import (
|
|
7
|
+
LLMConfig as VLMConfig,
|
|
8
|
+
BasicLLMConfig as BasicVLMConfig,
|
|
9
|
+
ReasoningLLMConfig as ReasoningVLMConfig,
|
|
10
|
+
OpenAIReasoningLLMConfig as OpenAIReasoningVLMConfig
|
|
11
|
+
)
|
|
12
|
+
from llm_inference_engine.utils import MessagesLogger
|
|
13
|
+
from llm_inference_engine.engines import (
|
|
14
|
+
InferenceEngine,
|
|
15
|
+
OllamaInferenceEngine,
|
|
16
|
+
OpenAICompatibleInferenceEngine,
|
|
17
|
+
VLLMInferenceEngine,
|
|
18
|
+
OpenRouterInferenceEngine,
|
|
19
|
+
OpenAIInferenceEngine,
|
|
20
|
+
AzureOpenAIInferenceEngine,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class VLMEngine(InferenceEngine):
|
|
25
|
+
@abc.abstractmethod
|
|
26
|
+
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
|
|
27
|
+
"""
|
|
28
|
+
This method inputs an image and returns the correesponding chat messages for the inference engine.
|
|
29
|
+
|
|
30
|
+
Parameters:
|
|
31
|
+
----------
|
|
32
|
+
system_prompt : str
|
|
33
|
+
the system prompt.
|
|
34
|
+
user_prompt : str
|
|
35
|
+
the user prompt.
|
|
36
|
+
image : Image.Image
|
|
37
|
+
the image for OCR.
|
|
38
|
+
few_shot_examples : List[FewShotExample], Optional
|
|
39
|
+
list of few-shot examples.
|
|
40
|
+
"""
|
|
41
|
+
return NotImplemented
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class OllamaVLMEngine(OllamaInferenceEngine, VLMEngine):
|
|
45
|
+
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
|
|
46
|
+
"""
|
|
47
|
+
This method inputs an image and returns the correesponding chat messages for the inference engine.
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
----------
|
|
51
|
+
system_prompt : str
|
|
52
|
+
the system prompt.
|
|
53
|
+
user_prompt : str
|
|
54
|
+
the user prompt.
|
|
55
|
+
image : Image.Image
|
|
56
|
+
the image for OCR.
|
|
57
|
+
few_shot_examples : List[FewShotExample], Optional
|
|
58
|
+
list of few-shot examples.
|
|
59
|
+
"""
|
|
60
|
+
base64_str = image_to_base64(image)
|
|
61
|
+
output_messages = []
|
|
62
|
+
# system message
|
|
63
|
+
system_message = {"role": "system", "content": system_prompt}
|
|
64
|
+
output_messages.append(system_message)
|
|
65
|
+
|
|
66
|
+
# few-shot examples
|
|
67
|
+
if few_shot_examples is not None:
|
|
68
|
+
for example in few_shot_examples:
|
|
69
|
+
if not isinstance(example, FewShotExample):
|
|
70
|
+
raise ValueError("Few-shot example must be a FewShotExample object.")
|
|
71
|
+
|
|
72
|
+
example_image_b64 = image_to_base64(example.image)
|
|
73
|
+
example_user_message = {"role": "user", "content": user_prompt, "images": [example_image_b64]}
|
|
74
|
+
example_agent_message = {"role": "assistant", "content": example.text}
|
|
75
|
+
output_messages.append(example_user_message)
|
|
76
|
+
output_messages.append(example_agent_message)
|
|
77
|
+
|
|
78
|
+
# user message
|
|
79
|
+
user_message = {"role": "user", "content": user_prompt, "images": [base64_str]}
|
|
80
|
+
output_messages.append(user_message)
|
|
81
|
+
|
|
82
|
+
return output_messages
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class OpenAICompatibleVLMEngine(OpenAICompatibleInferenceEngine, VLMEngine):
|
|
86
|
+
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
|
|
87
|
+
detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
|
|
88
|
+
"""
|
|
89
|
+
This method inputs an image and returns the correesponding chat messages for the inference engine.
|
|
90
|
+
|
|
91
|
+
Parameters:
|
|
92
|
+
----------
|
|
93
|
+
system_prompt : str
|
|
94
|
+
the system prompt.
|
|
95
|
+
user_prompt : str
|
|
96
|
+
the user prompt.
|
|
97
|
+
image : Image.Image
|
|
98
|
+
the image for OCR.
|
|
99
|
+
format : str, Optional
|
|
100
|
+
the image format.
|
|
101
|
+
detail : str, Optional
|
|
102
|
+
the detail level of the image. Default is "high".
|
|
103
|
+
few_shot_examples : List[FewShotExample], Optional
|
|
104
|
+
list of few-shot examples.
|
|
105
|
+
"""
|
|
106
|
+
base64_str = image_to_base64(image)
|
|
107
|
+
output_messages = []
|
|
108
|
+
# system message
|
|
109
|
+
system_message = {"role": "system", "content": system_prompt}
|
|
110
|
+
output_messages.append(system_message)
|
|
111
|
+
|
|
112
|
+
# few-shot examples
|
|
113
|
+
if few_shot_examples is not None:
|
|
114
|
+
for example in few_shot_examples:
|
|
115
|
+
if not isinstance(example, FewShotExample):
|
|
116
|
+
raise ValueError("Few-shot example must be a FewShotExample object.")
|
|
117
|
+
|
|
118
|
+
example_image_b64 = image_to_base64(example.image)
|
|
119
|
+
example_user_message = {
|
|
120
|
+
"role": "user",
|
|
121
|
+
"content": [
|
|
122
|
+
{
|
|
123
|
+
"type": "image_url",
|
|
124
|
+
"image_url": {
|
|
125
|
+
"url": f"data:image/{format};base64,{example_image_b64}",
|
|
126
|
+
"detail": detail
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
{"type": "text", "text": user_prompt},
|
|
130
|
+
],
|
|
131
|
+
}
|
|
132
|
+
example_agent_message = {"role": "assistant", "content": example.text}
|
|
133
|
+
output_messages.append(example_user_message)
|
|
134
|
+
output_messages.append(example_agent_message)
|
|
135
|
+
|
|
136
|
+
# user message
|
|
137
|
+
user_message = {
|
|
138
|
+
"role": "user",
|
|
139
|
+
"content": [
|
|
140
|
+
{
|
|
141
|
+
"type": "image_url",
|
|
142
|
+
"image_url": {
|
|
143
|
+
"url": f"data:image/{format};base64,{base64_str}",
|
|
144
|
+
"detail": detail
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
{"type": "text", "text": user_prompt},
|
|
148
|
+
],
|
|
149
|
+
}
|
|
150
|
+
output_messages.append(user_message)
|
|
151
|
+
return output_messages
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class VLLMVLMEngine(VLLMInferenceEngine, OpenAICompatibleVLMEngine):
|
|
155
|
+
"""
|
|
156
|
+
vLLM OpenAI compatible server inference engine.
|
|
157
|
+
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
|
|
158
|
+
|
|
159
|
+
For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
|
|
160
|
+
|
|
161
|
+
Parameters:
|
|
162
|
+
----------
|
|
163
|
+
model_name : str
|
|
164
|
+
model name as shown in the vLLM server
|
|
165
|
+
api_key : str, Optional
|
|
166
|
+
the API key for the vLLM server.
|
|
167
|
+
base_url : str, Optional
|
|
168
|
+
the base url for the vLLM server.
|
|
169
|
+
config : LLMConfig
|
|
170
|
+
the LLM configuration.
|
|
171
|
+
"""
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
class OpenRouterVLMEngine(OpenRouterInferenceEngine, OpenAICompatibleVLMEngine):
|
|
175
|
+
"""
|
|
176
|
+
OpenRouter OpenAI-compatible server inference engine.
|
|
177
|
+
|
|
178
|
+
Parameters:
|
|
179
|
+
----------
|
|
180
|
+
model_name : str
|
|
181
|
+
model name as shown in the vLLM server
|
|
182
|
+
api_key : str, Optional
|
|
183
|
+
the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY'].
|
|
184
|
+
base_url : str, Optional
|
|
185
|
+
the base url for the vLLM server.
|
|
186
|
+
config : LLMConfig
|
|
187
|
+
the LLM configuration.
|
|
188
|
+
"""
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
class OpenAIVLMEngine(OpenAIInferenceEngine, VLMEngine):
|
|
192
|
+
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
|
|
193
|
+
detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
|
|
194
|
+
"""
|
|
195
|
+
This method inputs an image and returns the correesponding chat messages for the inference engine.
|
|
196
|
+
|
|
197
|
+
Parameters:
|
|
198
|
+
----------
|
|
199
|
+
system_prompt : str
|
|
200
|
+
the system prompt.
|
|
201
|
+
user_prompt : str
|
|
202
|
+
the user prompt.
|
|
203
|
+
image : Image.Image
|
|
204
|
+
the image for OCR.
|
|
205
|
+
format : str, Optional
|
|
206
|
+
the image format.
|
|
207
|
+
detail : str, Optional
|
|
208
|
+
the detail level of the image. Default is "high".
|
|
209
|
+
few_shot_examples : List[FewShotExample], Optional
|
|
210
|
+
list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
|
|
211
|
+
"""
|
|
212
|
+
base64_str = image_to_base64(image)
|
|
213
|
+
output_messages = []
|
|
214
|
+
# system message
|
|
215
|
+
system_message = {"role": "system", "content": system_prompt}
|
|
216
|
+
output_messages.append(system_message)
|
|
217
|
+
|
|
218
|
+
# few-shot examples
|
|
219
|
+
if few_shot_examples is not None:
|
|
220
|
+
for example in few_shot_examples:
|
|
221
|
+
if not isinstance(example, FewShotExample):
|
|
222
|
+
raise ValueError("Few-shot example must be a FewShotExample object.")
|
|
223
|
+
|
|
224
|
+
example_image_b64 = image_to_base64(example.image)
|
|
225
|
+
example_user_message = {
|
|
226
|
+
"role": "user",
|
|
227
|
+
"content": [
|
|
228
|
+
{
|
|
229
|
+
"type": "image_url",
|
|
230
|
+
"image_url": {
|
|
231
|
+
"url": f"data:image/{format};base64,{example_image_b64}",
|
|
232
|
+
"detail": detail
|
|
233
|
+
},
|
|
234
|
+
},
|
|
235
|
+
{"type": "text", "text": user_prompt},
|
|
236
|
+
],
|
|
237
|
+
}
|
|
238
|
+
example_agent_message = {"role": "assistant", "content": example.text}
|
|
239
|
+
output_messages.append(example_user_message)
|
|
240
|
+
output_messages.append(example_agent_message)
|
|
241
|
+
|
|
242
|
+
# user message
|
|
243
|
+
user_message = {
|
|
244
|
+
"role": "user",
|
|
245
|
+
"content": [
|
|
246
|
+
{
|
|
247
|
+
"type": "image_url",
|
|
248
|
+
"image_url": {
|
|
249
|
+
"url": f"data:image/{format};base64,{base64_str}",
|
|
250
|
+
"detail": detail
|
|
251
|
+
},
|
|
252
|
+
},
|
|
253
|
+
{"type": "text", "text": user_prompt},
|
|
254
|
+
],
|
|
255
|
+
}
|
|
256
|
+
output_messages.append(user_message)
|
|
257
|
+
return output_messages
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class AzureOpenAIVLMEngine(AzureOpenAIInferenceEngine, OpenAIVLMEngine):
|
|
261
|
+
"""
|
|
262
|
+
The Azure OpenAI API inference engine.
|
|
263
|
+
For parameters and documentation, refer to
|
|
264
|
+
- https://azure.microsoft.com/en-us/products/ai-services/openai-service
|
|
265
|
+
- https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
----------
|
|
269
|
+
model : str
|
|
270
|
+
model name as described in https://platform.openai.com/docs/models
|
|
271
|
+
api_version : str
|
|
272
|
+
the Azure OpenAI API version
|
|
273
|
+
config : LLMConfig
|
|
274
|
+
the LLM configuration.
|
|
275
|
+
"""
|
|
276
|
+
pass
|