vlm4ocr 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vlm4ocr
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Python package and Web App for OCR with vision language models.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.12
12
12
  Provides-Extra: tesseract
13
13
  Requires-Dist: colorama (>=0.4.4)
14
14
  Requires-Dist: json-repair (>=0.30.0)
15
+ Requires-Dist: llm-inference-engine (>=0.1.5)
15
16
  Requires-Dist: pdf2image (>=1.16.0)
16
17
  Requires-Dist: pillow (>=10.0.0)
17
18
  Requires-Dist: pytesseract (>=0.3.13) ; extra == "tesseract"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "vlm4ocr"
3
- version = "0.4.0"
3
+ version = "0.4.2"
4
4
  description = "Python package and Web App for OCR with vision language models."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -18,6 +18,7 @@ pdf2image = ">=1.16.0"
18
18
  colorama = ">=0.4.4"
19
19
  pillow = ">=10.0.0"
20
20
  json-repair = ">=0.30.0"
21
+ llm-inference-engine = ">=0.1.5"
21
22
  pytesseract = { version = ">=0.3.13", optional = true }
22
23
 
23
24
  [tool.poetry.scripts]
@@ -126,9 +126,8 @@ class OCREngine:
126
126
  few_shot_examples=few_shot_examples)
127
127
 
128
128
  # Stream response
129
- response_stream = self.vlm_engine.chat(
130
- messages,
131
- stream=True
129
+ response_stream = self.vlm_engine.chat_stream(
130
+ messages
132
131
  )
133
132
  for chunk in response_stream:
134
133
  if chunk["type"] == "response":
@@ -163,9 +162,8 @@ class OCREngine:
163
162
  image=image,
164
163
  few_shot_examples=few_shot_examples)
165
164
  # Stream response
166
- response_stream = self.vlm_engine.chat(
167
- messages,
168
- stream=True
165
+ response_stream = self.vlm_engine.chat_stream(
166
+ messages
169
167
  )
170
168
  for chunk in response_stream:
171
169
  if chunk["type"] == "response":
@@ -295,7 +293,6 @@ class OCREngine:
295
293
  response = self.vlm_engine.chat(
296
294
  messages,
297
295
  verbose=verbose,
298
- stream=False,
299
296
  messages_logger=messages_logger
300
297
  )
301
298
  ocr_text = response["response"]
@@ -0,0 +1,276 @@
1
+ import abc
2
+ from typing import List, Dict
3
+ from PIL import Image
4
+ from vlm4ocr.utils import image_to_base64
5
+ from vlm4ocr.data_types import FewShotExample
6
+ from llm_inference_engine.llm_configs import (
7
+ LLMConfig as VLMConfig,
8
+ BasicLLMConfig as BasicVLMConfig,
9
+ ReasoningLLMConfig as ReasoningVLMConfig,
10
+ OpenAIReasoningLLMConfig as OpenAIReasoningVLMConfig
11
+ )
12
+ from llm_inference_engine.utils import MessagesLogger
13
+ from llm_inference_engine.engines import (
14
+ InferenceEngine,
15
+ OllamaInferenceEngine,
16
+ OpenAICompatibleInferenceEngine,
17
+ VLLMInferenceEngine,
18
+ OpenRouterInferenceEngine,
19
+ OpenAIInferenceEngine,
20
+ AzureOpenAIInferenceEngine,
21
+ )
22
+
23
+
24
+ class VLMEngine(InferenceEngine):
25
+ @abc.abstractmethod
26
+ def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
27
+ """
28
+ This method inputs an image and returns the correesponding chat messages for the inference engine.
29
+
30
+ Parameters:
31
+ ----------
32
+ system_prompt : str
33
+ the system prompt.
34
+ user_prompt : str
35
+ the user prompt.
36
+ image : Image.Image
37
+ the image for OCR.
38
+ few_shot_examples : List[FewShotExample], Optional
39
+ list of few-shot examples.
40
+ """
41
+ return NotImplemented
42
+
43
+
44
+ class OllamaVLMEngine(OllamaInferenceEngine, VLMEngine):
45
+ def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
46
+ """
47
+ This method inputs an image and returns the correesponding chat messages for the inference engine.
48
+
49
+ Parameters:
50
+ ----------
51
+ system_prompt : str
52
+ the system prompt.
53
+ user_prompt : str
54
+ the user prompt.
55
+ image : Image.Image
56
+ the image for OCR.
57
+ few_shot_examples : List[FewShotExample], Optional
58
+ list of few-shot examples.
59
+ """
60
+ base64_str = image_to_base64(image)
61
+ output_messages = []
62
+ # system message
63
+ system_message = {"role": "system", "content": system_prompt}
64
+ output_messages.append(system_message)
65
+
66
+ # few-shot examples
67
+ if few_shot_examples is not None:
68
+ for example in few_shot_examples:
69
+ if not isinstance(example, FewShotExample):
70
+ raise ValueError("Few-shot example must be a FewShotExample object.")
71
+
72
+ example_image_b64 = image_to_base64(example.image)
73
+ example_user_message = {"role": "user", "content": user_prompt, "images": [example_image_b64]}
74
+ example_agent_message = {"role": "assistant", "content": example.text}
75
+ output_messages.append(example_user_message)
76
+ output_messages.append(example_agent_message)
77
+
78
+ # user message
79
+ user_message = {"role": "user", "content": user_prompt, "images": [base64_str]}
80
+ output_messages.append(user_message)
81
+
82
+ return output_messages
83
+
84
+
85
+ class OpenAICompatibleVLMEngine(OpenAICompatibleInferenceEngine, VLMEngine):
86
+ def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
87
+ detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
88
+ """
89
+ This method inputs an image and returns the correesponding chat messages for the inference engine.
90
+
91
+ Parameters:
92
+ ----------
93
+ system_prompt : str
94
+ the system prompt.
95
+ user_prompt : str
96
+ the user prompt.
97
+ image : Image.Image
98
+ the image for OCR.
99
+ format : str, Optional
100
+ the image format.
101
+ detail : str, Optional
102
+ the detail level of the image. Default is "high".
103
+ few_shot_examples : List[FewShotExample], Optional
104
+ list of few-shot examples.
105
+ """
106
+ base64_str = image_to_base64(image)
107
+ output_messages = []
108
+ # system message
109
+ system_message = {"role": "system", "content": system_prompt}
110
+ output_messages.append(system_message)
111
+
112
+ # few-shot examples
113
+ if few_shot_examples is not None:
114
+ for example in few_shot_examples:
115
+ if not isinstance(example, FewShotExample):
116
+ raise ValueError("Few-shot example must be a FewShotExample object.")
117
+
118
+ example_image_b64 = image_to_base64(example.image)
119
+ example_user_message = {
120
+ "role": "user",
121
+ "content": [
122
+ {
123
+ "type": "image_url",
124
+ "image_url": {
125
+ "url": f"data:image/{format};base64,{example_image_b64}",
126
+ "detail": detail
127
+ },
128
+ },
129
+ {"type": "text", "text": user_prompt},
130
+ ],
131
+ }
132
+ example_agent_message = {"role": "assistant", "content": example.text}
133
+ output_messages.append(example_user_message)
134
+ output_messages.append(example_agent_message)
135
+
136
+ # user message
137
+ user_message = {
138
+ "role": "user",
139
+ "content": [
140
+ {
141
+ "type": "image_url",
142
+ "image_url": {
143
+ "url": f"data:image/{format};base64,{base64_str}",
144
+ "detail": detail
145
+ },
146
+ },
147
+ {"type": "text", "text": user_prompt},
148
+ ],
149
+ }
150
+ output_messages.append(user_message)
151
+ return output_messages
152
+
153
+
154
+ class VLLMVLMEngine(VLLMInferenceEngine, OpenAICompatibleVLMEngine):
155
+ """
156
+ vLLM OpenAI compatible server inference engine.
157
+ https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html
158
+
159
+ For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction
160
+
161
+ Parameters:
162
+ ----------
163
+ model_name : str
164
+ model name as shown in the vLLM server
165
+ api_key : str, Optional
166
+ the API key for the vLLM server.
167
+ base_url : str, Optional
168
+ the base url for the vLLM server.
169
+ config : LLMConfig
170
+ the LLM configuration.
171
+ """
172
+ pass
173
+
174
+ class OpenRouterVLMEngine(OpenRouterInferenceEngine, OpenAICompatibleVLMEngine):
175
+ """
176
+ OpenRouter OpenAI-compatible server inference engine.
177
+
178
+ Parameters:
179
+ ----------
180
+ model_name : str
181
+ model name as shown in the vLLM server
182
+ api_key : str, Optional
183
+ the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY'].
184
+ base_url : str, Optional
185
+ the base url for the vLLM server.
186
+ config : LLMConfig
187
+ the LLM configuration.
188
+ """
189
+ pass
190
+
191
+ class OpenAIVLMEngine(OpenAIInferenceEngine, VLMEngine):
192
+ def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png',
193
+ detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
194
+ """
195
+ This method inputs an image and returns the correesponding chat messages for the inference engine.
196
+
197
+ Parameters:
198
+ ----------
199
+ system_prompt : str
200
+ the system prompt.
201
+ user_prompt : str
202
+ the user prompt.
203
+ image : Image.Image
204
+ the image for OCR.
205
+ format : str, Optional
206
+ the image format.
207
+ detail : str, Optional
208
+ the detail level of the image. Default is "high".
209
+ few_shot_examples : List[FewShotExample], Optional
210
+ list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
211
+ """
212
+ base64_str = image_to_base64(image)
213
+ output_messages = []
214
+ # system message
215
+ system_message = {"role": "system", "content": system_prompt}
216
+ output_messages.append(system_message)
217
+
218
+ # few-shot examples
219
+ if few_shot_examples is not None:
220
+ for example in few_shot_examples:
221
+ if not isinstance(example, FewShotExample):
222
+ raise ValueError("Few-shot example must be a FewShotExample object.")
223
+
224
+ example_image_b64 = image_to_base64(example.image)
225
+ example_user_message = {
226
+ "role": "user",
227
+ "content": [
228
+ {
229
+ "type": "image_url",
230
+ "image_url": {
231
+ "url": f"data:image/{format};base64,{example_image_b64}",
232
+ "detail": detail
233
+ },
234
+ },
235
+ {"type": "text", "text": user_prompt},
236
+ ],
237
+ }
238
+ example_agent_message = {"role": "assistant", "content": example.text}
239
+ output_messages.append(example_user_message)
240
+ output_messages.append(example_agent_message)
241
+
242
+ # user message
243
+ user_message = {
244
+ "role": "user",
245
+ "content": [
246
+ {
247
+ "type": "image_url",
248
+ "image_url": {
249
+ "url": f"data:image/{format};base64,{base64_str}",
250
+ "detail": detail
251
+ },
252
+ },
253
+ {"type": "text", "text": user_prompt},
254
+ ],
255
+ }
256
+ output_messages.append(user_message)
257
+ return output_messages
258
+
259
+
260
+ class AzureOpenAIVLMEngine(AzureOpenAIInferenceEngine, OpenAIVLMEngine):
261
+ """
262
+ The Azure OpenAI API inference engine.
263
+ For parameters and documentation, refer to
264
+ - https://azure.microsoft.com/en-us/products/ai-services/openai-service
265
+ - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart
266
+
267
+ Parameters:
268
+ ----------
269
+ model : str
270
+ model name as described in https://platform.openai.com/docs/models
271
+ api_version : str
272
+ the Azure OpenAI API version
273
+ config : LLMConfig
274
+ the LLM configuration.
275
+ """
276
+ pass