pyconverters-openai_vision 0.5.18__tar.gz → 0.5.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/PKG-INFO +2 -1
  2. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/pyconverters_openai_vision/__init__.py +1 -1
  3. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/pyconverters_openai_vision/openai_utils.py +86 -24
  4. pyconverters_openai_vision-0.5.22/pyconverters_openai_vision/openai_vision.py +466 -0
  5. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/pyproject.toml +4 -1
  6. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/setup.py +7 -2
  7. pyconverters_openai_vision-0.5.22/tests/data/ENG product fact files_general offer_2025_30pages.json +274 -0
  8. pyconverters_openai_vision-0.5.22/tests/data/ENG product fact files_general offer_2025_30pages_alts.json +214 -0
  9. pyconverters_openai_vision-0.5.22/tests/data/ENG product fact files_general offer_2025_30pages_descs.json +133 -0
  10. pyconverters_openai_vision-0.5.22/tests/data/PC_Kairntech_LLM_v1.md.json +16 -0
  11. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/tests/test_openai_vision.py +25 -10
  12. pyconverters_openai_vision-0.5.18/pyconverters_openai_vision/openai_vision.py +0 -251
  13. pyconverters_openai_vision-0.5.18/tests/data/Sodexo_URD_2023_FR - 4p.pdf +0 -0
  14. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/.dockerignore +0 -0
  15. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/.gitignore +0 -0
  16. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/Dockerfile +0 -0
  17. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/Jenkinsfile +0 -0
  18. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/README.md +0 -0
  19. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/bumpversion.py +0 -0
  20. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/tests/__init__.py +0 -0
  21. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/tests/data/colducoq.jpg +0 -0
  22. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/tests/data/webinar.png +0 -0
  23. {pyconverters_openai_vision-0.5.18 → pyconverters_openai_vision-0.5.22}/tox.ini +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pyconverters-openai_vision
3
- Version: 0.5.18
3
+ Version: 0.5.22
4
4
  Summary: OpenAIVision converter
5
5
  Home-page: https://kairntech.com/
6
6
  Author: Olivier Terrier
@@ -27,6 +27,7 @@ Classifier: Programming Language :: Python :: 3.8
27
27
  Classifier: Topic :: Internet :: WWW/HTTP :: HTTP Servers
28
28
  Classifier: Topic :: Internet :: WWW/HTTP
29
29
  Requires-Dist: pymultirole-plugins>=0.5.0,<0.6.0
30
+ Requires-Dist: httpx<0.28
30
31
  Requires-Dist: openai==1.9.0
31
32
  Requires-Dist: Jinja2
32
33
  Requires-Dist: tenacity
@@ -1,2 +1,2 @@
1
1
  """OpenAIVision converter"""
2
- __version__ = "0.5.18"
2
+ __version__ = "0.5.22"
@@ -6,31 +6,63 @@ from openai import OpenAI
6
6
  from openai.lib.azure import AzureOpenAI
7
7
  from pymultirole_plugins.util import comma_separated_to_list
8
8
  from strenum import StrEnum
9
+ import time
10
+ from openai._base_client import SyncHttpxClientWrapper
11
+
12
+
13
+ class OAuthToken:
14
+ access_token: str = None
15
+ token_expiry: str = None
16
+
9
17
 
10
18
  logger = Logger("pymultirole")
11
19
  DEFAULT_CHAT_GPT_MODEL = "gpt-4o-mini"
12
- DEEPINFRA_VISION_MODELS = [
13
- "vision",
14
- "llava",
15
- "pixtral"
16
- ]
20
+ OPENAI_MAX_RETRIES = int(os.getenv("OPENAI_MAX_RETRIES", 2))
21
+
22
+
23
+ def check_litellm_defined():
24
+ LITELLM_OPENAI_API_KEY = os.getenv("LITELLM_OPENAI_API_KEY", None)
25
+ if LITELLM_OPENAI_API_KEY:
26
+ os.environ["OPENAI_API_KEY"] = LITELLM_OPENAI_API_KEY
27
+ LITELLM_OPENAI_API_BASE = os.getenv("LITELLM_OPENAI_API_BASE", None)
28
+ if LITELLM_OPENAI_API_BASE:
29
+ os.environ["OPENAI_API_BASE"] = LITELLM_OPENAI_API_BASE
30
+
31
+
32
+ def get_api_key(prefix, oauth_token):
33
+ if not prefix.startswith("APOLLO"):
34
+ api_key = os.getenv(prefix + "OPENAI_API_KEY")
35
+ elif oauth_token.access_token is None or time.time() + 100 > oauth_token.token_expiry:
36
+ client_id = os.getenv("APOLLO_CLIENT_ID")
37
+ client_secret = os.getenv("APOLLO_CLIENT_SECRET")
38
+ token_url = os.getenv("APOLLO_OAUTH")
39
+ if not client_id or not client_secret or not token_url:
40
+ raise ValueError("Environment variables for OAuth are not set properly.")
41
+ token_data = {
42
+ "grant_type": "client_credentials",
43
+ "client_id": client_id,
44
+ "client_secret": client_secret,
45
+ }
46
+ verify = not prefix.startswith("APOLLO")
47
+ response = requests.post(token_url, data=token_data, verify=verify)
48
+ response.raise_for_status()
49
+ json_response = response.json()
50
+ oauth_token.access_token = json_response['access_token']
51
+ oauth_token.token_expiry = time.time() + json_response.get('expires_in', 3600)
52
+ api_key = oauth_token.access_token
53
+ else:
54
+ api_key = oauth_token.access_token
55
+ return api_key
17
56
 
18
57
 
19
58
  # Now use default retry with backoff of openai api
20
- def openai_chat_completion(prefix, **kwargs):
21
- client = set_openai(prefix)
59
+ def openai_chat_completion(prefix, oauth_token, base_url, **kwargs):
60
+ client = set_openai(prefix, oauth_token, base_url)
22
61
  response = client.chat.completions.create(**kwargs)
23
62
  return response
24
63
 
25
64
 
26
- def is_vision_model(model):
27
- for m in DEEPINFRA_VISION_MODELS:
28
- if m in model.lower():
29
- return True
30
- return False
31
-
32
-
33
- def openai_list_models(prefix, **kwargs):
65
+ def openai_list_models(prefix, oauth_token, base_url, **kwargs):
34
66
  def sort_by_created(x):
35
67
  if 'created' in x:
36
68
  return x['created']
@@ -42,7 +74,7 @@ def openai_list_models(prefix, **kwargs):
42
74
  return x.id
43
75
 
44
76
  models = []
45
- client = set_openai(prefix)
77
+ client = set_openai(prefix, oauth_token, base_url, max_retries=10)
46
78
  if prefix.startswith("DEEPINFRA"):
47
79
  deepinfra_url = client.base_url
48
80
  deepinfra_models = {}
@@ -75,9 +107,19 @@ def openai_list_models(prefix, **kwargs):
75
107
  mods = list(
76
108
  {m['model_name'] for m in mods if m['task'] == 'text-generation' and m['status'] == 'running'})
77
109
  deepinfra_models.update({m: m for m in mods})
78
- models = [m for m in deepinfra_models.keys() if is_vision_model(m)]
110
+ models = list(deepinfra_models.keys())
79
111
  elif prefix.startswith("AZURE"):
80
112
  models = comma_separated_to_list(os.getenv(prefix + "OPENAI_DEPLOYMENT_ID", None))
113
+ elif prefix.startswith("APOLLO"):
114
+ apollo_url = client.base_url
115
+ public_models_list_url = f"{apollo_url}models"
116
+ response = requests.get(public_models_list_url, verify=False,
117
+ headers={'Accept': "application/json", 'Authorization': f"Bearer {client.api_key}"})
118
+ if response.ok:
119
+ resp = response.json()
120
+ mods = sorted(resp["data"], key=sort_by_created, reverse=True)
121
+ models = list(
122
+ {m['id'] for m in mods})
81
123
  else:
82
124
  response = client.models.list(**kwargs)
83
125
  models = sorted(response.data, key=sort_by_created, reverse=True)
@@ -85,20 +127,31 @@ def openai_list_models(prefix, **kwargs):
85
127
  return models
86
128
 
87
129
 
88
- def set_openai(prefix):
130
+ def set_openai(prefix, oauth_token, base_url, max_retries=OPENAI_MAX_RETRIES):
131
+ api_key = get_api_key(prefix, oauth_token)
89
132
  if prefix.startswith("AZURE"):
90
133
  client = AzureOpenAI(
91
134
  # This is the default and can be omitted
92
- api_key=os.getenv(prefix + "OPENAI_API_KEY"),
93
- azure_endpoint=os.getenv(prefix + "OPENAI_API_BASE", None),
135
+ api_key=api_key,
136
+ azure_endpoint=base_url,
94
137
  api_version=os.getenv(prefix + "OPENAI_API_VERSION", None),
95
138
  # azure_deployment=os.getenv(prefix + "OPENAI_DEPLOYMENT_ID", None)
96
139
  )
97
140
  else:
141
+ # hack to support verify=None for Apollo
142
+ if prefix.startswith("APOLLO"):
143
+ http_client = SyncHttpxClientWrapper(
144
+ base_url="https://api.openai.com/v1" if base_url is None else base_url,
145
+ verify=False,
146
+ )
147
+ else:
148
+ http_client = None
98
149
  client = OpenAI(
99
150
  # This is the default and can be omitted
100
- api_key=os.getenv(prefix + "OPENAI_API_KEY"),
101
- base_url=os.getenv(prefix + "OPENAI_API_BASE", None)
151
+ api_key=api_key,
152
+ base_url=base_url,
153
+ http_client=http_client,
154
+ max_retries=max_retries
102
155
  )
103
156
  return client
104
157
 
@@ -107,14 +160,23 @@ def gpt_filter(m: str):
107
160
  return m.startswith('gpt') and not m.startswith('gpt-3.5-turbo-instruct') and 'vision' not in m
108
161
 
109
162
 
163
+ def all_filter(m: str):
164
+ return True
165
+
166
+
167
+ def apollo_filter(m: str):
168
+ return 'embed' not in m and 'vision' not in m and 'mock' not in m and 'tts' not in m and 'mock' not in m
169
+
170
+
110
171
  NO_DEPLOYED_MODELS = 'no deployed models - check API key'
111
172
 
112
173
 
113
- def create_openai_model_enum(name, prefix="", key=lambda m: m):
174
+ # @lru_cache(maxsize=None)
175
+ def create_openai_model_enum(name, prefix="", base_url=None, key=all_filter):
114
176
  chat_gpt_models = []
115
177
  default_chat_gpt_model = None
116
178
  try:
117
- chat_gpt_models = [m for m in openai_list_models(prefix) if key(m)]
179
+ chat_gpt_models = [m for m in openai_list_models(prefix, OAuthToken(), base_url) if key(m)]
118
180
  if chat_gpt_models:
119
181
  default_chat_gpt_model = DEFAULT_CHAT_GPT_MODEL if DEFAULT_CHAT_GPT_MODEL in chat_gpt_models else \
120
182
  chat_gpt_models[0]
@@ -0,0 +1,466 @@
1
+ import base64
2
+ import os
3
+ import re
4
+ from enum import Enum
5
+ from logging import Logger
6
+ from re import Pattern
7
+ from typing import List, cast, Type, Dict, Any, Optional
8
+
9
+ import filetype as filetype
10
+ from log_with_context import add_logging_context
11
+ from pydantic import Field, BaseModel
12
+ from pymultirole_plugins.v1.converter import ConverterParameters, ConverterBase
13
+ from pymultirole_plugins.v1.processor import ProcessorParameters, ProcessorBase
14
+ from pymultirole_plugins.v1.schema import Document, AltText
15
+ from starlette.datastructures import UploadFile
16
+
17
+ from .openai_utils import create_openai_model_enum, openai_chat_completion, gpt_filter, \
18
+ NO_DEPLOYED_MODELS, OAuthToken, all_filter, check_litellm_defined
19
+
20
+ logger = Logger("pymultirole")
21
+ SHOW_INTERNAL = bool(os.getenv("SHOW_INTERNAL", "false"))
22
+
23
+
24
+ class OpenAIVisionBaseParameters(ConverterParameters):
25
+ base_url: str = Field(
26
+ None,
27
+ description="""OpenAI endpoint base url""", extra="advanced"
28
+ )
29
+ model_str: str = Field(
30
+ None, extra="advanced"
31
+ )
32
+ model: str = Field(
33
+ None, extra="internal"
34
+ )
35
+ prompt: str = Field(
36
+ """If the attached file is an image: describe the image.""",
37
+ description="""Contains the prompt as a string""",
38
+ extra="multiline",
39
+ )
40
+ max_tokens: int = Field(
41
+ 16384,
42
+ description="""The maximum number of tokens to generate in the completion.
43
+ The token count of your prompt plus max_tokens cannot exceed the model's context length.
44
+ Most models have a context length of 2048 tokens (except for the newest models, which support 4096).""",
45
+ )
46
+ system_prompt: str = Field(
47
+ None,
48
+ description="""Contains the system prompt""",
49
+ extra="multiline,advanced",
50
+ )
51
+ temperature: float = Field(
52
+ 0.1,
53
+ description="""What sampling temperature to use, between 0 and 2.
54
+ Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.
55
+ We generally recommend altering this or `top_p` but not both.""",
56
+ extra="advanced",
57
+ )
58
+ top_p: int = Field(
59
+ 1,
60
+ description="""An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass.
61
+ So 0.1 means only the tokens comprising the top 10% probability mass are considered.
62
+ We generally recommend altering this or `temperature` but not both.""",
63
+ extra="advanced",
64
+ )
65
+ n: int = Field(
66
+ 1,
67
+ description="""How many completions to generate for each prompt.
68
+ Note: Because this parameter generates many completions, it can quickly consume your token quota.
69
+ Use carefully and ensure that you have reasonable settings for `max_tokens`.""",
70
+ extra="advanced",
71
+ )
72
+ best_of: int = Field(
73
+ 1,
74
+ description="""Generates best_of completions server-side and returns the "best" (the one with the highest log probability per token).
75
+ Results cannot be streamed.
76
+ When used with `n`, `best_of` controls the number of candidate completions and `n` specifies how many to return – `best_of` must be greater than `n`.
77
+ Use carefully and ensure that you have reasonable settings for `max_tokens`.""",
78
+ extra="advanced",
79
+ )
80
+ presence_penalty: float = Field(
81
+ 0.0,
82
+ description="""Number between -2.0 and 2.0.
83
+ Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.""",
84
+ extra="advanced",
85
+ )
86
+ frequency_penalty: float = Field(
87
+ 0.0,
88
+ description="""Number between -2.0 and 2.0.
89
+ Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.""",
90
+ extra="advanced",
91
+ )
92
+
93
+
94
+ class OpenAIVisionModel(str, Enum):
95
+ gpt_4o_mini = "gpt-4o-mini"
96
+ gpt_4o = "gpt-4o"
97
+ gpt_4_1 = "gpt-4.1"
98
+ gpt_4_1_mini = "gpt-4.1-mini"
99
+ gpt_4_1_nano = "gpt-4.1-nano"
100
+ gpt_5 = "gpt-5"
101
+ gpt_5_mini = "gpt-5-mini"
102
+ gpt_5_nano = "gpt-5-nano"
103
+
104
+
105
+ check_litellm_defined()
106
+ OPENAI_PREFIX = ""
107
+ OPENAI_API_BASE = os.getenv(OPENAI_PREFIX + "OPENAI_API_BASE", None)
108
+ CHAT_GPT_MODEL_ENUM, DEFAULT_CHAT_GPT_MODEL = create_openai_model_enum('OpenAIModel2', prefix=OPENAI_PREFIX,
109
+ base_url=OPENAI_API_BASE,
110
+ key=gpt_filter if OPENAI_API_BASE is None else all_filter)
111
+
112
+
113
+ class OpenAIVisionParameters(OpenAIVisionBaseParameters):
114
+ base_url: Optional[str] = Field(
115
+ os.getenv(OPENAI_PREFIX + "OPENAI_API_BASE", None),
116
+ description="""OpenAI endpoint base url""", extra="advanced"
117
+ )
118
+ model: CHAT_GPT_MODEL_ENUM = Field(
119
+ DEFAULT_CHAT_GPT_MODEL,
120
+ description="""The [OpenAI model](https://platform.openai.com/docs/models) used for completion.""",
121
+ extra="pipeline-naming-hint"
122
+ )
123
+
124
+
125
+ DEEPINFRA_PREFIX = "DEEPINFRA_"
126
+ DEEPINFRA_OPENAI_API_BASE = os.getenv(DEEPINFRA_PREFIX + "OPENAI_API_BASE", None)
127
+ DEEPINFRA_CHAT_GPT_MODEL_ENUM, DEEPINFRA_DEFAULT_CHAT_GPT_MODEL = create_openai_model_enum('DeepInfraOpenAIModel',
128
+ prefix=DEEPINFRA_PREFIX,
129
+ base_url=DEEPINFRA_OPENAI_API_BASE)
130
+
131
+
132
+ class DeepInfraOpenAIVisionParameters(OpenAIVisionBaseParameters):
133
+ base_url: str = Field(
134
+ os.getenv(DEEPINFRA_PREFIX + "OPENAI_API_BASE", None),
135
+ description="""OpenAI endpoint base url""", extra="advanced"
136
+ )
137
+ model: DEEPINFRA_CHAT_GPT_MODEL_ENUM = Field(
138
+ None,
139
+ description="""The [DeepInfra 'OpenAI compatible' model](https://deepinfra.com/models?type=text-generation) used for completion. It must be deployed on your [DeepInfra dashboard](https://deepinfra.com/dash).""",
140
+ extra="pipeline-naming-hint"
141
+ )
142
+
143
+
144
+ class OpenAIVisionConverterBase(ConverterBase):
145
+ __doc__ = """Generate text using [OpenAI Text Completion](https://platform.openai.com/docs/guides/completion) API
146
+ You input some text as a prompt, and the model will generate a text completion that attempts to match whatever context or pattern you gave it."""
147
+ PREFIX: str = ""
148
+ oauth_token: OAuthToken = OAuthToken()
149
+
150
+ def compute_args(self, params: OpenAIVisionBaseParameters, source: UploadFile, kind
151
+ ) -> Dict[str, Any]:
152
+ data = source.file.read()
153
+ rv = base64.b64encode(data)
154
+ if kind.mime.startswith("image"):
155
+ binary_block = {
156
+ "type": "image_url",
157
+ "image_url": {
158
+ "url": f"data:image/jpeg;base64,{rv.decode('utf-8')}"
159
+ }
160
+ }
161
+ messages = [{"role": "system", "content": params.system_prompt}] if params.system_prompt is not None else []
162
+ messages.append({"role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": params.prompt
167
+ },
168
+ binary_block
169
+ ]})
170
+ kwargs = {
171
+ 'model': params.model_str,
172
+ 'messages': messages,
173
+ 'max_tokens': params.max_tokens,
174
+ 'temperature': params.temperature,
175
+ 'top_p': params.top_p,
176
+ 'n': params.n,
177
+ 'frequency_penalty': params.frequency_penalty,
178
+ 'presence_penalty': params.presence_penalty,
179
+ }
180
+ return kwargs
181
+
182
+ def compute_result(self, base_url, **kwargs):
183
+ pattern: Pattern = re.compile(r"```(?:markdown\s+)?(\W.*?)```", re.DOTALL)
184
+ """Regex pattern to parse the output."""
185
+ response = openai_chat_completion(self.PREFIX, self.oauth_token, base_url, **kwargs)
186
+ contents = []
187
+ result = None
188
+ for choice in response.choices:
189
+ if choice.message.content:
190
+ if "```" in choice.message.content:
191
+ action_match = pattern.search(choice.message.content)
192
+ if action_match is not None:
193
+ contents.append(action_match.group(1).strip())
194
+ else:
195
+ contents.append(choice.message.content)
196
+ if contents:
197
+ result = "\n".join(contents)
198
+ return result
199
+
200
+ def convert(self, source: UploadFile, parameters: ConverterParameters) \
201
+ -> List[Document]:
202
+
203
+ params: OpenAIVisionBaseParameters = cast(
204
+ OpenAIVisionBaseParameters, parameters
205
+ )
206
+ OPENAI_MODEL = os.getenv(self.PREFIX + "OPENAI_MODEL", None)
207
+ if OPENAI_MODEL:
208
+ params.model_str = OPENAI_MODEL
209
+ doc = None
210
+ try:
211
+ kind = filetype.guess(source.file)
212
+ source.file.seek(0)
213
+ if kind.mime.startswith("image"):
214
+ result = None
215
+ kwargs = self.compute_args(params, source, kind)
216
+ if kwargs['model'] != NO_DEPLOYED_MODELS:
217
+ result = self.compute_result(params.base_url, **kwargs)
218
+ if result:
219
+ doc = Document(identifier=source.filename, text=result)
220
+ doc.properties = {"fileName": source.filename}
221
+ except BaseException as err:
222
+ raise err
223
+ if doc is None:
224
+ raise TypeError(f"Conversion of file {source.filename} failed")
225
+ return [doc]
226
+
227
+ @classmethod
228
+ def get_model(cls) -> Type[BaseModel]:
229
+ return OpenAIVisionBaseParameters
230
+
231
+
232
+ class OpenAIVisionConverter(OpenAIVisionConverterBase):
233
+ __doc__ = """Convert audio using [OpenAI Audio](https://platform.openai.com/docs/guides/speech-to-text) API"""
234
+
235
+ def convert(self, source: UploadFile, parameters: ConverterParameters) \
236
+ -> List[Document]:
237
+ params: OpenAIVisionParameters = cast(
238
+ OpenAIVisionParameters, parameters
239
+ )
240
+ model_str = params.model_str if bool(params.model_str and params.model_str.strip()) else None
241
+ model = params.model.value if params.model is not None else None
242
+ params.model_str = model_str or model
243
+ return super().convert(source, params)
244
+
245
+ @classmethod
246
+ def get_model(cls) -> Type[BaseModel]:
247
+ return OpenAIVisionParameters
248
+
249
+
250
+ class DeepInfraOpenAIVisionConverter(OpenAIVisionConverterBase):
251
+ __doc__ = """Convert images using [DeepInfra Vision](https://deepinfra.com/docs/tutorials/whisper) API"""
252
+ PREFIX = DEEPINFRA_PREFIX
253
+
254
+ def convert(self, source: UploadFile, parameters: ConverterParameters) \
255
+ -> List[Document]:
256
+ params: DeepInfraOpenAIVisionParameters = cast(
257
+ DeepInfraOpenAIVisionParameters, parameters
258
+ )
259
+ model_str = params.model_str if bool(params.model_str and params.model_str.strip()) else None
260
+ model = params.model.value if params.model is not None else None
261
+ params.model_str = model_str or model
262
+ return super().convert(source, params)
263
+
264
+ @classmethod
265
+ def get_model(cls) -> Type[BaseModel]:
266
+ return DeepInfraOpenAIVisionParameters
267
+
268
+
269
+ def guess_kind(base64_src):
270
+ kind = None
271
+ img_regex = r"data:(image/[^;]+);base64"
272
+ matches = re.search(img_regex, base64_src)
273
+ if matches:
274
+ mime = matches.group(1)
275
+ kind = filetype.get_type(mime)
276
+ return kind
277
+
278
+
279
+ class OpenAIVisionProcessorBaseParameters(OpenAIVisionBaseParameters):
280
+ replace_refs_altTexts_by_descriptions: bool = Field(
281
+ False, extra="advanced"
282
+ )
283
+
284
+
285
+ class OpenAIVisionProcessorBase(ProcessorBase):
286
+ __doc__ = """Generate text using [OpenAI Text Completion](https://platform.openai.com/docs/guides/completion) API
287
+ You input some text as a prompt, and the model will generate a text completion that attempts to match whatever context or pattern you gave it."""
288
+ PREFIX: str = ""
289
+ oauth_token: OAuthToken = OAuthToken()
290
+
291
+ def compute_args(self, params: OpenAIVisionBaseParameters, source: str, kind
292
+ ) -> Dict[str, Any]:
293
+ if kind.mime.startswith("image"):
294
+ binary_block = {
295
+ "type": "image_url",
296
+ "image_url": {
297
+ "url": source
298
+ }
299
+ }
300
+ messages = [{"role": "system", "content": params.system_prompt}] if params.system_prompt is not None else []
301
+ messages.append({"role": "user",
302
+ "content": [
303
+ {
304
+ "type": "text",
305
+ "text": params.prompt
306
+ },
307
+ binary_block
308
+ ]})
309
+ kwargs = {
310
+ 'model': params.model_str,
311
+ 'messages': messages,
312
+ 'max_tokens': params.max_tokens,
313
+ 'temperature': params.temperature,
314
+ 'top_p': params.top_p,
315
+ 'n': params.n,
316
+ 'frequency_penalty': params.frequency_penalty,
317
+ 'presence_penalty': params.presence_penalty,
318
+ }
319
+ return kwargs
320
+
321
+ def compute_result(self, base_url, **kwargs):
322
+ pattern: Pattern = re.compile(r"```(?:markdown\s+)?(\W.*?)```", re.DOTALL)
323
+ """Regex pattern to parse the output."""
324
+ response = openai_chat_completion(self.PREFIX, self.oauth_token, base_url, **kwargs)
325
+ contents = []
326
+ result = None
327
+ for choice in response.choices:
328
+ if choice.message.content:
329
+ if "```" in choice.message.content:
330
+ action_match = pattern.search(choice.message.content)
331
+ if action_match is not None:
332
+ contents.append(action_match.group(1).strip())
333
+ else:
334
+ contents.append(choice.message.content)
335
+ if contents:
336
+ result = "\n".join(contents)
337
+ return result
338
+
339
+ def process(
340
+ self, documents: List[Document], parameters: ProcessorParameters
341
+ ) -> List[Document]:
342
+ # supported_languages = comma_separated_to_list(SUPPORTED_LANGUAGES)
343
+
344
+ params: OpenAIVisionProcessorBaseParameters = cast(
345
+ OpenAIVisionProcessorBaseParameters, parameters
346
+ )
347
+ OPENAI_MODEL = os.getenv(self.PREFIX + "OPENAI_MODEL", None)
348
+ if OPENAI_MODEL:
349
+ params.model_str = OPENAI_MODEL
350
+ try:
351
+ for document in documents:
352
+ with add_logging_context(docid=document.identifier):
353
+ if document.altTexts:
354
+ altTexts = document.altTexts
355
+ alts = {altText.name: altText.text for altText in document.altTexts}
356
+ anames = list(alts.keys())
357
+ for aname in anames:
358
+ atext = alts[aname]
359
+ result = None
360
+ kind = guess_kind(atext)
361
+ if kind is not None and kind.mime.startswith("image"):
362
+ kwargs = self.compute_args(params, atext, kind)
363
+ if kwargs['model'] != NO_DEPLOYED_MODELS:
364
+ result = self.compute_result(params.base_url, **kwargs)
365
+ if result is not None and isinstance(result, str):
366
+ alts[aname] = result
367
+ else:
368
+ del alts[aname]
369
+ if alts:
370
+ document.altTexts = []
371
+
372
+ if params.replace_refs_altTexts_by_descriptions:
373
+ text = document.text
374
+ link_regex = r"!\[([^]]+)\]\(([^]]+)\)"
375
+
376
+ def convert_links(matchobj):
377
+ m = matchobj.group(0)
378
+ m_id = matchobj.group(1)
379
+ if m_id in alts:
380
+ m_desc = alts[m_id]
381
+ return f"{m}\n___\n{m_desc}\n___\n"
382
+ return m
383
+
384
+ ptext = re.sub(link_regex, convert_links, text, 0,
385
+ re.MULTILINE)
386
+ document.text = ptext
387
+ for altText in altTexts:
388
+ if altText.name not in alts:
389
+ document.altTexts.append(altText)
390
+ else:
391
+ for altText in altTexts:
392
+ if altText.name in alts:
393
+ document.altTexts.append(AltText(name=altText.name, text=alts[altText.name]))
394
+ else:
395
+ document.altTexts.append(altText)
396
+
397
+ except BaseException as err:
398
+ raise err
399
+ return documents
400
+
401
+ @classmethod
402
+ def get_model(cls) -> Type[BaseModel]:
403
+ return OpenAIVisionProcessorBaseParameters
404
+
405
+
406
+ class OpenAIVisionProcessorParameters(OpenAIVisionProcessorBaseParameters):
407
+ base_url: Optional[str] = Field(
408
+ os.getenv(OPENAI_PREFIX + "OPENAI_API_BASE", None),
409
+ description="""OpenAI endpoint base url""", extra="advanced"
410
+ )
411
+ model: CHAT_GPT_MODEL_ENUM = Field(
412
+ DEFAULT_CHAT_GPT_MODEL,
413
+ description="""The [OpenAI model](https://platform.openai.com/docs/models) used for completion.""",
414
+ extra="pipeline-naming-hint"
415
+ )
416
+
417
+
418
+ class OpenAIVisionProcessor(OpenAIVisionProcessorBase):
419
+ __doc__ = """Convert audio using [OpenAI Audio](https://platform.openai.com/docs/guides/speech-to-text) API"""
420
+
421
+ def process(
422
+ self, documents: List[Document], parameters: ProcessorParameters
423
+ ) -> List[Document]:
424
+ params: OpenAIVisionParameters = cast(
425
+ OpenAIVisionParameters, parameters
426
+ )
427
+ model_str = params.model_str if bool(params.model_str and params.model_str.strip()) else None
428
+ model = params.model.value if params.model is not None else None
429
+ params.model_str = model_str or model
430
+ return super().process(documents, params)
431
+
432
+ @classmethod
433
+ def get_model(cls) -> Type[BaseModel]:
434
+ return OpenAIVisionProcessorParameters
435
+
436
+
437
+ class DeepInfraOpenAIVisionProcessorParameters(OpenAIVisionProcessorBaseParameters):
438
+ base_url: str = Field(
439
+ os.getenv(DEEPINFRA_PREFIX + "OPENAI_API_BASE", None),
440
+ description="""OpenAI endpoint base url""", extra="advanced"
441
+ )
442
+ model: DEEPINFRA_CHAT_GPT_MODEL_ENUM = Field(
443
+ None,
444
+ description="""The [DeepInfra 'OpenAI compatible' model](https://deepinfra.com/models?type=text-generation) used for completion. It must be deployed on your [DeepInfra dashboard](https://deepinfra.com/dash).""",
445
+ extra="pipeline-naming-hint"
446
+ )
447
+
448
+
449
+ class DeepInfraOpenAIVisionProcessor(OpenAIVisionProcessorBase):
450
+ __doc__ = """Convert images using [DeepInfra Vision](https://deepinfra.com/docs/tutorials/whisper) API"""
451
+ PREFIX = DEEPINFRA_PREFIX
452
+
453
+ def process(
454
+ self, documents: List[Document], parameters: ProcessorParameters
455
+ ) -> List[Document]:
456
+ params: DeepInfraOpenAIVisionParameters = cast(
457
+ DeepInfraOpenAIVisionParameters, parameters
458
+ )
459
+ model_str = params.model_str if bool(params.model_str and params.model_str.strip()) else None
460
+ model = params.model.value if params.model is not None else None
461
+ params.model_str = model_str or model
462
+ return super().process(documents, params)
463
+
464
+ @classmethod
465
+ def get_model(cls) -> Type[BaseModel]:
466
+ return DeepInfraOpenAIVisionProcessorParameters
@@ -30,6 +30,7 @@ classifiers = [
30
30
  ]
31
31
  requires = [
32
32
  "pymultirole-plugins>=0.5.0,<0.6.0",
33
+ "httpx<0.28",
33
34
  "openai==1.9.0",
34
35
  "Jinja2",
35
36
  "tenacity",
@@ -46,7 +47,9 @@ requires-python = ">=3.8"
46
47
  openai_vision = "pyconverters_openai_vision.openai_vision:OpenAIVisionConverter"
47
48
  deepinfra_openai_vision = "pyconverters_openai_vision.openai_vision:DeepInfraOpenAIVisionConverter"
48
49
  # azure_openai_vision = "pyconverters_openai_vision.openai_vision:AzureOpenAIVisionConverter"
49
-
50
+ [tool.flit.entrypoints."pyprocessors.plugins"]
51
+ openai_vision = "pyconverters_openai_vision.openai_vision:OpenAIVisionProcessor"
52
+ deepinfra_openai_vision = "pyconverters_openai_vision.openai_vision:DeepInfraOpenAIVisionProcessor"
50
53
 
51
54
  [tool.flit.metadata.requires-extra]
52
55
  test = [