pyconverters-openai_vision 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
  """OpenAIVision converter"""
2
- __version__ = "0.5.3"
2
+ __version__ = "0.5.4"
@@ -1,7 +1,9 @@
1
1
  import base64
2
2
  import os
3
+ import re
3
4
  from enum import Enum
4
5
  from logging import Logger
6
+ from re import Pattern
5
7
  from typing import List, cast, Type, Dict, Any
6
8
 
7
9
  import filetype as filetype
@@ -24,12 +26,13 @@ class OpenAIVisionBaseParameters(ConverterParameters):
24
26
  None, extra="internal"
25
27
  )
26
28
  prompt: str = Field(
27
- "Describe the image with a lot of details",
29
+ """If the attached file is an image: describe the image with a lot of details.",
30
+ If the attached file is a PDF document: convert the PDF document into Markdown format. The output must be just the markdown result without any explanation or introductory prefix.""",
28
31
  description="""Contains the prompt as a string""",
29
32
  extra="multiline",
30
33
  )
31
34
  max_tokens: int = Field(
32
- 256,
35
+ 16384,
33
36
  description="""The maximum number of tokens to generate in the completion.
34
37
  The token count of your prompt plus max_tokens cannot exceed the model's context length.
35
38
  Most models have a context length of 2048 tokens (except for the newest models, which support 4096).""",
@@ -85,13 +88,14 @@ class OpenAIVisionBaseParameters(ConverterParameters):
85
88
  class OpenAIVisionModel(str, Enum):
86
89
  gpt_4o_mini = "gpt-4o-mini"
87
90
  gpt_4o = "gpt-4o"
91
+ o3_mini = "o3-mini"
88
92
 
89
93
 
90
94
  class OpenAIVisionParameters(OpenAIVisionBaseParameters):
91
95
  model: OpenAIVisionModel = Field(
92
96
  OpenAIVisionModel.gpt_4o_mini,
93
- description="""The [OpenAI model](https://platform.openai.com/docs/models) used for speech to text transcription. Options currently available:</br>
94
- <li>`whisper-1` - state-of-the-art open source large-v2 Whisper model.
97
+ description="""The [OpenAI model](https://platform.openai.com/docs/models) used for vision. Options currently available:</br>
98
+
95
99
  """, extra="pipeline-naming-hint"
96
100
  )
97
101
 
@@ -126,10 +130,24 @@ class OpenAIVisionConverterBase(ConverterBase):
126
130
  You input some text as a prompt, and the model will generate a text completion that attempts to match whatever context or pattern you gave it."""
127
131
  PREFIX: str = ""
128
132
 
129
- def compute_args(self, params: OpenAIVisionBaseParameters, source: UploadFile
133
+ def compute_args(self, params: OpenAIVisionBaseParameters, source: UploadFile, kind
130
134
  ) -> Dict[str, Any]:
131
135
  data = source.file.read()
132
136
  rv = base64.b64encode(data)
137
+ if kind.mime.startswith("image"):
138
+ binary_block = {
139
+ "type": "image_url",
140
+ "image_url": {
141
+ "url": f"data:image/jpeg;base64,{rv.decode('utf-8')}"
142
+ }
143
+ }
144
+ else:
145
+ binary_block = {
146
+ "type": "file",
147
+ "file": {
148
+ "filename": source.filename,
149
+ "file_data": f"data:application/pdf;base64,{rv.decode('utf-8')}"}
150
+ }
133
151
  messages = [{"role": "system", "content": params.system_prompt}] if params.system_prompt is not None else []
134
152
  messages.append({"role": "user",
135
153
  "content": [
@@ -137,12 +155,8 @@ class OpenAIVisionConverterBase(ConverterBase):
137
155
  "type": "text",
138
156
  "text": params.prompt
139
157
  },
140
- {
141
- "type": "image_url",
142
- "image_url": {
143
- "url": f"data:image/jpeg;base64,{rv.decode('utf-8')}"
144
- }
145
- }]})
158
+ binary_block
159
+ ]})
146
160
  kwargs = {
147
161
  'model': params.model_str,
148
162
  'messages': messages,
@@ -156,11 +170,18 @@ class OpenAIVisionConverterBase(ConverterBase):
156
170
  return kwargs
157
171
 
158
172
  def compute_result(self, **kwargs):
173
+ pattern: Pattern = re.compile(r"```(?:markdown\s+)?(\W.*?)```", re.DOTALL)
174
+ """Regex pattern to parse the output."""
159
175
  response = openai_chat_completion(self.PREFIX, **kwargs)
160
176
  contents = []
161
177
  for choice in response.choices:
162
178
  if choice.message.content:
163
- contents.append(choice.message.content)
179
+ if "```" in choice.message.content:
180
+ action_match = pattern.search(choice.message.content)
181
+ if action_match is not None:
182
+ contents.append(action_match.group(1).strip())
183
+ else:
184
+ contents.append(choice.message.content)
164
185
  if contents:
165
186
  result = "\n".join(contents)
166
187
  return result
@@ -178,9 +199,9 @@ class OpenAIVisionConverterBase(ConverterBase):
178
199
  try:
179
200
  kind = filetype.guess(source.file)
180
201
  source.file.seek(0)
181
- if kind.mime.startswith("image"):
202
+ if kind.mime.startswith("image") or kind.mime.endswith("pdf"):
182
203
  result = None
183
- kwargs = self.compute_args(params, source)
204
+ kwargs = self.compute_args(params, source, kind)
184
205
  if kwargs['model'] != NO_DEPLOYED_MODELS:
185
206
  result = self.compute_result(**kwargs)
186
207
  if result:
@@ -189,7 +210,7 @@ class OpenAIVisionConverterBase(ConverterBase):
189
210
  except BaseException as err:
190
211
  raise err
191
212
  if doc is None:
192
- raise TypeError(f"Conversion of audio file {source.filename} failed")
213
+ raise TypeError(f"Conversion of file {source.filename} failed")
193
214
  return [doc]
194
215
 
195
216
  @classmethod
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: pyconverters-openai_vision
3
- Version: 0.5.3
3
+ Version: 0.5.4
4
4
  Summary: OpenAIVision converter
5
5
  Home-page: https://kairntech.com/
6
6
  Author: Olivier Terrier
@@ -0,0 +1,7 @@
1
+ pyconverters_openai_vision/__init__.py,sha256=gg9uPdn4gUxA8hQ1hvO_tebG_dq6EzNtTSggy4MA9R0,51
2
+ pyconverters_openai_vision/openai_utils.py,sha256=HRJ6sJg88en66gkQbOpQKh7cbwtfoAwVLNh7JQSA9ps,5014
3
+ pyconverters_openai_vision/openai_vision.py,sha256=ACpeOEFNphqKceQqyWHwpP6PvuOOig8qTTHwImq34j8,10445
4
+ pyconverters_openai_vision-0.5.4.dist-info/entry_points.txt,sha256=-DS1gRUTf08Fjb79S_8sqCaqxBifC3q3EJZqXXdcf7Q,197
5
+ pyconverters_openai_vision-0.5.4.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
6
+ pyconverters_openai_vision-0.5.4.dist-info/METADATA,sha256=FMB-_MGpLRxKMKyjr5InU-P6VJRtB-4hixc12PF2Xyg,2635
7
+ pyconverters_openai_vision-0.5.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: flit 3.9.0
2
+ Generator: flit 3.12.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,7 +0,0 @@
1
- pyconverters_openai_vision/__init__.py,sha256=3OKw8raUFiHJX-jh8L1RvKEFeD7DZW5-8hpnp21EK-A,51
2
- pyconverters_openai_vision/openai_utils.py,sha256=HRJ6sJg88en66gkQbOpQKh7cbwtfoAwVLNh7JQSA9ps,5014
3
- pyconverters_openai_vision/openai_vision.py,sha256=MoEyjYCZ75jWiMonaQdR1erpFBpWgYEiqabp9D8102Y,9562
4
- pyconverters_openai_vision-0.5.3.dist-info/entry_points.txt,sha256=-DS1gRUTf08Fjb79S_8sqCaqxBifC3q3EJZqXXdcf7Q,197
5
- pyconverters_openai_vision-0.5.3.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
6
- pyconverters_openai_vision-0.5.3.dist-info/METADATA,sha256=PJWwNjeSChCjjJQhgWxR1RIn9jA7-HvcPvKHWT7sJjk,2635
7
- pyconverters_openai_vision-0.5.3.dist-info/RECORD,,