llama-index-llms-openai 0.3.18__tar.gz → 0.3.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-index-llms-openai
3
- Version: 0.3.18
3
+ Version: 0.3.19
4
4
  Summary: llama-index llms openai integration
5
5
  License: MIT
6
6
  Author: llama-index
@@ -11,7 +11,7 @@ Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
- Requires-Dist: llama-index-core (>=0.12.4,<0.13.0)
14
+ Requires-Dist: llama-index-core (>=0.12.17,<0.13.0)
15
15
  Requires-Dist: openai (>=1.58.1,<2.0.0)
16
16
  Description-Content-Type: text/markdown
17
17
 
@@ -222,6 +222,14 @@ class OpenAI(FunctionCallingLLM):
222
222
  default=None,
223
223
  description="The effort to use for reasoning models.",
224
224
  )
225
+ modalities: Optional[List[str]] = Field(
226
+ default=None,
227
+ description="The output modalities to use for the model.",
228
+ )
229
+ audio_config: Optional[Dict[str, Any]] = Field(
230
+ default=None,
231
+ description="The audio configuration to use for the model.",
232
+ )
225
233
 
226
234
  _client: Optional[SyncOpenAI] = PrivateAttr()
227
235
  _aclient: Optional[AsyncOpenAI] = PrivateAttr()
@@ -254,6 +262,8 @@ class OpenAI(FunctionCallingLLM):
254
262
  output_parser: Optional[BaseOutputParser] = None,
255
263
  strict: bool = False,
256
264
  reasoning_effort: Optional[Literal["low", "medium", "high"]] = None,
265
+ modalities: Optional[List[str]] = None,
266
+ audio_config: Optional[Dict[str, Any]] = None,
257
267
  **kwargs: Any,
258
268
  ) -> None:
259
269
  additional_kwargs = additional_kwargs or {}
@@ -288,6 +298,8 @@ class OpenAI(FunctionCallingLLM):
288
298
  output_parser=output_parser,
289
299
  strict=strict,
290
300
  reasoning_effort=reasoning_effort,
301
+ modalities=modalities,
302
+ audio_config=audio_config,
291
303
  **kwargs,
292
304
  )
293
305
 
@@ -375,6 +387,11 @@ class OpenAI(FunctionCallingLLM):
375
387
  def complete(
376
388
  self, prompt: str, formatted: bool = False, **kwargs: Any
377
389
  ) -> CompletionResponse:
390
+ if self.modalities and "audio" in self.modalities:
391
+ raise ValueError(
392
+ "Audio is not supported for completion. Use chat/achat instead."
393
+ )
394
+
378
395
  if self._use_chat_completions(kwargs):
379
396
  complete_fn = chat_to_completion_decorator(self._chat)
380
397
  else:
@@ -434,6 +451,11 @@ class OpenAI(FunctionCallingLLM):
434
451
  # O1 models support reasoning_effort of low, medium, high
435
452
  all_kwargs["reasoning_effort"] = self.reasoning_effort
436
453
 
454
+ if self.modalities is not None:
455
+ all_kwargs["modalities"] = self.modalities
456
+ if self.audio_config is not None:
457
+ all_kwargs["audio"] = self.audio_config
458
+
437
459
  return all_kwargs
438
460
 
439
461
  @llm_retry_decorator
@@ -459,7 +481,9 @@ class OpenAI(FunctionCallingLLM):
459
481
  )
460
482
 
461
483
  openai_message = response.choices[0].message
462
- message = from_openai_message(openai_message)
484
+ message = from_openai_message(
485
+ openai_message, modalities=self.modalities or ["text"]
486
+ )
463
487
  openai_token_logprobs = response.choices[0].logprobs
464
488
  logprobs = None
465
489
  if openai_token_logprobs and openai_token_logprobs.content:
@@ -476,6 +500,9 @@ class OpenAI(FunctionCallingLLM):
476
500
  def _stream_chat(
477
501
  self, messages: Sequence[ChatMessage], **kwargs: Any
478
502
  ) -> ChatResponseGen:
503
+ if self.modalities and "audio" in self.modalities:
504
+ raise ValueError("Audio is not supported for chat streaming")
505
+
479
506
  client = self._get_client()
480
507
  message_dicts = to_openai_message_dicts(
481
508
  messages,
@@ -667,6 +694,11 @@ class OpenAI(FunctionCallingLLM):
667
694
  async def acomplete(
668
695
  self, prompt: str, formatted: bool = False, **kwargs: Any
669
696
  ) -> CompletionResponse:
697
+ if self.modalities and "audio" in self.modalities:
698
+ raise ValueError(
699
+ "Audio is not supported for completion. Use chat/achat instead."
700
+ )
701
+
670
702
  if self._use_chat_completions(kwargs):
671
703
  acomplete_fn = achat_to_completion_decorator(self._achat)
672
704
  else:
@@ -708,7 +740,9 @@ class OpenAI(FunctionCallingLLM):
708
740
  )
709
741
 
710
742
  openai_message = response.choices[0].message
711
- message = from_openai_message(openai_message)
743
+ message = from_openai_message(
744
+ openai_message, modalities=self.modalities or ["text"]
745
+ )
712
746
  openai_token_logprobs = response.choices[0].logprobs
713
747
  logprobs = None
714
748
  if openai_token_logprobs and openai_token_logprobs.content:
@@ -725,6 +759,9 @@ class OpenAI(FunctionCallingLLM):
725
759
  async def _astream_chat(
726
760
  self, messages: Sequence[ChatMessage], **kwargs: Any
727
761
  ) -> ChatResponseAsyncGen:
762
+ if self.modalities and "audio" in self.modalities:
763
+ raise ValueError("Audio is not supported for chat streaming")
764
+
728
765
  aclient = self._get_aclient()
729
766
  message_dicts = to_openai_message_dicts(
730
767
  messages,
@@ -27,6 +27,7 @@ from llama_index.core.base.llms.types import (
27
27
  LogProb,
28
28
  MessageRole,
29
29
  TextBlock,
30
+ AudioBlock,
30
31
  )
31
32
  from llama_index.core.bridge.pydantic import BaseModel
32
33
 
@@ -68,6 +69,11 @@ GPT4_MODELS: Dict[str, int] = {
68
69
  "gpt-4-turbo-2024-04-09": 128000,
69
70
  "gpt-4-turbo": 128000,
70
71
  "gpt-4o": 128000,
72
+ "gpt-4o-audio-preview": 128000,
73
+ "gpt-4o-audio-preview-2024-12-17": 128000,
74
+ "gpt-4o-audio-preview-2024-10-01": 128000,
75
+ "gpt-4o-mini-audio-preview": 128000,
76
+ "gpt-4o-mini-audio-preview-2024-12-17": 128000,
71
77
  "gpt-4o-2024-05-13": 128000,
72
78
  "gpt-4o-2024-08-06": 128000,
73
79
  "gpt-4o-2024-11-20": 128000,
@@ -270,7 +276,16 @@ def to_openai_message_dict(
270
276
  """Convert a ChatMessage to an OpenAI message dict."""
271
277
  content = []
272
278
  content_txt = ""
279
+ reference_audio_id = None
273
280
  for block in message.blocks:
281
+ if message.role == MessageRole.ASSISTANT:
282
+ reference_audio_id = message.additional_kwargs.get(
283
+ "reference_audio_id", None
284
+ )
285
+ # if reference audio id is provided, we don't need to send the audio
286
+ if reference_audio_id:
287
+ continue
288
+
274
289
  if isinstance(block, TextBlock):
275
290
  content.append({"type": "text", "text": block.text})
276
291
  content_txt += block.text
@@ -291,6 +306,18 @@ def to_openai_message_dict(
291
306
  },
292
307
  }
293
308
  )
309
+ elif isinstance(block, AudioBlock):
310
+ audio_bytes = block.resolve_audio(as_base64=True).read()
311
+ audio_str = audio_bytes.decode("utf-8")
312
+ content.append(
313
+ {
314
+ "type": "input_audio",
315
+ "input_audio": {
316
+ "data": audio_str,
317
+ "format": block.format,
318
+ },
319
+ }
320
+ )
294
321
  else:
295
322
  msg = f"Unsupported content block type: {type(block).__name__}"
296
323
  raise ValueError(msg)
@@ -304,19 +331,27 @@ def to_openai_message_dict(
304
331
  else content_txt
305
332
  )
306
333
 
307
- # NOTE: Despite what the openai docs say, if the role is ASSISTANT, SYSTEM
308
- # or TOOL, 'content' cannot be a list and must be string instead.
309
- # Furthermore, if all blocks are text blocks, we can use the content_txt
310
- # as the content. This will avoid breaking openai-like APIs.
311
- message_dict = {
312
- "role": message.role.value,
313
- "content": (
314
- content_txt
315
- if message.role.value in ("assistant", "tool", "system")
316
- or all(isinstance(block, TextBlock) for block in message.blocks)
317
- else content
318
- ),
319
- }
334
+ # If reference audio id is provided, we don't need to send the audio
335
+ # NOTE: this is only a thing for assistant messages
336
+ if reference_audio_id:
337
+ message_dict = {
338
+ "role": message.role.value,
339
+ "audio": {"id": reference_audio_id},
340
+ }
341
+ else:
342
+ # NOTE: Despite what the openai docs say, if the role is ASSISTANT, SYSTEM
343
+ # or TOOL, 'content' cannot be a list and must be string instead.
344
+ # Furthermore, if all blocks are text blocks, we can use the content_txt
345
+ # as the content. This will avoid breaking openai-like APIs.
346
+ message_dict = {
347
+ "role": message.role.value,
348
+ "content": (
349
+ content_txt
350
+ if message.role.value in ("assistant", "tool", "system")
351
+ or all(isinstance(block, TextBlock) for block in message.blocks)
352
+ else content
353
+ ),
354
+ }
320
355
 
321
356
  # TODO: O1 models do not support system prompts
322
357
  if model is not None and model in O1_MODELS:
@@ -353,20 +388,29 @@ def to_openai_message_dicts(
353
388
  ]
354
389
 
355
390
 
356
- def from_openai_message(openai_message: ChatCompletionMessage) -> ChatMessage:
391
+ def from_openai_message(
392
+ openai_message: ChatCompletionMessage, modalities: List[str]
393
+ ) -> ChatMessage:
357
394
  """Convert openai message dict to generic message."""
358
395
  role = openai_message.role
359
396
  # NOTE: Azure OpenAI returns function calling messages without a content key
360
- content = openai_message.content
361
-
362
- # function_call = None # deprecated in OpenAI v 1.1.0
397
+ if "text" in modalities and openai_message.content:
398
+ blocks = [TextBlock(text=openai_message.content or "")]
399
+ else:
400
+ blocks = []
363
401
 
364
402
  additional_kwargs: Dict[str, Any] = {}
365
403
  if openai_message.tool_calls:
366
404
  tool_calls: List[ChatCompletionMessageToolCall] = openai_message.tool_calls
367
405
  additional_kwargs.update(tool_calls=tool_calls)
368
406
 
369
- return ChatMessage(role=role, content=content, additional_kwargs=additional_kwargs)
407
+ if openai_message.audio and "audio" in modalities:
408
+ reference_audio_id = openai_message.audio.id
409
+ audio_data = openai_message.audio.data
410
+ additional_kwargs["reference_audio_id"] = reference_audio_id
411
+ blocks.append(AudioBlock(audio=audio_data, format="mp3"))
412
+
413
+ return ChatMessage(role=role, blocks=blocks, additional_kwargs=additional_kwargs)
370
414
 
371
415
 
372
416
  def from_openai_token_logprob(
@@ -421,10 +465,10 @@ def from_openai_completion_logprobs(
421
465
 
422
466
 
423
467
  def from_openai_messages(
424
- openai_messages: Sequence[ChatCompletionMessage],
468
+ openai_messages: Sequence[ChatCompletionMessage], modalities: List[str]
425
469
  ) -> List[ChatMessage]:
426
470
  """Convert openai message dicts to generic messages."""
427
- return [from_openai_message(message) for message in openai_messages]
471
+ return [from_openai_message(message, modalities) for message in openai_messages]
428
472
 
429
473
 
430
474
  def from_openai_message_dict(message_dict: dict) -> ChatMessage:
@@ -29,12 +29,12 @@ exclude = ["**/BUILD"]
29
29
  license = "MIT"
30
30
  name = "llama-index-llms-openai"
31
31
  readme = "README.md"
32
- version = "0.3.18"
32
+ version = "0.3.19"
33
33
 
34
34
  [tool.poetry.dependencies]
35
35
  python = ">=3.9,<4.0"
36
36
  openai = "^1.58.1"
37
- llama-index-core = "^0.12.4"
37
+ llama-index-core = "^0.12.17"
38
38
 
39
39
  [tool.poetry.group.dev.dependencies]
40
40
  ipython = "8.10.0"