openai-sdk-helpers 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,392 @@
1
+ """File attachment utilities for responses.
2
+
3
+ This module provides functions for processing file attachments, automatically
4
+ detecting file types (images vs documents), and preparing them for the OpenAI API
5
+ with appropriate encoding (base64 or vector store). Supports both individual and
6
+ batch file processing.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING, Any, cast
14
+
15
+ from openai.types.responses.response_input_file_content_param import (
16
+ ResponseInputFileContentParam,
17
+ )
18
+ from openai.types.responses.response_input_file_param import ResponseInputFileParam
19
+ from openai.types.responses.response_input_image_content_param import (
20
+ ResponseInputImageContentParam,
21
+ )
22
+
23
+ from ..utils import create_file_data_url, create_image_data_url, is_image_file, log
24
+
25
+ if TYPE_CHECKING: # pragma: no cover
26
+ from .base import BaseResponse
27
+
28
+
29
+ def process_files(
30
+ response: BaseResponse[Any],
31
+ files: list[str],
32
+ use_vector_store: bool = False,
33
+ batch_size: int = 10,
34
+ max_workers: int = 5,
35
+ ) -> tuple[
36
+ list[ResponseInputFileParam],
37
+ list[ResponseInputFileContentParam],
38
+ list[ResponseInputImageContentParam],
39
+ ]:
40
+ """Process file attachments and prepare them for OpenAI API.
41
+
42
+ Automatically categorizes files by type (images vs documents) and
43
+ processes them appropriately. Supports concurrent processing for efficient
44
+ handling of multiple files.
45
+
46
+ Parameters
47
+ ----------
48
+ response : BaseResponse[Any]
49
+ Response instance that will use the processed files.
50
+ files : list[str]
51
+ List of file paths to process.
52
+ use_vector_store : bool, default False
53
+ If True, non-image files are uploaded to a vector store for
54
+ RAG-enabled file search instead of inline base64 encoding.
55
+ batch_size : int, default 10
56
+ Maximum number of files to submit to thread pool at once.
57
+ Processes files in chunks to avoid overwhelming the executor.
58
+ max_workers : int, default 5
59
+ Maximum number of concurrent workers for processing.
60
+
61
+ Returns
62
+ -------
63
+ tuple[list, list, list]
64
+ Three lists containing:
65
+ 1. Vector store file references (ResponseInputFileParam)
66
+ 2. Base64-encoded file content (ResponseInputFileContentParam)
67
+ 3. Base64-encoded image content (ResponseInputImageContentParam)
68
+
69
+ Examples
70
+ --------
71
+ >>> from openai_sdk_helpers.response import process_files
72
+ >>> vector_files, base64_files, images = process_files(
73
+ ... response,
74
+ ... ["photo.jpg", "document.pdf"],
75
+ ... use_vector_store=False
76
+ ... )
77
+
78
+ >>> # Batch process many files
79
+ >>> vector_files, base64_files, images = process_files(
80
+ ... response,
81
+ ... ["file1.pdf", "file2.pdf", ...], # Many files
82
+ ... batch_size=20,
83
+ ... max_workers=10
84
+ ... )
85
+ """
86
+ # Categorize files by type
87
+ image_files: list[str] = []
88
+ document_files: list[str] = []
89
+
90
+ for file_path in files:
91
+ if is_image_file(file_path):
92
+ image_files.append(file_path)
93
+ else:
94
+ document_files.append(file_path)
95
+
96
+ # Handle document files (vector store or base64)
97
+ vector_file_refs: list[ResponseInputFileParam] = []
98
+ base64_files: list[ResponseInputFileContentParam] = []
99
+
100
+ if document_files:
101
+ if use_vector_store:
102
+ # Upload to vector store (sequential for now)
103
+ vector_file_refs = _upload_to_vector_store(response, document_files)
104
+ else:
105
+ # Use batch processing for base64 encoding
106
+ base64_files = _encode_documents_base64_batch(
107
+ document_files, batch_size, max_workers
108
+ )
109
+
110
+ # Handle images (always base64) with batch processing
111
+ image_contents = _encode_images_base64_batch(image_files, batch_size, max_workers)
112
+
113
+ return vector_file_refs, base64_files, image_contents
114
+
115
+
116
+ def _upload_to_vector_store(
117
+ response: BaseResponse[Any], document_files: list[str]
118
+ ) -> list[ResponseInputFileParam]:
119
+ """Upload documents to vector store and return file references.
120
+
121
+ Uploads user files with purpose="user_data" for proper categorization
122
+ and cleanup according to OpenAI Files API conventions.
123
+
124
+ Parameters
125
+ ----------
126
+ response : BaseResponse[Any]
127
+ Response instance with vector storage.
128
+ document_files : list[str]
129
+ List of document file paths to upload.
130
+
131
+ Returns
132
+ -------
133
+ list[ResponseInputFileParam]
134
+ List of file references for vector store files.
135
+
136
+ Notes
137
+ -----
138
+ Files are uploaded with purpose="user_data" to distinguish them
139
+ from assistant files. All user files are automatically deleted
140
+ when the response is closed via the vector store cleanup.
141
+ """
142
+ file_refs: list[ResponseInputFileParam] = []
143
+
144
+ if response._user_vector_storage is None:
145
+ from openai_sdk_helpers.vector_storage import VectorStorage
146
+
147
+ store_name = f"{response.__class__.__name__.lower()}_{response._name}_{response.uuid}_user"
148
+ response._user_vector_storage = VectorStorage(
149
+ store_name=store_name,
150
+ client=response._client,
151
+ model=response._model,
152
+ )
153
+ user_vector_storage = cast(Any, response._user_vector_storage)
154
+ if not any(tool.get("type") == "file_search" for tool in response._tools):
155
+ response._tools.append(
156
+ {
157
+ "type": "file_search",
158
+ "vector_store_ids": [user_vector_storage.id],
159
+ }
160
+ )
161
+
162
+ user_vector_storage = cast(Any, response._user_vector_storage)
163
+ for file_path in document_files:
164
+ # Upload with purpose="user_data" for user-uploaded files
165
+ uploaded_file = user_vector_storage.upload_file(file_path, purpose="user_data")
166
+ file_refs.append(
167
+ ResponseInputFileParam(type="input_file", file_id=uploaded_file.id)
168
+ )
169
+
170
+ # Best-effort tracking with FilesAPIManager (if available on the response)
171
+ files_manager = getattr(response, "_files_manager", None)
172
+ if files_manager is not None:
173
+ # Prefer tracking by file ID; fall back to full object if needed.
174
+ try:
175
+ files_manager.track_file(uploaded_file.id)
176
+ except AttributeError:
177
+ try:
178
+ files_manager.track_file(uploaded_file)
179
+ except AttributeError:
180
+ # If the manager does not support tracking in either form,
181
+ # silently skip to avoid breaking existing behavior.
182
+ pass
183
+ return file_refs
184
+
185
+
186
+ def _encode_documents_base64(
187
+ document_files: list[str],
188
+ ) -> list[ResponseInputFileContentParam]:
189
+ """Encode documents as base64 for inline attachment.
190
+
191
+ Parameters
192
+ ----------
193
+ document_files : list[str]
194
+ List of document file paths to encode.
195
+
196
+ Returns
197
+ -------
198
+ list[ResponseInputFileContentParam]
199
+ List of base64-encoded file content parameters.
200
+ """
201
+ base64_files: list[ResponseInputFileContentParam] = []
202
+
203
+ for file_path in document_files:
204
+ file_data_url = create_file_data_url(file_path)
205
+ filename = Path(file_path).name
206
+ base64_files.append(
207
+ ResponseInputFileContentParam(
208
+ type="input_file",
209
+ file_data=file_data_url,
210
+ filename=filename,
211
+ )
212
+ )
213
+
214
+ return base64_files
215
+
216
+
217
+ def _encode_documents_base64_batch(
218
+ document_files: list[str],
219
+ batch_size: int = 10,
220
+ max_workers: int = 5,
221
+ ) -> list[ResponseInputFileContentParam]:
222
+ """Encode documents as base64 with batch processing.
223
+
224
+ Uses thread pool for concurrent encoding of multiple files.
225
+
226
+ Parameters
227
+ ----------
228
+ document_files : list[str]
229
+ List of document file paths to encode.
230
+ batch_size : int, default 10
231
+ Number of files to process in each batch.
232
+ max_workers : int, default 5
233
+ Maximum number of concurrent workers.
234
+
235
+ Returns
236
+ -------
237
+ list[ResponseInputFileContentParam]
238
+ List of base64-encoded file content parameters.
239
+ """
240
+ if not document_files:
241
+ return []
242
+
243
+ # If small number of files, process sequentially
244
+ if len(document_files) <= 3:
245
+ return _encode_documents_base64(document_files)
246
+
247
+ base64_files: list[ResponseInputFileContentParam] = []
248
+
249
+ def encode_single_document(file_path: str) -> ResponseInputFileContentParam:
250
+ """Encode a single document file."""
251
+ file_data_url = create_file_data_url(file_path)
252
+ filename = Path(file_path).name
253
+ return ResponseInputFileContentParam(
254
+ type="input_file",
255
+ file_data=file_data_url,
256
+ filename=filename,
257
+ )
258
+
259
+ # Process files concurrently in batches using thread pool
260
+ log(
261
+ f"Processing {len(document_files)} documents in batches of {batch_size} "
262
+ f"with {max_workers} workers"
263
+ )
264
+
265
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
266
+ # Process files in batches to avoid overwhelming the executor
267
+ for batch_start in range(0, len(document_files), batch_size):
268
+ batch_end = min(batch_start + batch_size, len(document_files))
269
+ batch = document_files[batch_start:batch_end]
270
+
271
+ # Submit this batch of tasks
272
+ future_to_file = {
273
+ executor.submit(encode_single_document, file_path): file_path
274
+ for file_path in batch
275
+ }
276
+
277
+ # Collect results as they complete
278
+ for future in as_completed(future_to_file):
279
+ try:
280
+ result = future.result()
281
+ base64_files.append(result)
282
+ except Exception as exc:
283
+ file_path = future_to_file[future]
284
+ log(f"Error encoding document {file_path}: {exc}")
285
+ raise
286
+
287
+ return base64_files
288
+
289
+
290
+ def _encode_images_base64(
291
+ image_files: list[str],
292
+ ) -> list[ResponseInputImageContentParam]:
293
+ """Encode images as base64 for inline attachment.
294
+
295
+ Parameters
296
+ ----------
297
+ image_files : list[str]
298
+ List of image file paths to encode.
299
+
300
+ Returns
301
+ -------
302
+ list[ResponseInputImageContentParam]
303
+ List of base64-encoded image content parameters.
304
+ """
305
+ image_contents: list[ResponseInputImageContentParam] = []
306
+
307
+ for image_path in image_files:
308
+ image_url, detail = create_image_data_url(image_path, detail="auto")
309
+ image_contents.append(
310
+ ResponseInputImageContentParam(
311
+ type="input_image",
312
+ image_url=image_url,
313
+ detail=detail,
314
+ )
315
+ )
316
+
317
+ return image_contents
318
+
319
+
320
+ def _encode_images_base64_batch(
321
+ image_files: list[str],
322
+ batch_size: int = 10,
323
+ max_workers: int = 5,
324
+ ) -> list[ResponseInputImageContentParam]:
325
+ """Encode images as base64 with batch processing.
326
+
327
+ Uses thread pool for concurrent encoding of multiple images.
328
+
329
+ Parameters
330
+ ----------
331
+ image_files : list[str]
332
+ List of image file paths to encode.
333
+ batch_size : int, default 10
334
+ Number of images to process in each batch.
335
+ max_workers : int, default 5
336
+ Maximum number of concurrent workers.
337
+
338
+ Returns
339
+ -------
340
+ list[ResponseInputImageContentParam]
341
+ List of base64-encoded image content parameters.
342
+ """
343
+ if not image_files:
344
+ return []
345
+
346
+ # If small number of files, process sequentially
347
+ if len(image_files) <= 3:
348
+ return _encode_images_base64(image_files)
349
+
350
+ image_contents: list[ResponseInputImageContentParam] = []
351
+
352
+ def encode_single_image(image_path: str) -> ResponseInputImageContentParam:
353
+ """Encode a single image file."""
354
+ image_url, detail = create_image_data_url(image_path, detail="auto")
355
+ return ResponseInputImageContentParam(
356
+ type="input_image",
357
+ image_url=image_url,
358
+ detail=detail,
359
+ )
360
+
361
+ # Process images concurrently in batches using thread pool
362
+ log(
363
+ f"Processing {len(image_files)} images in batches of {batch_size} "
364
+ f"with {max_workers} workers"
365
+ )
366
+
367
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
368
+ # Process images in batches to avoid overwhelming the executor
369
+ for batch_start in range(0, len(image_files), batch_size):
370
+ batch_end = min(batch_start + batch_size, len(image_files))
371
+ batch = image_files[batch_start:batch_end]
372
+
373
+ # Submit this batch of tasks
374
+ future_to_file = {
375
+ executor.submit(encode_single_image, image_path): image_path
376
+ for image_path in batch
377
+ }
378
+
379
+ # Collect results as they complete
380
+ for future in as_completed(future_to_file):
381
+ try:
382
+ result = future.result()
383
+ image_contents.append(result)
384
+ except Exception as exc:
385
+ image_path = future_to_file[future]
386
+ log(f"Error encoding image {image_path}: {exc}")
387
+ raise
388
+
389
+ return image_contents
390
+
391
+
392
+ __all__ = ["process_files"]
@@ -8,6 +8,7 @@ rendering, response execution, and resource cleanup.
8
8
  from __future__ import annotations
9
9
 
10
10
  import json
11
+ import tempfile
11
12
  from pathlib import Path
12
13
  from typing import Any
13
14
 
@@ -29,6 +30,36 @@ from openai_sdk_helpers.utils import (
29
30
  log,
30
31
  )
31
32
 
33
+ # Supported file extensions for OpenAI Assistants file search
34
+ SUPPORTED_FILE_EXTENSIONS = (
35
+ ".csv",
36
+ ".docx",
37
+ ".html",
38
+ ".json",
39
+ ".md",
40
+ ".pdf",
41
+ ".pptx",
42
+ ".txt",
43
+ ".xlsx",
44
+ )
45
+
46
+
47
+ def _validate_file_type(filename: str) -> bool:
48
+ """Check if a file has a supported extension for vector storage.
49
+
50
+ Parameters
51
+ ----------
52
+ filename : str
53
+ Name of the file to validate.
54
+
55
+ Returns
56
+ -------
57
+ bool
58
+ True if the file extension is supported, False otherwise.
59
+ """
60
+ file_ext = Path(filename).suffix.lower()
61
+ return file_ext in SUPPORTED_FILE_EXTENSIONS
62
+
32
63
 
33
64
  def _extract_assistant_text(response: BaseResponse[Any]) -> str:
34
65
  """Extract the latest assistant message as readable text.
@@ -227,6 +258,8 @@ def _init_session_state() -> None:
227
258
  """
228
259
  if "chat_history" not in st.session_state:
229
260
  st.session_state["chat_history"] = []
261
+ if "uploaded_files" not in st.session_state:
262
+ st.session_state["uploaded_files"] = []
230
263
 
231
264
 
232
265
  def _render_chat_history() -> None:
@@ -252,9 +285,16 @@ def _render_chat_history() -> None:
252
285
  st.json(raw_output)
253
286
  else:
254
287
  st.markdown(message.get("content", ""))
288
+ attachments = message.get("attachments", [])
289
+ if attachments:
290
+ st.caption(
291
+ f"📎 {len(attachments)} file(s) attached: {', '.join(attachments)}"
292
+ )
255
293
 
256
294
 
257
- def _handle_user_message(prompt: str, config: StreamlitAppConfig) -> None:
295
+ def _handle_user_message(
296
+ prompt: str, config: StreamlitAppConfig, attachment_paths: list[str] | None = None
297
+ ) -> None:
258
298
  """Process user input and generate assistant response.
259
299
 
260
300
  Appends the user message to chat history, executes the response
@@ -267,6 +307,8 @@ def _handle_user_message(prompt: str, config: StreamlitAppConfig) -> None:
267
307
  User-entered text to send to the assistant.
268
308
  config : StreamlitAppConfig
269
309
  Loaded configuration with response handler definition.
310
+ attachment_paths : list[str] or None, default None
311
+ Optional list of file paths to attach to the message.
270
312
 
271
313
  Notes
272
314
  -----
@@ -274,7 +316,12 @@ def _handle_user_message(prompt: str, config: StreamlitAppConfig) -> None:
274
316
  chat transcript rather than crashing the application. The function
275
317
  triggers a Streamlit rerun after successful response generation.
276
318
  """
277
- st.session_state["chat_history"].append({"role": "user", "content": prompt})
319
+ attachment_names = (
320
+ [Path(p).name for p in attachment_paths] if attachment_paths else []
321
+ )
322
+ st.session_state["chat_history"].append(
323
+ {"role": "user", "content": prompt, "attachments": attachment_names}
324
+ )
278
325
  try:
279
326
  response = _get_response_instance(config)
280
327
  except Exception as exc: # pragma: no cover - surfaced in UI
@@ -283,7 +330,7 @@ def _handle_user_message(prompt: str, config: StreamlitAppConfig) -> None:
283
330
 
284
331
  try:
285
332
  with st.spinner("Thinking..."):
286
- result = response.run_sync(content=prompt)
333
+ result = response.run_sync(content=prompt, files=attachment_paths)
287
334
  summary = _render_summary(result, response)
288
335
  raw_output = _build_raw_output(result, response)
289
336
  st.session_state["chat_history"].append(
@@ -341,9 +388,41 @@ def main(config_path: Path) -> None:
341
388
 
342
389
  _render_chat_history()
343
390
 
391
+ # File uploader for attachments
392
+ uploaded_files = st.file_uploader(
393
+ "Attach files (optional)",
394
+ accept_multiple_files=True,
395
+ key="file_uploader",
396
+ help=f"Supported formats: {', '.join(sorted(SUPPORTED_FILE_EXTENSIONS))}",
397
+ )
398
+
399
+ # Save uploaded files to temporary directory and track paths
400
+ attachment_paths: list[str] = []
401
+ if uploaded_files:
402
+ invalid_files = []
403
+ for uploaded_file in uploaded_files:
404
+ if not _validate_file_type(uploaded_file.name):
405
+ invalid_files.append(uploaded_file.name)
406
+ continue
407
+ with tempfile.NamedTemporaryFile(
408
+ delete=False, suffix=Path(uploaded_file.name).suffix
409
+ ) as tmp_file:
410
+ tmp_file.write(uploaded_file.getbuffer())
411
+ attachment_paths.append(tmp_file.name)
412
+
413
+ if invalid_files:
414
+ st.warning(
415
+ f"⚠️ Unsupported file types skipped: {', '.join(invalid_files)}. "
416
+ f"Supported formats: {', '.join(sorted(SUPPORTED_FILE_EXTENSIONS))}"
417
+ )
418
+ if attachment_paths:
419
+ st.caption(f"📎 {len(attachment_paths)} file(s) ready to attach")
420
+
344
421
  prompt = st.chat_input("Message the assistant")
345
422
  if prompt:
346
- _handle_user_message(prompt, config)
423
+ _handle_user_message(
424
+ prompt, config, attachment_paths if attachment_paths else None
425
+ )
347
426
 
348
427
 
349
428
  if __name__ == "__main__":
@@ -7,6 +7,7 @@ The utils package collects cross-cutting helpers used across the project:
7
7
  * Concurrency: async bridging helpers.
8
8
  * Output validation: JSON Schema and semantic validators.
9
9
  * Instrumentation helpers: deprecation utilities.
10
+ * Encoding: base64 encoding for images and files.
10
11
 
11
12
  Import style
12
13
  ------------
@@ -32,6 +33,8 @@ output_validation
32
33
  JSON Schema and semantic output validation utilities.
33
34
  deprecation
34
35
  Deprecation helpers and warning utilities.
36
+ encoding
37
+ Base64 encoding helpers for images and files.
35
38
  """
36
39
 
37
40
  from __future__ import annotations
@@ -69,6 +72,14 @@ from .output_validation import (
69
72
  validate_output,
70
73
  )
71
74
  from .deprecation import DeprecationHelper, deprecated, warn_deprecated
75
+ from .encoding import (
76
+ create_file_data_url,
77
+ create_image_data_url,
78
+ encode_file,
79
+ encode_image,
80
+ get_mime_type,
81
+ is_image_file,
82
+ )
72
83
 
73
84
  __all__ = [
74
85
  "ensure_list",
@@ -104,4 +115,11 @@ __all__ = [
104
115
  "deprecated",
105
116
  "warn_deprecated",
106
117
  "DeprecationHelper",
118
+ # Encoding
119
+ "encode_image",
120
+ "encode_file",
121
+ "get_mime_type",
122
+ "create_image_data_url",
123
+ "create_file_data_url",
124
+ "is_image_file",
107
125
  ]