datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1220 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import base64
5
+ import json
6
+ import logging
7
+ import tempfile
8
+ import time
9
+ import uuid
10
+ from abc import ABC, abstractmethod
11
+ from pathlib import Path
12
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
13
+
14
+ from openai import OpenAI
15
+ from pydantic import BaseModel, ValidationError
16
+ from tenacity import retry, retry_if_not_exception_type, stop_after_attempt, wait_exponential
17
+
18
+ ########################################################################################################################
19
+ # PARAMETERS
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ ########################################################################################################################
24
+ # BASE CLASSES
25
+
26
+
27
+ class BaseLLMProvider(ABC):
28
+ """
29
+ Abstract base class for LLM providers.
30
+ """
31
+
32
+ @abstractmethod
33
+ def invoke_text(
34
+ self,
35
+ *,
36
+ instructions: str,
37
+ input_texts: List[str],
38
+ model: str,
39
+ image_path: Optional[str] = None,
40
+ image_url: Optional[str] = None,
41
+ **kwargs,
42
+ ) -> Tuple[str, Dict[str, Any], Any]:
43
+ """
44
+ Generate text output from the LLM.
45
+ Supports both local images (image_path) and image URLs (image_url).
46
+ """
47
+ pass
48
+
49
+ @abstractmethod
50
+ def invoke_structured(
51
+ self,
52
+ *,
53
+ instructions: str,
54
+ input_texts: List[str],
55
+ model_schema: Type[BaseModel],
56
+ model: str,
57
+ image_path: Optional[str] = None,
58
+ image_url: Optional[str] = None,
59
+ **kwargs,
60
+ ) -> Tuple[BaseModel, Dict[str, Any], Any]:
61
+ """
62
+ Generate structured output from the LLM using a Pydantic schema.
63
+ Supports both local images (image_path) and image URLs (image_url).
64
+ """
65
+ pass
66
+
67
+
68
+ class OpenAIProvider(BaseLLMProvider):
69
+ """
70
+ OpenAI provider implementation using the 'client.responses' pattern.
71
+ """
72
+
73
+ def __init__(self, api_key: str, default_model: str = "gpt-4o-mini") -> None:
74
+ self.client = OpenAI(api_key=api_key)
75
+ self.default_model = default_model
76
+
77
+ @staticmethod
78
+ def _batch_to_dict(batch) -> Dict[str, Any]:
79
+ """
80
+ Convert an OpenAI Batch object to a dictionary.
81
+
82
+ Uses model_dump() for robust serialization that adapts to SDK changes.
83
+
84
+ Args:
85
+ batch: OpenAI Batch object (Pydantic model)
86
+
87
+ Returns:
88
+ Dictionary representation of the batch
89
+ """
90
+ return batch.model_dump()
91
+
92
+ @staticmethod
93
+ def _encode_image(image_path: str) -> str:
94
+ """Encodes a local image file to a base64 string."""
95
+ with open(image_path, "rb") as image_file:
96
+ return base64.b64encode(image_file.read()).decode("utf-8")
97
+
98
+ @staticmethod
99
+ def _get_image_media_type(image_path: str) -> str:
100
+ """
101
+ Determine the media type based on file extension.
102
+
103
+ Supported formats: PNG, JPEG, WEBP, non-animated GIF
104
+
105
+ Args:
106
+ image_path: Path to the image file
107
+
108
+ Returns:
109
+ Media type string (e.g., "image/jpeg", "image/png")
110
+ """
111
+ extension = Path(image_path).suffix.lower()
112
+ media_types = {
113
+ ".jpg": "image/jpeg",
114
+ ".jpeg": "image/jpeg",
115
+ ".png": "image/png",
116
+ ".gif": "image/gif",
117
+ ".webp": "image/webp",
118
+ }
119
+ return media_types.get(extension, "not_compatible") # Return 'not_compatible' for unsupported extensions
120
+
121
+ @staticmethod
122
+ def _is_url(path: str) -> bool:
123
+ """Check if the provided path is a URL."""
124
+ return path.startswith(("http://", "https://"))
125
+
126
+ def _build_vision_content(
127
+ self,
128
+ input_texts: List[str],
129
+ image_path: Optional[str] = None,
130
+ image_url: Optional[str] = None,
131
+ detail: str = "auto",
132
+ ) -> List[Dict[str, Any]]:
133
+ """
134
+ Builds the content list for the user message following OpenAI Responses API pattern.
135
+ Handles text and optional image (from local file or URL) for Vision capabilities.
136
+
137
+ Args:
138
+ input_texts: List of text strings to include
139
+ image_path: Path to a local image file (will be base64 encoded)
140
+ image_url: URL of an image (used directly)
141
+
142
+ Returns:
143
+ List of content dictionaries with proper OpenAI Responses API format
144
+ """
145
+ content = []
146
+
147
+ # Add text inputs using input_text type
148
+ for text in input_texts:
149
+ content.append({"type": "input_text", "text": text})
150
+
151
+ # Add image if provided (prioritize image_url over image_path)
152
+ if image_url:
153
+ # Use URL directly with input_image type (OpenAI Responses API format)
154
+ content.append({"type": "input_image", "image_url": image_url})
155
+ elif image_path:
156
+ # Check if image_path is actually a URL
157
+ if self._is_url(image_path):
158
+ content.append({"type": "input_image", "image_url": image_path})
159
+ else:
160
+ # Local file: encode to base64 and use data URL with correct media type
161
+ base64_image = self._encode_image(image_path)
162
+ media_type = self._get_image_media_type(image_path)
163
+ if media_type == "not_compatible":
164
+ raise ValueError(f"Unsupported image format for file: {image_path}")
165
+ content.append(
166
+ {"type": "input_image", "image_url": f"data:{media_type};base64,{base64_image}", "detail": detail}
167
+ )
168
+
169
+ return content
170
+
171
+ @staticmethod
172
+ def _extract_usage_meta(resp) -> Dict[str, Any]:
173
+ """
174
+ Extract usage metadata.
175
+ Note: The structure of 'resp' depends on the specific API version.
176
+ We attempt to read standard usage fields.
177
+ """
178
+ usage = getattr(resp, "usage", None)
179
+ input_tokens = getattr(usage, "input_tokens", 0) if usage else 0
180
+ output_tokens = getattr(usage, "output_tokens", 0) if usage else 0
181
+
182
+ # Calculate number of cached tokens
183
+ cached_tokens = 0
184
+ details = getattr(usage, "input_tokens_details", None)
185
+ if details is not None and hasattr(details, "cached_tokens"):
186
+ cached_tokens = details.cached_tokens
187
+
188
+ return {
189
+ "response_id": getattr(resp, "id", None),
190
+ "model": getattr(resp, "model", None),
191
+ "input_tokens": input_tokens,
192
+ "cached_tokens": cached_tokens,
193
+ "uncached_tokens": (input_tokens - cached_tokens),
194
+ "output_tokens": output_tokens,
195
+ }
196
+
197
+ @retry(
198
+ reraise=True,
199
+ stop=stop_after_attempt(3),
200
+ wait=wait_exponential(multiplier=0.5, min=0.5, max=5),
201
+ retry=retry_if_not_exception_type(ValidationError),
202
+ )
203
+ def invoke_text(
204
+ self,
205
+ *,
206
+ instructions: str,
207
+ input_texts: List[str],
208
+ model: str,
209
+ image_path: Optional[str] = None,
210
+ image_url: Optional[str] = None,
211
+ **kwargs,
212
+ ) -> Tuple[str, Dict[str, Any], Any]:
213
+ """
214
+ Generate text output using 'client.responses.create'.
215
+
216
+ Args:
217
+ instructions: System instructions for the model
218
+ input_texts: List of text inputs
219
+ model: Model identifier
220
+ image_path: Path to local image file OR image URL (auto-detected)
221
+ image_url: Direct image URL (takes priority over image_path)
222
+ **kwargs: Additional arguments
223
+
224
+ Returns:
225
+ Tuple of (output_text, usage_metadata, raw_response)
226
+
227
+ Examples:
228
+ # Text only
229
+ >>> text, meta, _ = provider.invoke_text(
230
+ ... instructions="Summarize",
231
+ ... input_texts=["Long text here..."],
232
+ ... model="gpt-4o-mini"
233
+ ... )
234
+
235
+ # With image URL
236
+ >>> text, meta, _ = provider.invoke_text(
237
+ ... instructions="Describe this image",
238
+ ... input_texts=["What's in this image?"],
239
+ ... model="gpt-4o-mini",
240
+ ... image_url="https://example.com/image.jpg"
241
+ ... )
242
+ """
243
+ t0 = time.time()
244
+
245
+ # Build content with optional image support
246
+ user_content = self._build_vision_content(input_texts, image_path, image_url, detail="high")
247
+
248
+ # Construct input list (System + User)
249
+ input_payload = [
250
+ {"role": "system", "content": instructions},
251
+ {"role": "user", "content": user_content},
252
+ ]
253
+
254
+ logger.info(f"Prompting {model} (text via responses.create)...")
255
+
256
+ # Using the requested syntax
257
+ resp = self.client.responses.create(
258
+ model=model,
259
+ input=input_payload,
260
+ **kwargs,
261
+ )
262
+ # Extract output text
263
+ output_text = getattr(resp, "output_text", "")
264
+ if not output_text and hasattr(resp, "choices"):
265
+ output_text = resp.choices[0].message.content
266
+
267
+ meta = self._extract_usage_meta(resp) | {"latency_sec": round(time.time() - t0, 3)}
268
+ return output_text, meta, resp
269
+
270
+ @retry(
271
+ reraise=True,
272
+ stop=stop_after_attempt(3),
273
+ wait=wait_exponential(multiplier=0.5, min=0.5, max=5),
274
+ retry=retry_if_not_exception_type(ValidationError),
275
+ )
276
+ def invoke_structured(
277
+ self,
278
+ *,
279
+ instructions: str,
280
+ input_texts: List[str],
281
+ model_schema: Type[BaseModel],
282
+ model: str,
283
+ image_path: Optional[str] = None,
284
+ image_url: Optional[str] = None,
285
+ detail: Optional[str] = None,
286
+ **kwargs,
287
+ ) -> Tuple[BaseModel, Dict[str, Any], Any]:
288
+ """
289
+ Generate structured output using 'client.responses.parse'.
290
+ Strictly follows the syntax: client.responses.parse(model=..., input=..., text_format=...)
291
+
292
+ Args:
293
+ instructions: System instructions for the model
294
+ input_texts: List of text inputs
295
+ model_schema: Pydantic model class for structured output
296
+ model: Model identifier
297
+ image_path: Path to local image file OR image URL (auto-detected)
298
+ image_url: Direct image URL (takes priority over image_path)
299
+ **kwargs: Additional arguments
300
+
301
+ Returns:
302
+ Tuple of (pydantic_instance, usage_metadata, raw_response)
303
+ """
304
+ t0 = time.time()
305
+
306
+ # Prepare content (handles text + optional image from path or URL)
307
+ user_content_list = self._build_vision_content(input_texts, image_path, image_url, detail)
308
+
309
+ # Construct input payload
310
+ input_payload = [
311
+ {"role": "system", "content": instructions},
312
+ {"role": "user", "content": user_content_list},
313
+ ]
314
+
315
+ logger.info(f"Prompting {model} (structured via responses.parse)...")
316
+
317
+ # EXACT SYNTAX REQUESTED
318
+ resp = self.client.responses.parse(model=model, input=input_payload, text_format=model_schema, **kwargs)
319
+
320
+ # Access parsed output as requested: response.output_parsed
321
+ parsed_output = resp.output_parsed
322
+
323
+ meta = self._extract_usage_meta(resp) | {"latency_sec": round(time.time() - t0, 3)}
324
+ return parsed_output, meta, resp
325
+
326
+ # invoke_chat implementation omitted or mapped to invoke_text depending on need
327
+ def invoke_chat(self, **kwargs):
328
+ raise NotImplementedError("Chat method not implemented for Responses API pattern.")
329
+
330
+ # ==========================================================================================================
331
+ # BATCH PROCESSING METHODS
332
+ # ==========================================================================================================
333
+
334
+ def _build_batch_request(
335
+ self,
336
+ custom_id: str,
337
+ instructions: str,
338
+ input_texts: List[str],
339
+ model: str,
340
+ image_path: Optional[str] = None,
341
+ image_url: Optional[str] = None,
342
+ endpoint: str = "/v1/responses",
343
+ **kwargs,
344
+ ) -> Dict[str, Any]:
345
+ """
346
+ Build a single batch request in the format required by OpenAI Batch API.
347
+
348
+ Args:
349
+ custom_id: Unique identifier for this request (used to match results)
350
+ instructions: System instructions for the model
351
+ input_texts: List of text inputs
352
+ model: Model identifier
353
+ image_path: Path to local image file OR image URL (auto-detected)
354
+ image_url: Direct image URL (takes priority over image_path)
355
+ endpoint: API endpoint ("/v1/responses" or "/v1/chat/completions")
356
+ **kwargs: Additional arguments for the API call
357
+
358
+ Returns:
359
+ Dictionary representing a single batch request line
360
+ """
361
+ # Build content with optional image support
362
+ user_content = self._build_vision_content(input_texts, image_path, image_url, detail="high")
363
+
364
+ # Construct input payload
365
+ input_payload = [
366
+ {"role": "system", "content": instructions},
367
+ {"role": "user", "content": user_content},
368
+ ]
369
+
370
+ return {
371
+ "custom_id": custom_id,
372
+ "method": "POST",
373
+ "url": endpoint,
374
+ "body": {
375
+ "model": model,
376
+ "input": input_payload,
377
+ **kwargs,
378
+ },
379
+ }
380
+
381
+ def create_batch_file(
382
+ self,
383
+ requests: List[Dict[str, Any]],
384
+ output_path: Optional[str] = None,
385
+ ) -> str:
386
+ """
387
+ Create a JSONL batch input file from a list of request configurations.
388
+
389
+ Args:
390
+ requests: List of request configurations. Each dict should have:
391
+ - custom_id: Unique identifier
392
+ - instructions: System instructions
393
+ - input_texts: List of input texts
394
+ - model: Model to use (optional, uses default)
395
+ - image_url: Image URL (optional)
396
+ - image_path: Local image path (optional)
397
+ - Any additional kwargs for the API
398
+ output_path: Path to save the JSONL file (optional, creates temp file if not provided)
399
+
400
+ Returns:
401
+ Path to the created JSONL file
402
+
403
+ Example:
404
+ >>> requests = [
405
+ ... {"custom_id": "req-1", "instructions": "Summarize", "input_texts": ["Text 1..."]},
406
+ ... {"custom_id": "req-2", "instructions": "Summarize", "input_texts": ["Text 2..."]},
407
+ ... ]
408
+ >>> filepath = provider.create_batch_file(requests)
409
+ """
410
+ if output_path is None:
411
+ with tempfile.NamedTemporaryFile(suffix=".jsonl", prefix="batch_input_", delete=False) as temp_file:
412
+ output_path = temp_file.name
413
+
414
+ with open(output_path, "w", encoding="utf-8") as f:
415
+ for req in requests:
416
+ custom_id = req.get("custom_id", str(uuid.uuid4()))
417
+ instructions = req.get("instructions", "")
418
+ input_texts = req.get("input_texts", [])
419
+ model = req.get("model", self.default_model)
420
+ image_url = req.get("image_url")
421
+ image_path = req.get("image_path")
422
+ endpoint = req.get("endpoint", "/v1/responses")
423
+
424
+ # Get additional kwargs (exclude known keys)
425
+ known_keys = {
426
+ "custom_id",
427
+ "instructions",
428
+ "input_texts",
429
+ "model",
430
+ "image_url",
431
+ "image_path",
432
+ "endpoint",
433
+ }
434
+ extra_kwargs = {k: v for k, v in req.items() if k not in known_keys}
435
+
436
+ batch_request = self._build_batch_request(
437
+ custom_id=custom_id,
438
+ instructions=instructions,
439
+ input_texts=input_texts,
440
+ model=model,
441
+ image_path=image_path,
442
+ image_url=image_url,
443
+ endpoint=endpoint,
444
+ **extra_kwargs,
445
+ )
446
+
447
+ f.write(json.dumps(batch_request, ensure_ascii=False) + "\n")
448
+
449
+ logger.info(f"Created batch file with {len(requests)} requests: {output_path}")
450
+ return output_path
451
+
452
+ def upload_batch_file(self, filepath: str) -> str:
453
+ """
454
+ Upload a batch input file to OpenAI.
455
+
456
+ Args:
457
+ filepath: Path to the JSONL batch file
458
+
459
+ Returns:
460
+ File ID from OpenAI
461
+
462
+ Example:
463
+ >>> file_id = provider.upload_batch_file("batch_input.jsonl")
464
+ >>> print(file_id) # "file-abc123"
465
+ """
466
+ with open(filepath, "rb") as f:
467
+ file_obj = self.client.files.create(file=f, purpose="batch")
468
+
469
+ logger.info(f"Uploaded batch file: {file_obj.id}")
470
+ return file_obj.id
471
+
472
+ def create_batch(
473
+ self,
474
+ input_file_id: str,
475
+ endpoint: str = "/v1/responses",
476
+ completion_window: str = "24h",
477
+ metadata: Optional[Dict[str, str]] = None,
478
+ ) -> Dict[str, Any]:
479
+ """
480
+ Create a new batch processing job.
481
+
482
+ Args:
483
+ input_file_id: ID of the uploaded batch input file
484
+ endpoint: API endpoint ("/v1/responses" or "/v1/chat/completions")
485
+ completion_window: Time window for completion ("24h")
486
+ metadata: Optional metadata dict for the batch
487
+
488
+ Returns:
489
+ Batch object with job details
490
+
491
+ Example:
492
+ >>> batch = provider.create_batch("file-abc123")
493
+ >>> print(batch["id"]) # "batch_abc123"
494
+ >>> print(batch["status"]) # "validating"
495
+ """
496
+ batch = self.client.batches.create(
497
+ input_file_id=input_file_id,
498
+ endpoint=endpoint,
499
+ completion_window=completion_window,
500
+ metadata=metadata,
501
+ )
502
+
503
+ batch_dict = self._batch_to_dict(batch)
504
+
505
+ logger.info(f"Created batch: {batch.id} (status: {batch.status})")
506
+ return batch_dict
507
+
508
+ def get_batch_status(self, batch_id: str) -> Dict[str, Any]:
509
+ """
510
+ Get the current status of a batch job.
511
+
512
+ Args:
513
+ batch_id: ID of the batch
514
+
515
+ Returns:
516
+ Batch object with current status
517
+
518
+ Status values:
519
+ - validating: Input file being validated
520
+ - failed: Validation failed
521
+ - in_progress: Batch is running
522
+ - finalizing: Completing and preparing results
523
+ - completed: Done, results ready
524
+ - expired: Did not complete in time
525
+ - cancelling: Being cancelled
526
+ - cancelled: Was cancelled
527
+ """
528
+ batch = self.client.batches.retrieve(batch_id)
529
+
530
+ return self._batch_to_dict(batch)
531
+
532
+ def wait_for_batch(
533
+ self,
534
+ batch_id: str,
535
+ poll_interval: float = 30.0,
536
+ timeout: Optional[float] = None,
537
+ callback: Optional[Callable[[Dict[str, Any]], None]] = None,
538
+ ) -> Dict[str, Any]:
539
+ """
540
+ Wait for a batch job to complete, polling periodically.
541
+
542
+ Args:
543
+ batch_id: ID of the batch
544
+ poll_interval: Seconds between status checks (default: 30)
545
+ timeout: Maximum seconds to wait (default: None = no timeout)
546
+ callback: Optional function called with status on each poll
547
+
548
+ Returns:
549
+ Final batch status
550
+
551
+ Raises:
552
+ TimeoutError: If timeout is reached before completion
553
+ RuntimeError: If batch fails or is cancelled
554
+
555
+ Example:
556
+ >>> def on_update(status):
557
+ ... print(f"Status: {status['status']}, Completed: {status['request_counts']['completed']}")
558
+ >>> result = provider.wait_for_batch("batch_abc123", callback=on_update)
559
+ """
560
+ start_time = time.time()
561
+ terminal_statuses = {"completed", "failed", "expired", "cancelled"}
562
+
563
+ while True:
564
+ status = self.get_batch_status(batch_id)
565
+
566
+ if callback:
567
+ callback(status)
568
+
569
+ if status["status"] in terminal_statuses:
570
+ if status["status"] == "completed":
571
+ logger.info(f"Batch {batch_id} completed successfully")
572
+ elif status["status"] == "failed":
573
+ raise RuntimeError(f"Batch {batch_id} failed")
574
+ elif status["status"] == "expired":
575
+ raise RuntimeError(f"Batch {batch_id} expired before completion")
576
+ elif status["status"] == "cancelled":
577
+ raise RuntimeError(f"Batch {batch_id} was cancelled")
578
+
579
+ return status
580
+
581
+ if timeout and (time.time() - start_time) > timeout:
582
+ raise TimeoutError(f"Timeout waiting for batch {batch_id}")
583
+
584
+ logger.debug(
585
+ f"Batch {batch_id} status: {status['status']} "
586
+ f"({status['request_counts']['completed']}/{status['request_counts']['total']} completed)"
587
+ )
588
+
589
+ time.sleep(poll_interval)
590
+
591
+ @staticmethod
592
+ def _parse_jsonl_content(content: str) -> List[Dict[str, Any]]:
593
+ """
594
+ Parse JSONL (JSON Lines) content into a list of dictionaries.
595
+
596
+ Args:
597
+ content: Raw JSONL content string
598
+
599
+ Returns:
600
+ List of parsed JSON objects
601
+ """
602
+ results = []
603
+ for line in content.strip().split("\n"):
604
+ if line:
605
+ results.append(json.loads(line))
606
+ return results
607
+
608
+ def _save_batch_results_to_file(self, content: str, output_path: str) -> None:
609
+ """
610
+ Save batch results content to a file.
611
+
612
+ Args:
613
+ content: Raw JSONL content to save
614
+ output_path: Path where to save the file
615
+ """
616
+ with open(output_path, "w", encoding="utf-8") as f:
617
+ f.write(content)
618
+ logger.info(f"Saved batch results to: {output_path}")
619
+
620
+ def _fetch_and_parse_file(self, file_id: str) -> List[Dict[str, Any]]:
621
+ """
622
+ Fetch a file from OpenAI and parse its JSONL content.
623
+
624
+ Args:
625
+ file_id: ID of the file to fetch
626
+
627
+ Returns:
628
+ List of parsed JSON objects from the file
629
+ """
630
+ file_response = self.client.files.content(file_id)
631
+ return self._parse_jsonl_content(file_response.text)
632
+
633
+ def _process_output_file(
634
+ self,
635
+ batch_id: str,
636
+ output_file_id: Optional[str],
637
+ output_path: Optional[str],
638
+ ) -> List[Dict[str, Any]]:
639
+ """
640
+ Process successful batch results from the output file.
641
+
642
+ Args:
643
+ batch_id: ID of the batch (for logging)
644
+ output_file_id: ID of the output file (None if no successes)
645
+ output_path: Path to optionally save results
646
+
647
+ Returns:
648
+ List of result dictionaries from successful requests
649
+ """
650
+ if not output_file_id:
651
+ if output_path:
652
+ logger.warning(f"No output file for batch {batch_id}, skipping save to {output_path}")
653
+ return []
654
+
655
+ file_response = self.client.files.content(output_file_id)
656
+ content = file_response.text
657
+
658
+ if output_path:
659
+ self._save_batch_results_to_file(content, output_path)
660
+
661
+ return self._parse_jsonl_content(content)
662
+
663
+ def _process_error_file(self, batch_id: str, error_file_id: Optional[str]) -> List[Dict[str, Any]]:
664
+ """
665
+ Process error results from the error file.
666
+
667
+ Args:
668
+ batch_id: ID of the batch (for logging)
669
+ error_file_id: ID of the error file (None if no errors)
670
+
671
+ Returns:
672
+ List of error dictionaries
673
+ """
674
+ if not error_file_id:
675
+ return []
676
+
677
+ logger.info(f"Batch {batch_id} has errors (file: {error_file_id}). Fetching...")
678
+ try:
679
+ return self._fetch_and_parse_file(error_file_id)
680
+ except Exception as e:
681
+ logger.error(f"Failed to retrieve error file {error_file_id}: {e}")
682
+ return []
683
+
684
+ def get_batch_results(
685
+ self,
686
+ batch_id: str,
687
+ output_path: Optional[str] = None,
688
+ ) -> List[Dict[str, Any]]:
689
+ """
690
+ Retrieve results from a completed batch.
691
+
692
+ Args:
693
+ batch_id: ID of the batch
694
+ output_path: Optional path to save the raw JSONL output
695
+
696
+ Returns:
697
+ List of result dictionaries, each containing:
698
+ - custom_id: The original request ID
699
+ - response: Response data (if successful)
700
+ - error: Error data (if failed)
701
+
702
+ Example:
703
+ >>> results = provider.get_batch_results("batch_abc123")
704
+ >>> for r in results:
705
+ ... print(f"{r['custom_id']}: {r['response']['body']['output_text']}")
706
+ """
707
+ status = self.get_batch_status(batch_id)
708
+
709
+ if status["status"] != "completed":
710
+ raise RuntimeError(f"Batch {batch_id} is not completed (status: {status['status']})")
711
+
712
+ # Process both output and error files
713
+ output_results = self._process_output_file(
714
+ batch_id,
715
+ status.get("output_file_id"),
716
+ output_path,
717
+ )
718
+ error_results = self._process_error_file(batch_id, status.get("error_file_id"))
719
+
720
+ results = output_results + error_results
721
+
722
+ # Log warning if batch has no results
723
+ if not results and not status.get("output_file_id") and not status.get("error_file_id"):
724
+ logger.warning(f"Batch {batch_id} completed but has no output or error file.")
725
+
726
+ logger.info(f"Retrieved {len(results)} results from batch {batch_id}")
727
+ return results
728
+
729
+ def get_batch_errors(self, batch_id: str) -> List[Dict[str, Any]]:
730
+ """
731
+ Retrieve errors from a batch (if any).
732
+
733
+ Args:
734
+ batch_id: ID of the batch
735
+
736
+ Returns:
737
+ List of error dictionaries
738
+ """
739
+ status = self.get_batch_status(batch_id)
740
+
741
+ error_file_id = status["error_file_id"]
742
+ if not error_file_id:
743
+ return []
744
+
745
+ file_response = self.client.files.content(error_file_id)
746
+ content = file_response.text
747
+
748
+ errors = []
749
+ for line in content.strip().split("\n"):
750
+ if line:
751
+ errors.append(json.loads(line))
752
+
753
+ return errors
754
+
755
+ def cancel_batch(self, batch_id: str) -> Dict[str, Any]:
756
+ """
757
+ Cancel a running batch job.
758
+
759
+ Args:
760
+ batch_id: ID of the batch to cancel
761
+
762
+ Returns:
763
+ Updated batch status
764
+ """
765
+ batch = self.client.batches.cancel(batch_id)
766
+ logger.info(f"Cancelling batch {batch_id}")
767
+
768
+ return self._batch_to_dict(batch)
769
+
770
+ def list_batches(self, limit: int = 20, after: Optional[str] = None) -> List[Dict[str, Any]]:
771
+ """
772
+ List all batches.
773
+
774
+ Args:
775
+ limit: Maximum number of batches to return
776
+ after: Cursor for pagination
777
+
778
+ Returns:
779
+ List of batch objects
780
+ """
781
+ batches = self.client.batches.list(limit=limit, after=after)
782
+
783
+ return [self._batch_to_dict(b) for b in batches.data]
784
+
785
+ def submit_batch(
786
+ self,
787
+ requests: List[Dict[str, Any]],
788
+ endpoint: str = "/v1/responses",
789
+ metadata: Optional[Dict[str, str]] = None,
790
+ wait: bool = False,
791
+ poll_interval: float = 30.0,
792
+ callback: Optional[Callable[[Dict[str, Any]], None]] = None,
793
+ ) -> Tuple[str, Optional[List[Dict[str, Any]]]]:
794
+ """
795
+ High-level method to submit a batch job (create file, upload, and start batch).
796
+
797
+ Args:
798
+ requests: List of request configurations (see create_batch_file for format)
799
+ endpoint: API endpoint ("/v1/responses" or "/v1/chat/completions")
800
+ metadata: Optional metadata for the batch
801
+ wait: If True, wait for completion and return results
802
+ poll_interval: Seconds between status checks when waiting
803
+ callback: Optional callback for status updates when waiting
804
+
805
+ Returns:
806
+ Tuple of (batch_id, results). Results is None if wait=False.
807
+
808
+ Example:
809
+ >>> requests = [
810
+ ... {"custom_id": "img-1", "instructions": "Describe", "input_texts": ["What is this?"], "image_url": "..."},
811
+ ... {"custom_id": "img-2", "instructions": "Describe", "input_texts": ["What is this?"], "image_url": "..."},
812
+ ... ]
813
+ >>> batch_id, results = provider.submit_batch(requests, wait=True)
814
+ >>> for r in results:
815
+ ... print(f"{r['custom_id']}: {r['response']['body']['output_text']}")
816
+ """
817
+ # Step 1: Create the batch file
818
+ filepath = self.create_batch_file(requests)
819
+
820
+ try:
821
+ # Step 2: Upload the file
822
+ file_id = self.upload_batch_file(filepath)
823
+
824
+ # Step 3: Create the batch
825
+ batch = self.create_batch(
826
+ input_file_id=file_id,
827
+ endpoint=endpoint,
828
+ metadata=metadata,
829
+ )
830
+
831
+ batch_id = batch["id"]
832
+
833
+ if not wait:
834
+ return batch_id, None
835
+
836
+ # Step 4: Wait for completion
837
+ self.wait_for_batch(batch_id, poll_interval=poll_interval, callback=callback)
838
+
839
+ # Step 5: Get results
840
+ results = self.get_batch_results(batch_id)
841
+
842
+ return batch_id, results
843
+
844
+ finally:
845
+ # Clean up temp file
846
+ Path(filepath).unlink(missing_ok=True)
847
+
848
+
849
+ ########################################################################################################################
850
+ # MAIN INTERFACE
851
+
852
+
853
+ class LLMInterface:
854
+ """
855
+ Unified interface for working with Large Language Models.
856
+ """
857
+
858
+ SUPPORTED_PROVIDERS = {
859
+ "openai": OpenAIProvider,
860
+ }
861
+
862
+ def __init__(self, config) -> None:
863
+ if "llm" not in config:
864
+ raise ValueError("Configuration must contain 'llm' section")
865
+
866
+ llm_config = config["llm"]
867
+ self.provider_name = llm_config.get("provider", "openai").lower()
868
+
869
+ if self.provider_name not in self.SUPPORTED_PROVIDERS:
870
+ raise ValueError(f"Unsupported provider '{self.provider_name}'")
871
+
872
+ api_key = llm_config.get("api_key")
873
+ if not api_key:
874
+ raise ValueError(f"API key required for provider '{self.provider_name}'")
875
+
876
+ default_model = llm_config.get("model", "gpt-5-nano")
877
+
878
+ provider_class = self.SUPPORTED_PROVIDERS[self.provider_name]
879
+ self.provider: BaseLLMProvider = provider_class(api_key=api_key, default_model=default_model)
880
+ self.default_model = default_model
881
+
882
+ logger.info(f"Initialized LLM interface with provider: {self.provider_name}")
883
+
884
+ def invoke_text(
885
+ self,
886
+ *,
887
+ instructions: str,
888
+ input_texts: List[str],
889
+ image_path: Optional[str] = None,
890
+ image_url: Optional[str] = None,
891
+ model: Optional[str] = None,
892
+ **kwargs,
893
+ ) -> Tuple[str, Dict[str, Any], Any]:
894
+ """
895
+ Wrapper for text generation with optional Vision support.
896
+
897
+ Args:
898
+ instructions: System instructions for the model
899
+ input_texts: List of text inputs
900
+ image_path: Path to local image file OR image URL (auto-detected)
901
+ image_url: Direct image URL (takes priority over image_path)
902
+ model: Model identifier (uses default if not specified)
903
+ **kwargs: Additional arguments
904
+
905
+ Returns:
906
+ Tuple of (output_text, usage_metadata, raw_response)
907
+
908
+ Examples:
909
+ # Text only
910
+ >>> text, meta, _ = llm.invoke_text(
911
+ ... instructions="Translate to Spanish",
912
+ ... input_texts=["Hello world"]
913
+ ... )
914
+
915
+ # With image URL
916
+ >>> text, meta, _ = llm.invoke_text(
917
+ ... instructions="Describe what you see",
918
+ ... input_texts=["What's in this image?"],
919
+ ... image_url="https://example.com/image.jpg"
920
+ ... )
921
+
922
+ # With local image
923
+ >>> text, meta, _ = llm.invoke_text(
924
+ ... instructions="Read the text",
925
+ ... input_texts=["Extract text from this image"],
926
+ ... image_path="/path/to/document.jpg"
927
+ ... )
928
+ """
929
+ model = model or self.default_model
930
+ return self.provider.invoke_text(
931
+ instructions=instructions,
932
+ input_texts=input_texts,
933
+ model=model,
934
+ image_path=image_path,
935
+ image_url=image_url,
936
+ **kwargs,
937
+ )
938
+
939
+ def invoke_structured(
940
+ self,
941
+ *,
942
+ instructions: str,
943
+ input_texts: List[str],
944
+ model_schema: Type[BaseModel],
945
+ image_path: Optional[str] = None,
946
+ image_url: Optional[str] = None,
947
+ model: Optional[str] = None,
948
+ **kwargs,
949
+ ) -> Tuple[BaseModel, Dict[str, Any], Any]:
950
+ """
951
+ Wrapper for structured output generation with Vision support.
952
+
953
+ Args:
954
+ instructions: System instructions for the model
955
+ input_texts: List of text inputs
956
+ model_schema: Pydantic model class for structured output
957
+ image_path: Path to local image file OR image URL (auto-detected)
958
+ image_url: Direct image URL (takes priority over image_path)
959
+ model: Model identifier (uses default if not specified)
960
+ **kwargs: Additional arguments
961
+
962
+ Returns:
963
+ Tuple of (pydantic_instance, usage_metadata, raw_response)
964
+
965
+ Examples:
966
+ # Using image URL
967
+ >>> result, meta, _ = llm.invoke_structured(
968
+ ... instructions="Describe this image",
969
+ ... input_texts=["What do you see?"],
970
+ ... model_schema=ImageDescription,
971
+ ... image_url="https://example.com/image.jpg"
972
+ ... )
973
+
974
+ # Using local image file
975
+ >>> result, meta, _ = llm.invoke_structured(
976
+ ... instructions="Extract text from image",
977
+ ... input_texts=["Read the label"],
978
+ ... model_schema=LabelText,
979
+ ... image_path="/path/to/image.jpg"
980
+ ... )
981
+
982
+ # image_path also accepts URLs (auto-detected)
983
+ >>> result, meta, _ = llm.invoke_structured(
984
+ ... instructions="Analyze",
985
+ ... input_texts=["What is this?"],
986
+ ... model_schema=Analysis,
987
+ ... image_path="https://example.com/photo.jpg"
988
+ ... )
989
+ """
990
+ model = model or self.default_model
991
+ return self.provider.invoke_structured(
992
+ instructions=instructions,
993
+ input_texts=input_texts,
994
+ model_schema=model_schema,
995
+ model=model,
996
+ image_path=image_path,
997
+ image_url=image_url,
998
+ **kwargs,
999
+ )
1000
+
1001
+ def get_default_model(self) -> str:
1002
+ return self.default_model
1003
+
1004
+ # ==========================================================================================================
1005
+ # BATCH PROCESSING METHODS
1006
+ # ==========================================================================================================
1007
+
1008
+ def create_batch_file(
1009
+ self,
1010
+ requests: List[Dict[str, Any]],
1011
+ output_path: Optional[str] = None,
1012
+ ) -> str:
1013
+ """
1014
+ Create a JSONL batch input file from a list of request configurations.
1015
+
1016
+ Args:
1017
+ requests: List of request configurations. Each dict should have:
1018
+ - custom_id: Unique identifier
1019
+ - instructions: System instructions
1020
+ - input_texts: List of input texts
1021
+ - model: Model to use (optional, uses default)
1022
+ - image_url: Image URL (optional)
1023
+ - image_path: Local image path (optional)
1024
+ - Any additional kwargs for the API
1025
+ output_path: Path to save the JSONL file (optional, creates temp file if not provided)
1026
+
1027
+ Returns:
1028
+ Path to the created JSONL file
1029
+ """
1030
+ if not hasattr(self.provider, "create_batch_file"):
1031
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1032
+ return self.provider.create_batch_file(requests, output_path)
1033
+
1034
+ def upload_batch_file(self, filepath: str) -> str:
1035
+ """
1036
+ Upload a batch input file to the provider.
1037
+
1038
+ Args:
1039
+ filepath: Path to the JSONL batch file
1040
+
1041
+ Returns:
1042
+ File ID from the provider
1043
+ """
1044
+ if not hasattr(self.provider, "upload_batch_file"):
1045
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1046
+ return self.provider.upload_batch_file(filepath)
1047
+
1048
+ def create_batch(
1049
+ self,
1050
+ input_file_id: str,
1051
+ endpoint: str = "/v1/responses",
1052
+ completion_window: str = "24h",
1053
+ metadata: Optional[Dict[str, str]] = None,
1054
+ ) -> Dict[str, Any]:
1055
+ """
1056
+ Create a new batch processing job.
1057
+
1058
+ Args:
1059
+ input_file_id: ID of the uploaded batch input file
1060
+ endpoint: API endpoint ("/v1/responses" or "/v1/chat/completions")
1061
+ completion_window: Time window for completion ("24h")
1062
+ metadata: Optional metadata dict for the batch
1063
+
1064
+ Returns:
1065
+ Batch object with job details
1066
+ """
1067
+ if not hasattr(self.provider, "create_batch"):
1068
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1069
+ return self.provider.create_batch(input_file_id, endpoint, completion_window, metadata)
1070
+
1071
+ def get_batch_status(self, batch_id: str) -> Dict[str, Any]:
1072
+ """
1073
+ Get the current status of a batch job.
1074
+
1075
+ Args:
1076
+ batch_id: ID of the batch
1077
+
1078
+ Returns:
1079
+ Batch object with current status
1080
+ """
1081
+ if not hasattr(self.provider, "get_batch_status"):
1082
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1083
+ return self.provider.get_batch_status(batch_id)
1084
+
1085
+ def wait_for_batch(
1086
+ self,
1087
+ batch_id: str,
1088
+ poll_interval: float = 30.0,
1089
+ timeout: Optional[float] = None,
1090
+ callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1091
+ ) -> Dict[str, Any]:
1092
+ """
1093
+ Wait for a batch job to complete, polling periodically.
1094
+
1095
+ Args:
1096
+ batch_id: ID of the batch
1097
+ poll_interval: Seconds between status checks (default: 30)
1098
+ timeout: Maximum seconds to wait (default: None = no timeout)
1099
+ callback: Optional function called with status on each poll
1100
+
1101
+ Returns:
1102
+ Final batch status
1103
+
1104
+ Raises:
1105
+ TimeoutError: If timeout is reached before completion
1106
+ RuntimeError: If batch fails or is cancelled
1107
+ """
1108
+ if not hasattr(self.provider, "wait_for_batch"):
1109
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1110
+ return self.provider.wait_for_batch(batch_id, poll_interval, timeout, callback)
1111
+
1112
+ def get_batch_results(
1113
+ self,
1114
+ batch_id: str,
1115
+ output_path: Optional[str] = None,
1116
+ ) -> List[Dict[str, Any]]:
1117
+ """
1118
+ Retrieve results from a completed batch.
1119
+
1120
+ Args:
1121
+ batch_id: ID of the batch
1122
+ output_path: Optional path to save the raw JSONL output
1123
+
1124
+ Returns:
1125
+ List of result dictionaries, each containing:
1126
+ - custom_id: The original request ID
1127
+ - response: Response data (if successful)
1128
+ - error: Error data (if failed)
1129
+ """
1130
+ if not hasattr(self.provider, "get_batch_results"):
1131
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1132
+ return self.provider.get_batch_results(batch_id, output_path)
1133
+
1134
+ def get_batch_errors(self, batch_id: str) -> List[Dict[str, Any]]:
1135
+ """
1136
+ Retrieve errors from a batch (if any).
1137
+
1138
+ Args:
1139
+ batch_id: ID of the batch
1140
+
1141
+ Returns:
1142
+ List of error dictionaries
1143
+ """
1144
+ if not hasattr(self.provider, "get_batch_errors"):
1145
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1146
+ return self.provider.get_batch_errors(batch_id)
1147
+
1148
+ def cancel_batch(self, batch_id: str) -> Dict[str, Any]:
1149
+ """
1150
+ Cancel a running batch job.
1151
+
1152
+ Args:
1153
+ batch_id: ID of the batch to cancel
1154
+
1155
+ Returns:
1156
+ Updated batch status
1157
+ """
1158
+ if not hasattr(self.provider, "cancel_batch"):
1159
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1160
+ return self.provider.cancel_batch(batch_id)
1161
+
1162
+ def list_batches(self, limit: int = 20, after: Optional[str] = None) -> List[Dict[str, Any]]:
1163
+ """
1164
+ List all batches.
1165
+
1166
+ Args:
1167
+ limit: Maximum number of batches to return
1168
+ after: Cursor for pagination
1169
+
1170
+ Returns:
1171
+ List of batch objects
1172
+ """
1173
+ if not hasattr(self.provider, "list_batches"):
1174
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1175
+ return self.provider.list_batches(limit, after)
1176
+
1177
+ def submit_batch(
1178
+ self,
1179
+ requests: List[Dict[str, Any]],
1180
+ endpoint: str = "/v1/responses",
1181
+ metadata: Optional[Dict[str, str]] = None,
1182
+ wait: bool = False,
1183
+ poll_interval: float = 30.0,
1184
+ callback: Optional[Callable[[Dict[str, Any]], None]] = None,
1185
+ ) -> Tuple[str, Optional[List[Dict[str, Any]]]]:
1186
+ """
1187
+ High-level method to submit a batch job (create file, upload, and start batch).
1188
+
1189
+ This is the recommended way to submit batch jobs - it handles file creation,
1190
+ upload, batch creation, and optionally waiting for results.
1191
+
1192
+ Args:
1193
+ requests: List of request configurations. Each dict should have:
1194
+ - custom_id: Unique identifier
1195
+ - instructions: System instructions
1196
+ - input_texts: List of input texts
1197
+ - model: Model to use (optional, uses default)
1198
+ - image_url: Image URL (optional)
1199
+ - image_path: Local image path (optional)
1200
+ endpoint: API endpoint ("/v1/responses" or "/v1/chat/completions")
1201
+ metadata: Optional metadata for the batch
1202
+ wait: If True, wait for completion and return results
1203
+ poll_interval: Seconds between status checks when waiting
1204
+ callback: Optional callback for status updates when waiting
1205
+
1206
+ Returns:
1207
+ Tuple of (batch_id, results). Results is None if wait=False.
1208
+
1209
+ Example:
1210
+ >>> requests = [
1211
+ ... {"custom_id": "img-1", "instructions": "Describe", "input_texts": ["What is this?"], "image_url": "..."},
1212
+ ... {"custom_id": "img-2", "instructions": "Describe", "input_texts": ["What is this?"], "image_url": "..."},
1213
+ ... ]
1214
+ >>> batch_id, results = llm.submit_batch(requests, wait=True)
1215
+ >>> for r in results:
1216
+ ... print(f"{r['custom_id']}: {r['response']['body']['output_text']}")
1217
+ """
1218
+ if not hasattr(self.provider, "submit_batch"):
1219
+ raise NotImplementedError(f"Batch processing not supported by {self.provider_name} provider")
1220
+ return self.provider.submit_batch(requests, endpoint, metadata, wait, poll_interval, callback)