ai-pipeline-core 0.3.0__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,8 @@ EXTENSION_MIME_MAP = {
24
24
  "gif": "image/gif",
25
25
  "bmp": "image/bmp",
26
26
  "webp": "image/webp",
27
+ "heic": "image/heic",
28
+ "heif": "image/heif",
27
29
  "json": "application/json",
28
30
  "yaml": "application/yaml",
29
31
  "yml": "application/yaml",
@@ -266,3 +268,29 @@ def is_image_mime_type(mime_type: str) -> bool:
266
268
  False
267
269
  """
268
270
  return mime_type.startswith("image/")
271
+
272
+
273
+ LLM_SUPPORTED_IMAGE_MIME_TYPES: frozenset[str] = frozenset({
274
+ "image/png",
275
+ "image/jpeg",
276
+ "image/webp",
277
+ "image/heic",
278
+ "image/heif",
279
+ })
280
+
281
+
282
+ def is_llm_supported_image(mime_type: str) -> bool:
283
+ """Check if MIME type is an image format directly supported by LLMs.
284
+
285
+ Unsupported image formats (gif, bmp, tiff, svg, etc.) need conversion
286
+ to PNG before sending to the LLM.
287
+
288
+ @public
289
+
290
+ Args:
291
+ mime_type: MIME type string to check.
292
+
293
+ Returns:
294
+ True if the image format is natively supported by LLMs.
295
+ """
296
+ return mime_type in LLM_SUPPORTED_IMAGE_MIME_TYPES
@@ -41,7 +41,7 @@ class FlowOptions(BaseSettings):
41
41
 
42
42
  >>> # Or create programmatically:
43
43
  >>> options = MyFlowOptions(
44
- ... core_model="gemini-2.5-pro",
44
+ ... core_model="gemini-3-pro",
45
45
  ... temperature=0.9
46
46
  ... )
47
47
 
@@ -61,11 +61,11 @@ class FlowOptions(BaseSettings):
61
61
  """
62
62
 
63
63
  core_model: ModelName = Field(
64
- default="gemini-2.5-pro",
64
+ default="gemini-3-pro",
65
65
  description="Primary model for complex analysis and generation tasks.",
66
66
  )
67
67
  small_model: ModelName = Field(
68
- default="grok-4-fast",
68
+ default="grok-4.1-fast",
69
69
  description="Fast, cost-effective model for simple tasks and orchestration.",
70
70
  )
71
71
 
@@ -0,0 +1,362 @@
1
+ """Image processing utilities for LLM vision models.
2
+
3
+ @public
4
+
5
+ Splits large images, compresses to JPEG, and respects model-specific constraints.
6
+ Designed for website screenshots, document pages, and other visual content
7
+ sent to vision-capable LLMs.
8
+
9
+ Quick Start:
10
+ >>> from ai_pipeline_core.images import process_image, ImagePreset
11
+ >>>
12
+ >>> result = process_image(screenshot_bytes)
13
+ >>> for part in result:
14
+ ... send_to_llm(part.data, context=part.label)
15
+ >>>
16
+ >>> result = process_image(screenshot_bytes, preset=ImagePreset.GEMINI)
17
+ """
18
+
19
+ from enum import StrEnum
20
+
21
+ from pydantic import BaseModel, Field
22
+
23
+ from ai_pipeline_core.documents import Document, TemporaryDocument
24
+
25
+ from ._processing import execute_split, load_and_normalize, plan_split
26
+
27
+ __all__ = [
28
+ "ImagePreset",
29
+ "ImageProcessingConfig",
30
+ "ImagePart",
31
+ "ProcessedImage",
32
+ "ImageProcessingError",
33
+ "process_image",
34
+ "process_image_to_documents",
35
+ ]
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Configuration
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ class ImagePreset(StrEnum):
44
+ """Presets for LLM vision model constraints.
45
+
46
+ @public
47
+ """
48
+
49
+ GEMINI = "gemini"
50
+ CLAUDE = "claude"
51
+ GPT4V = "gpt4v"
52
+
53
+
54
+ class ImageProcessingConfig(BaseModel):
55
+ """Configuration for image processing.
56
+
57
+ @public
58
+
59
+ Use ``for_preset`` for standard configurations or construct directly for
60
+ custom constraints.
61
+
62
+ Example:
63
+ >>> config = ImageProcessingConfig.for_preset(ImagePreset.GEMINI)
64
+ >>> config = ImageProcessingConfig(max_dimension=2000, jpeg_quality=80)
65
+ """
66
+
67
+ model_config = {"frozen": True}
68
+
69
+ max_dimension: int = Field(
70
+ default=3000,
71
+ ge=100,
72
+ le=8192,
73
+ description="Maximum width AND height in pixels",
74
+ )
75
+ max_pixels: int = Field(
76
+ default=9_000_000,
77
+ ge=10_000,
78
+ description="Maximum total pixels per output image part",
79
+ )
80
+ overlap_fraction: float = Field(
81
+ default=0.20,
82
+ ge=0.0,
83
+ le=0.5,
84
+ description="Overlap between adjacent vertical parts (0.0-0.5)",
85
+ )
86
+ max_parts: int = Field(
87
+ default=20,
88
+ ge=1,
89
+ le=100,
90
+ description="Maximum number of output image parts",
91
+ )
92
+ jpeg_quality: int = Field(
93
+ default=60,
94
+ ge=10,
95
+ le=95,
96
+ description="JPEG compression quality (10-95)",
97
+ )
98
+
99
+ @classmethod
100
+ def for_preset(cls, preset: ImagePreset) -> "ImageProcessingConfig":
101
+ """Create configuration from a model preset.
102
+
103
+ @public
104
+ """
105
+ return _PRESETS[preset]
106
+
107
+
108
+ _PRESETS: dict[ImagePreset, ImageProcessingConfig] = {
109
+ ImagePreset.GEMINI: ImageProcessingConfig(
110
+ max_dimension=3000,
111
+ max_pixels=9_000_000,
112
+ jpeg_quality=75,
113
+ ),
114
+ ImagePreset.CLAUDE: ImageProcessingConfig(
115
+ max_dimension=1568,
116
+ max_pixels=1_150_000,
117
+ jpeg_quality=60,
118
+ ),
119
+ ImagePreset.GPT4V: ImageProcessingConfig(
120
+ max_dimension=2048,
121
+ max_pixels=4_000_000,
122
+ jpeg_quality=70,
123
+ ),
124
+ }
125
+
126
+
127
+ # ---------------------------------------------------------------------------
128
+ # Result models
129
+ # ---------------------------------------------------------------------------
130
+
131
+
132
+ class ImagePart(BaseModel):
133
+ """A single processed image part.
134
+
135
+ @public
136
+ """
137
+
138
+ model_config = {"frozen": True}
139
+
140
+ data: bytes = Field(repr=False)
141
+ width: int
142
+ height: int
143
+ index: int = Field(ge=0, description="0-indexed position")
144
+ total: int = Field(ge=1, description="Total number of parts")
145
+ source_y: int = Field(ge=0, description="Y offset in original image")
146
+ source_height: int = Field(ge=1, description="Height of region in original")
147
+
148
+ @property
149
+ def label(self) -> str:
150
+ """Human-readable label for LLM context, 1-indexed.
151
+
152
+ @public
153
+ """
154
+ if self.total == 1:
155
+ return "Full image"
156
+ return f"Part {self.index + 1}/{self.total}"
157
+
158
+
159
+ class ProcessedImage(BaseModel):
160
+ """Result of image processing.
161
+
162
+ @public
163
+
164
+ Iterable: ``for part in result`` iterates over parts.
165
+ """
166
+
167
+ model_config = {"frozen": True}
168
+
169
+ parts: list[ImagePart]
170
+ original_width: int
171
+ original_height: int
172
+ original_bytes: int
173
+ output_bytes: int
174
+ was_trimmed: bool = Field(description="True if width was trimmed to fit")
175
+ warnings: list[str] = Field(default_factory=list)
176
+
177
+ @property
178
+ def compression_ratio(self) -> float:
179
+ """Output size / input size (lower means more compression).
180
+
181
+ @public
182
+ """
183
+ if self.original_bytes <= 0:
184
+ return 1.0
185
+ return self.output_bytes / self.original_bytes
186
+
187
+ def __len__(self) -> int:
188
+ return len(self.parts)
189
+
190
+ def __iter__(self): # type: ignore[override]
191
+ return iter(self.parts)
192
+
193
+ def __getitem__(self, idx: int) -> ImagePart:
194
+ return self.parts[idx]
195
+
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # Exceptions
199
+ # ---------------------------------------------------------------------------
200
+
201
+
202
+ class ImageProcessingError(Exception):
203
+ """Image processing failed.
204
+
205
+ @public
206
+ """
207
+
208
+
209
+ # ---------------------------------------------------------------------------
210
+ # Public API
211
+ # ---------------------------------------------------------------------------
212
+
213
+
214
+ def process_image(
215
+ image: bytes | Document,
216
+ preset: ImagePreset = ImagePreset.GEMINI,
217
+ config: ImageProcessingConfig | None = None,
218
+ ) -> ProcessedImage:
219
+ """Process an image for LLM vision models.
220
+
221
+ @public
222
+
223
+ Splits tall images vertically with overlap, trims width if needed, and
224
+ compresses to JPEG. The default preset is **GEMINI** (3 000 px, 9 M pixels).
225
+
226
+ Args:
227
+ image: Raw image bytes or a Document whose content is an image.
228
+ preset: Model preset (ignored when *config* is provided).
229
+ config: Custom configuration that overrides the preset.
230
+
231
+ Returns:
232
+ A ``ProcessedImage`` containing one or more ``ImagePart`` objects.
233
+
234
+ Raises:
235
+ ImageProcessingError: If the image cannot be decoded or processed.
236
+
237
+ Example:
238
+ >>> result = process_image(screenshot_bytes)
239
+ >>> for part in result:
240
+ ... print(part.label, len(part.data))
241
+ """
242
+ effective = config if config is not None else ImageProcessingConfig.for_preset(preset)
243
+
244
+ # Resolve input bytes
245
+ raw: bytes
246
+ if isinstance(image, Document):
247
+ raw = image.content
248
+ elif isinstance(image, bytes): # type: ignore[reportUnnecessaryIsInstance]
249
+ raw = image
250
+ else:
251
+ raise ImageProcessingError(f"Unsupported image input type: {type(image)}")
252
+
253
+ if not raw:
254
+ raise ImageProcessingError("Empty image data")
255
+
256
+ original_bytes = len(raw)
257
+
258
+ # Load & normalise
259
+ try:
260
+ img = load_and_normalize(raw)
261
+ except Exception as exc:
262
+ raise ImageProcessingError(f"Failed to decode image: {exc}") from exc
263
+
264
+ original_width, original_height = img.size
265
+
266
+ # Plan
267
+ plan = plan_split(
268
+ width=original_width,
269
+ height=original_height,
270
+ max_dimension=effective.max_dimension,
271
+ max_pixels=effective.max_pixels,
272
+ overlap_fraction=effective.overlap_fraction,
273
+ max_parts=effective.max_parts,
274
+ )
275
+
276
+ # Execute
277
+ raw_parts = execute_split(img, plan, effective.jpeg_quality)
278
+
279
+ # Build result
280
+ parts: list[ImagePart] = []
281
+ total = len(raw_parts)
282
+ total_output = 0
283
+
284
+ for idx, (data, w, h, sy, sh) in enumerate(raw_parts):
285
+ total_output += len(data)
286
+ parts.append(
287
+ ImagePart(
288
+ data=data,
289
+ width=w,
290
+ height=h,
291
+ index=idx,
292
+ total=total,
293
+ source_y=sy,
294
+ source_height=sh,
295
+ )
296
+ )
297
+
298
+ return ProcessedImage(
299
+ parts=parts,
300
+ original_width=original_width,
301
+ original_height=original_height,
302
+ original_bytes=original_bytes,
303
+ output_bytes=total_output,
304
+ was_trimmed=plan.trim_width is not None,
305
+ warnings=plan.warnings,
306
+ )
307
+
308
+
309
+ def process_image_to_documents(
310
+ image: bytes | Document,
311
+ preset: ImagePreset = ImagePreset.GEMINI,
312
+ config: ImageProcessingConfig | None = None,
313
+ name_prefix: str = "image",
314
+ sources: list[str] | None = None,
315
+ ) -> list[TemporaryDocument]:
316
+ """Process an image and return parts as ``TemporaryDocument`` list.
317
+
318
+ @public
319
+
320
+ Convenience wrapper around ``process_image`` for direct integration
321
+ with ``AIMessages``.
322
+
323
+ Args:
324
+ image: Raw image bytes or a Document.
325
+ preset: Model preset (ignored when *config* is provided).
326
+ config: Custom configuration.
327
+ name_prefix: Prefix for generated document names.
328
+ sources: Optional provenance references attached to each document.
329
+
330
+ Returns:
331
+ List of ``TemporaryDocument`` instances with JPEG image data.
332
+
333
+ Example:
334
+ >>> docs = process_image_to_documents(screenshot_bytes)
335
+ >>> messages = AIMessages(docs)
336
+ """
337
+ result = process_image(image, preset=preset, config=config)
338
+
339
+ # Resolve sources
340
+ doc_sources: list[str] = list(sources or [])
341
+ if isinstance(image, Document):
342
+ doc_sources.append(image.sha256)
343
+
344
+ documents: list[TemporaryDocument] = []
345
+ for part in result.parts:
346
+ if len(result.parts) == 1:
347
+ name = f"{name_prefix}.jpg"
348
+ desc = None
349
+ else:
350
+ name = f"{name_prefix}_{part.index + 1:02d}_of_{part.total:02d}.jpg"
351
+ desc = part.label
352
+
353
+ documents.append(
354
+ TemporaryDocument.create(
355
+ name=name,
356
+ content=part.data,
357
+ description=desc,
358
+ sources=doc_sources or None,
359
+ )
360
+ )
361
+
362
+ return documents
@@ -0,0 +1,157 @@
1
+ """Internal image processing logic: planning, splitting, encoding."""
2
+
3
+ from dataclasses import dataclass
4
+ from io import BytesIO
5
+ from math import ceil
6
+
7
+ from PIL import Image, ImageOps
8
+
9
+ PIL_MAX_PIXELS = 100_000_000 # 100MP security limit
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class SplitPlan:
14
+ """Describes how to split an image into parts."""
15
+
16
+ tile_width: int
17
+ tile_height: int
18
+ step_y: int
19
+ num_parts: int
20
+ trim_width: int | None # None = no trim needed
21
+ warnings: list[str]
22
+
23
+
24
+ def plan_split(
25
+ width: int,
26
+ height: int,
27
+ max_dimension: int,
28
+ max_pixels: int,
29
+ overlap_fraction: float,
30
+ max_parts: int,
31
+ ) -> SplitPlan:
32
+ """Calculate how to split an image. Pure function, no side effects.
33
+
34
+ Returns a SplitPlan describing tile size, step, and number of parts.
35
+ """
36
+ warnings: list[str] = []
37
+
38
+ # Effective tile size respecting both max_dimension and max_pixels
39
+ tile_size = max_dimension
40
+ while tile_size * tile_size > max_pixels and tile_size > 100:
41
+ tile_size -= 10
42
+
43
+ # Width: trim if needed (left-aligned, web content is left-aligned)
44
+ trim_width = tile_size if width > tile_size else None
45
+
46
+ effective_width = min(width, tile_size)
47
+
48
+ # If single-tile pixel budget is still exceeded by width * tile_height, reduce tile_height
49
+ tile_h = tile_size
50
+ while effective_width * tile_h > max_pixels and tile_h > 100:
51
+ tile_h -= 10
52
+
53
+ # No vertical split needed
54
+ if height <= tile_h:
55
+ return SplitPlan(
56
+ tile_width=effective_width,
57
+ tile_height=height,
58
+ step_y=0,
59
+ num_parts=1,
60
+ trim_width=trim_width,
61
+ warnings=warnings,
62
+ )
63
+
64
+ # Vertical split with overlap
65
+ overlap_px = int(tile_h * overlap_fraction)
66
+ step = tile_h - overlap_px
67
+ if step <= 0:
68
+ step = 1
69
+
70
+ num_parts = 1 + ceil((height - tile_h) / step)
71
+
72
+ # Auto-reduce if exceeds max_parts
73
+ if num_parts > max_parts:
74
+ warnings.append(
75
+ f"Image requires {num_parts} parts but max is {max_parts}. "
76
+ f"Reducing to {max_parts} parts with larger step."
77
+ )
78
+ num_parts = max_parts
79
+ if num_parts > 1:
80
+ step = (height - tile_h) // (num_parts - 1)
81
+ else:
82
+ step = 0
83
+
84
+ return SplitPlan(
85
+ tile_width=effective_width,
86
+ tile_height=tile_h,
87
+ step_y=step,
88
+ num_parts=num_parts,
89
+ trim_width=trim_width,
90
+ warnings=warnings,
91
+ )
92
+
93
+
94
+ def load_and_normalize(data: bytes) -> Image.Image:
95
+ """Load image from bytes, apply EXIF orientation, validate size."""
96
+ img = Image.open(BytesIO(data))
97
+ img.load()
98
+
99
+ if img.width * img.height > PIL_MAX_PIXELS:
100
+ raise ValueError(
101
+ f"Image too large: {img.width}x{img.height} = {img.width * img.height:,} pixels "
102
+ f"(limit: {PIL_MAX_PIXELS:,})"
103
+ )
104
+
105
+ # Fix EXIF orientation (important for mobile photos)
106
+ img = ImageOps.exif_transpose(img)
107
+ return img
108
+
109
+
110
+ def encode_jpeg(img: Image.Image, quality: int) -> bytes:
111
+ """Encode PIL Image as JPEG bytes."""
112
+ # Convert to RGB if needed (JPEG doesn't support alpha)
113
+ if img.mode not in ("RGB", "L"):
114
+ img = img.convert("RGB")
115
+
116
+ buf = BytesIO()
117
+ img.save(buf, format="JPEG", quality=quality, optimize=True)
118
+ return buf.getvalue()
119
+
120
+
121
+ def execute_split(
122
+ img: Image.Image,
123
+ plan: SplitPlan,
124
+ jpeg_quality: int,
125
+ ) -> list[tuple[bytes, int, int, int, int]]:
126
+ """Execute a split plan on an image.
127
+
128
+ Returns list of (data, width, height, source_y, source_height) tuples.
129
+ """
130
+ width, height = img.size
131
+
132
+ # Trim width if needed (left-aligned crop)
133
+ if plan.trim_width is not None and width > plan.trim_width:
134
+ img = img.crop((0, 0, plan.trim_width, height))
135
+ width = plan.trim_width
136
+
137
+ # Convert to RGB once for JPEG
138
+ if img.mode not in ("RGB", "L"):
139
+ img = img.convert("RGB")
140
+
141
+ parts: list[tuple[bytes, int, int, int, int]] = []
142
+
143
+ for i in range(plan.num_parts):
144
+ if plan.num_parts == 1:
145
+ y = 0
146
+ else:
147
+ y = i * plan.step_y
148
+ # Clamp so last tile aligns to bottom
149
+ y = min(y, max(0, height - plan.tile_height))
150
+
151
+ h = min(plan.tile_height, height - y)
152
+ tile = img.crop((0, y, width, y + h))
153
+
154
+ data = encode_jpeg(tile, jpeg_quality)
155
+ parts.append((data, width, h, y, h))
156
+
157
+ return parts
@@ -8,6 +8,7 @@ including text, documents, and model responses.
8
8
 
9
9
  import base64
10
10
  import hashlib
11
+ import io
11
12
  import json
12
13
  from copy import deepcopy
13
14
  from typing import Any, Callable, Iterable, SupportsIndex, Union
@@ -17,9 +18,11 @@ from openai.types.chat import (
17
18
  ChatCompletionContentPartParam,
18
19
  ChatCompletionMessageParam,
19
20
  )
21
+ from PIL import Image
20
22
  from prefect.logging import get_logger
21
23
 
22
24
  from ai_pipeline_core.documents import Document
25
+ from ai_pipeline_core.documents.mime_type import is_llm_supported_image
23
26
 
24
27
  from .model_response import ModelResponse
25
28
 
@@ -53,7 +56,7 @@ class AIMessages(list[AIMessageType]):
53
56
  Note: Document conversion is automatic. Text content becomes user text messages.
54
57
 
55
58
  VISION/PDF MODEL COMPATIBILITY WARNING:
56
- Images require vision-capable models (e.g., gpt-4o, gemini-pro-vision, claude-3-haiku).
59
+ Images require vision-capable models (e.g., gpt-5.1, gemini-3-flash, gemini-3-pro).
57
60
  Non-vision models will raise ValueError when encountering image documents.
58
61
  PDFs require models with document processing support - check your model's capabilities
59
62
  before including PDF documents in messages. Unsupported models may fall back to
@@ -74,7 +77,7 @@ class AIMessages(list[AIMessageType]):
74
77
  >>> from ai_pipeline_core import llm
75
78
  >>> messages = AIMessages()
76
79
  >>> messages.append("What is the capital of France?")
77
- >>> response = await llm.generate("gpt-5", messages=messages)
80
+ >>> response = await llm.generate("gpt-5.1", messages=messages)
78
81
  >>> messages.append(response) # Add the actual response
79
82
  """
80
83
 
@@ -264,10 +267,31 @@ class AIMessages(list[AIMessageType]):
264
267
  elif isinstance(message, Document):
265
268
  messages.append({"role": "user", "content": AIMessages.document_to_prompt(message)})
266
269
  elif isinstance(message, ModelResponse): # type: ignore
267
- messages.append({
270
+ # Build base assistant message
271
+ assistant_message: ChatCompletionMessageParam = {
268
272
  "role": "assistant",
269
273
  "content": [{"type": "text", "text": message.content}],
270
- })
274
+ }
275
+
276
+ # Preserve reasoning_content (Gemini Flash 3+, O1, O3, GPT-5)
277
+ if reasoning_content := message.reasoning_content:
278
+ assistant_message["reasoning_content"] = reasoning_content # type: ignore[typeddict-item]
279
+
280
+ # Preserve thinking_blocks (structured thinking)
281
+ if hasattr(message.choices[0].message, "thinking_blocks"):
282
+ thinking_blocks = getattr(message.choices[0].message, "thinking_blocks", None)
283
+ if thinking_blocks:
284
+ assistant_message["thinking_blocks"] = thinking_blocks # type: ignore[typeddict-item]
285
+
286
+ # Preserve provider_specific_fields (thought_signatures for Gemini multi-turn)
287
+ if hasattr(message.choices[0].message, "provider_specific_fields"):
288
+ provider_fields = getattr(
289
+ message.choices[0].message, "provider_specific_fields", None
290
+ )
291
+ if provider_fields:
292
+ assistant_message["provider_specific_fields"] = provider_fields # type: ignore[typeddict-item]
293
+
294
+ messages.append(assistant_message)
271
295
  else:
272
296
  raise ValueError(f"Unsupported message type: {type(message)}")
273
297
 
@@ -376,9 +400,19 @@ class AIMessages(list[AIMessageType]):
376
400
  "text": f"{header_text}<content>\n",
377
401
  })
378
402
 
379
- # Encode binary content
380
- base64_content = base64.b64encode(document.content).decode("utf-8")
381
- data_uri = f"data:{document.mime_type};base64,{base64_content}"
403
+ # Encode binary content, converting unsupported image formats to PNG
404
+ if document.is_image and not is_llm_supported_image(document.mime_type):
405
+ img = Image.open(io.BytesIO(document.content))
406
+ buf = io.BytesIO()
407
+ img.save(buf, format="PNG")
408
+ content_bytes = buf.getvalue()
409
+ mime_type = "image/png"
410
+ else:
411
+ content_bytes = document.content
412
+ mime_type = document.mime_type
413
+
414
+ base64_content = base64.b64encode(content_bytes).decode("utf-8")
415
+ data_uri = f"data:{mime_type};base64,{base64_content}"
382
416
 
383
417
  # Add appropriate content type
384
418
  if document.is_pdf: