ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. ai_pipeline_core/__init__.py +84 -4
  2. ai_pipeline_core/documents/__init__.py +9 -0
  3. ai_pipeline_core/documents/document.py +1034 -151
  4. ai_pipeline_core/documents/document_list.py +147 -38
  5. ai_pipeline_core/documents/flow_document.py +112 -11
  6. ai_pipeline_core/documents/mime_type.py +173 -15
  7. ai_pipeline_core/documents/task_document.py +117 -12
  8. ai_pipeline_core/documents/temporary_document.py +84 -5
  9. ai_pipeline_core/documents/utils.py +41 -9
  10. ai_pipeline_core/exceptions.py +47 -11
  11. ai_pipeline_core/flow/__init__.py +2 -0
  12. ai_pipeline_core/flow/config.py +232 -23
  13. ai_pipeline_core/flow/options.py +50 -1
  14. ai_pipeline_core/llm/__init__.py +6 -0
  15. ai_pipeline_core/llm/ai_messages.py +125 -27
  16. ai_pipeline_core/llm/client.py +278 -26
  17. ai_pipeline_core/llm/model_options.py +130 -1
  18. ai_pipeline_core/llm/model_response.py +239 -35
  19. ai_pipeline_core/llm/model_types.py +67 -0
  20. ai_pipeline_core/logging/__init__.py +13 -0
  21. ai_pipeline_core/logging/logging_config.py +72 -20
  22. ai_pipeline_core/logging/logging_mixin.py +38 -32
  23. ai_pipeline_core/pipeline.py +308 -60
  24. ai_pipeline_core/prefect.py +48 -1
  25. ai_pipeline_core/prompt_manager.py +209 -24
  26. ai_pipeline_core/settings.py +108 -4
  27. ai_pipeline_core/simple_runner/__init__.py +5 -0
  28. ai_pipeline_core/simple_runner/cli.py +96 -11
  29. ai_pipeline_core/simple_runner/simple_runner.py +237 -4
  30. ai_pipeline_core/tracing.py +232 -30
  31. ai_pipeline_core-0.1.11.dist-info/METADATA +450 -0
  32. ai_pipeline_core-0.1.11.dist-info/RECORD +36 -0
  33. ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
  34. ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
  35. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/WHEEL +0 -0
  36. {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,16 @@
1
+ """LLM client implementation for AI model interactions.
2
+
3
+ @public
4
+
5
+ This module provides the core functionality for interacting with language models
6
+ through a unified interface. It handles retries, caching, structured outputs,
7
+ and integration with various LLM providers via LiteLLM.
8
+
9
+ Key functions:
10
+ - generate(): Text generation with optional context caching
11
+ - generate_structured(): Type-safe structured output generation
12
+ """
13
+
1
14
  import asyncio
2
15
  from typing import Any, TypeVar
3
16
 
@@ -26,17 +39,36 @@ def _process_messages(
26
39
  messages: AIMessages,
27
40
  system_prompt: str | None = None,
28
41
  ) -> list[ChatCompletionMessageParam]:
29
- """Convert context and messages to OpenAI-compatible format.
42
+ """Process and format messages for LLM API consumption.
43
+
44
+ Internal function that combines context and messages into a single
45
+ list of API-compatible messages. Applies caching directives to
46
+ context messages for efficiency.
30
47
 
31
48
  Args:
32
- context: Messages to be cached (optional)
33
- messages: Regular messages that won't be cached
34
- system_prompt: Optional system prompt
49
+ context: Messages to be cached (typically expensive/static content).
50
+ messages: Regular messages without caching (dynamic queries).
51
+ system_prompt: Optional system instructions for the model.
35
52
 
36
53
  Returns:
37
- List of formatted messages for OpenAI API
54
+ List of formatted messages ready for API calls, with:
55
+ - System prompt at the beginning (if provided)
56
+ - Context messages with cache_control on the last one
57
+ - Regular messages without caching
58
+
59
+ System Prompt Location:
60
+ The system prompt from ModelOptions.system_prompt is always injected
61
+ as the FIRST message with role="system". It is NOT cached with context,
62
+ allowing dynamic system prompts without breaking cache efficiency.
63
+
64
+ Cache behavior:
65
+ The last context message gets ephemeral caching (120s TTL)
66
+ to reduce token usage on repeated calls with same context.
67
+
68
+ Note:
69
+ This is an internal function used by _generate_with_retry().
70
+ The context/messages split enables efficient token usage.
38
71
  """
39
-
40
72
  processed_messages: list[ChatCompletionMessageParam] = []
41
73
 
42
74
  # Add system prompt if provided
@@ -67,6 +99,28 @@ def _process_messages(
67
99
  async def _generate(
68
100
  model: str, messages: list[ChatCompletionMessageParam], completion_kwargs: dict[str, Any]
69
101
  ) -> ModelResponse:
102
+ """Execute a single LLM API call.
103
+
104
+ Internal function that makes the actual API request to the LLM provider.
105
+ Handles both regular and structured output generation.
106
+
107
+ Args:
108
+ model: Model identifier (e.g., "gpt-5", "gemini-2.5-pro").
109
+ messages: Formatted messages for the API.
110
+ completion_kwargs: Additional parameters for the completion API.
111
+
112
+ Returns:
113
+ ModelResponse with generated content and metadata.
114
+
115
+ API selection:
116
+ - Uses client.chat.completions.parse() for structured output
117
+ - Uses client.chat.completions.create() for regular text
118
+
119
+ Note:
120
+ - Uses AsyncOpenAI client configured via settings
121
+ - Captures response headers for cost tracking
122
+ - Response includes model options for debugging
123
+ """
70
124
  async with AsyncOpenAI(
71
125
  api_key=settings.openai_api_key,
72
126
  base_url=settings.openai_base_url,
@@ -93,7 +147,27 @@ async def _generate_with_retry(
93
147
  messages: AIMessages,
94
148
  options: ModelOptions,
95
149
  ) -> ModelResponse:
96
- """Core generation logic with exponential backoff retry."""
150
+ """Core LLM generation with automatic retry logic.
151
+
152
+ Internal function that orchestrates the complete generation process
153
+ including message processing, retries, caching, and tracing.
154
+
155
+ Args:
156
+ model: Model identifier string.
157
+ context: Cached context messages (can be empty).
158
+ messages: Dynamic query messages.
159
+ options: Configuration including retries, timeout, temperature.
160
+
161
+ Returns:
162
+ ModelResponse with generated content.
163
+
164
+ Raises:
165
+ ValueError: If model is not provided or both context and messages are empty.
166
+ LLMError: If all retry attempts are exhausted.
167
+
168
+ Note:
169
+ Empty responses trigger a retry as they indicate API issues.
170
+ """
97
171
  if not model:
98
172
  raise ValueError("Model must be provided")
99
173
  if not context and not messages:
@@ -143,28 +217,135 @@ async def _generate_with_retry(
143
217
  async def generate(
144
218
  model: ModelName | str,
145
219
  *,
146
- context: AIMessages = AIMessages(),
220
+ context: AIMessages | None = None,
147
221
  messages: AIMessages | str,
148
- options: ModelOptions = ModelOptions(),
222
+ options: ModelOptions | None = None,
149
223
  ) -> ModelResponse:
150
- """Generate response using a large or small model.
224
+ """Generate text response from a language model.
225
+
226
+ @public
227
+
228
+ Main entry point for LLM text generation with smart context caching.
229
+ The context/messages split enables efficient token usage by caching
230
+ expensive static content separately from dynamic queries.
231
+
232
+ Best Practices:
233
+ 1. OPTIONS: Omit in 90% of cases - defaults are optimized
234
+ 2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
235
+ 3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
151
236
 
152
237
  Args:
153
- model: The model to use for generation
154
- context: Messages to be cached (optional) - keyword only
155
- messages: Regular messages that won't be cached - keyword only
156
- options: Model options - keyword only
238
+ model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
239
+ Can be ModelName literal or any string for custom models.
240
+ context: Static context to cache (documents, examples, instructions).
241
+ Defaults to None (empty context). Cached for 120 seconds.
242
+ messages: Dynamic messages/queries. AIMessages or str ONLY.
243
+ Do not pass Document or DocumentList directly.
244
+ If string, converted to AIMessages internally.
245
+ options: Model configuration (temperature, retries, timeout, etc.).
246
+ Defaults to None (uses ModelOptions() with standard settings).
157
247
 
158
248
  Returns:
159
- Model response
249
+ ModelResponse containing:
250
+ - Generated text content
251
+ - Usage statistics
252
+ - Cost information (if available)
253
+ - Model metadata
254
+
255
+ Raises:
256
+ ValueError: If model is empty or messages are invalid.
257
+ LLMError: If generation fails after all retries.
258
+
259
+ Document Handling:
260
+ Wrap Documents in AIMessages - DO NOT pass directly or convert to .text:
261
+
262
+ # CORRECT - wrap Document in AIMessages
263
+ response = await llm.generate("gpt-5", messages=AIMessages([my_document]))
264
+
265
+ # WRONG - don't pass Document directly
266
+ response = await llm.generate("gpt-5", messages=my_document) # NO!
267
+
268
+ # WRONG - don't convert to string yourself
269
+ response = await llm.generate("gpt-5", messages=my_document.text) # NO!
270
+
271
+ Context vs Messages Strategy:
272
+ context: Static, reusable content (cached 120 seconds)
273
+ - Large documents, instructions, examples
274
+ - Same across multiple calls
275
+
276
+ messages: Dynamic, query-specific content
277
+ - User questions, current conversation turn
278
+ - Changes every call
279
+
280
+ Example:
281
+ >>> # Simple case - no options needed (90% of cases)
282
+ >>> response = await llm.generate("gpt-5", messages="Explain quantum computing")
283
+ >>> print(response.content) # In production, use get_pipeline_logger instead of print
284
+
285
+ >>> # With context caching for efficiency
286
+ >>> # Context and messages are both AIMessages or str; wrap any Documents
287
+ >>> static_doc = AIMessages([large_document, "few-shot example: ..."])
288
+ >>>
289
+ >>> # First call: caches context
290
+ >>> r1 = await llm.generate("gpt-5", context=static_doc, messages="Summarize")
291
+ >>>
292
+ >>> # Second call: reuses cache, saves tokens!
293
+ >>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
294
+
295
+ >>> # AVOID unnecessary options (defaults are optimal)
296
+ >>> response = await llm.generate(
297
+ ... "gpt-5",
298
+ ... messages="Hello",
299
+ ... options=ModelOptions(temperature=0.7) # Default is probably fine!
300
+ ... )
301
+
302
+ >>> # Multi-turn conversation
303
+ >>> messages = AIMessages([
304
+ ... "What is Python?",
305
+ ... previous_response,
306
+ ... "Can you give an example?"
307
+ ... ])
308
+ >>> response = await llm.generate("gpt-5", messages=messages)
309
+
310
+ Performance:
311
+ - Context caching saves ~50-90% tokens on repeated calls
312
+ - First call: full token cost
313
+ - Subsequent calls (within 120s): only messages tokens
314
+ - Default retry delay is 10s (configurable via ModelOptions.retry_delay_seconds)
315
+
316
+ Caching:
317
+ When enabled in your LiteLLM proxy and supported by the upstream provider,
318
+ context messages may be cached (typical TTL ~120s) to reduce token usage on
319
+ repeated calls. Savings depend on provider and payload; treat this as an
320
+ optimization, not a guarantee. Cache behavior varies by proxy configuration.
321
+
322
+ Note:
323
+ - Context argument is ignored by the tracer to avoid recording large data
324
+ - All models are accessed via LiteLLM proxy
325
+ - Automatic retry with configurable delay between attempts
326
+ - Cost tracking via response headers
327
+
328
+ See Also:
329
+ - generate_structured: For typed/structured output
330
+ - AIMessages: Message container with document support
331
+ - ModelOptions: Configuration options
160
332
  """
161
333
  if isinstance(messages, str):
162
334
  messages = AIMessages([messages])
163
335
 
164
- return await _generate_with_retry(model, context, messages, options)
336
+ if context is None:
337
+ context = AIMessages()
338
+ if options is None:
339
+ options = ModelOptions()
340
+
341
+ try:
342
+ return await _generate_with_retry(model, context, messages, options)
343
+ except (ValueError, LLMError):
344
+ raise # Explicitly re-raise to satisfy DOC502
165
345
 
166
346
 
167
347
  T = TypeVar("T", bound=BaseModel)
348
+ """Type variable for Pydantic model types in structured generation."""
168
349
 
169
350
 
170
351
  @trace(ignore_inputs=["context"])
@@ -172,29 +353,100 @@ async def generate_structured(
172
353
  model: ModelName | str,
173
354
  response_format: type[T],
174
355
  *,
175
- context: AIMessages = AIMessages(),
356
+ context: AIMessages | None = None,
176
357
  messages: AIMessages | str,
177
- options: ModelOptions = ModelOptions(),
358
+ options: ModelOptions | None = None,
178
359
  ) -> StructuredModelResponse[T]:
179
- """Generate structured response using Pydantic models.
360
+ """Generate structured output conforming to a Pydantic model.
361
+
362
+ @public
363
+
364
+ Type-safe generation that returns validated Pydantic model instances.
365
+ Uses OpenAI's structured output feature for guaranteed schema compliance.
366
+
367
+ Best Practices (same as generate):
368
+ 1. OPTIONS: Omit in 90% of cases - defaults are optimized
369
+ 2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
370
+ 3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
180
371
 
181
372
  Args:
182
- model: The model to use for generation
183
- response_format: A Pydantic model class
184
- context: Messages to be cached (optional) - keyword only
185
- messages: Regular messages that won't be cached - keyword only
186
- options: Model options - keyword only
373
+ model: Model to use (must support structured output).
374
+ response_format: Pydantic model class defining the output schema.
375
+ The model will generate JSON matching this schema.
376
+ context: Static context to cache (documents, schemas, examples).
377
+ Defaults to None (empty AIMessages).
378
+ messages: Dynamic prompts/queries. AIMessages or str ONLY.
379
+ Do not pass Document or DocumentList directly.
380
+ options: Model configuration. response_format is set automatically.
187
381
 
188
382
  Returns:
189
- A StructuredModelResponse containing the parsed Pydantic model instance
383
+ StructuredModelResponse[T] containing:
384
+ - parsed: Validated instance of response_format class
385
+ - All fields from regular ModelResponse (content, usage, etc.)
386
+
387
+ Raises:
388
+ TypeError: If response_format is not a Pydantic model class.
389
+ ValueError: If model doesn't support structured output or no parsed content returned.
390
+ LLMError: If generation fails after retries.
391
+ ValidationError: If response cannot be parsed into response_format.
392
+
393
+ Example:
394
+ >>> from pydantic import BaseModel, Field
395
+ >>>
396
+ >>> class Analysis(BaseModel):
397
+ ... summary: str = Field(description="Brief summary")
398
+ ... sentiment: float = Field(ge=-1, le=1)
399
+ ... key_points: list[str] = Field(max_length=5)
400
+ >>>
401
+ >>> response = await llm.generate_structured(
402
+ ... model="gpt-5",
403
+ ... response_format=Analysis,
404
+ ... messages="Analyze this product review: ..."
405
+ ... )
406
+ >>>
407
+ >>> analysis = response.parsed # Type: Analysis
408
+ >>> print(f"Sentiment: {analysis.sentiment}")
409
+ >>> for point in analysis.key_points:
410
+ ... print(f"- {point}")
411
+
412
+ Supported models:
413
+ Support varies by provider and model. Generally includes:
414
+ - OpenAI: GPT-4 and newer models
415
+ - Anthropic: Claude 3+ models
416
+ - Google: Gemini Pro models
417
+ Check provider documentation for specific model support.
418
+
419
+ Performance:
420
+ - Structured output may use more tokens than free text
421
+ - Complex schemas increase generation time
422
+ - Validation overhead is minimal (Pydantic is fast)
423
+
424
+ Note:
425
+ - Pydantic model is converted to JSON Schema for the API
426
+ - The model generates JSON matching the schema
427
+ - Validation happens automatically via Pydantic
428
+ - Use Field() descriptions to guide generation
429
+
430
+ See Also:
431
+ - generate: For unstructured text generation
432
+ - ModelOptions: Configuration including response_format
433
+ - StructuredModelResponse: Response wrapper with .parsed property
190
434
  """
435
+ if context is None:
436
+ context = AIMessages()
437
+ if options is None:
438
+ options = ModelOptions()
439
+
191
440
  options.response_format = response_format
192
441
 
193
442
  if isinstance(messages, str):
194
443
  messages = AIMessages([messages])
195
444
 
196
445
  # Call the internal generate function with structured output enabled
197
- response = await _generate_with_retry(model, context, messages, options)
446
+ try:
447
+ response = await _generate_with_retry(model, context, messages, options)
448
+ except (ValueError, LLMError):
449
+ raise # Explicitly re-raise to satisfy DOC502
198
450
 
199
451
  # Extract the parsed value from the response
200
452
  parsed_value: T | None = None
@@ -1,9 +1,103 @@
1
+ """Configuration options for LLM generation.
2
+
3
+ @public
4
+
5
+ Provides the ModelOptions class for configuring model behavior,
6
+ retry logic, and advanced features like web search and reasoning.
7
+ """
8
+
1
9
  from typing import Any, Literal
2
10
 
3
11
  from pydantic import BaseModel
4
12
 
5
13
 
6
14
  class ModelOptions(BaseModel):
15
+ """Configuration options for LLM generation requests.
16
+
17
+ @public
18
+
19
+ ModelOptions encapsulates all configuration parameters for model
20
+ generation, including model behavior settings, retry logic, and
21
+ advanced features. All fields are optional with sensible defaults.
22
+
23
+ Attributes:
24
+ temperature: Controls randomness in generation (0.0-2.0).
25
+ Lower values = more deterministic, higher = more creative.
26
+ If None, the parameter is omitted from the API call,
27
+ causing the provider to use its own default (often 1.0).
28
+
29
+ system_prompt: System-level instructions for the model.
30
+ Sets the model's behavior and persona.
31
+
32
+ search_context_size: Web search result depth for search-enabled models.
33
+ Literal["low", "medium", "high"] | None
34
+ "low": Minimal context (~1-2 results)
35
+ "medium": Moderate context (~3-5 results)
36
+ "high": Extensive context (~6+ results)
37
+
38
+ reasoning_effort: Reasoning intensity for models that support explicit reasoning.
39
+ Literal["low", "medium", "high"] | None
40
+ "low": Quick reasoning
41
+ "medium": Balanced reasoning
42
+ "high": Deep, thorough reasoning
43
+ Note: Availability and effect vary by provider and model. Only models
44
+ that expose an explicit reasoning control will honor this parameter.
45
+
46
+ retries: Number of retry attempts on failure (default: 3).
47
+
48
+ retry_delay_seconds: Seconds to wait between retries (default: 10).
49
+
50
+ timeout: Maximum seconds to wait for response (default: 300).
51
+
52
+ service_tier: API tier selection for performance/cost trade-offs.
53
+ "auto": Let API choose
54
+ "default": Standard tier
55
+ "flex": Flexible (cheaper, may be slower)
56
+ "scale": Scaled performance
57
+ "priority": Priority processing
58
+ Note: Service tiers are correct as of Q3 2025. Only OpenAI models
59
+ support this parameter. Other providers (Anthropic, Google, Grok)
60
+ silently ignore it.
61
+
62
+ max_completion_tokens: Maximum tokens to generate.
63
+ None uses model default.
64
+
65
+ response_format: Pydantic model class for structured output.
66
+ Pass a Pydantic model; the client converts it to JSON Schema.
67
+ Set automatically by generate_structured(). Provider support varies.
68
+
69
+ Example:
70
+ >>> # Basic configuration
71
+ >>> options = ModelOptions(
72
+ ... temperature=0.7,
73
+ ... max_completion_tokens=1000
74
+ ... )
75
+ >>>
76
+ >>> # With system prompt
77
+ >>> options = ModelOptions(
78
+ ... system_prompt="You are a helpful coding assistant",
79
+ ... temperature=0.3 # Lower for code generation
80
+ ... )
81
+ >>>
82
+ >>> # For search-enabled models
83
+ >>> options = ModelOptions(
84
+ ... search_context_size="high", # Get more search results
85
+ ... max_completion_tokens=2000
86
+ ... )
87
+ >>>
88
+ >>> # For reasoning models
89
+ >>> options = ModelOptions(
90
+ ... reasoning_effort="high", # Deep reasoning
91
+ ... timeout=600 # More time for complex reasoning
92
+ ... )
93
+
94
+ Note:
95
+ - Not all options apply to all models
96
+ - search_context_size only works with search models
97
+ - reasoning_effort only works with models that support explicit reasoning
98
+ - response_format is set internally by generate_structured()
99
+ """
100
+
7
101
  temperature: float | None = None
8
102
  system_prompt: str | None = None
9
103
  search_context_size: Literal["low", "medium", "high"] | None = None
@@ -16,7 +110,42 @@ class ModelOptions(BaseModel):
16
110
  response_format: type[BaseModel] | None = None
17
111
 
18
112
  def to_openai_completion_kwargs(self) -> dict[str, Any]:
19
- """Convert ModelOptions to OpenAI completion kwargs."""
113
+ """Convert options to OpenAI API completion parameters.
114
+
115
+ Transforms ModelOptions fields into the format expected by
116
+ the OpenAI completion API. Only includes non-None values.
117
+
118
+ Returns:
119
+ Dictionary with OpenAI API parameters:
120
+ - Always includes 'timeout' and 'extra_body'
121
+ - Conditionally includes other parameters if set
122
+ - Maps search_context_size to web_search_options
123
+ - Passes reasoning_effort directly
124
+
125
+ API parameter mapping:
126
+ - temperature -> temperature
127
+ - max_completion_tokens -> max_completion_tokens
128
+ - reasoning_effort -> reasoning_effort
129
+ - search_context_size -> web_search_options.search_context_size
130
+ - response_format -> response_format
131
+ - service_tier -> service_tier
132
+
133
+ Web Search Structure:
134
+ When search_context_size is set, creates:
135
+ {"web_search_options": {"search_context_size": "low|medium|high"}}
136
+ Non-search models silently ignore this parameter.
137
+
138
+ Example:
139
+ >>> options = ModelOptions(temperature=0.5, timeout=60)
140
+ >>> kwargs = options.to_openai_completion_kwargs()
141
+ >>> kwargs
142
+ {'timeout': 60, 'extra_body': {}, 'temperature': 0.5}
143
+
144
+ Note:
145
+ - system_prompt is handled separately in _process_messages()
146
+ - retries and retry_delay_seconds are used by retry logic
147
+ - extra_body is always included for potential extensions
148
+ """
20
149
  kwargs: dict[str, Any] = {
21
150
  "timeout": self.timeout,
22
151
  "extra_body": {},