ai-pipeline-core 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +5 -2
- ai_pipeline_core/documents/__init__.py +2 -1
- ai_pipeline_core/documents/document.py +239 -14
- ai_pipeline_core/documents/document_list.py +72 -16
- ai_pipeline_core/documents/flow_document.py +6 -23
- ai_pipeline_core/documents/task_document.py +6 -23
- ai_pipeline_core/documents/temporary_document.py +5 -19
- ai_pipeline_core/documents/utils.py +64 -1
- ai_pipeline_core/flow/options.py +2 -2
- ai_pipeline_core/llm/__init__.py +5 -0
- ai_pipeline_core/llm/ai_messages.py +0 -3
- ai_pipeline_core/llm/client.py +50 -19
- ai_pipeline_core/llm/model_options.py +18 -0
- ai_pipeline_core/llm/model_response.py +62 -15
- ai_pipeline_core/llm/model_types.py +38 -36
- ai_pipeline_core/pipeline.py +28 -2
- ai_pipeline_core/settings.py +4 -0
- ai_pipeline_core/simple_runner/simple_runner.py +18 -1
- ai_pipeline_core/tracing.py +115 -7
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/METADATA +42 -7
- ai_pipeline_core-0.1.14.dist-info/RECORD +36 -0
- ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.1.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -29,24 +29,8 @@ class TaskDocument(Document):
|
|
|
29
29
|
- Reduces persistent I/O for temporary data
|
|
30
30
|
|
|
31
31
|
Creating TaskDocuments:
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
>>> from enum import StrEnum
|
|
36
|
-
>>>
|
|
37
|
-
>>> # Simple task document:
|
|
38
|
-
>>> class TempDoc(TaskDocument):
|
|
39
|
-
... pass
|
|
40
|
-
>>>
|
|
41
|
-
>>> # With restricted files:
|
|
42
|
-
>>> class CacheDoc(TaskDocument):
|
|
43
|
-
... class FILES(StrEnum):
|
|
44
|
-
... CACHE = "cache.json"
|
|
45
|
-
... INDEX = "index.dat"
|
|
46
|
-
>>>
|
|
47
|
-
>>> # RECOMMENDED - automatic conversion:
|
|
48
|
-
>>> doc = TempDoc.create(name="temp.json", content={"status": "processing"})
|
|
49
|
-
>>> doc = CacheDoc.create(name="cache.json", content={"data": [1, 2, 3]})
|
|
32
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
33
|
+
See Document.create() for detailed usage examples.
|
|
50
34
|
|
|
51
35
|
Use Cases:
|
|
52
36
|
- Intermediate transformation results
|
|
@@ -71,13 +55,11 @@ class TaskDocument(Document):
|
|
|
71
55
|
name: str,
|
|
72
56
|
content: bytes,
|
|
73
57
|
description: str | None = None,
|
|
58
|
+
sources: list[str] = [],
|
|
74
59
|
) -> None:
|
|
75
60
|
"""Initialize a TaskDocument with raw bytes content.
|
|
76
61
|
|
|
77
|
-
|
|
78
|
-
**Most users should use the `create` classmethod instead of __init__.**
|
|
79
|
-
The create method provides automatic content conversion for various types
|
|
80
|
-
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
62
|
+
See Document.__init__() for parameter details and usage notes.
|
|
81
63
|
|
|
82
64
|
Prevents direct instantiation of the abstract TaskDocument class.
|
|
83
65
|
TaskDocument must be subclassed for specific temporary document types.
|
|
@@ -86,6 +68,7 @@ class TaskDocument(Document):
|
|
|
86
68
|
name: Document filename (required, keyword-only)
|
|
87
69
|
content: Document content as raw bytes (required, keyword-only)
|
|
88
70
|
description: Optional human-readable description (keyword-only)
|
|
71
|
+
sources: Optional list of strings for provenance tracking
|
|
89
72
|
|
|
90
73
|
Raises:
|
|
91
74
|
TypeError: If attempting to instantiate TaskDocument directly
|
|
@@ -114,7 +97,7 @@ class TaskDocument(Document):
|
|
|
114
97
|
"""
|
|
115
98
|
if type(self) is TaskDocument:
|
|
116
99
|
raise TypeError("Cannot instantiate abstract TaskDocument class directly")
|
|
117
|
-
super().__init__(name=name, content=content, description=description)
|
|
100
|
+
super().__init__(name=name, content=content, description=description, sources=sources)
|
|
118
101
|
|
|
119
102
|
@final
|
|
120
103
|
def get_base_type(self) -> Literal["task"]:
|
|
@@ -30,25 +30,11 @@ class TemporaryDocument(Document):
|
|
|
30
30
|
- Ignored by simple_runner save operations
|
|
31
31
|
|
|
32
32
|
Creating TemporaryDocuments:
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
>>> doc = TemporaryDocument.create(
|
|
38
|
-
... name="api_response.json",
|
|
39
|
-
... content={"status": "ok", "data": [1, 2, 3]}
|
|
40
|
-
... )
|
|
41
|
-
>>> doc = TemporaryDocument.create(
|
|
42
|
-
... name="credentials.txt",
|
|
43
|
-
... content="secret_token_xyz"
|
|
44
|
-
... )
|
|
45
|
-
>>>
|
|
46
|
-
>>> # Direct constructor - only for bytes:
|
|
47
|
-
>>> doc = TemporaryDocument(
|
|
48
|
-
... name="binary.dat",
|
|
49
|
-
... content=b"\x00\x01\x02"
|
|
50
|
-
... )
|
|
51
|
-
>>>
|
|
33
|
+
Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
|
|
34
|
+
Unlike abstract document types, TemporaryDocument can be instantiated directly.
|
|
35
|
+
See Document.create() for detailed usage examples.
|
|
36
|
+
|
|
37
|
+
>>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
|
|
52
38
|
>>> doc.is_temporary # Always True
|
|
53
39
|
|
|
54
40
|
Use Cases:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Utility functions for document handling.
|
|
2
2
|
|
|
3
3
|
Provides helper functions for URL sanitization, naming conventions,
|
|
4
|
-
|
|
4
|
+
canonical key generation, and hash validation used throughout the document system.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import re
|
|
@@ -115,3 +115,66 @@ def canonical_name_key(
|
|
|
115
115
|
break
|
|
116
116
|
|
|
117
117
|
return camel_to_snake(name)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def is_document_sha256(value: str) -> bool:
|
|
121
|
+
"""Check if a string is a valid base32-encoded SHA256 hash with proper entropy.
|
|
122
|
+
|
|
123
|
+
@public
|
|
124
|
+
|
|
125
|
+
This function validates that a string is not just formatted like a SHA256 hash,
|
|
126
|
+
but actually has the entropy characteristics of a real hash. It checks:
|
|
127
|
+
1. Correct length (52 characters without padding)
|
|
128
|
+
2. Valid base32 characters (A-Z, 2-7)
|
|
129
|
+
3. Sufficient entropy (at least 8 unique characters)
|
|
130
|
+
|
|
131
|
+
The entropy check prevents false positives like 'AAAAAAA...AAA' from being
|
|
132
|
+
identified as valid document hashes.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
value: String to check if it's a document SHA256 hash.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
True if the string appears to be a real base32-encoded SHA256 hash,
|
|
139
|
+
False otherwise.
|
|
140
|
+
|
|
141
|
+
Examples:
|
|
142
|
+
>>> # Real SHA256 hash
|
|
143
|
+
>>> is_document_sha256("P3AEMA2PSYILKFYVBUALJLMIYWVZIS2QDI3S5VTMD2X7SOODF2YQ")
|
|
144
|
+
True
|
|
145
|
+
|
|
146
|
+
>>> # Too uniform - lacks entropy
|
|
147
|
+
>>> is_document_sha256("A" * 52)
|
|
148
|
+
False
|
|
149
|
+
|
|
150
|
+
>>> # Wrong length
|
|
151
|
+
>>> is_document_sha256("ABC123")
|
|
152
|
+
False
|
|
153
|
+
|
|
154
|
+
>>> # Invalid characters
|
|
155
|
+
>>> is_document_sha256("a" * 52) # lowercase
|
|
156
|
+
False
|
|
157
|
+
"""
|
|
158
|
+
# Check basic format: exactly 52 uppercase base32 characters
|
|
159
|
+
try:
|
|
160
|
+
if not value or len(value) != 52:
|
|
161
|
+
return False
|
|
162
|
+
except (TypeError, AttributeError):
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
# Check if all characters are valid base32 (A-Z, 2-7)
|
|
166
|
+
try:
|
|
167
|
+
if not re.match(r"^[A-Z2-7]{52}$", value):
|
|
168
|
+
return False
|
|
169
|
+
except TypeError:
|
|
170
|
+
# re.match raises TypeError for non-string types like bytes
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
# Check entropy: real SHA256 hashes have high entropy
|
|
174
|
+
# Require at least 8 unique characters (out of 32 possible in base32)
|
|
175
|
+
# This prevents patterns like "AAAAAAA..." from being identified as real hashes
|
|
176
|
+
unique_chars = len(set(value))
|
|
177
|
+
if unique_chars < 8:
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
return True
|
ai_pipeline_core/flow/options.py
CHANGED
|
@@ -60,11 +60,11 @@ class FlowOptions(BaseSettings):
|
|
|
60
60
|
add flow-specific parameters with appropriate validation.
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
|
-
core_model: ModelName
|
|
63
|
+
core_model: ModelName = Field(
|
|
64
64
|
default="gpt-5",
|
|
65
65
|
description="Primary model for complex analysis and generation tasks.",
|
|
66
66
|
)
|
|
67
|
-
small_model: ModelName
|
|
67
|
+
small_model: ModelName = Field(
|
|
68
68
|
default="gpt-5-mini",
|
|
69
69
|
description="Fast, cost-effective model for simple tasks and orchestration.",
|
|
70
70
|
)
|
ai_pipeline_core/llm/__init__.py
CHANGED
|
@@ -8,6 +8,8 @@ from .ai_messages import AIMessages, AIMessageType
|
|
|
8
8
|
from .client import (
|
|
9
9
|
generate,
|
|
10
10
|
generate_structured,
|
|
11
|
+
generate_with_retry_for_testing,
|
|
12
|
+
process_messages_for_testing,
|
|
11
13
|
)
|
|
12
14
|
from .model_options import ModelOptions
|
|
13
15
|
from .model_response import ModelResponse, StructuredModelResponse
|
|
@@ -22,4 +24,7 @@ __all__ = [
|
|
|
22
24
|
"StructuredModelResponse",
|
|
23
25
|
"generate",
|
|
24
26
|
"generate_structured",
|
|
27
|
+
# Internal functions exposed for testing only
|
|
28
|
+
"process_messages_for_testing",
|
|
29
|
+
"generate_with_retry_for_testing",
|
|
25
30
|
]
|
|
@@ -63,7 +63,6 @@ class AIMessages(list[AIMessageType]):
|
|
|
63
63
|
>>> messages.append("What is the capital of France?")
|
|
64
64
|
>>> response = await llm.generate("gpt-5", messages=messages)
|
|
65
65
|
>>> messages.append(response) # Add the actual response
|
|
66
|
-
>>> prompt = messages.get_last_message_as_str() # Get the last message as a string
|
|
67
66
|
"""
|
|
68
67
|
|
|
69
68
|
def get_last_message(self) -> AIMessageType:
|
|
@@ -78,8 +77,6 @@ class AIMessages(list[AIMessageType]):
|
|
|
78
77
|
def get_last_message_as_str(self) -> str:
|
|
79
78
|
"""Get the last message as a string, raising if not a string.
|
|
80
79
|
|
|
81
|
-
@public
|
|
82
|
-
|
|
83
80
|
Returns:
|
|
84
81
|
The last message as a string.
|
|
85
82
|
|
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -38,6 +38,7 @@ def _process_messages(
|
|
|
38
38
|
context: AIMessages,
|
|
39
39
|
messages: AIMessages,
|
|
40
40
|
system_prompt: str | None = None,
|
|
41
|
+
cache_ttl: str | None = "120s",
|
|
41
42
|
) -> list[ChatCompletionMessageParam]:
|
|
42
43
|
"""Process and format messages for LLM API consumption.
|
|
43
44
|
|
|
@@ -49,11 +50,13 @@ def _process_messages(
|
|
|
49
50
|
context: Messages to be cached (typically expensive/static content).
|
|
50
51
|
messages: Regular messages without caching (dynamic queries).
|
|
51
52
|
system_prompt: Optional system instructions for the model.
|
|
53
|
+
cache_ttl: Cache TTL for context messages (e.g. "120s", "5m", "1h").
|
|
54
|
+
Set to None or empty string to disable caching.
|
|
52
55
|
|
|
53
56
|
Returns:
|
|
54
57
|
List of formatted messages ready for API calls, with:
|
|
55
58
|
- System prompt at the beginning (if provided)
|
|
56
|
-
- Context messages with cache_control on the last one
|
|
59
|
+
- Context messages with cache_control on the last one (if cache_ttl)
|
|
57
60
|
- Regular messages without caching
|
|
58
61
|
|
|
59
62
|
System Prompt Location:
|
|
@@ -62,8 +65,10 @@ def _process_messages(
|
|
|
62
65
|
allowing dynamic system prompts without breaking cache efficiency.
|
|
63
66
|
|
|
64
67
|
Cache behavior:
|
|
65
|
-
The last context message gets ephemeral caching
|
|
68
|
+
The last context message gets ephemeral caching with specified TTL
|
|
66
69
|
to reduce token usage on repeated calls with same context.
|
|
70
|
+
If cache_ttl is None or empty string (falsy), no caching is applied.
|
|
71
|
+
Only the last context message receives cache_control to maximize efficiency.
|
|
67
72
|
|
|
68
73
|
Note:
|
|
69
74
|
This is an internal function used by _generate_with_retry().
|
|
@@ -80,11 +85,12 @@ def _process_messages(
|
|
|
80
85
|
# Use AIMessages.to_prompt() for context
|
|
81
86
|
context_messages = context.to_prompt()
|
|
82
87
|
|
|
83
|
-
# Apply caching to last context message
|
|
84
|
-
|
|
85
|
-
"type
|
|
86
|
-
|
|
87
|
-
|
|
88
|
+
# Apply caching to last context message if cache_ttl is set
|
|
89
|
+
if cache_ttl:
|
|
90
|
+
context_messages[-1]["cache_control"] = { # type: ignore
|
|
91
|
+
"type": "ephemeral",
|
|
92
|
+
"ttl": cache_ttl,
|
|
93
|
+
}
|
|
88
94
|
|
|
89
95
|
processed_messages.extend(context_messages)
|
|
90
96
|
|
|
@@ -173,7 +179,9 @@ async def _generate_with_retry(
|
|
|
173
179
|
if not context and not messages:
|
|
174
180
|
raise ValueError("Either context or messages must be provided")
|
|
175
181
|
|
|
176
|
-
processed_messages = _process_messages(
|
|
182
|
+
processed_messages = _process_messages(
|
|
183
|
+
context, messages, options.system_prompt, options.cache_ttl
|
|
184
|
+
)
|
|
177
185
|
completion_kwargs: dict[str, Any] = {
|
|
178
186
|
"model": model,
|
|
179
187
|
"messages": processed_messages,
|
|
@@ -215,7 +223,7 @@ async def _generate_with_retry(
|
|
|
215
223
|
|
|
216
224
|
@trace(ignore_inputs=["context"])
|
|
217
225
|
async def generate(
|
|
218
|
-
model: ModelName
|
|
226
|
+
model: ModelName,
|
|
219
227
|
*,
|
|
220
228
|
context: AIMessages | None = None,
|
|
221
229
|
messages: AIMessages | str,
|
|
@@ -236,7 +244,7 @@ async def generate(
|
|
|
236
244
|
|
|
237
245
|
Args:
|
|
238
246
|
model: Model to use (e.g., "gpt-5", "gemini-2.5-pro", "grok-4").
|
|
239
|
-
|
|
247
|
+
Accepts predefined models or any string for custom models.
|
|
240
248
|
context: Static context to cache (documents, examples, instructions).
|
|
241
249
|
Defaults to None (empty context). Cached for 120 seconds.
|
|
242
250
|
messages: Dynamic messages/queries. AIMessages or str ONLY.
|
|
@@ -292,6 +300,22 @@ async def generate(
|
|
|
292
300
|
>>> # Second call: reuses cache, saves tokens!
|
|
293
301
|
>>> r2 = await llm.generate("gpt-5", context=static_doc, messages="Key points?")
|
|
294
302
|
|
|
303
|
+
>>> # Custom cache TTL for longer-lived contexts
|
|
304
|
+
>>> response = await llm.generate(
|
|
305
|
+
... "gpt-5",
|
|
306
|
+
... context=static_doc,
|
|
307
|
+
... messages="Analyze this",
|
|
308
|
+
... options=ModelOptions(cache_ttl="300s") # Cache for 5 minutes
|
|
309
|
+
... )
|
|
310
|
+
|
|
311
|
+
>>> # Disable caching when context changes frequently
|
|
312
|
+
>>> response = await llm.generate(
|
|
313
|
+
... "gpt-5",
|
|
314
|
+
... context=dynamic_doc,
|
|
315
|
+
... messages="Process this",
|
|
316
|
+
... options=ModelOptions(cache_ttl=None) # No caching
|
|
317
|
+
... )
|
|
318
|
+
|
|
295
319
|
>>> # AVOID unnecessary options (defaults are optimal)
|
|
296
320
|
>>> response = await llm.generate(
|
|
297
321
|
... "gpt-5",
|
|
@@ -310,14 +334,17 @@ async def generate(
|
|
|
310
334
|
Performance:
|
|
311
335
|
- Context caching saves ~50-90% tokens on repeated calls
|
|
312
336
|
- First call: full token cost
|
|
313
|
-
- Subsequent calls (within
|
|
337
|
+
- Subsequent calls (within cache TTL): only messages tokens
|
|
338
|
+
- Default cache TTL is 120s (configurable via ModelOptions.cache_ttl)
|
|
314
339
|
- Default retry delay is 10s (configurable via ModelOptions.retry_delay_seconds)
|
|
315
340
|
|
|
316
341
|
Caching:
|
|
317
342
|
When enabled in your LiteLLM proxy and supported by the upstream provider,
|
|
318
|
-
context messages may be cached
|
|
319
|
-
|
|
320
|
-
|
|
343
|
+
context messages may be cached to reduce token usage on repeated calls.
|
|
344
|
+
Default TTL is 120s, configurable via ModelOptions.cache_ttl (e.g. "300s", "5m").
|
|
345
|
+
Set cache_ttl=None to disable caching. Savings depend on provider and payload;
|
|
346
|
+
treat this as an optimization, not a guarantee. Cache behavior varies by proxy
|
|
347
|
+
configuration.
|
|
321
348
|
|
|
322
349
|
Note:
|
|
323
350
|
- Context argument is ignored by the tracer to avoid recording large data
|
|
@@ -350,7 +377,7 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
350
377
|
|
|
351
378
|
@trace(ignore_inputs=["context"])
|
|
352
379
|
async def generate_structured(
|
|
353
|
-
model: ModelName
|
|
380
|
+
model: ModelName,
|
|
354
381
|
response_format: type[T],
|
|
355
382
|
*,
|
|
356
383
|
context: AIMessages | None = None,
|
|
@@ -364,10 +391,8 @@ async def generate_structured(
|
|
|
364
391
|
Type-safe generation that returns validated Pydantic model instances.
|
|
365
392
|
Uses OpenAI's structured output feature for guaranteed schema compliance.
|
|
366
393
|
|
|
367
|
-
Best Practices
|
|
368
|
-
|
|
369
|
-
2. MESSAGES: Use AIMessages or str - wrap Documents in AIMessages
|
|
370
|
-
3. CONTEXT vs MESSAGES: Use context for static/cacheable, messages for dynamic
|
|
394
|
+
Best Practices:
|
|
395
|
+
Same as generate() - see generate() documentation for details.
|
|
371
396
|
|
|
372
397
|
Args:
|
|
373
398
|
model: Model to use (must support structured output).
|
|
@@ -473,3 +498,9 @@ async def generate_structured(
|
|
|
473
498
|
|
|
474
499
|
# Create a StructuredModelResponse with the parsed value
|
|
475
500
|
return StructuredModelResponse[T](chat_completion=response, parsed_value=parsed_value)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# Public aliases for testing internal functions
|
|
504
|
+
# These are exported to allow testing of implementation details
|
|
505
|
+
process_messages_for_testing = _process_messages
|
|
506
|
+
generate_with_retry_for_testing = _generate_with_retry
|
|
@@ -49,6 +49,10 @@ class ModelOptions(BaseModel):
|
|
|
49
49
|
|
|
50
50
|
timeout: Maximum seconds to wait for response (default: 300).
|
|
51
51
|
|
|
52
|
+
cache_ttl: Cache TTL for context messages (default: "120s").
|
|
53
|
+
String format like "60s", "5m", or None to disable caching.
|
|
54
|
+
Applied to the last context message for efficient token reuse.
|
|
55
|
+
|
|
52
56
|
service_tier: API tier selection for performance/cost trade-offs.
|
|
53
57
|
"auto": Let API choose
|
|
54
58
|
"default": Standard tier
|
|
@@ -79,6 +83,18 @@ class ModelOptions(BaseModel):
|
|
|
79
83
|
... temperature=0.3 # Lower for code generation
|
|
80
84
|
... )
|
|
81
85
|
>>>
|
|
86
|
+
>>> # With custom cache TTL
|
|
87
|
+
>>> options = ModelOptions(
|
|
88
|
+
... cache_ttl="300s", # Cache context for 5 minutes
|
|
89
|
+
... max_completion_tokens=1000
|
|
90
|
+
... )
|
|
91
|
+
>>>
|
|
92
|
+
>>> # Disable caching
|
|
93
|
+
>>> options = ModelOptions(
|
|
94
|
+
... cache_ttl=None, # No context caching
|
|
95
|
+
... temperature=0.5
|
|
96
|
+
... )
|
|
97
|
+
>>>
|
|
82
98
|
>>> # For search-enabled models
|
|
83
99
|
>>> options = ModelOptions(
|
|
84
100
|
... search_context_size="high", # Get more search results
|
|
@@ -96,6 +112,7 @@ class ModelOptions(BaseModel):
|
|
|
96
112
|
- search_context_size only works with search models
|
|
97
113
|
- reasoning_effort only works with models that support explicit reasoning
|
|
98
114
|
- response_format is set internally by generate_structured()
|
|
115
|
+
- cache_ttl accepts formats like "120s", "5m", "1h" or None to disable caching
|
|
99
116
|
"""
|
|
100
117
|
|
|
101
118
|
temperature: float | None = None
|
|
@@ -105,6 +122,7 @@ class ModelOptions(BaseModel):
|
|
|
105
122
|
retries: int = 3
|
|
106
123
|
retry_delay_seconds: int = 10
|
|
107
124
|
timeout: int = 300
|
|
125
|
+
cache_ttl: str | None = "120s"
|
|
108
126
|
service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
|
|
109
127
|
max_completion_tokens: int | None = None
|
|
110
128
|
response_format: type[BaseModel] | None = None
|
|
@@ -146,36 +146,83 @@ class ModelResponse(ChatCompletion):
|
|
|
146
146
|
self.headers = copy.deepcopy(headers)
|
|
147
147
|
|
|
148
148
|
def get_laminar_metadata(self) -> dict[str, str | int | float]:
|
|
149
|
-
"""Extract metadata for LMNR (Laminar) observability.
|
|
149
|
+
"""Extract metadata for LMNR (Laminar) observability including cost tracking.
|
|
150
150
|
|
|
151
|
-
Collects comprehensive metadata about the generation for
|
|
152
|
-
|
|
151
|
+
Collects comprehensive metadata about the generation for tracing,
|
|
152
|
+
monitoring, and cost analysis in the LMNR platform. This method
|
|
153
|
+
provides detailed insights into token usage, caching effectiveness,
|
|
154
|
+
and generation costs.
|
|
153
155
|
|
|
154
156
|
Returns:
|
|
155
157
|
Dictionary containing:
|
|
156
|
-
- LiteLLM headers (call ID, costs, etc.)
|
|
157
|
-
- Token usage statistics
|
|
158
|
-
- Model configuration
|
|
159
|
-
- Cost information
|
|
160
|
-
- Cached token counts
|
|
158
|
+
- LiteLLM headers (call ID, costs, model info, etc.)
|
|
159
|
+
- Token usage statistics (input, output, total, cached)
|
|
160
|
+
- Model configuration used for generation
|
|
161
|
+
- Cost information in multiple formats
|
|
162
|
+
- Cached token counts (when context caching enabled)
|
|
161
163
|
- Reasoning token counts (for O1 models)
|
|
162
164
|
|
|
163
165
|
Metadata structure:
|
|
164
166
|
- litellm.*: All LiteLLM-specific headers
|
|
165
|
-
- gen_ai.usage
|
|
167
|
+
- gen_ai.usage.prompt_tokens: Input token count
|
|
168
|
+
- gen_ai.usage.completion_tokens: Output token count
|
|
169
|
+
- gen_ai.usage.total_tokens: Total tokens used
|
|
170
|
+
- gen_ai.usage.cached_tokens: Cached tokens (if applicable)
|
|
171
|
+
- gen_ai.usage.reasoning_tokens: Reasoning tokens (O1 models)
|
|
172
|
+
- gen_ai.usage.output_cost: Generation cost in dollars
|
|
173
|
+
- gen_ai.usage.cost: Alternative cost field (same value)
|
|
174
|
+
- gen_ai.cost: Simple cost field (same value)
|
|
166
175
|
- gen_ai.response.*: Response identifiers
|
|
167
|
-
- gen_ai.cost: Cost information
|
|
168
176
|
- model_options.*: Configuration used
|
|
169
177
|
|
|
178
|
+
Cost tracking:
|
|
179
|
+
Cost information is extracted from two sources:
|
|
180
|
+
1. x-litellm-response-cost header (primary)
|
|
181
|
+
2. usage.cost attribute (fallback)
|
|
182
|
+
|
|
183
|
+
Cost is stored in three fields for compatibility:
|
|
184
|
+
- gen_ai.usage.output_cost (standard)
|
|
185
|
+
- gen_ai.usage.cost (alternative)
|
|
186
|
+
- gen_ai.cost (simple)
|
|
187
|
+
|
|
170
188
|
Example:
|
|
171
|
-
>>> response = await llm.generate(
|
|
189
|
+
>>> response = await llm.generate(
|
|
190
|
+
... "gpt-5",
|
|
191
|
+
... context=large_doc,
|
|
192
|
+
... messages="Summarize this",
|
|
193
|
+
... options=ModelOptions(cache_ttl="300s")
|
|
194
|
+
... )
|
|
195
|
+
>>>
|
|
196
|
+
>>> # Get comprehensive metadata
|
|
172
197
|
>>> metadata = response.get_laminar_metadata()
|
|
173
|
-
>>>
|
|
174
|
-
>>>
|
|
198
|
+
>>>
|
|
199
|
+
>>> # Track generation cost
|
|
200
|
+
>>> cost = metadata.get('gen_ai.usage.output_cost', 0)
|
|
201
|
+
>>> if cost > 0:
|
|
202
|
+
... print(f"Generation cost: ${cost:.4f}")
|
|
203
|
+
>>>
|
|
204
|
+
>>> # Monitor token usage
|
|
205
|
+
>>> print(f"Input: {metadata.get('gen_ai.usage.prompt_tokens', 0)} tokens")
|
|
206
|
+
>>> print(f"Output: {metadata.get('gen_ai.usage.completion_tokens', 0)} tokens")
|
|
207
|
+
>>> print(f"Total: {metadata.get('gen_ai.usage.total_tokens', 0)} tokens")
|
|
208
|
+
>>>
|
|
209
|
+
>>> # Check cache effectiveness
|
|
210
|
+
>>> cached = metadata.get('gen_ai.usage.cached_tokens', 0)
|
|
211
|
+
>>> if cached > 0:
|
|
212
|
+
... total = metadata.get('gen_ai.usage.total_tokens', 1)
|
|
213
|
+
... savings = (cached / total) * 100
|
|
214
|
+
... print(f"Cache hit: {cached} tokens ({savings:.1f}% savings)")
|
|
215
|
+
>>>
|
|
216
|
+
>>> # Calculate cost per token
|
|
217
|
+
>>> if cost > 0 and metadata.get('gen_ai.usage.total_tokens'):
|
|
218
|
+
... cost_per_1k = (cost / metadata['gen_ai.usage.total_tokens']) * 1000
|
|
219
|
+
... print(f"Cost per 1K tokens: ${cost_per_1k:.4f}")
|
|
175
220
|
|
|
176
221
|
Note:
|
|
177
|
-
|
|
178
|
-
|
|
222
|
+
- Cost availability depends on LiteLLM proxy configuration
|
|
223
|
+
- Not all providers return cost information
|
|
224
|
+
- Cached tokens reduce actual cost but may not be reflected
|
|
225
|
+
- Used internally by tracing but accessible for cost analysis
|
|
179
226
|
"""
|
|
180
227
|
metadata: dict[str, str | int | float] = {}
|
|
181
228
|
|
|
@@ -12,28 +12,32 @@ Model categories:
|
|
|
12
12
|
|
|
13
13
|
from typing import Literal, TypeAlias
|
|
14
14
|
|
|
15
|
-
ModelName: TypeAlias =
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
15
|
+
ModelName: TypeAlias = (
|
|
16
|
+
Literal[
|
|
17
|
+
# Core models
|
|
18
|
+
"gemini-2.5-pro",
|
|
19
|
+
"gpt-5",
|
|
20
|
+
"grok-4",
|
|
21
|
+
# Small models
|
|
22
|
+
"gemini-2.5-flash",
|
|
23
|
+
"gpt-5-mini",
|
|
24
|
+
"grok-3-mini",
|
|
25
|
+
# Search models
|
|
26
|
+
"gemini-2.5-flash-search",
|
|
27
|
+
"sonar-pro-search",
|
|
28
|
+
"gpt-4o-search",
|
|
29
|
+
"grok-3-mini-search",
|
|
30
|
+
]
|
|
31
|
+
| str
|
|
32
|
+
)
|
|
33
|
+
"""Type-safe model name identifiers with support for custom models.
|
|
31
34
|
|
|
32
35
|
@public
|
|
33
36
|
|
|
34
|
-
Provides
|
|
35
|
-
|
|
36
|
-
and
|
|
37
|
+
Provides IDE autocompletion for common model names while allowing any
|
|
38
|
+
string for custom models. The type is a union of predefined literals
|
|
39
|
+
and str, giving you the best of both worlds: suggestions for known
|
|
40
|
+
models and flexibility for custom ones.
|
|
37
41
|
|
|
38
42
|
Note: These are example common model names as of Q3 2025. Actual availability
|
|
39
43
|
depends on your LiteLLM proxy configuration and provider access.
|
|
@@ -51,32 +55,30 @@ Model categories:
|
|
|
51
55
|
Models with integrated web search capabilities for retrieving
|
|
52
56
|
and synthesizing current information.
|
|
53
57
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
3. Or simply use strings: model = "any-model-via-litellm"
|
|
58
|
+
Using custom models:
|
|
59
|
+
ModelName now includes str, so you can use any model name directly:
|
|
60
|
+
- Predefined models get IDE autocomplete and validation
|
|
61
|
+
- Custom models work seamlessly as strings
|
|
62
|
+
- No need for Union types or additional type aliases
|
|
60
63
|
|
|
61
64
|
Example:
|
|
62
65
|
>>> from ai_pipeline_core import llm, ModelName
|
|
63
66
|
>>>
|
|
64
|
-
>>> #
|
|
65
|
-
>>> model: ModelName = "gpt-5" # IDE
|
|
67
|
+
>>> # Predefined model with IDE autocomplete
|
|
68
|
+
>>> model: ModelName = "gpt-5" # IDE suggests common models
|
|
66
69
|
>>> response = await llm.generate(model, messages="Hello")
|
|
67
70
|
>>>
|
|
68
|
-
>>> #
|
|
69
|
-
>>>
|
|
71
|
+
>>> # Custom model works directly
|
|
72
|
+
>>> model: ModelName = "custom-model-v2" # Any string is valid
|
|
73
|
+
>>> response = await llm.generate(model, messages="Hello")
|
|
70
74
|
>>>
|
|
71
|
-
>>> #
|
|
72
|
-
>>>
|
|
73
|
-
>>> MyModel = Literal["company-llm-v1"]
|
|
74
|
-
>>> model: ModelName | MyModel = "company-llm-v1"
|
|
75
|
+
>>> # Both types work seamlessly
|
|
76
|
+
>>> models: list[ModelName] = ["gpt-5", "custom-llm", "gemini-2.5-pro"]
|
|
75
77
|
|
|
76
78
|
Note:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
79
|
+
The ModelName type includes both predefined literals and str,
|
|
80
|
+
allowing full flexibility while maintaining IDE support for
|
|
81
|
+
common models.
|
|
80
82
|
|
|
81
83
|
See Also:
|
|
82
84
|
- llm.generate: Main generation function
|