ai-pipeline-core 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/PKG-INFO +92 -30
  2. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/README.md +90 -28
  3. ai_pipeline_core-0.1.7/ai_pipeline_core/__init__.py +77 -0
  4. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/__init__.py +3 -0
  5. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/document.py +57 -3
  6. ai_pipeline_core-0.1.7/ai_pipeline_core/documents/mime_type.py +110 -0
  7. ai_pipeline_core-0.1.7/ai_pipeline_core/flow/__init__.py +7 -0
  8. ai_pipeline_core-0.1.7/ai_pipeline_core/flow/options.py +26 -0
  9. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/client.py +5 -3
  10. ai_pipeline_core-0.1.7/ai_pipeline_core/pipeline.py +418 -0
  11. ai_pipeline_core-0.1.7/ai_pipeline_core/prefect.py +7 -0
  12. ai_pipeline_core-0.1.7/ai_pipeline_core/simple_runner/__init__.py +19 -0
  13. ai_pipeline_core-0.1.7/ai_pipeline_core/simple_runner/cli.py +95 -0
  14. ai_pipeline_core-0.1.7/ai_pipeline_core/simple_runner/simple_runner.py +147 -0
  15. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/tracing.py +63 -20
  16. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/pyproject.toml +3 -3
  17. ai_pipeline_core-0.1.5/ai_pipeline_core/__init__.py +0 -36
  18. ai_pipeline_core-0.1.5/ai_pipeline_core/documents/mime_type.py +0 -78
  19. ai_pipeline_core-0.1.5/ai_pipeline_core/flow/__init__.py +0 -3
  20. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/.gitignore +0 -0
  21. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/LICENSE +0 -0
  22. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/document_list.py +0 -0
  23. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/flow_document.py +0 -0
  24. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/task_document.py +0 -0
  25. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/documents/utils.py +0 -0
  26. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/exceptions.py +0 -0
  27. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/flow/config.py +0 -0
  28. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/__init__.py +0 -0
  29. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/ai_messages.py +0 -0
  30. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/model_options.py +0 -0
  31. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/model_response.py +0 -0
  32. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/llm/model_types.py +0 -0
  33. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/logging/__init__.py +0 -0
  34. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/logging/logging.yml +0 -0
  35. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/logging/logging_config.py +0 -0
  36. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/logging/logging_mixin.py +0 -0
  37. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/prompt_manager.py +0 -0
  38. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/py.typed +0 -0
  39. {ai_pipeline_core-0.1.5 → ai_pipeline_core-0.1.7}/ai_pipeline_core/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -20,7 +20,7 @@ Classifier: Typing :: Typed
20
20
  Requires-Python: >=3.12
21
21
  Requires-Dist: httpx>=0.28.1
22
22
  Requires-Dist: jinja2>=3.1.6
23
- Requires-Dist: lmnr>=0.7.5
23
+ Requires-Dist: lmnr>=0.7.6
24
24
  Requires-Dist: openai>=1.99.9
25
25
  Requires-Dist: prefect>=3.4.13
26
26
  Requires-Dist: pydantic-settings>=2.10.1
@@ -151,40 +151,76 @@ async def process_document(doc: Document):
151
151
  return response.parsed
152
152
  ```
153
153
 
154
- ### Prefect Flow Integration
154
+ ### Enhanced Pipeline Decorators (New in v0.1.7)
155
155
  ```python
156
- from prefect import flow, task
157
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
158
- from ai_pipeline_core.flow import FlowConfig
159
- from ai_pipeline_core.tracing import trace
156
+ from ai_pipeline_core import pipeline_flow, pipeline_task
157
+ from ai_pipeline_core.flow import FlowOptions
158
+ from ai_pipeline_core.documents import DocumentList, FlowDocument
160
159
 
161
- class OutputDocument(FlowDocument):
162
- """Custom output document type"""
163
- def get_type(self) -> str:
164
- return "output"
160
+ class CustomFlowOptions(FlowOptions):
161
+ """Extend base options with your custom fields"""
162
+ batch_size: int = 100
163
+ temperature: float = 0.7
165
164
 
166
- class MyFlowConfig(FlowConfig):
167
- INPUT_DOCUMENT_TYPES = [InputDocument]
168
- OUTPUT_DOCUMENT_TYPE = OutputDocument
169
-
170
- @task
171
- @trace
165
+ @pipeline_task(trace_level="always", retries=3)
172
166
  async def process_task(doc: Document) -> Document:
173
- # Task-level processing with automatic tracing
167
+ # Task with automatic tracing and retries
174
168
  result = await process_document(doc)
175
- # Convert result to JSON string for document content
176
- import json
177
- return OutputDocument(name="result", content=json.dumps(result.model_dump()).encode())
169
+ return OutputDocument(name="result", content=result.encode())
170
+
171
+ @pipeline_flow(trace_level="always")
172
+ async def my_pipeline(
173
+ project_name: str,
174
+ documents: DocumentList,
175
+ flow_options: CustomFlowOptions # Type-safe custom options
176
+ ) -> DocumentList:
177
+ # Pipeline flow with enforced signature and tracing
178
+ results = []
179
+ for doc in documents:
180
+ result = await process_task(doc)
181
+ results.append(result)
182
+ return DocumentList(results)
183
+ ```
178
184
 
179
- @flow
180
- async def my_pipeline(documents: DocumentList):
181
- config = MyFlowConfig()
182
- input_docs = config.get_input_documents(documents)
185
+ ### Simple Runner Utility (New in v0.1.7)
186
+ ```python
187
+ from ai_pipeline_core.simple_runner import run_cli, run_pipeline
188
+ from ai_pipeline_core.flow import FlowOptions
189
+
190
+ # CLI-based pipeline execution
191
+ if __name__ == "__main__":
192
+ run_cli(
193
+ flows=[my_pipeline],
194
+ flow_configs=[MyFlowConfig],
195
+ options_cls=CustomFlowOptions
196
+ )
183
197
 
184
- results = await process_task.map(input_docs)
198
+ # Or programmatic execution
199
+ async def main():
200
+ result = await run_pipeline(
201
+ project_name="my-project",
202
+ output_dir=Path("./output"),
203
+ flow=my_pipeline,
204
+ flow_config=MyFlowConfig,
205
+ flow_options=CustomFlowOptions(batch_size=50)
206
+ )
207
+ ```
185
208
 
186
- config.validate_output_documents(results)
187
- return results
209
+ ### Clean Prefect Decorators (New in v0.1.7)
210
+ ```python
211
+ # Import clean Prefect decorators without tracing
212
+ from ai_pipeline_core.prefect import flow, task
213
+
214
+ # Or use pipeline decorators with tracing
215
+ from ai_pipeline_core import pipeline_flow, pipeline_task
216
+
217
+ @task # Clean Prefect task
218
+ def compute(x: int) -> int:
219
+ return x * 2
220
+
221
+ @pipeline_task(trace_level="always") # With tracing
222
+ def compute_traced(x: int) -> int:
223
+ return x * 2
188
224
  ```
189
225
 
190
226
  ## Core Modules
@@ -291,8 +327,14 @@ ai_pipeline_core/
291
327
  │ ├── client.py # Async client implementation
292
328
  │ └── model_options.py # Configuration models
293
329
  ├── flow/ # Prefect flow utilities
294
- └── config.py # Type-safe flow configuration
330
+ ├── config.py # Type-safe flow configuration
331
+ │ └── options.py # FlowOptions base class (v0.1.7)
332
+ ├── simple_runner/ # Pipeline execution utilities (v0.1.7)
333
+ │ ├── cli.py # CLI interface
334
+ │ └── simple_runner.py # Core runner logic
295
335
  ├── logging/ # Structured logging
336
+ ├── pipeline.py # Enhanced decorators (v0.1.7)
337
+ ├── prefect.py # Clean Prefect exports (v0.1.7)
296
338
  ├── tracing.py # Observability decorators
297
339
  └── settings.py # Centralized configuration
298
340
  ```
@@ -469,9 +511,29 @@ Built with:
469
511
  - [LiteLLM](https://litellm.ai/) - LLM proxy
470
512
  - [Pydantic](https://pydantic-docs.helpmanual.io/) - Data validation
471
513
 
514
+ ## What's New in v0.1.7
515
+
516
+ ### Major Additions
517
+ - **Enhanced Pipeline Decorators**: New `pipeline_flow` and `pipeline_task` decorators combining Prefect functionality with automatic LMNR tracing
518
+ - **FlowOptions Base Class**: Extensible configuration system for flows with type-safe inheritance
519
+ - **Simple Runner Module**: CLI and programmatic utilities for easy pipeline execution
520
+ - **Clean Prefect Exports**: Separate imports for Prefect decorators with and without tracing
521
+ - **Expanded Exports**: All major components now accessible from top-level package import
522
+
523
+ ### API Improvements
524
+ - Better type inference for document flows with custom options
525
+ - Support for custom FlowOptions inheritance in pipeline flows
526
+ - Improved error messages for invalid flow signatures
527
+ - Enhanced document utility functions (`canonical_name_key`, `sanitize_url`)
528
+
529
+ ### Developer Experience
530
+ - Simplified imports - most components available from `ai_pipeline_core` directly
531
+ - Better separation of concerns between clean Prefect and traced pipeline decorators
532
+ - More intuitive flow configuration with `FlowOptions` inheritance
533
+
472
534
  ## Stability Notice
473
535
 
474
- **Current Version**: 0.1.5
536
+ **Current Version**: 0.1.7
475
537
  **Status**: Internal Preview
476
538
  **API Stability**: Unstable - Breaking changes expected
477
539
  **Recommended Use**: Learning and reference only
@@ -109,40 +109,76 @@ async def process_document(doc: Document):
109
109
  return response.parsed
110
110
  ```
111
111
 
112
- ### Prefect Flow Integration
112
+ ### Enhanced Pipeline Decorators (New in v0.1.7)
113
113
  ```python
114
- from prefect import flow, task
115
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
116
- from ai_pipeline_core.flow import FlowConfig
117
- from ai_pipeline_core.tracing import trace
114
+ from ai_pipeline_core import pipeline_flow, pipeline_task
115
+ from ai_pipeline_core.flow import FlowOptions
116
+ from ai_pipeline_core.documents import DocumentList, FlowDocument
118
117
 
119
- class OutputDocument(FlowDocument):
120
- """Custom output document type"""
121
- def get_type(self) -> str:
122
- return "output"
118
+ class CustomFlowOptions(FlowOptions):
119
+ """Extend base options with your custom fields"""
120
+ batch_size: int = 100
121
+ temperature: float = 0.7
123
122
 
124
- class MyFlowConfig(FlowConfig):
125
- INPUT_DOCUMENT_TYPES = [InputDocument]
126
- OUTPUT_DOCUMENT_TYPE = OutputDocument
127
-
128
- @task
129
- @trace
123
+ @pipeline_task(trace_level="always", retries=3)
130
124
  async def process_task(doc: Document) -> Document:
131
- # Task-level processing with automatic tracing
125
+ # Task with automatic tracing and retries
132
126
  result = await process_document(doc)
133
- # Convert result to JSON string for document content
134
- import json
135
- return OutputDocument(name="result", content=json.dumps(result.model_dump()).encode())
127
+ return OutputDocument(name="result", content=result.encode())
128
+
129
+ @pipeline_flow(trace_level="always")
130
+ async def my_pipeline(
131
+ project_name: str,
132
+ documents: DocumentList,
133
+ flow_options: CustomFlowOptions # Type-safe custom options
134
+ ) -> DocumentList:
135
+ # Pipeline flow with enforced signature and tracing
136
+ results = []
137
+ for doc in documents:
138
+ result = await process_task(doc)
139
+ results.append(result)
140
+ return DocumentList(results)
141
+ ```
136
142
 
137
- @flow
138
- async def my_pipeline(documents: DocumentList):
139
- config = MyFlowConfig()
140
- input_docs = config.get_input_documents(documents)
143
+ ### Simple Runner Utility (New in v0.1.7)
144
+ ```python
145
+ from ai_pipeline_core.simple_runner import run_cli, run_pipeline
146
+ from ai_pipeline_core.flow import FlowOptions
147
+
148
+ # CLI-based pipeline execution
149
+ if __name__ == "__main__":
150
+ run_cli(
151
+ flows=[my_pipeline],
152
+ flow_configs=[MyFlowConfig],
153
+ options_cls=CustomFlowOptions
154
+ )
141
155
 
142
- results = await process_task.map(input_docs)
156
+ # Or programmatic execution
157
+ async def main():
158
+ result = await run_pipeline(
159
+ project_name="my-project",
160
+ output_dir=Path("./output"),
161
+ flow=my_pipeline,
162
+ flow_config=MyFlowConfig,
163
+ flow_options=CustomFlowOptions(batch_size=50)
164
+ )
165
+ ```
143
166
 
144
- config.validate_output_documents(results)
145
- return results
167
+ ### Clean Prefect Decorators (New in v0.1.7)
168
+ ```python
169
+ # Import clean Prefect decorators without tracing
170
+ from ai_pipeline_core.prefect import flow, task
171
+
172
+ # Or use pipeline decorators with tracing
173
+ from ai_pipeline_core import pipeline_flow, pipeline_task
174
+
175
+ @task # Clean Prefect task
176
+ def compute(x: int) -> int:
177
+ return x * 2
178
+
179
+ @pipeline_task(trace_level="always") # With tracing
180
+ def compute_traced(x: int) -> int:
181
+ return x * 2
146
182
  ```
147
183
 
148
184
  ## Core Modules
@@ -249,8 +285,14 @@ ai_pipeline_core/
249
285
  │ ├── client.py # Async client implementation
250
286
  │ └── model_options.py # Configuration models
251
287
  ├── flow/ # Prefect flow utilities
252
- └── config.py # Type-safe flow configuration
288
+ ├── config.py # Type-safe flow configuration
289
+ │ └── options.py # FlowOptions base class (v0.1.7)
290
+ ├── simple_runner/ # Pipeline execution utilities (v0.1.7)
291
+ │ ├── cli.py # CLI interface
292
+ │ └── simple_runner.py # Core runner logic
253
293
  ├── logging/ # Structured logging
294
+ ├── pipeline.py # Enhanced decorators (v0.1.7)
295
+ ├── prefect.py # Clean Prefect exports (v0.1.7)
254
296
  ├── tracing.py # Observability decorators
255
297
  └── settings.py # Centralized configuration
256
298
  ```
@@ -427,9 +469,29 @@ Built with:
427
469
  - [LiteLLM](https://litellm.ai/) - LLM proxy
428
470
  - [Pydantic](https://pydantic-docs.helpmanual.io/) - Data validation
429
471
 
472
+ ## What's New in v0.1.7
473
+
474
+ ### Major Additions
475
+ - **Enhanced Pipeline Decorators**: New `pipeline_flow` and `pipeline_task` decorators combining Prefect functionality with automatic LMNR tracing
476
+ - **FlowOptions Base Class**: Extensible configuration system for flows with type-safe inheritance
477
+ - **Simple Runner Module**: CLI and programmatic utilities for easy pipeline execution
478
+ - **Clean Prefect Exports**: Separate imports for Prefect decorators with and without tracing
479
+ - **Expanded Exports**: All major components now accessible from top-level package import
480
+
481
+ ### API Improvements
482
+ - Better type inference for document flows with custom options
483
+ - Support for custom FlowOptions inheritance in pipeline flows
484
+ - Improved error messages for invalid flow signatures
485
+ - Enhanced document utility functions (`canonical_name_key`, `sanitize_url`)
486
+
487
+ ### Developer Experience
488
+ - Simplified imports - most components available from `ai_pipeline_core` directly
489
+ - Better separation of concerns between clean Prefect and traced pipeline decorators
490
+ - More intuitive flow configuration with `FlowOptions` inheritance
491
+
430
492
  ## Stability Notice
431
493
 
432
- **Current Version**: 0.1.5
494
+ **Current Version**: 0.1.7
433
495
  **Status**: Internal Preview
434
496
  **API Stability**: Unstable - Breaking changes expected
435
497
  **Recommended Use**: Learning and reference only
@@ -0,0 +1,77 @@
1
+ """Pipeline Core - Shared infrastructure for AI pipelines."""
2
+
3
+ from . import llm
4
+ from .documents import (
5
+ Document,
6
+ DocumentList,
7
+ FlowDocument,
8
+ TaskDocument,
9
+ canonical_name_key,
10
+ sanitize_url,
11
+ )
12
+ from .flow import FlowConfig, FlowOptions
13
+ from .llm import (
14
+ AIMessages,
15
+ AIMessageType,
16
+ ModelName,
17
+ ModelOptions,
18
+ ModelResponse,
19
+ StructuredModelResponse,
20
+ )
21
+ from .logging import (
22
+ LoggerMixin,
23
+ LoggingConfig,
24
+ StructuredLoggerMixin,
25
+ get_pipeline_logger,
26
+ setup_logging,
27
+ )
28
+ from .logging import get_pipeline_logger as get_logger
29
+ from .pipeline import pipeline_flow, pipeline_task
30
+ from .prefect import flow, task
31
+ from .prompt_manager import PromptManager
32
+ from .settings import settings
33
+ from .tracing import TraceInfo, TraceLevel, trace
34
+
35
+ __version__ = "0.1.7"
36
+
37
+ __all__ = [
38
+ # Config/Settings
39
+ "settings",
40
+ # Logging
41
+ "get_logger",
42
+ "get_pipeline_logger",
43
+ "LoggerMixin",
44
+ "LoggingConfig",
45
+ "setup_logging",
46
+ "StructuredLoggerMixin",
47
+ # Documents
48
+ "Document",
49
+ "DocumentList",
50
+ "FlowDocument",
51
+ "TaskDocument",
52
+ "canonical_name_key",
53
+ "sanitize_url",
54
+ # Flow/Task
55
+ "FlowConfig",
56
+ "FlowOptions",
57
+ # Prefect decorators (clean, no tracing)
58
+ "task",
59
+ "flow",
60
+ # Pipeline decorators (with tracing)
61
+ "pipeline_task",
62
+ "pipeline_flow",
63
+ # LLM
64
+ "llm",
65
+ "ModelName",
66
+ "ModelOptions",
67
+ "ModelResponse",
68
+ "StructuredModelResponse",
69
+ "AIMessages",
70
+ "AIMessageType",
71
+ # Tracing
72
+ "trace",
73
+ "TraceLevel",
74
+ "TraceInfo",
75
+ # Utils
76
+ "PromptManager",
77
+ ]
@@ -2,10 +2,13 @@ from .document import Document
2
2
  from .document_list import DocumentList
3
3
  from .flow_document import FlowDocument
4
4
  from .task_document import TaskDocument
5
+ from .utils import canonical_name_key, sanitize_url
5
6
 
6
7
  __all__ = [
7
8
  "Document",
8
9
  "DocumentList",
9
10
  "FlowDocument",
10
11
  "TaskDocument",
12
+ "canonical_name_key",
13
+ "sanitize_url",
11
14
  ]
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from base64 import b32encode
7
7
  from enum import StrEnum
8
8
  from functools import cached_property
9
- from typing import Any, ClassVar, Literal, Self
9
+ from typing import Any, ClassVar, Literal, Self, TypeVar
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
12
12
  from ruamel.yaml import YAML
@@ -19,8 +19,11 @@ from .mime_type import (
19
19
  is_image_mime_type,
20
20
  is_pdf_mime_type,
21
21
  is_text_mime_type,
22
+ is_yaml_mime_type,
22
23
  )
23
24
 
25
+ TModel = TypeVar("TModel", bound=BaseModel)
26
+
24
27
 
25
28
  class Document(BaseModel, ABC):
26
29
  """Abstract base class for all documents"""
@@ -207,15 +210,40 @@ class Document(BaseModel, ABC):
207
210
  """Parse document as JSON"""
208
211
  return json.loads(self.as_text())
209
212
 
213
+ def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
214
+ """Parse document as a pydantic model and return the validated instance"""
215
+ data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
216
+ return model_type.model_validate(data)
217
+
210
218
  def as_markdown_list(self) -> list[str]:
211
219
  """Parse document as a markdown list"""
212
220
  return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
213
221
 
214
222
  @classmethod
215
- def create(cls, name: str, description: str | None, content: bytes | str) -> Self:
223
+ def create(
224
+ cls,
225
+ name: str,
226
+ description: str | None,
227
+ content: bytes | str | BaseModel | list[str] | Any,
228
+ ) -> Self:
216
229
  """Create a document from a name, description, and content"""
217
- if isinstance(content, str):
230
+ is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
231
+ is_json_extension = name.endswith(".json")
232
+ is_markdown_extension = name.endswith(".md")
233
+ is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
234
+ if isinstance(content, bytes):
235
+ pass
236
+ elif isinstance(content, str):
218
237
  content = content.encode("utf-8")
238
+ elif is_str_list and is_markdown_extension:
239
+ return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
240
+ elif is_yaml_extension:
241
+ return cls.create_as_yaml(name, description, content)
242
+ elif is_json_extension:
243
+ return cls.create_as_json(name, description, content)
244
+ else:
245
+ raise ValueError(f"Unsupported content type: {type(content)} for {name}")
246
+
219
247
  return cls(name=name, description=description, content=content)
220
248
 
221
249
  @classmethod
@@ -230,6 +258,32 @@ class Document(BaseModel, ABC):
230
258
  content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
231
259
  return cls.create(name, description, content)
232
260
 
261
+ @classmethod
262
+ def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
263
+ """Create a document from a name, description, and JSON data"""
264
+ assert name.endswith(".json"), f"Document name must end with .json: {name}"
265
+ if isinstance(data, BaseModel):
266
+ data = data.model_dump(mode="json")
267
+ content = json.dumps(data, indent=2).encode("utf-8")
268
+ return cls.create(name, description, content)
269
+
270
+ @classmethod
271
+ def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
272
+ """Create a document from a name, description, and YAML data"""
273
+ assert name.endswith(".yaml") or name.endswith(".yml"), (
274
+ f"Document name must end with .yaml or .yml: {name}"
275
+ )
276
+ if isinstance(data, BaseModel):
277
+ data = data.model_dump()
278
+ yaml = YAML()
279
+ yaml.indent(mapping=2, sequence=4, offset=2)
280
+ from io import BytesIO
281
+
282
+ stream = BytesIO()
283
+ yaml.dump(data, stream)
284
+ content = stream.getvalue()
285
+ return cls.create(name, description, content)
286
+
233
287
  def serialize_model(self) -> dict[str, Any]:
234
288
  """Serialize document to a dictionary with proper encoding."""
235
289
  result = {
@@ -0,0 +1,110 @@
1
+ """MIME type detection utilities for documents"""
2
+
3
+ import magic
4
+
5
+ from ai_pipeline_core.logging import get_pipeline_logger
6
+
7
+ logger = get_pipeline_logger(__name__)
8
+
9
+ # Extension to MIME type mapping for common formats
10
+ # These are formats where extension-based detection is more reliable
11
+ EXTENSION_MIME_MAP = {
12
+ "md": "text/markdown",
13
+ "txt": "text/plain",
14
+ "pdf": "application/pdf",
15
+ "png": "image/png",
16
+ "jpg": "image/jpeg",
17
+ "jpeg": "image/jpeg",
18
+ "gif": "image/gif",
19
+ "bmp": "image/bmp",
20
+ "webp": "image/webp",
21
+ "json": "application/json",
22
+ "yaml": "application/yaml",
23
+ "yml": "application/yaml",
24
+ "xml": "text/xml",
25
+ "html": "text/html",
26
+ "htm": "text/html",
27
+ "py": "text/x-python",
28
+ "css": "text/css",
29
+ "js": "application/javascript",
30
+ "ts": "application/typescript",
31
+ "tsx": "application/typescript",
32
+ "jsx": "application/javascript",
33
+ }
34
+
35
+
36
+ def detect_mime_type(content: bytes, name: str) -> str:
37
+ """Detect MIME type from content and filename
38
+
39
+ Uses a hybrid approach:
40
+ 1. Check for empty content
41
+ 2. Try extension-based detection for known formats
42
+ 3. Fall back to magic content detection
43
+ 4. Final fallback to application/octet-stream
44
+ """
45
+
46
+ # Check for empty content
47
+ if len(content) == 0:
48
+ return "application/x-empty"
49
+
50
+ # Try extension-based detection first for known formats
51
+ # This is more reliable for text formats that magic might misidentify
52
+ ext = name.lower().split(".")[-1] if "." in name else ""
53
+ if ext in EXTENSION_MIME_MAP:
54
+ return EXTENSION_MIME_MAP[ext]
55
+
56
+ # Try content-based detection with magic
57
+ try:
58
+ mime = magic.from_buffer(content[:1024], mime=True)
59
+ # If magic returns a valid mime type, use it
60
+ if mime and mime != "application/octet-stream":
61
+ return mime
62
+ except (AttributeError, OSError, magic.MagicException) as e:
63
+ logger.warning(f"MIME detection failed for {name}: {e}")
64
+ except Exception as e:
65
+ logger.error(f"Unexpected error in MIME detection for {name}: {e}")
66
+
67
+ # Final fallback based on extension or default
68
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
69
+
70
+
71
+ def mime_type_from_extension(name: str) -> str:
72
+ """Get MIME type based on file extension
73
+
74
+ Legacy function kept for compatibility
75
+ """
76
+ ext = name.lower().split(".")[-1] if "." in name else ""
77
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
78
+
79
+
80
+ def is_text_mime_type(mime_type: str) -> bool:
81
+ """Check if MIME type represents text content"""
82
+ text_types = [
83
+ "text/",
84
+ "application/json",
85
+ "application/xml",
86
+ "application/javascript",
87
+ "application/yaml",
88
+ "application/x-yaml",
89
+ ]
90
+ return any(mime_type.startswith(t) for t in text_types)
91
+
92
+
93
+ def is_json_mime_type(mime_type: str) -> bool:
94
+ """Check if MIME type is JSON"""
95
+ return mime_type == "application/json"
96
+
97
+
98
+ def is_yaml_mime_type(mime_type: str) -> bool:
99
+ """Check if MIME type is YAML"""
100
+ return mime_type == "application/yaml" or mime_type == "application/x-yaml"
101
+
102
+
103
+ def is_pdf_mime_type(mime_type: str) -> bool:
104
+ """Check if MIME type is PDF"""
105
+ return mime_type == "application/pdf"
106
+
107
+
108
+ def is_image_mime_type(mime_type: str) -> bool:
109
+ """Check if MIME type is an image"""
110
+ return mime_type.startswith("image/")
@@ -0,0 +1,7 @@
1
+ from .config import FlowConfig
2
+ from .options import FlowOptions
3
+
4
+ __all__ = [
5
+ "FlowConfig",
6
+ "FlowOptions",
7
+ ]
@@ -0,0 +1,26 @@
1
+ from typing import TypeVar
2
+
3
+ from pydantic import Field
4
+ from pydantic_settings import BaseSettings, SettingsConfigDict
5
+
6
+ from ai_pipeline_core.llm import ModelName
7
+
8
+ T = TypeVar("T", bound="FlowOptions")
9
+
10
+
11
+ class FlowOptions(BaseSettings):
12
+ """Base configuration for AI Pipeline flows."""
13
+
14
+ core_model: ModelName | str = Field(
15
+ default="gpt-5",
16
+ description="Primary model for complex analysis and generation tasks.",
17
+ )
18
+ small_model: ModelName | str = Field(
19
+ default="gpt-5-mini",
20
+ description="Fast, cost-effective model for simple tasks and orchestration.",
21
+ )
22
+
23
+ model_config = SettingsConfigDict(frozen=True, extra="ignore")
24
+
25
+
26
+ __all__ = ["FlowOptions"]
@@ -118,11 +118,13 @@ async def _generate_with_retry(
118
118
  span.set_attributes(response.get_laminar_metadata())
119
119
  Laminar.set_span_output(response.content)
120
120
  if not response.content:
121
- # disable cache in case of empty response
122
- completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
123
121
  raise ValueError(f"Model {model} returned an empty response.")
124
122
  return response
125
123
  except (asyncio.TimeoutError, ValueError, Exception) as e:
124
+ if not isinstance(e, asyncio.TimeoutError):
125
+ # disable cache if it's not a timeout because it may cause an error
126
+ completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
127
+
126
128
  logger.warning(
127
129
  "LLM generation failed (attempt %d/%d): %s",
128
130
  attempt + 1,
@@ -167,7 +169,7 @@ T = TypeVar("T", bound=BaseModel)
167
169
 
168
170
  @trace(ignore_inputs=["context"])
169
171
  async def generate_structured(
170
- model: ModelName,
172
+ model: ModelName | str,
171
173
  response_format: type[T],
172
174
  *,
173
175
  context: AIMessages = AIMessages(),