ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +54 -13
- ai_pipeline_core/documents/__init__.py +3 -0
- ai_pipeline_core/documents/document.py +57 -3
- ai_pipeline_core/documents/mime_type.py +64 -32
- ai_pipeline_core/flow/__init__.py +5 -1
- ai_pipeline_core/flow/options.py +26 -0
- ai_pipeline_core/llm/client.py +5 -3
- ai_pipeline_core/pipeline.py +418 -0
- ai_pipeline_core/prefect.py +7 -0
- ai_pipeline_core/simple_runner/__init__.py +19 -0
- ai_pipeline_core/simple_runner/cli.py +95 -0
- ai_pipeline_core/simple_runner/simple_runner.py +147 -0
- ai_pipeline_core/tracing.py +63 -20
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/METADATA +92 -30
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/RECORD +17 -11
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.7.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/__init__.py
CHANGED
|
@@ -1,7 +1,23 @@
|
|
|
1
1
|
"""Pipeline Core - Shared infrastructure for AI pipelines."""
|
|
2
2
|
|
|
3
|
-
from .
|
|
4
|
-
from .
|
|
3
|
+
from . import llm
|
|
4
|
+
from .documents import (
|
|
5
|
+
Document,
|
|
6
|
+
DocumentList,
|
|
7
|
+
FlowDocument,
|
|
8
|
+
TaskDocument,
|
|
9
|
+
canonical_name_key,
|
|
10
|
+
sanitize_url,
|
|
11
|
+
)
|
|
12
|
+
from .flow import FlowConfig, FlowOptions
|
|
13
|
+
from .llm import (
|
|
14
|
+
AIMessages,
|
|
15
|
+
AIMessageType,
|
|
16
|
+
ModelName,
|
|
17
|
+
ModelOptions,
|
|
18
|
+
ModelResponse,
|
|
19
|
+
StructuredModelResponse,
|
|
20
|
+
)
|
|
5
21
|
from .logging import (
|
|
6
22
|
LoggerMixin,
|
|
7
23
|
LoggingConfig,
|
|
@@ -9,28 +25,53 @@ from .logging import (
|
|
|
9
25
|
get_pipeline_logger,
|
|
10
26
|
setup_logging,
|
|
11
27
|
)
|
|
12
|
-
from .logging import
|
|
13
|
-
|
|
14
|
-
|
|
28
|
+
from .logging import get_pipeline_logger as get_logger
|
|
29
|
+
from .pipeline import pipeline_flow, pipeline_task
|
|
30
|
+
from .prefect import flow, task
|
|
15
31
|
from .prompt_manager import PromptManager
|
|
16
32
|
from .settings import settings
|
|
17
|
-
from .tracing import trace
|
|
33
|
+
from .tracing import TraceInfo, TraceLevel, trace
|
|
18
34
|
|
|
19
|
-
__version__ = "0.1.
|
|
35
|
+
__version__ = "0.1.7"
|
|
20
36
|
|
|
21
37
|
__all__ = [
|
|
22
|
-
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
"FlowDocument",
|
|
38
|
+
# Config/Settings
|
|
39
|
+
"settings",
|
|
40
|
+
# Logging
|
|
26
41
|
"get_logger",
|
|
27
42
|
"get_pipeline_logger",
|
|
28
43
|
"LoggerMixin",
|
|
29
44
|
"LoggingConfig",
|
|
30
|
-
"PromptManager",
|
|
31
|
-
"settings",
|
|
32
45
|
"setup_logging",
|
|
33
46
|
"StructuredLoggerMixin",
|
|
47
|
+
# Documents
|
|
48
|
+
"Document",
|
|
49
|
+
"DocumentList",
|
|
50
|
+
"FlowDocument",
|
|
34
51
|
"TaskDocument",
|
|
52
|
+
"canonical_name_key",
|
|
53
|
+
"sanitize_url",
|
|
54
|
+
# Flow/Task
|
|
55
|
+
"FlowConfig",
|
|
56
|
+
"FlowOptions",
|
|
57
|
+
# Prefect decorators (clean, no tracing)
|
|
58
|
+
"task",
|
|
59
|
+
"flow",
|
|
60
|
+
# Pipeline decorators (with tracing)
|
|
61
|
+
"pipeline_task",
|
|
62
|
+
"pipeline_flow",
|
|
63
|
+
# LLM
|
|
64
|
+
"llm",
|
|
65
|
+
"ModelName",
|
|
66
|
+
"ModelOptions",
|
|
67
|
+
"ModelResponse",
|
|
68
|
+
"StructuredModelResponse",
|
|
69
|
+
"AIMessages",
|
|
70
|
+
"AIMessageType",
|
|
71
|
+
# Tracing
|
|
35
72
|
"trace",
|
|
73
|
+
"TraceLevel",
|
|
74
|
+
"TraceInfo",
|
|
75
|
+
# Utils
|
|
76
|
+
"PromptManager",
|
|
36
77
|
]
|
|
@@ -2,10 +2,13 @@ from .document import Document
|
|
|
2
2
|
from .document_list import DocumentList
|
|
3
3
|
from .flow_document import FlowDocument
|
|
4
4
|
from .task_document import TaskDocument
|
|
5
|
+
from .utils import canonical_name_key, sanitize_url
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
7
8
|
"Document",
|
|
8
9
|
"DocumentList",
|
|
9
10
|
"FlowDocument",
|
|
10
11
|
"TaskDocument",
|
|
12
|
+
"canonical_name_key",
|
|
13
|
+
"sanitize_url",
|
|
11
14
|
]
|
|
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
|
|
|
6
6
|
from base64 import b32encode
|
|
7
7
|
from enum import StrEnum
|
|
8
8
|
from functools import cached_property
|
|
9
|
-
from typing import Any, ClassVar, Literal, Self
|
|
9
|
+
from typing import Any, ClassVar, Literal, Self, TypeVar
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
|
|
12
12
|
from ruamel.yaml import YAML
|
|
@@ -19,8 +19,11 @@ from .mime_type import (
|
|
|
19
19
|
is_image_mime_type,
|
|
20
20
|
is_pdf_mime_type,
|
|
21
21
|
is_text_mime_type,
|
|
22
|
+
is_yaml_mime_type,
|
|
22
23
|
)
|
|
23
24
|
|
|
25
|
+
TModel = TypeVar("TModel", bound=BaseModel)
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
class Document(BaseModel, ABC):
|
|
26
29
|
"""Abstract base class for all documents"""
|
|
@@ -207,15 +210,40 @@ class Document(BaseModel, ABC):
|
|
|
207
210
|
"""Parse document as JSON"""
|
|
208
211
|
return json.loads(self.as_text())
|
|
209
212
|
|
|
213
|
+
def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
|
|
214
|
+
"""Parse document as a pydantic model and return the validated instance"""
|
|
215
|
+
data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
|
|
216
|
+
return model_type.model_validate(data)
|
|
217
|
+
|
|
210
218
|
def as_markdown_list(self) -> list[str]:
|
|
211
219
|
"""Parse document as a markdown list"""
|
|
212
220
|
return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
|
|
213
221
|
|
|
214
222
|
@classmethod
|
|
215
|
-
def create(
|
|
223
|
+
def create(
|
|
224
|
+
cls,
|
|
225
|
+
name: str,
|
|
226
|
+
description: str | None,
|
|
227
|
+
content: bytes | str | BaseModel | list[str] | Any,
|
|
228
|
+
) -> Self:
|
|
216
229
|
"""Create a document from a name, description, and content"""
|
|
217
|
-
|
|
230
|
+
is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
|
|
231
|
+
is_json_extension = name.endswith(".json")
|
|
232
|
+
is_markdown_extension = name.endswith(".md")
|
|
233
|
+
is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
|
|
234
|
+
if isinstance(content, bytes):
|
|
235
|
+
pass
|
|
236
|
+
elif isinstance(content, str):
|
|
218
237
|
content = content.encode("utf-8")
|
|
238
|
+
elif is_str_list and is_markdown_extension:
|
|
239
|
+
return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
|
|
240
|
+
elif is_yaml_extension:
|
|
241
|
+
return cls.create_as_yaml(name, description, content)
|
|
242
|
+
elif is_json_extension:
|
|
243
|
+
return cls.create_as_json(name, description, content)
|
|
244
|
+
else:
|
|
245
|
+
raise ValueError(f"Unsupported content type: {type(content)} for {name}")
|
|
246
|
+
|
|
219
247
|
return cls(name=name, description=description, content=content)
|
|
220
248
|
|
|
221
249
|
@classmethod
|
|
@@ -230,6 +258,32 @@ class Document(BaseModel, ABC):
|
|
|
230
258
|
content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
|
|
231
259
|
return cls.create(name, description, content)
|
|
232
260
|
|
|
261
|
+
@classmethod
|
|
262
|
+
def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
|
|
263
|
+
"""Create a document from a name, description, and JSON data"""
|
|
264
|
+
assert name.endswith(".json"), f"Document name must end with .json: {name}"
|
|
265
|
+
if isinstance(data, BaseModel):
|
|
266
|
+
data = data.model_dump(mode="json")
|
|
267
|
+
content = json.dumps(data, indent=2).encode("utf-8")
|
|
268
|
+
return cls.create(name, description, content)
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
|
|
272
|
+
"""Create a document from a name, description, and YAML data"""
|
|
273
|
+
assert name.endswith(".yaml") or name.endswith(".yml"), (
|
|
274
|
+
f"Document name must end with .yaml or .yml: {name}"
|
|
275
|
+
)
|
|
276
|
+
if isinstance(data, BaseModel):
|
|
277
|
+
data = data.model_dump()
|
|
278
|
+
yaml = YAML()
|
|
279
|
+
yaml.indent(mapping=2, sequence=4, offset=2)
|
|
280
|
+
from io import BytesIO
|
|
281
|
+
|
|
282
|
+
stream = BytesIO()
|
|
283
|
+
yaml.dump(data, stream)
|
|
284
|
+
content = stream.getvalue()
|
|
285
|
+
return cls.create(name, description, content)
|
|
286
|
+
|
|
233
287
|
def serialize_model(self) -> dict[str, Any]:
|
|
234
288
|
"""Serialize document to a dictionary with proper encoding."""
|
|
235
289
|
result = {
|
|
@@ -6,53 +6,75 @@ from ai_pipeline_core.logging import get_pipeline_logger
|
|
|
6
6
|
|
|
7
7
|
logger = get_pipeline_logger(__name__)
|
|
8
8
|
|
|
9
|
+
# Extension to MIME type mapping for common formats
|
|
10
|
+
# These are formats where extension-based detection is more reliable
|
|
11
|
+
EXTENSION_MIME_MAP = {
|
|
12
|
+
"md": "text/markdown",
|
|
13
|
+
"txt": "text/plain",
|
|
14
|
+
"pdf": "application/pdf",
|
|
15
|
+
"png": "image/png",
|
|
16
|
+
"jpg": "image/jpeg",
|
|
17
|
+
"jpeg": "image/jpeg",
|
|
18
|
+
"gif": "image/gif",
|
|
19
|
+
"bmp": "image/bmp",
|
|
20
|
+
"webp": "image/webp",
|
|
21
|
+
"json": "application/json",
|
|
22
|
+
"yaml": "application/yaml",
|
|
23
|
+
"yml": "application/yaml",
|
|
24
|
+
"xml": "text/xml",
|
|
25
|
+
"html": "text/html",
|
|
26
|
+
"htm": "text/html",
|
|
27
|
+
"py": "text/x-python",
|
|
28
|
+
"css": "text/css",
|
|
29
|
+
"js": "application/javascript",
|
|
30
|
+
"ts": "application/typescript",
|
|
31
|
+
"tsx": "application/typescript",
|
|
32
|
+
"jsx": "application/javascript",
|
|
33
|
+
}
|
|
34
|
+
|
|
9
35
|
|
|
10
36
|
def detect_mime_type(content: bytes, name: str) -> str:
|
|
11
|
-
"""Detect MIME type from content
|
|
37
|
+
"""Detect MIME type from content and filename
|
|
12
38
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
39
|
+
Uses a hybrid approach:
|
|
40
|
+
1. Check for empty content
|
|
41
|
+
2. Try extension-based detection for known formats
|
|
42
|
+
3. Fall back to magic content detection
|
|
43
|
+
4. Final fallback to application/octet-stream
|
|
44
|
+
"""
|
|
18
45
|
|
|
19
|
-
|
|
46
|
+
# Check for empty content
|
|
47
|
+
if len(content) == 0:
|
|
20
48
|
return "application/x-empty"
|
|
21
49
|
|
|
50
|
+
# Try extension-based detection first for known formats
|
|
51
|
+
# This is more reliable for text formats that magic might misidentify
|
|
52
|
+
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
53
|
+
if ext in EXTENSION_MIME_MAP:
|
|
54
|
+
return EXTENSION_MIME_MAP[ext]
|
|
55
|
+
|
|
56
|
+
# Try content-based detection with magic
|
|
22
57
|
try:
|
|
23
58
|
mime = magic.from_buffer(content[:1024], mime=True)
|
|
24
|
-
|
|
59
|
+
# If magic returns a valid mime type, use it
|
|
60
|
+
if mime and mime != "application/octet-stream":
|
|
61
|
+
return mime
|
|
25
62
|
except (AttributeError, OSError, magic.MagicException) as e:
|
|
26
|
-
logger.warning(f"MIME detection failed for {name}: {e}
|
|
27
|
-
return mime_type_from_extension(name)
|
|
63
|
+
logger.warning(f"MIME detection failed for {name}: {e}")
|
|
28
64
|
except Exception as e:
|
|
29
65
|
logger.error(f"Unexpected error in MIME detection for {name}: {e}")
|
|
30
|
-
|
|
66
|
+
|
|
67
|
+
# Final fallback based on extension or default
|
|
68
|
+
return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
|
|
31
69
|
|
|
32
70
|
|
|
33
71
|
def mime_type_from_extension(name: str) -> str:
|
|
34
|
-
"""Get MIME type based on file extension
|
|
35
|
-
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
72
|
+
"""Get MIME type based on file extension
|
|
36
73
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"png": "image/png",
|
|
42
|
-
"jpg": "image/jpeg",
|
|
43
|
-
"jpeg": "image/jpeg",
|
|
44
|
-
"gif": "image/gif",
|
|
45
|
-
"bmp": "image/bmp",
|
|
46
|
-
"webp": "image/webp",
|
|
47
|
-
"json": "application/json",
|
|
48
|
-
"yaml": "application/yaml",
|
|
49
|
-
"yml": "application/yaml",
|
|
50
|
-
"xml": "text/xml",
|
|
51
|
-
"html": "text/html",
|
|
52
|
-
"htm": "text/html",
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
return mime_map.get(ext, "application/octet-stream")
|
|
74
|
+
Legacy function kept for compatibility
|
|
75
|
+
"""
|
|
76
|
+
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
77
|
+
return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
|
|
56
78
|
|
|
57
79
|
|
|
58
80
|
def is_text_mime_type(mime_type: str) -> bool:
|
|
@@ -68,6 +90,16 @@ def is_text_mime_type(mime_type: str) -> bool:
|
|
|
68
90
|
return any(mime_type.startswith(t) for t in text_types)
|
|
69
91
|
|
|
70
92
|
|
|
93
|
+
def is_json_mime_type(mime_type: str) -> bool:
|
|
94
|
+
"""Check if MIME type is JSON"""
|
|
95
|
+
return mime_type == "application/json"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def is_yaml_mime_type(mime_type: str) -> bool:
|
|
99
|
+
"""Check if MIME type is YAML"""
|
|
100
|
+
return mime_type == "application/yaml" or mime_type == "application/x-yaml"
|
|
101
|
+
|
|
102
|
+
|
|
71
103
|
def is_pdf_mime_type(mime_type: str) -> bool:
|
|
72
104
|
"""Check if MIME type is PDF"""
|
|
73
105
|
return mime_type == "application/pdf"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import TypeVar
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
|
|
6
|
+
from ai_pipeline_core.llm import ModelName
|
|
7
|
+
|
|
8
|
+
T = TypeVar("T", bound="FlowOptions")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FlowOptions(BaseSettings):
|
|
12
|
+
"""Base configuration for AI Pipeline flows."""
|
|
13
|
+
|
|
14
|
+
core_model: ModelName | str = Field(
|
|
15
|
+
default="gpt-5",
|
|
16
|
+
description="Primary model for complex analysis and generation tasks.",
|
|
17
|
+
)
|
|
18
|
+
small_model: ModelName | str = Field(
|
|
19
|
+
default="gpt-5-mini",
|
|
20
|
+
description="Fast, cost-effective model for simple tasks and orchestration.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
model_config = SettingsConfigDict(frozen=True, extra="ignore")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = ["FlowOptions"]
|
ai_pipeline_core/llm/client.py
CHANGED
|
@@ -118,11 +118,13 @@ async def _generate_with_retry(
|
|
|
118
118
|
span.set_attributes(response.get_laminar_metadata())
|
|
119
119
|
Laminar.set_span_output(response.content)
|
|
120
120
|
if not response.content:
|
|
121
|
-
# disable cache in case of empty response
|
|
122
|
-
completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
|
|
123
121
|
raise ValueError(f"Model {model} returned an empty response.")
|
|
124
122
|
return response
|
|
125
123
|
except (asyncio.TimeoutError, ValueError, Exception) as e:
|
|
124
|
+
if not isinstance(e, asyncio.TimeoutError):
|
|
125
|
+
# disable cache if it's not a timeout because it may cause an error
|
|
126
|
+
completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
|
|
127
|
+
|
|
126
128
|
logger.warning(
|
|
127
129
|
"LLM generation failed (attempt %d/%d): %s",
|
|
128
130
|
attempt + 1,
|
|
@@ -167,7 +169,7 @@ T = TypeVar("T", bound=BaseModel)
|
|
|
167
169
|
|
|
168
170
|
@trace(ignore_inputs=["context"])
|
|
169
171
|
async def generate_structured(
|
|
170
|
-
model: ModelName,
|
|
172
|
+
model: ModelName | str,
|
|
171
173
|
response_format: type[T],
|
|
172
174
|
*,
|
|
173
175
|
context: AIMessages = AIMessages(),
|