ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,23 @@
1
1
  """Pipeline Core - Shared infrastructure for AI pipelines."""
2
2
 
3
- from .documents import Document, DocumentList, FlowDocument, TaskDocument
4
- from .flow import FlowConfig
3
+ from . import llm
4
+ from .documents import (
5
+ Document,
6
+ DocumentList,
7
+ FlowDocument,
8
+ TaskDocument,
9
+ canonical_name_key,
10
+ sanitize_url,
11
+ )
12
+ from .flow import FlowConfig, FlowOptions
13
+ from .llm import (
14
+ AIMessages,
15
+ AIMessageType,
16
+ ModelName,
17
+ ModelOptions,
18
+ ModelResponse,
19
+ StructuredModelResponse,
20
+ )
5
21
  from .logging import (
6
22
  LoggerMixin,
7
23
  LoggingConfig,
@@ -9,28 +25,53 @@ from .logging import (
9
25
  get_pipeline_logger,
10
26
  setup_logging,
11
27
  )
12
- from .logging import (
13
- get_pipeline_logger as get_logger,
14
- )
28
+ from .logging import get_pipeline_logger as get_logger
29
+ from .pipeline import pipeline_flow, pipeline_task
30
+ from .prefect import flow, task
15
31
  from .prompt_manager import PromptManager
16
32
  from .settings import settings
17
- from .tracing import trace
33
+ from .tracing import TraceInfo, TraceLevel, trace
18
34
 
19
- __version__ = "0.1.4"
35
+ __version__ = "0.1.7"
20
36
 
21
37
  __all__ = [
22
- "Document",
23
- "DocumentList",
24
- "FlowConfig",
25
- "FlowDocument",
38
+ # Config/Settings
39
+ "settings",
40
+ # Logging
26
41
  "get_logger",
27
42
  "get_pipeline_logger",
28
43
  "LoggerMixin",
29
44
  "LoggingConfig",
30
- "PromptManager",
31
- "settings",
32
45
  "setup_logging",
33
46
  "StructuredLoggerMixin",
47
+ # Documents
48
+ "Document",
49
+ "DocumentList",
50
+ "FlowDocument",
34
51
  "TaskDocument",
52
+ "canonical_name_key",
53
+ "sanitize_url",
54
+ # Flow/Task
55
+ "FlowConfig",
56
+ "FlowOptions",
57
+ # Prefect decorators (clean, no tracing)
58
+ "task",
59
+ "flow",
60
+ # Pipeline decorators (with tracing)
61
+ "pipeline_task",
62
+ "pipeline_flow",
63
+ # LLM
64
+ "llm",
65
+ "ModelName",
66
+ "ModelOptions",
67
+ "ModelResponse",
68
+ "StructuredModelResponse",
69
+ "AIMessages",
70
+ "AIMessageType",
71
+ # Tracing
35
72
  "trace",
73
+ "TraceLevel",
74
+ "TraceInfo",
75
+ # Utils
76
+ "PromptManager",
36
77
  ]
@@ -2,10 +2,13 @@ from .document import Document
2
2
  from .document_list import DocumentList
3
3
  from .flow_document import FlowDocument
4
4
  from .task_document import TaskDocument
5
+ from .utils import canonical_name_key, sanitize_url
5
6
 
6
7
  __all__ = [
7
8
  "Document",
8
9
  "DocumentList",
9
10
  "FlowDocument",
10
11
  "TaskDocument",
12
+ "canonical_name_key",
13
+ "sanitize_url",
11
14
  ]
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from base64 import b32encode
7
7
  from enum import StrEnum
8
8
  from functools import cached_property
9
- from typing import Any, ClassVar, Literal, Self
9
+ from typing import Any, ClassVar, Literal, Self, TypeVar
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
12
12
  from ruamel.yaml import YAML
@@ -19,8 +19,11 @@ from .mime_type import (
19
19
  is_image_mime_type,
20
20
  is_pdf_mime_type,
21
21
  is_text_mime_type,
22
+ is_yaml_mime_type,
22
23
  )
23
24
 
25
+ TModel = TypeVar("TModel", bound=BaseModel)
26
+
24
27
 
25
28
  class Document(BaseModel, ABC):
26
29
  """Abstract base class for all documents"""
@@ -207,15 +210,40 @@ class Document(BaseModel, ABC):
207
210
  """Parse document as JSON"""
208
211
  return json.loads(self.as_text())
209
212
 
213
+ def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
214
+ """Parse document as a pydantic model and return the validated instance"""
215
+ data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
216
+ return model_type.model_validate(data)
217
+
210
218
  def as_markdown_list(self) -> list[str]:
211
219
  """Parse document as a markdown list"""
212
220
  return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
213
221
 
214
222
  @classmethod
215
- def create(cls, name: str, description: str | None, content: bytes | str) -> Self:
223
+ def create(
224
+ cls,
225
+ name: str,
226
+ description: str | None,
227
+ content: bytes | str | BaseModel | list[str] | Any,
228
+ ) -> Self:
216
229
  """Create a document from a name, description, and content"""
217
- if isinstance(content, str):
230
+ is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
231
+ is_json_extension = name.endswith(".json")
232
+ is_markdown_extension = name.endswith(".md")
233
+ is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
234
+ if isinstance(content, bytes):
235
+ pass
236
+ elif isinstance(content, str):
218
237
  content = content.encode("utf-8")
238
+ elif is_str_list and is_markdown_extension:
239
+ return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
240
+ elif is_yaml_extension:
241
+ return cls.create_as_yaml(name, description, content)
242
+ elif is_json_extension:
243
+ return cls.create_as_json(name, description, content)
244
+ else:
245
+ raise ValueError(f"Unsupported content type: {type(content)} for {name}")
246
+
219
247
  return cls(name=name, description=description, content=content)
220
248
 
221
249
  @classmethod
@@ -230,6 +258,32 @@ class Document(BaseModel, ABC):
230
258
  content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
231
259
  return cls.create(name, description, content)
232
260
 
261
+ @classmethod
262
+ def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
263
+ """Create a document from a name, description, and JSON data"""
264
+ assert name.endswith(".json"), f"Document name must end with .json: {name}"
265
+ if isinstance(data, BaseModel):
266
+ data = data.model_dump(mode="json")
267
+ content = json.dumps(data, indent=2).encode("utf-8")
268
+ return cls.create(name, description, content)
269
+
270
+ @classmethod
271
+ def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
272
+ """Create a document from a name, description, and YAML data"""
273
+ assert name.endswith(".yaml") or name.endswith(".yml"), (
274
+ f"Document name must end with .yaml or .yml: {name}"
275
+ )
276
+ if isinstance(data, BaseModel):
277
+ data = data.model_dump()
278
+ yaml = YAML()
279
+ yaml.indent(mapping=2, sequence=4, offset=2)
280
+ from io import BytesIO
281
+
282
+ stream = BytesIO()
283
+ yaml.dump(data, stream)
284
+ content = stream.getvalue()
285
+ return cls.create(name, description, content)
286
+
233
287
  def serialize_model(self) -> dict[str, Any]:
234
288
  """Serialize document to a dictionary with proper encoding."""
235
289
  result = {
@@ -6,53 +6,75 @@ from ai_pipeline_core.logging import get_pipeline_logger
6
6
 
7
7
  logger = get_pipeline_logger(__name__)
8
8
 
9
+ # Extension to MIME type mapping for common formats
10
+ # These are formats where extension-based detection is more reliable
11
+ EXTENSION_MIME_MAP = {
12
+ "md": "text/markdown",
13
+ "txt": "text/plain",
14
+ "pdf": "application/pdf",
15
+ "png": "image/png",
16
+ "jpg": "image/jpeg",
17
+ "jpeg": "image/jpeg",
18
+ "gif": "image/gif",
19
+ "bmp": "image/bmp",
20
+ "webp": "image/webp",
21
+ "json": "application/json",
22
+ "yaml": "application/yaml",
23
+ "yml": "application/yaml",
24
+ "xml": "text/xml",
25
+ "html": "text/html",
26
+ "htm": "text/html",
27
+ "py": "text/x-python",
28
+ "css": "text/css",
29
+ "js": "application/javascript",
30
+ "ts": "application/typescript",
31
+ "tsx": "application/typescript",
32
+ "jsx": "application/javascript",
33
+ }
34
+
9
35
 
10
36
  def detect_mime_type(content: bytes, name: str) -> str:
11
- """Detect MIME type from content using python-magic"""
37
+ """Detect MIME type from content and filename
12
38
 
13
- try:
14
- if name.endswith(".md") and content.decode("utf-8"):
15
- return "text/markdown"
16
- except UnicodeDecodeError:
17
- pass
39
+ Uses a hybrid approach:
40
+ 1. Check for empty content
41
+ 2. Try extension-based detection for known formats
42
+ 3. Fall back to magic content detection
43
+ 4. Final fallback to application/octet-stream
44
+ """
18
45
 
19
- if len(content) <= 4:
46
+ # Check for empty content
47
+ if len(content) == 0:
20
48
  return "application/x-empty"
21
49
 
50
+ # Try extension-based detection first for known formats
51
+ # This is more reliable for text formats that magic might misidentify
52
+ ext = name.lower().split(".")[-1] if "." in name else ""
53
+ if ext in EXTENSION_MIME_MAP:
54
+ return EXTENSION_MIME_MAP[ext]
55
+
56
+ # Try content-based detection with magic
22
57
  try:
23
58
  mime = magic.from_buffer(content[:1024], mime=True)
24
- return mime
59
+ # If magic returns a valid mime type, use it
60
+ if mime and mime != "application/octet-stream":
61
+ return mime
25
62
  except (AttributeError, OSError, magic.MagicException) as e:
26
- logger.warning(f"MIME detection failed for {name}: {e}, falling back to extension")
27
- return mime_type_from_extension(name)
63
+ logger.warning(f"MIME detection failed for {name}: {e}")
28
64
  except Exception as e:
29
65
  logger.error(f"Unexpected error in MIME detection for {name}: {e}")
30
- return mime_type_from_extension(name)
66
+
67
+ # Final fallback based on extension or default
68
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
31
69
 
32
70
 
33
71
  def mime_type_from_extension(name: str) -> str:
34
- """Get MIME type based on file extension"""
35
- ext = name.lower().split(".")[-1] if "." in name else ""
72
+ """Get MIME type based on file extension
36
73
 
37
- mime_map = {
38
- "md": "text/markdown",
39
- "txt": "text/plain",
40
- "pdf": "application/pdf",
41
- "png": "image/png",
42
- "jpg": "image/jpeg",
43
- "jpeg": "image/jpeg",
44
- "gif": "image/gif",
45
- "bmp": "image/bmp",
46
- "webp": "image/webp",
47
- "json": "application/json",
48
- "yaml": "application/yaml",
49
- "yml": "application/yaml",
50
- "xml": "text/xml",
51
- "html": "text/html",
52
- "htm": "text/html",
53
- }
54
-
55
- return mime_map.get(ext, "application/octet-stream")
74
+ Legacy function kept for compatibility
75
+ """
76
+ ext = name.lower().split(".")[-1] if "." in name else ""
77
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
56
78
 
57
79
 
58
80
  def is_text_mime_type(mime_type: str) -> bool:
@@ -68,6 +90,16 @@ def is_text_mime_type(mime_type: str) -> bool:
68
90
  return any(mime_type.startswith(t) for t in text_types)
69
91
 
70
92
 
93
+ def is_json_mime_type(mime_type: str) -> bool:
94
+ """Check if MIME type is JSON"""
95
+ return mime_type == "application/json"
96
+
97
+
98
+ def is_yaml_mime_type(mime_type: str) -> bool:
99
+ """Check if MIME type is YAML"""
100
+ return mime_type == "application/yaml" or mime_type == "application/x-yaml"
101
+
102
+
71
103
  def is_pdf_mime_type(mime_type: str) -> bool:
72
104
  """Check if MIME type is PDF"""
73
105
  return mime_type == "application/pdf"
@@ -1,3 +1,7 @@
1
1
  from .config import FlowConfig
2
+ from .options import FlowOptions
2
3
 
3
- __all__ = ["FlowConfig"]
4
+ __all__ = [
5
+ "FlowConfig",
6
+ "FlowOptions",
7
+ ]
@@ -0,0 +1,26 @@
1
+ from typing import TypeVar
2
+
3
+ from pydantic import Field
4
+ from pydantic_settings import BaseSettings, SettingsConfigDict
5
+
6
+ from ai_pipeline_core.llm import ModelName
7
+
8
+ T = TypeVar("T", bound="FlowOptions")
9
+
10
+
11
+ class FlowOptions(BaseSettings):
12
+ """Base configuration for AI Pipeline flows."""
13
+
14
+ core_model: ModelName | str = Field(
15
+ default="gpt-5",
16
+ description="Primary model for complex analysis and generation tasks.",
17
+ )
18
+ small_model: ModelName | str = Field(
19
+ default="gpt-5-mini",
20
+ description="Fast, cost-effective model for simple tasks and orchestration.",
21
+ )
22
+
23
+ model_config = SettingsConfigDict(frozen=True, extra="ignore")
24
+
25
+
26
+ __all__ = ["FlowOptions"]
@@ -118,11 +118,13 @@ async def _generate_with_retry(
118
118
  span.set_attributes(response.get_laminar_metadata())
119
119
  Laminar.set_span_output(response.content)
120
120
  if not response.content:
121
- # disable cache in case of empty response
122
- completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
123
121
  raise ValueError(f"Model {model} returned an empty response.")
124
122
  return response
125
123
  except (asyncio.TimeoutError, ValueError, Exception) as e:
124
+ if not isinstance(e, asyncio.TimeoutError):
125
+ # disable cache if it's not a timeout because it may cause an error
126
+ completion_kwargs["extra_body"]["cache"] = {"no-cache": True}
127
+
126
128
  logger.warning(
127
129
  "LLM generation failed (attempt %d/%d): %s",
128
130
  attempt + 1,
@@ -167,7 +169,7 @@ T = TypeVar("T", bound=BaseModel)
167
169
 
168
170
  @trace(ignore_inputs=["context"])
169
171
  async def generate_structured(
170
- model: ModelName,
172
+ model: ModelName | str,
171
173
  response_format: type[T],
172
174
  *,
173
175
  context: AIMessages = AIMessages(),