ai-pipeline-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ """Pipeline Core - Shared infrastructure for AI pipelines."""
2
+
3
+ from .documents import Document, DocumentList, FlowDocument, TaskDocument
4
+ from .flow import FlowConfig
5
+ from .logging import (
6
+ LoggerMixin,
7
+ LoggingConfig,
8
+ StructuredLoggerMixin,
9
+ get_pipeline_logger,
10
+ setup_logging,
11
+ )
12
+ from .logging import (
13
+ get_pipeline_logger as get_logger,
14
+ )
15
+ from .prompt_manager import PromptManager
16
+ from .settings import settings
17
+ from .tracing import trace
18
+
19
+ __version__ = "0.1.1"
20
+
21
+ __all__ = [
22
+ "Document",
23
+ "DocumentList",
24
+ "FlowConfig",
25
+ "FlowDocument",
26
+ "get_logger",
27
+ "get_pipeline_logger",
28
+ "LoggerMixin",
29
+ "LoggingConfig",
30
+ "PromptManager",
31
+ "settings",
32
+ "setup_logging",
33
+ "StructuredLoggerMixin",
34
+ "TaskDocument",
35
+ "trace",
36
+ ]
@@ -0,0 +1,11 @@
1
+ from .document import Document
2
+ from .document_list import DocumentList
3
+ from .flow_document import FlowDocument
4
+ from .task_document import TaskDocument
5
+
6
+ __all__ = [
7
+ "Document",
8
+ "DocumentList",
9
+ "FlowDocument",
10
+ "TaskDocument",
11
+ ]
@@ -0,0 +1,252 @@
1
+ import base64
2
+ import hashlib
3
+ import json
4
+ from abc import ABC, abstractmethod
5
+ from base64 import b32encode
6
+ from enum import StrEnum
7
+ from functools import cached_property
8
+ from typing import Any, ClassVar, Literal, Self
9
+
10
+ from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
11
+ from ruamel.yaml import YAML
12
+
13
+ from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
14
+
15
+ from .mime_type import (
16
+ detect_mime_type,
17
+ is_image_mime_type,
18
+ is_pdf_mime_type,
19
+ is_text_mime_type,
20
+ )
21
+
22
+
23
+ class Document(BaseModel, ABC):
24
+ """Abstract base class for all documents"""
25
+
26
+ MAX_CONTENT_SIZE: ClassVar[int] = 10 * 1024 * 1024 # 10MB default
27
+ DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
28
+
29
+ # Optional enum of allowed file names. Subclasses may set this.
30
+ # This is used to validate the document name.
31
+ FILES: ClassVar[type[StrEnum] | None] = None
32
+
33
+ name: str
34
+ description: str | None = None
35
+ content: bytes
36
+
37
+ # Pydantic configuration
38
+ model_config = ConfigDict(
39
+ frozen=True, # Make documents immutable
40
+ arbitrary_types_allowed=True,
41
+ )
42
+
43
+ @abstractmethod
44
+ def get_base_type(self) -> Literal["flow", "task"]:
45
+ """Get the type of the document - must be implemented by subclasses"""
46
+ raise NotImplementedError("Subclasses must implement this method")
47
+
48
+ @property
49
+ def base_type(self) -> Literal["flow", "task"]:
50
+ """Alias for document_type for backward compatibility"""
51
+ return self.get_base_type()
52
+
53
+ @property
54
+ def is_flow(self) -> bool:
55
+ """Check if document is a flow document"""
56
+ return self.get_base_type() == "flow"
57
+
58
+ @property
59
+ def is_task(self) -> bool:
60
+ """Check if document is a task document"""
61
+ return self.get_base_type() == "task"
62
+
63
+ @classmethod
64
+ def get_expected_files(cls) -> list[str] | None:
65
+ """
66
+ Return the list of allowed file names for this document class, or None if unrestricted.
67
+ """
68
+ if cls.FILES is None:
69
+ return None
70
+ try:
71
+ values = [member.value for member in cls.FILES]
72
+ except TypeError:
73
+ raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
74
+ if len(values) == 0:
75
+ return None
76
+ return values
77
+
78
+ @classmethod
79
+ def validate_file_name(cls, name: str) -> None:
80
+ """
81
+ Optional file-name validation hook.
82
+
83
+ Default behavior:
84
+ - If `FILES` enum is defined on the subclass, ensure the **basename** of `name`
85
+ equals one of the enum values (exact string match).
86
+ - If `FILES` is None, do nothing.
87
+
88
+ Override this method in subclasses for custom conventions (regex, prefixes, etc.).
89
+ Raise DocumentNameError when invalid.
90
+ """
91
+ if cls.FILES is None:
92
+ return
93
+
94
+ try:
95
+ allowed = {str(member.value) for member in cls.FILES} # type: ignore[arg-type]
96
+ except TypeError:
97
+ raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
98
+
99
+ if name not in allowed:
100
+ allowed_str = ", ".join(sorted(allowed))
101
+ raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
102
+
103
+ @field_validator("name")
104
+ def validate_name(cls, v: str) -> str:
105
+ """Validate document name matches expected patterns and is secure"""
106
+ if v.endswith(cls.DESCRIPTION_EXTENSION):
107
+ raise DocumentNameError(
108
+ f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
109
+ )
110
+
111
+ if ".." in v or "\\" in v or "/" in v:
112
+ raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
113
+
114
+ if not v or v.startswith(" ") or v.endswith(" "):
115
+ raise DocumentNameError(f"Invalid filename format: {v}")
116
+
117
+ cls.validate_file_name(v)
118
+
119
+ return v
120
+
121
+ @field_validator("content")
122
+ def validate_content(cls, v: bytes) -> bytes:
123
+ """Validate content size"""
124
+ # Check content size limit
125
+ max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
126
+ if len(v) > max_size:
127
+ raise DocumentSizeError(
128
+ f"Document size ({len(v)} bytes) exceeds maximum allowed size ({max_size} bytes)"
129
+ )
130
+
131
+ return v
132
+
133
+ @field_serializer("content")
134
+ def serialize_content(self, v: bytes) -> str:
135
+ """Serialize bytes content to string for JSON serialization"""
136
+ try:
137
+ return v.decode("utf-8")
138
+ except UnicodeDecodeError:
139
+ # Fall back to base64 for binary content
140
+ return base64.b64encode(v).decode("ascii")
141
+
142
+ @property
143
+ def id(self) -> str:
144
+ """Return the first 6 characters of the SHA256 hash of the content, encoded in base32"""
145
+ return self.sha256[:6]
146
+
147
+ @cached_property
148
+ def sha256(self) -> str:
149
+ """Full SHA256 hash of content, encoded in base32"""
150
+ return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
151
+
152
+ @property
153
+ def size(self) -> int:
154
+ """Size of content in bytes"""
155
+ return len(self.content)
156
+
157
+ @cached_property
158
+ def detected_mime_type(self) -> str:
159
+ """Detect MIME type from content using python-magic"""
160
+ return detect_mime_type(self.content, self.name)
161
+
162
+ @property
163
+ def mime_type(self) -> str:
164
+ """Get MIME type - uses content detection with fallback to extension"""
165
+ return self.detected_mime_type
166
+
167
+ @property
168
+ def is_text(self) -> bool:
169
+ """Check if document is text based on MIME type"""
170
+ return is_text_mime_type(self.mime_type)
171
+
172
+ @property
173
+ def is_pdf(self) -> bool:
174
+ """Check if document is PDF"""
175
+ return is_pdf_mime_type(self.mime_type)
176
+
177
+ @property
178
+ def is_image(self) -> bool:
179
+ """Check if document is an image"""
180
+ return is_image_mime_type(self.mime_type)
181
+
182
+ @property
183
+ def should_be_cached(self) -> bool:
184
+ """Check if document should be cached"""
185
+ return False
186
+
187
+ def as_text(self) -> str:
188
+ """Parse document as text"""
189
+ if not self.is_text:
190
+ raise ValueError(f"Document is not text: {self.name}")
191
+ return self.content.decode("utf-8")
192
+
193
+ def as_yaml(self) -> Any:
194
+ """Parse document as YAML"""
195
+ if not self.is_text:
196
+ raise ValueError(f"Document is not text: {self.name}")
197
+ return YAML().load(self.content.decode("utf-8")) # type: ignore
198
+
199
+ def as_json(self) -> Any:
200
+ """Parse document as JSON"""
201
+ if not self.is_text:
202
+ raise ValueError(f"Document is not text: {self.name}")
203
+ return json.loads(self.content.decode("utf-8"))
204
+
205
+ def serialize_model(self) -> dict[str, Any]:
206
+ """Serialize document to a dictionary with proper encoding."""
207
+ result = {
208
+ "name": self.name,
209
+ "description": self.description,
210
+ "base_type": self.get_base_type(),
211
+ "size": self.size,
212
+ "id": self.id,
213
+ "sha256": self.sha256,
214
+ "mime_type": self.mime_type,
215
+ }
216
+
217
+ # Try to encode content as UTF-8, fall back to base64
218
+ if self.is_text or self.mime_type.startswith("text/"):
219
+ try:
220
+ result["content"] = self.content.decode("utf-8")
221
+ result["content_encoding"] = "utf-8"
222
+ except UnicodeDecodeError:
223
+ # For text files with encoding issues, use UTF-8 with replacement
224
+ result["content"] = self.content.decode("utf-8", errors="replace")
225
+ result["content_encoding"] = "utf-8"
226
+ else:
227
+ # Binary content - use base64
228
+ result["content"] = base64.b64encode(self.content).decode("ascii")
229
+ result["content_encoding"] = "base64"
230
+
231
+ return result
232
+
233
+ @classmethod
234
+ def from_dict(cls, data: dict[str, Any]) -> Self:
235
+ """Deserialize document from dictionary."""
236
+ # Extract content and encoding
237
+ content_str = data.get("content", "")
238
+ content_encoding = data.get("content_encoding", "utf-8")
239
+
240
+ # Decode content based on encoding
241
+ if content_encoding == "base64":
242
+ content = base64.b64decode(content_str)
243
+ else:
244
+ # Default to UTF-8
245
+ content = content_str.encode("utf-8")
246
+
247
+ # Create document with the required fields
248
+ return cls(
249
+ name=data["name"],
250
+ content=content,
251
+ description=data.get("description"),
252
+ )
@@ -0,0 +1,131 @@
1
+ from typing import Any, Iterable, SupportsIndex, Union, overload
2
+
3
+ from typing_extensions import Self
4
+
5
+ from .document import Document
6
+
7
+
8
+ class DocumentList(list[Document]):
9
+ """
10
+ A specialized list for Document objects with built-in validation.
11
+
12
+ Features:
13
+ - Optionally ensures no duplicate filenames within the list
14
+ - Optionally validates that all documents have the same type (for flow outputs)
15
+ - Provides convenience methods for document operations
16
+ - Works with both FlowDocument and TaskDocument classes
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ documents: list[Document] | None = None,
22
+ validate_same_type: bool = False,
23
+ validate_duplicates: bool = False,
24
+ ) -> None:
25
+ """
26
+ Initialize DocumentList with optional initial documents.
27
+
28
+ Args:
29
+ documents: Initial list of documents
30
+ validate_same_type: If True, validates that all documents have the same type.
31
+ Should be True for flow outputs, False for inputs.
32
+ """
33
+ super().__init__()
34
+ self._validate_same_type = validate_same_type
35
+ self._validate_duplicates = validate_duplicates
36
+ if documents:
37
+ self.extend(documents)
38
+
39
+ def _validate_no_duplicates(self) -> None:
40
+ """Validate that there are no duplicate filenames."""
41
+ if not self._validate_duplicates:
42
+ return
43
+
44
+ filenames = [doc.name for doc in self]
45
+ seen: set[str] = set()
46
+ duplicates: list[str] = []
47
+ for name in filenames:
48
+ if name in seen:
49
+ duplicates.append(name)
50
+ seen.add(name)
51
+ if duplicates:
52
+ unique_duplicates = list(set(duplicates))
53
+ raise ValueError(f"Duplicate document names found: {unique_duplicates}")
54
+
55
+ def _validate_no_description_files(self) -> None:
56
+ """Validate that no documents have DESCRIPTION_EXTENSION suffix."""
57
+ description_files = [
58
+ doc.name for doc in self if doc.name.endswith(Document.DESCRIPTION_EXTENSION)
59
+ ]
60
+ if description_files:
61
+ raise ValueError(
62
+ f"Documents with {Document.DESCRIPTION_EXTENSION} suffix are not allowed: "
63
+ f"{description_files}"
64
+ )
65
+
66
+ def _validate_types(self) -> None:
67
+ """Validate that all documents have the same class type if required."""
68
+ if not self._validate_same_type or not self:
69
+ return
70
+
71
+ first_class = type(self[0])
72
+ different_types = [doc for doc in self if type(doc) is not first_class]
73
+ if different_types:
74
+ types = list({type(doc).__name__ for doc in self})
75
+ raise ValueError(f"All documents must have the same type. Found types: {types}")
76
+
77
+ def _validate(self) -> None:
78
+ """Run all validations."""
79
+ self._validate_no_duplicates()
80
+ self._validate_no_description_files()
81
+ self._validate_types()
82
+
83
+ def append(self, document: Document) -> None:
84
+ """Add a document to the list with validation."""
85
+ super().append(document)
86
+ self._validate()
87
+
88
+ def extend(self, documents: Iterable[Document]) -> None:
89
+ """Extend the list with multiple documents with validation."""
90
+ super().extend(documents)
91
+ self._validate()
92
+
93
+ def insert(self, index: SupportsIndex, document: Document) -> None:
94
+ """Insert a document at the specified index with validation."""
95
+ super().insert(index, document)
96
+ self._validate()
97
+
98
+ @overload
99
+ def __setitem__(self, index: SupportsIndex, value: Document) -> None: ...
100
+
101
+ @overload
102
+ def __setitem__(self, index: slice, value: Iterable[Document]) -> None: ...
103
+
104
+ def __setitem__(self, index: Union[SupportsIndex, slice], value: Any) -> None:
105
+ """Set item with validation."""
106
+ super().__setitem__(index, value)
107
+ self._validate()
108
+
109
+ def __iadd__(self, other: Any) -> "Self":
110
+ """In-place addition with validation."""
111
+ result = super().__iadd__(other)
112
+ self._validate()
113
+ return result
114
+
115
+ def filter_by_type(self, document_type: type[Document]) -> "DocumentList":
116
+ """Return a new DocumentList containing only instances of the specified document class."""
117
+ return DocumentList([doc for doc in self if type(doc) is document_type])
118
+
119
+ def filter_by_types(self, document_types: list[type[Document]]) -> "DocumentList":
120
+ """Return a new DocumentList containing only instances of the specified document classes."""
121
+ documents = DocumentList()
122
+ for document_type in document_types:
123
+ documents.extend(self.filter_by_type(document_type))
124
+ return documents
125
+
126
+ def get_by_name(self, name: str) -> Document | None:
127
+ """Get a document by its name."""
128
+ for doc in self:
129
+ if doc.name == name:
130
+ return doc
131
+ return None
@@ -0,0 +1,21 @@
1
+ """Flow-specific document base class."""
2
+
3
+ from typing import Literal, final
4
+
5
+ from .document import Document
6
+
7
+
8
+ class FlowDocument(Document):
9
+ """
10
+ Abstract base class for flow-specific documents.
11
+
12
+ Flow documents represent inputs, outputs, and intermediate results
13
+ within a Prefect flow execution context.
14
+
15
+ Compared to TaskDocument, FlowDocument are persistent across Prefect flow runs.
16
+ """
17
+
18
+ @final
19
+ def get_base_type(self) -> Literal["flow"]:
20
+ """Get the document type."""
21
+ return "flow"
@@ -0,0 +1,78 @@
1
+ """MIME type detection utilities for documents"""
2
+
3
+ import magic
4
+
5
+ from ai_pipeline_core.logging import get_pipeline_logger
6
+
7
+ logger = get_pipeline_logger(__name__)
8
+
9
+
10
+ def detect_mime_type(content: bytes, name: str) -> str:
11
+ """Detect MIME type from content using python-magic"""
12
+
13
+ try:
14
+ if name.endswith(".md") and content.decode("utf-8"):
15
+ return "text/markdown"
16
+ except UnicodeDecodeError:
17
+ pass
18
+
19
+ if len(content) <= 4:
20
+ return "application/x-empty"
21
+
22
+ try:
23
+ mime = magic.from_buffer(content[:1024], mime=True)
24
+ return mime
25
+ except (AttributeError, OSError, magic.MagicException) as e:
26
+ logger.warning(f"MIME detection failed for {name}: {e}, falling back to extension")
27
+ return mime_type_from_extension(name)
28
+ except Exception as e:
29
+ logger.error(f"Unexpected error in MIME detection for {name}: {e}")
30
+ return mime_type_from_extension(name)
31
+
32
+
33
+ def mime_type_from_extension(name: str) -> str:
34
+ """Get MIME type based on file extension"""
35
+ ext = name.lower().split(".")[-1] if "." in name else ""
36
+
37
+ mime_map = {
38
+ "md": "text/markdown",
39
+ "txt": "text/plain",
40
+ "pdf": "application/pdf",
41
+ "png": "image/png",
42
+ "jpg": "image/jpeg",
43
+ "jpeg": "image/jpeg",
44
+ "gif": "image/gif",
45
+ "bmp": "image/bmp",
46
+ "webp": "image/webp",
47
+ "json": "application/json",
48
+ "yaml": "application/yaml",
49
+ "yml": "application/yaml",
50
+ "xml": "text/xml",
51
+ "html": "text/html",
52
+ "htm": "text/html",
53
+ }
54
+
55
+ return mime_map.get(ext, "application/octet-stream")
56
+
57
+
58
+ def is_text_mime_type(mime_type: str) -> bool:
59
+ """Check if MIME type represents text content"""
60
+ text_types = [
61
+ "text/",
62
+ "application/json",
63
+ "application/xml",
64
+ "application/javascript",
65
+ "application/yaml",
66
+ "application/x-yaml",
67
+ ]
68
+ return any(mime_type.startswith(t) for t in text_types)
69
+
70
+
71
+ def is_pdf_mime_type(mime_type: str) -> bool:
72
+ """Check if MIME type is PDF"""
73
+ return mime_type == "application/pdf"
74
+
75
+
76
+ def is_image_mime_type(mime_type: str) -> bool:
77
+ """Check if MIME type is an image"""
78
+ return mime_type.startswith("image/")
@@ -0,0 +1,22 @@
1
+ """Task-specific document base class."""
2
+
3
+ from typing import Literal, final
4
+
5
+ from .document import Document
6
+
7
+
8
+ class TaskDocument(Document):
9
+ """
10
+ Abstract base class for task-specific documents.
11
+
12
+ Task documents represent inputs, outputs, and intermediate results
13
+ within a Prefect task execution context.
14
+
15
+ Compared to FlowDocument, TaskDocument are not persisted across Prefect task runs.
16
+ They are used for intermediate results that are not needed after the task completes.
17
+ """
18
+
19
+ @final
20
+ def get_base_type(self) -> Literal["task"]:
21
+ """Get the document type."""
22
+ return "task"
@@ -0,0 +1,33 @@
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+
5
+ def sanitize_url(url: str) -> str:
6
+ """
7
+ Sanitize URL or query string for use in filenames.
8
+ Removes or replaces characters that are invalid in filenames.
9
+ """
10
+ # Remove protocol if it's a URL
11
+ if url.startswith(("http://", "https://")):
12
+ parsed = urlparse(url)
13
+ # Use domain + path
14
+ url = parsed.netloc + parsed.path
15
+
16
+ # Replace invalid filename characters
17
+ sanitized = re.sub(r'[<>:"/\\|?*]', "_", url)
18
+
19
+ # Replace multiple underscores with single one
20
+ sanitized = re.sub(r"_+", "_", sanitized)
21
+
22
+ # Remove leading/trailing underscores and dots
23
+ sanitized = sanitized.strip("_.")
24
+
25
+ # Limit length to prevent too long filenames
26
+ if len(sanitized) > 100:
27
+ sanitized = sanitized[:100]
28
+
29
+ # Ensure we have something
30
+ if not sanitized:
31
+ sanitized = "unnamed"
32
+
33
+ return sanitized
@@ -0,0 +1,61 @@
1
+ """Exception hierarchy for AI Pipeline Core."""
2
+
3
+
4
+ class PipelineCoreError(Exception):
5
+ """Base exception for all pipeline errors."""
6
+
7
+ pass
8
+
9
+
10
+ class DocumentError(PipelineCoreError):
11
+ """Document-related errors."""
12
+
13
+ pass
14
+
15
+
16
+ class DocumentValidationError(DocumentError):
17
+ """Document validation failed."""
18
+
19
+ pass
20
+
21
+
22
+ class DocumentSizeError(DocumentValidationError):
23
+ """Document size exceeds limits."""
24
+
25
+ pass
26
+
27
+
28
+ class DocumentNameError(DocumentValidationError):
29
+ """Invalid document name."""
30
+
31
+ pass
32
+
33
+
34
+ class LLMError(PipelineCoreError):
35
+ """LLM-related errors."""
36
+
37
+ pass
38
+
39
+
40
+ class PromptError(PipelineCoreError):
41
+ """Prompt-related errors."""
42
+
43
+ pass
44
+
45
+
46
+ class PromptRenderError(PromptError):
47
+ """Failed to render prompt template."""
48
+
49
+ pass
50
+
51
+
52
+ class PromptNotFoundError(PromptError):
53
+ """Prompt template not found."""
54
+
55
+ pass
56
+
57
+
58
+ class MimeTypeError(DocumentError):
59
+ """MIME type detection or validation error."""
60
+
61
+ pass
@@ -0,0 +1,3 @@
1
+ from .config import FlowConfig
2
+
3
+ __all__ = ["FlowConfig"]