ai-pipeline-core 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +36 -0
- ai_pipeline_core/documents/__init__.py +11 -0
- ai_pipeline_core/documents/document.py +252 -0
- ai_pipeline_core/documents/document_list.py +131 -0
- ai_pipeline_core/documents/flow_document.py +21 -0
- ai_pipeline_core/documents/mime_type.py +78 -0
- ai_pipeline_core/documents/task_document.py +22 -0
- ai_pipeline_core/documents/utils.py +33 -0
- ai_pipeline_core/exceptions.py +61 -0
- ai_pipeline_core/flow/__init__.py +3 -0
- ai_pipeline_core/flow/config.py +66 -0
- ai_pipeline_core/llm/__init__.py +19 -0
- ai_pipeline_core/llm/ai_messages.py +129 -0
- ai_pipeline_core/llm/client.py +218 -0
- ai_pipeline_core/llm/model_options.py +39 -0
- ai_pipeline_core/llm/model_response.py +149 -0
- ai_pipeline_core/llm/model_types.py +17 -0
- ai_pipeline_core/logging/__init__.py +10 -0
- ai_pipeline_core/logging/logging.yml +66 -0
- ai_pipeline_core/logging/logging_config.py +154 -0
- ai_pipeline_core/logging/logging_mixin.py +223 -0
- ai_pipeline_core/prompt_manager.py +115 -0
- ai_pipeline_core/py.typed +0 -0
- ai_pipeline_core/settings.py +24 -0
- ai_pipeline_core/tracing.py +205 -0
- ai_pipeline_core-0.1.1.dist-info/METADATA +477 -0
- ai_pipeline_core-0.1.1.dist-info/RECORD +29 -0
- ai_pipeline_core-0.1.1.dist-info/WHEEL +4 -0
- ai_pipeline_core-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Pipeline Core - Shared infrastructure for AI pipelines."""
|
|
2
|
+
|
|
3
|
+
from .documents import Document, DocumentList, FlowDocument, TaskDocument
|
|
4
|
+
from .flow import FlowConfig
|
|
5
|
+
from .logging import (
|
|
6
|
+
LoggerMixin,
|
|
7
|
+
LoggingConfig,
|
|
8
|
+
StructuredLoggerMixin,
|
|
9
|
+
get_pipeline_logger,
|
|
10
|
+
setup_logging,
|
|
11
|
+
)
|
|
12
|
+
from .logging import (
|
|
13
|
+
get_pipeline_logger as get_logger,
|
|
14
|
+
)
|
|
15
|
+
from .prompt_manager import PromptManager
|
|
16
|
+
from .settings import settings
|
|
17
|
+
from .tracing import trace
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.1"
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Document",
|
|
23
|
+
"DocumentList",
|
|
24
|
+
"FlowConfig",
|
|
25
|
+
"FlowDocument",
|
|
26
|
+
"get_logger",
|
|
27
|
+
"get_pipeline_logger",
|
|
28
|
+
"LoggerMixin",
|
|
29
|
+
"LoggingConfig",
|
|
30
|
+
"PromptManager",
|
|
31
|
+
"settings",
|
|
32
|
+
"setup_logging",
|
|
33
|
+
"StructuredLoggerMixin",
|
|
34
|
+
"TaskDocument",
|
|
35
|
+
"trace",
|
|
36
|
+
]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from base64 import b32encode
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
from functools import cached_property
|
|
8
|
+
from typing import Any, ClassVar, Literal, Self
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
|
|
11
|
+
from ruamel.yaml import YAML
|
|
12
|
+
|
|
13
|
+
from ai_pipeline_core.exceptions import DocumentNameError, DocumentSizeError
|
|
14
|
+
|
|
15
|
+
from .mime_type import (
|
|
16
|
+
detect_mime_type,
|
|
17
|
+
is_image_mime_type,
|
|
18
|
+
is_pdf_mime_type,
|
|
19
|
+
is_text_mime_type,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Document(BaseModel, ABC):
|
|
24
|
+
"""Abstract base class for all documents"""
|
|
25
|
+
|
|
26
|
+
MAX_CONTENT_SIZE: ClassVar[int] = 10 * 1024 * 1024 # 10MB default
|
|
27
|
+
DESCRIPTION_EXTENSION: ClassVar[str] = ".description.md"
|
|
28
|
+
|
|
29
|
+
# Optional enum of allowed file names. Subclasses may set this.
|
|
30
|
+
# This is used to validate the document name.
|
|
31
|
+
FILES: ClassVar[type[StrEnum] | None] = None
|
|
32
|
+
|
|
33
|
+
name: str
|
|
34
|
+
description: str | None = None
|
|
35
|
+
content: bytes
|
|
36
|
+
|
|
37
|
+
# Pydantic configuration
|
|
38
|
+
model_config = ConfigDict(
|
|
39
|
+
frozen=True, # Make documents immutable
|
|
40
|
+
arbitrary_types_allowed=True,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def get_base_type(self) -> Literal["flow", "task"]:
|
|
45
|
+
"""Get the type of the document - must be implemented by subclasses"""
|
|
46
|
+
raise NotImplementedError("Subclasses must implement this method")
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def base_type(self) -> Literal["flow", "task"]:
|
|
50
|
+
"""Alias for document_type for backward compatibility"""
|
|
51
|
+
return self.get_base_type()
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def is_flow(self) -> bool:
|
|
55
|
+
"""Check if document is a flow document"""
|
|
56
|
+
return self.get_base_type() == "flow"
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def is_task(self) -> bool:
|
|
60
|
+
"""Check if document is a task document"""
|
|
61
|
+
return self.get_base_type() == "task"
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def get_expected_files(cls) -> list[str] | None:
|
|
65
|
+
"""
|
|
66
|
+
Return the list of allowed file names for this document class, or None if unrestricted.
|
|
67
|
+
"""
|
|
68
|
+
if cls.FILES is None:
|
|
69
|
+
return None
|
|
70
|
+
try:
|
|
71
|
+
values = [member.value for member in cls.FILES]
|
|
72
|
+
except TypeError:
|
|
73
|
+
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
|
|
74
|
+
if len(values) == 0:
|
|
75
|
+
return None
|
|
76
|
+
return values
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def validate_file_name(cls, name: str) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Optional file-name validation hook.
|
|
82
|
+
|
|
83
|
+
Default behavior:
|
|
84
|
+
- If `FILES` enum is defined on the subclass, ensure the **basename** of `name`
|
|
85
|
+
equals one of the enum values (exact string match).
|
|
86
|
+
- If `FILES` is None, do nothing.
|
|
87
|
+
|
|
88
|
+
Override this method in subclasses for custom conventions (regex, prefixes, etc.).
|
|
89
|
+
Raise DocumentNameError when invalid.
|
|
90
|
+
"""
|
|
91
|
+
if cls.FILES is None:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
allowed = {str(member.value) for member in cls.FILES} # type: ignore[arg-type]
|
|
96
|
+
except TypeError:
|
|
97
|
+
raise DocumentNameError(f"{cls.__name__}.FILES must be an Enum of string values")
|
|
98
|
+
|
|
99
|
+
if name not in allowed:
|
|
100
|
+
allowed_str = ", ".join(sorted(allowed))
|
|
101
|
+
raise DocumentNameError(f"Invalid filename '{name}'. Allowed names: {allowed_str}")
|
|
102
|
+
|
|
103
|
+
@field_validator("name")
|
|
104
|
+
def validate_name(cls, v: str) -> str:
|
|
105
|
+
"""Validate document name matches expected patterns and is secure"""
|
|
106
|
+
if v.endswith(cls.DESCRIPTION_EXTENSION):
|
|
107
|
+
raise DocumentNameError(
|
|
108
|
+
f"Document names cannot end with {cls.DESCRIPTION_EXTENSION}: {v}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if ".." in v or "\\" in v or "/" in v:
|
|
112
|
+
raise DocumentNameError(f"Invalid filename - contains path traversal characters: {v}")
|
|
113
|
+
|
|
114
|
+
if not v or v.startswith(" ") or v.endswith(" "):
|
|
115
|
+
raise DocumentNameError(f"Invalid filename format: {v}")
|
|
116
|
+
|
|
117
|
+
cls.validate_file_name(v)
|
|
118
|
+
|
|
119
|
+
return v
|
|
120
|
+
|
|
121
|
+
@field_validator("content")
|
|
122
|
+
def validate_content(cls, v: bytes) -> bytes:
|
|
123
|
+
"""Validate content size"""
|
|
124
|
+
# Check content size limit
|
|
125
|
+
max_size = getattr(cls, "MAX_CONTENT_SIZE", 100 * 1024 * 1024)
|
|
126
|
+
if len(v) > max_size:
|
|
127
|
+
raise DocumentSizeError(
|
|
128
|
+
f"Document size ({len(v)} bytes) exceeds maximum allowed size ({max_size} bytes)"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return v
|
|
132
|
+
|
|
133
|
+
@field_serializer("content")
|
|
134
|
+
def serialize_content(self, v: bytes) -> str:
|
|
135
|
+
"""Serialize bytes content to string for JSON serialization"""
|
|
136
|
+
try:
|
|
137
|
+
return v.decode("utf-8")
|
|
138
|
+
except UnicodeDecodeError:
|
|
139
|
+
# Fall back to base64 for binary content
|
|
140
|
+
return base64.b64encode(v).decode("ascii")
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def id(self) -> str:
|
|
144
|
+
"""Return the first 6 characters of the SHA256 hash of the content, encoded in base32"""
|
|
145
|
+
return self.sha256[:6]
|
|
146
|
+
|
|
147
|
+
@cached_property
|
|
148
|
+
def sha256(self) -> str:
|
|
149
|
+
"""Full SHA256 hash of content, encoded in base32"""
|
|
150
|
+
return b32encode(hashlib.sha256(self.content).digest()).decode("ascii").upper()
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def size(self) -> int:
|
|
154
|
+
"""Size of content in bytes"""
|
|
155
|
+
return len(self.content)
|
|
156
|
+
|
|
157
|
+
@cached_property
|
|
158
|
+
def detected_mime_type(self) -> str:
|
|
159
|
+
"""Detect MIME type from content using python-magic"""
|
|
160
|
+
return detect_mime_type(self.content, self.name)
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def mime_type(self) -> str:
|
|
164
|
+
"""Get MIME type - uses content detection with fallback to extension"""
|
|
165
|
+
return self.detected_mime_type
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def is_text(self) -> bool:
|
|
169
|
+
"""Check if document is text based on MIME type"""
|
|
170
|
+
return is_text_mime_type(self.mime_type)
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def is_pdf(self) -> bool:
|
|
174
|
+
"""Check if document is PDF"""
|
|
175
|
+
return is_pdf_mime_type(self.mime_type)
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def is_image(self) -> bool:
|
|
179
|
+
"""Check if document is an image"""
|
|
180
|
+
return is_image_mime_type(self.mime_type)
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def should_be_cached(self) -> bool:
|
|
184
|
+
"""Check if document should be cached"""
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
def as_text(self) -> str:
|
|
188
|
+
"""Parse document as text"""
|
|
189
|
+
if not self.is_text:
|
|
190
|
+
raise ValueError(f"Document is not text: {self.name}")
|
|
191
|
+
return self.content.decode("utf-8")
|
|
192
|
+
|
|
193
|
+
def as_yaml(self) -> Any:
|
|
194
|
+
"""Parse document as YAML"""
|
|
195
|
+
if not self.is_text:
|
|
196
|
+
raise ValueError(f"Document is not text: {self.name}")
|
|
197
|
+
return YAML().load(self.content.decode("utf-8")) # type: ignore
|
|
198
|
+
|
|
199
|
+
def as_json(self) -> Any:
|
|
200
|
+
"""Parse document as JSON"""
|
|
201
|
+
if not self.is_text:
|
|
202
|
+
raise ValueError(f"Document is not text: {self.name}")
|
|
203
|
+
return json.loads(self.content.decode("utf-8"))
|
|
204
|
+
|
|
205
|
+
def serialize_model(self) -> dict[str, Any]:
|
|
206
|
+
"""Serialize document to a dictionary with proper encoding."""
|
|
207
|
+
result = {
|
|
208
|
+
"name": self.name,
|
|
209
|
+
"description": self.description,
|
|
210
|
+
"base_type": self.get_base_type(),
|
|
211
|
+
"size": self.size,
|
|
212
|
+
"id": self.id,
|
|
213
|
+
"sha256": self.sha256,
|
|
214
|
+
"mime_type": self.mime_type,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
# Try to encode content as UTF-8, fall back to base64
|
|
218
|
+
if self.is_text or self.mime_type.startswith("text/"):
|
|
219
|
+
try:
|
|
220
|
+
result["content"] = self.content.decode("utf-8")
|
|
221
|
+
result["content_encoding"] = "utf-8"
|
|
222
|
+
except UnicodeDecodeError:
|
|
223
|
+
# For text files with encoding issues, use UTF-8 with replacement
|
|
224
|
+
result["content"] = self.content.decode("utf-8", errors="replace")
|
|
225
|
+
result["content_encoding"] = "utf-8"
|
|
226
|
+
else:
|
|
227
|
+
# Binary content - use base64
|
|
228
|
+
result["content"] = base64.b64encode(self.content).decode("ascii")
|
|
229
|
+
result["content_encoding"] = "base64"
|
|
230
|
+
|
|
231
|
+
return result
|
|
232
|
+
|
|
233
|
+
@classmethod
|
|
234
|
+
def from_dict(cls, data: dict[str, Any]) -> Self:
|
|
235
|
+
"""Deserialize document from dictionary."""
|
|
236
|
+
# Extract content and encoding
|
|
237
|
+
content_str = data.get("content", "")
|
|
238
|
+
content_encoding = data.get("content_encoding", "utf-8")
|
|
239
|
+
|
|
240
|
+
# Decode content based on encoding
|
|
241
|
+
if content_encoding == "base64":
|
|
242
|
+
content = base64.b64decode(content_str)
|
|
243
|
+
else:
|
|
244
|
+
# Default to UTF-8
|
|
245
|
+
content = content_str.encode("utf-8")
|
|
246
|
+
|
|
247
|
+
# Create document with the required fields
|
|
248
|
+
return cls(
|
|
249
|
+
name=data["name"],
|
|
250
|
+
content=content,
|
|
251
|
+
description=data.get("description"),
|
|
252
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Any, Iterable, SupportsIndex, Union, overload
|
|
2
|
+
|
|
3
|
+
from typing_extensions import Self
|
|
4
|
+
|
|
5
|
+
from .document import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocumentList(list[Document]):
|
|
9
|
+
"""
|
|
10
|
+
A specialized list for Document objects with built-in validation.
|
|
11
|
+
|
|
12
|
+
Features:
|
|
13
|
+
- Optionally ensures no duplicate filenames within the list
|
|
14
|
+
- Optionally validates that all documents have the same type (for flow outputs)
|
|
15
|
+
- Provides convenience methods for document operations
|
|
16
|
+
- Works with both FlowDocument and TaskDocument classes
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
documents: list[Document] | None = None,
|
|
22
|
+
validate_same_type: bool = False,
|
|
23
|
+
validate_duplicates: bool = False,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""
|
|
26
|
+
Initialize DocumentList with optional initial documents.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
documents: Initial list of documents
|
|
30
|
+
validate_same_type: If True, validates that all documents have the same type.
|
|
31
|
+
Should be True for flow outputs, False for inputs.
|
|
32
|
+
"""
|
|
33
|
+
super().__init__()
|
|
34
|
+
self._validate_same_type = validate_same_type
|
|
35
|
+
self._validate_duplicates = validate_duplicates
|
|
36
|
+
if documents:
|
|
37
|
+
self.extend(documents)
|
|
38
|
+
|
|
39
|
+
def _validate_no_duplicates(self) -> None:
|
|
40
|
+
"""Validate that there are no duplicate filenames."""
|
|
41
|
+
if not self._validate_duplicates:
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
filenames = [doc.name for doc in self]
|
|
45
|
+
seen: set[str] = set()
|
|
46
|
+
duplicates: list[str] = []
|
|
47
|
+
for name in filenames:
|
|
48
|
+
if name in seen:
|
|
49
|
+
duplicates.append(name)
|
|
50
|
+
seen.add(name)
|
|
51
|
+
if duplicates:
|
|
52
|
+
unique_duplicates = list(set(duplicates))
|
|
53
|
+
raise ValueError(f"Duplicate document names found: {unique_duplicates}")
|
|
54
|
+
|
|
55
|
+
def _validate_no_description_files(self) -> None:
|
|
56
|
+
"""Validate that no documents have DESCRIPTION_EXTENSION suffix."""
|
|
57
|
+
description_files = [
|
|
58
|
+
doc.name for doc in self if doc.name.endswith(Document.DESCRIPTION_EXTENSION)
|
|
59
|
+
]
|
|
60
|
+
if description_files:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"Documents with {Document.DESCRIPTION_EXTENSION} suffix are not allowed: "
|
|
63
|
+
f"{description_files}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _validate_types(self) -> None:
|
|
67
|
+
"""Validate that all documents have the same class type if required."""
|
|
68
|
+
if not self._validate_same_type or not self:
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
first_class = type(self[0])
|
|
72
|
+
different_types = [doc for doc in self if type(doc) is not first_class]
|
|
73
|
+
if different_types:
|
|
74
|
+
types = list({type(doc).__name__ for doc in self})
|
|
75
|
+
raise ValueError(f"All documents must have the same type. Found types: {types}")
|
|
76
|
+
|
|
77
|
+
def _validate(self) -> None:
|
|
78
|
+
"""Run all validations."""
|
|
79
|
+
self._validate_no_duplicates()
|
|
80
|
+
self._validate_no_description_files()
|
|
81
|
+
self._validate_types()
|
|
82
|
+
|
|
83
|
+
def append(self, document: Document) -> None:
|
|
84
|
+
"""Add a document to the list with validation."""
|
|
85
|
+
super().append(document)
|
|
86
|
+
self._validate()
|
|
87
|
+
|
|
88
|
+
def extend(self, documents: Iterable[Document]) -> None:
|
|
89
|
+
"""Extend the list with multiple documents with validation."""
|
|
90
|
+
super().extend(documents)
|
|
91
|
+
self._validate()
|
|
92
|
+
|
|
93
|
+
def insert(self, index: SupportsIndex, document: Document) -> None:
|
|
94
|
+
"""Insert a document at the specified index with validation."""
|
|
95
|
+
super().insert(index, document)
|
|
96
|
+
self._validate()
|
|
97
|
+
|
|
98
|
+
@overload
|
|
99
|
+
def __setitem__(self, index: SupportsIndex, value: Document) -> None: ...
|
|
100
|
+
|
|
101
|
+
@overload
|
|
102
|
+
def __setitem__(self, index: slice, value: Iterable[Document]) -> None: ...
|
|
103
|
+
|
|
104
|
+
def __setitem__(self, index: Union[SupportsIndex, slice], value: Any) -> None:
|
|
105
|
+
"""Set item with validation."""
|
|
106
|
+
super().__setitem__(index, value)
|
|
107
|
+
self._validate()
|
|
108
|
+
|
|
109
|
+
def __iadd__(self, other: Any) -> "Self":
|
|
110
|
+
"""In-place addition with validation."""
|
|
111
|
+
result = super().__iadd__(other)
|
|
112
|
+
self._validate()
|
|
113
|
+
return result
|
|
114
|
+
|
|
115
|
+
def filter_by_type(self, document_type: type[Document]) -> "DocumentList":
|
|
116
|
+
"""Return a new DocumentList containing only instances of the specified document class."""
|
|
117
|
+
return DocumentList([doc for doc in self if type(doc) is document_type])
|
|
118
|
+
|
|
119
|
+
def filter_by_types(self, document_types: list[type[Document]]) -> "DocumentList":
|
|
120
|
+
"""Return a new DocumentList containing only instances of the specified document classes."""
|
|
121
|
+
documents = DocumentList()
|
|
122
|
+
for document_type in document_types:
|
|
123
|
+
documents.extend(self.filter_by_type(document_type))
|
|
124
|
+
return documents
|
|
125
|
+
|
|
126
|
+
def get_by_name(self, name: str) -> Document | None:
|
|
127
|
+
"""Get a document by its name."""
|
|
128
|
+
for doc in self:
|
|
129
|
+
if doc.name == name:
|
|
130
|
+
return doc
|
|
131
|
+
return None
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Flow-specific document base class."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal, final
|
|
4
|
+
|
|
5
|
+
from .document import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FlowDocument(Document):
|
|
9
|
+
"""
|
|
10
|
+
Abstract base class for flow-specific documents.
|
|
11
|
+
|
|
12
|
+
Flow documents represent inputs, outputs, and intermediate results
|
|
13
|
+
within a Prefect flow execution context.
|
|
14
|
+
|
|
15
|
+
Compared to TaskDocument, FlowDocument are persistent across Prefect flow runs.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@final
|
|
19
|
+
def get_base_type(self) -> Literal["flow"]:
|
|
20
|
+
"""Get the document type."""
|
|
21
|
+
return "flow"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""MIME type detection utilities for documents"""
|
|
2
|
+
|
|
3
|
+
import magic
|
|
4
|
+
|
|
5
|
+
from ai_pipeline_core.logging import get_pipeline_logger
|
|
6
|
+
|
|
7
|
+
logger = get_pipeline_logger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_mime_type(content: bytes, name: str) -> str:
|
|
11
|
+
"""Detect MIME type from content using python-magic"""
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
if name.endswith(".md") and content.decode("utf-8"):
|
|
15
|
+
return "text/markdown"
|
|
16
|
+
except UnicodeDecodeError:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
if len(content) <= 4:
|
|
20
|
+
return "application/x-empty"
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
mime = magic.from_buffer(content[:1024], mime=True)
|
|
24
|
+
return mime
|
|
25
|
+
except (AttributeError, OSError, magic.MagicException) as e:
|
|
26
|
+
logger.warning(f"MIME detection failed for {name}: {e}, falling back to extension")
|
|
27
|
+
return mime_type_from_extension(name)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
logger.error(f"Unexpected error in MIME detection for {name}: {e}")
|
|
30
|
+
return mime_type_from_extension(name)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def mime_type_from_extension(name: str) -> str:
|
|
34
|
+
"""Get MIME type based on file extension"""
|
|
35
|
+
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
36
|
+
|
|
37
|
+
mime_map = {
|
|
38
|
+
"md": "text/markdown",
|
|
39
|
+
"txt": "text/plain",
|
|
40
|
+
"pdf": "application/pdf",
|
|
41
|
+
"png": "image/png",
|
|
42
|
+
"jpg": "image/jpeg",
|
|
43
|
+
"jpeg": "image/jpeg",
|
|
44
|
+
"gif": "image/gif",
|
|
45
|
+
"bmp": "image/bmp",
|
|
46
|
+
"webp": "image/webp",
|
|
47
|
+
"json": "application/json",
|
|
48
|
+
"yaml": "application/yaml",
|
|
49
|
+
"yml": "application/yaml",
|
|
50
|
+
"xml": "text/xml",
|
|
51
|
+
"html": "text/html",
|
|
52
|
+
"htm": "text/html",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return mime_map.get(ext, "application/octet-stream")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_text_mime_type(mime_type: str) -> bool:
|
|
59
|
+
"""Check if MIME type represents text content"""
|
|
60
|
+
text_types = [
|
|
61
|
+
"text/",
|
|
62
|
+
"application/json",
|
|
63
|
+
"application/xml",
|
|
64
|
+
"application/javascript",
|
|
65
|
+
"application/yaml",
|
|
66
|
+
"application/x-yaml",
|
|
67
|
+
]
|
|
68
|
+
return any(mime_type.startswith(t) for t in text_types)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_pdf_mime_type(mime_type: str) -> bool:
|
|
72
|
+
"""Check if MIME type is PDF"""
|
|
73
|
+
return mime_type == "application/pdf"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def is_image_mime_type(mime_type: str) -> bool:
|
|
77
|
+
"""Check if MIME type is an image"""
|
|
78
|
+
return mime_type.startswith("image/")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Task-specific document base class."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal, final
|
|
4
|
+
|
|
5
|
+
from .document import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TaskDocument(Document):
|
|
9
|
+
"""
|
|
10
|
+
Abstract base class for task-specific documents.
|
|
11
|
+
|
|
12
|
+
Task documents represent inputs, outputs, and intermediate results
|
|
13
|
+
within a Prefect task execution context.
|
|
14
|
+
|
|
15
|
+
Compared to FlowDocument, TaskDocument are not persisted across Prefect task runs.
|
|
16
|
+
They are used for intermediate results that are not needed after the task completes.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@final
|
|
20
|
+
def get_base_type(self) -> Literal["task"]:
|
|
21
|
+
"""Get the document type."""
|
|
22
|
+
return "task"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def sanitize_url(url: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Sanitize URL or query string for use in filenames.
|
|
8
|
+
Removes or replaces characters that are invalid in filenames.
|
|
9
|
+
"""
|
|
10
|
+
# Remove protocol if it's a URL
|
|
11
|
+
if url.startswith(("http://", "https://")):
|
|
12
|
+
parsed = urlparse(url)
|
|
13
|
+
# Use domain + path
|
|
14
|
+
url = parsed.netloc + parsed.path
|
|
15
|
+
|
|
16
|
+
# Replace invalid filename characters
|
|
17
|
+
sanitized = re.sub(r'[<>:"/\\|?*]', "_", url)
|
|
18
|
+
|
|
19
|
+
# Replace multiple underscores with single one
|
|
20
|
+
sanitized = re.sub(r"_+", "_", sanitized)
|
|
21
|
+
|
|
22
|
+
# Remove leading/trailing underscores and dots
|
|
23
|
+
sanitized = sanitized.strip("_.")
|
|
24
|
+
|
|
25
|
+
# Limit length to prevent too long filenames
|
|
26
|
+
if len(sanitized) > 100:
|
|
27
|
+
sanitized = sanitized[:100]
|
|
28
|
+
|
|
29
|
+
# Ensure we have something
|
|
30
|
+
if not sanitized:
|
|
31
|
+
sanitized = "unnamed"
|
|
32
|
+
|
|
33
|
+
return sanitized
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Exception hierarchy for AI Pipeline Core."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PipelineCoreError(Exception):
|
|
5
|
+
"""Base exception for all pipeline errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DocumentError(PipelineCoreError):
|
|
11
|
+
"""Document-related errors."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocumentValidationError(DocumentError):
|
|
17
|
+
"""Document validation failed."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DocumentSizeError(DocumentValidationError):
|
|
23
|
+
"""Document size exceeds limits."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DocumentNameError(DocumentValidationError):
|
|
29
|
+
"""Invalid document name."""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LLMError(PipelineCoreError):
|
|
35
|
+
"""LLM-related errors."""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PromptError(PipelineCoreError):
|
|
41
|
+
"""Prompt-related errors."""
|
|
42
|
+
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PromptRenderError(PromptError):
|
|
47
|
+
"""Failed to render prompt template."""
|
|
48
|
+
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class PromptNotFoundError(PromptError):
|
|
53
|
+
"""Prompt template not found."""
|
|
54
|
+
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MimeTypeError(DocumentError):
|
|
59
|
+
"""MIME type detection or validation error."""
|
|
60
|
+
|
|
61
|
+
pass
|