ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +1 -1
- ai_pipeline_core/documents/document.py +57 -3
- ai_pipeline_core/documents/mime_type.py +64 -32
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.6.dist-info}/METADATA +2 -2
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.6.dist-info}/RECORD +7 -7
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.6.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.5.dist-info → ai_pipeline_core-0.1.6.dist-info}/licenses/LICENSE +0 -0
ai_pipeline_core/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
|
|
|
6
6
|
from base64 import b32encode
|
|
7
7
|
from enum import StrEnum
|
|
8
8
|
from functools import cached_property
|
|
9
|
-
from typing import Any, ClassVar, Literal, Self
|
|
9
|
+
from typing import Any, ClassVar, Literal, Self, TypeVar
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
|
|
12
12
|
from ruamel.yaml import YAML
|
|
@@ -19,8 +19,11 @@ from .mime_type import (
|
|
|
19
19
|
is_image_mime_type,
|
|
20
20
|
is_pdf_mime_type,
|
|
21
21
|
is_text_mime_type,
|
|
22
|
+
is_yaml_mime_type,
|
|
22
23
|
)
|
|
23
24
|
|
|
25
|
+
TModel = TypeVar("TModel", bound=BaseModel)
|
|
26
|
+
|
|
24
27
|
|
|
25
28
|
class Document(BaseModel, ABC):
|
|
26
29
|
"""Abstract base class for all documents"""
|
|
@@ -207,15 +210,40 @@ class Document(BaseModel, ABC):
|
|
|
207
210
|
"""Parse document as JSON"""
|
|
208
211
|
return json.loads(self.as_text())
|
|
209
212
|
|
|
213
|
+
def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
|
|
214
|
+
"""Parse document as a pydantic model and return the validated instance"""
|
|
215
|
+
data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
|
|
216
|
+
return model_type.model_validate(data)
|
|
217
|
+
|
|
210
218
|
def as_markdown_list(self) -> list[str]:
|
|
211
219
|
"""Parse document as a markdown list"""
|
|
212
220
|
return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
|
|
213
221
|
|
|
214
222
|
@classmethod
|
|
215
|
-
def create(
|
|
223
|
+
def create(
|
|
224
|
+
cls,
|
|
225
|
+
name: str,
|
|
226
|
+
description: str | None,
|
|
227
|
+
content: bytes | str | BaseModel | list[str] | Any,
|
|
228
|
+
) -> Self:
|
|
216
229
|
"""Create a document from a name, description, and content"""
|
|
217
|
-
|
|
230
|
+
is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
|
|
231
|
+
is_json_extension = name.endswith(".json")
|
|
232
|
+
is_markdown_extension = name.endswith(".md")
|
|
233
|
+
is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
|
|
234
|
+
if isinstance(content, bytes):
|
|
235
|
+
pass
|
|
236
|
+
elif isinstance(content, str):
|
|
218
237
|
content = content.encode("utf-8")
|
|
238
|
+
elif is_str_list and is_markdown_extension:
|
|
239
|
+
return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
|
|
240
|
+
elif is_yaml_extension:
|
|
241
|
+
return cls.create_as_yaml(name, description, content)
|
|
242
|
+
elif is_json_extension:
|
|
243
|
+
return cls.create_as_json(name, description, content)
|
|
244
|
+
else:
|
|
245
|
+
raise ValueError(f"Unsupported content type: {type(content)} for {name}")
|
|
246
|
+
|
|
219
247
|
return cls(name=name, description=description, content=content)
|
|
220
248
|
|
|
221
249
|
@classmethod
|
|
@@ -230,6 +258,32 @@ class Document(BaseModel, ABC):
|
|
|
230
258
|
content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
|
|
231
259
|
return cls.create(name, description, content)
|
|
232
260
|
|
|
261
|
+
@classmethod
|
|
262
|
+
def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
|
|
263
|
+
"""Create a document from a name, description, and JSON data"""
|
|
264
|
+
assert name.endswith(".json"), f"Document name must end with .json: {name}"
|
|
265
|
+
if isinstance(data, BaseModel):
|
|
266
|
+
data = data.model_dump(mode="json")
|
|
267
|
+
content = json.dumps(data, indent=2).encode("utf-8")
|
|
268
|
+
return cls.create(name, description, content)
|
|
269
|
+
|
|
270
|
+
@classmethod
|
|
271
|
+
def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
|
|
272
|
+
"""Create a document from a name, description, and YAML data"""
|
|
273
|
+
assert name.endswith(".yaml") or name.endswith(".yml"), (
|
|
274
|
+
f"Document name must end with .yaml or .yml: {name}"
|
|
275
|
+
)
|
|
276
|
+
if isinstance(data, BaseModel):
|
|
277
|
+
data = data.model_dump()
|
|
278
|
+
yaml = YAML()
|
|
279
|
+
yaml.indent(mapping=2, sequence=4, offset=2)
|
|
280
|
+
from io import BytesIO
|
|
281
|
+
|
|
282
|
+
stream = BytesIO()
|
|
283
|
+
yaml.dump(data, stream)
|
|
284
|
+
content = stream.getvalue()
|
|
285
|
+
return cls.create(name, description, content)
|
|
286
|
+
|
|
233
287
|
def serialize_model(self) -> dict[str, Any]:
|
|
234
288
|
"""Serialize document to a dictionary with proper encoding."""
|
|
235
289
|
result = {
|
|
@@ -6,53 +6,75 @@ from ai_pipeline_core.logging import get_pipeline_logger
|
|
|
6
6
|
|
|
7
7
|
logger = get_pipeline_logger(__name__)
|
|
8
8
|
|
|
9
|
+
# Extension to MIME type mapping for common formats
|
|
10
|
+
# These are formats where extension-based detection is more reliable
|
|
11
|
+
EXTENSION_MIME_MAP = {
|
|
12
|
+
"md": "text/markdown",
|
|
13
|
+
"txt": "text/plain",
|
|
14
|
+
"pdf": "application/pdf",
|
|
15
|
+
"png": "image/png",
|
|
16
|
+
"jpg": "image/jpeg",
|
|
17
|
+
"jpeg": "image/jpeg",
|
|
18
|
+
"gif": "image/gif",
|
|
19
|
+
"bmp": "image/bmp",
|
|
20
|
+
"webp": "image/webp",
|
|
21
|
+
"json": "application/json",
|
|
22
|
+
"yaml": "application/yaml",
|
|
23
|
+
"yml": "application/yaml",
|
|
24
|
+
"xml": "text/xml",
|
|
25
|
+
"html": "text/html",
|
|
26
|
+
"htm": "text/html",
|
|
27
|
+
"py": "text/x-python",
|
|
28
|
+
"css": "text/css",
|
|
29
|
+
"js": "application/javascript",
|
|
30
|
+
"ts": "application/typescript",
|
|
31
|
+
"tsx": "application/typescript",
|
|
32
|
+
"jsx": "application/javascript",
|
|
33
|
+
}
|
|
34
|
+
|
|
9
35
|
|
|
10
36
|
def detect_mime_type(content: bytes, name: str) -> str:
|
|
11
|
-
"""Detect MIME type from content
|
|
37
|
+
"""Detect MIME type from content and filename
|
|
12
38
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
39
|
+
Uses a hybrid approach:
|
|
40
|
+
1. Check for empty content
|
|
41
|
+
2. Try extension-based detection for known formats
|
|
42
|
+
3. Fall back to magic content detection
|
|
43
|
+
4. Final fallback to application/octet-stream
|
|
44
|
+
"""
|
|
18
45
|
|
|
19
|
-
|
|
46
|
+
# Check for empty content
|
|
47
|
+
if len(content) == 0:
|
|
20
48
|
return "application/x-empty"
|
|
21
49
|
|
|
50
|
+
# Try extension-based detection first for known formats
|
|
51
|
+
# This is more reliable for text formats that magic might misidentify
|
|
52
|
+
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
53
|
+
if ext in EXTENSION_MIME_MAP:
|
|
54
|
+
return EXTENSION_MIME_MAP[ext]
|
|
55
|
+
|
|
56
|
+
# Try content-based detection with magic
|
|
22
57
|
try:
|
|
23
58
|
mime = magic.from_buffer(content[:1024], mime=True)
|
|
24
|
-
|
|
59
|
+
# If magic returns a valid mime type, use it
|
|
60
|
+
if mime and mime != "application/octet-stream":
|
|
61
|
+
return mime
|
|
25
62
|
except (AttributeError, OSError, magic.MagicException) as e:
|
|
26
|
-
logger.warning(f"MIME detection failed for {name}: {e}
|
|
27
|
-
return mime_type_from_extension(name)
|
|
63
|
+
logger.warning(f"MIME detection failed for {name}: {e}")
|
|
28
64
|
except Exception as e:
|
|
29
65
|
logger.error(f"Unexpected error in MIME detection for {name}: {e}")
|
|
30
|
-
|
|
66
|
+
|
|
67
|
+
# Final fallback based on extension or default
|
|
68
|
+
return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
|
|
31
69
|
|
|
32
70
|
|
|
33
71
|
def mime_type_from_extension(name: str) -> str:
|
|
34
|
-
"""Get MIME type based on file extension
|
|
35
|
-
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
72
|
+
"""Get MIME type based on file extension
|
|
36
73
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
"png": "image/png",
|
|
42
|
-
"jpg": "image/jpeg",
|
|
43
|
-
"jpeg": "image/jpeg",
|
|
44
|
-
"gif": "image/gif",
|
|
45
|
-
"bmp": "image/bmp",
|
|
46
|
-
"webp": "image/webp",
|
|
47
|
-
"json": "application/json",
|
|
48
|
-
"yaml": "application/yaml",
|
|
49
|
-
"yml": "application/yaml",
|
|
50
|
-
"xml": "text/xml",
|
|
51
|
-
"html": "text/html",
|
|
52
|
-
"htm": "text/html",
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
return mime_map.get(ext, "application/octet-stream")
|
|
74
|
+
Legacy function kept for compatibility
|
|
75
|
+
"""
|
|
76
|
+
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
77
|
+
return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
|
|
56
78
|
|
|
57
79
|
|
|
58
80
|
def is_text_mime_type(mime_type: str) -> bool:
|
|
@@ -68,6 +90,16 @@ def is_text_mime_type(mime_type: str) -> bool:
|
|
|
68
90
|
return any(mime_type.startswith(t) for t in text_types)
|
|
69
91
|
|
|
70
92
|
|
|
93
|
+
def is_json_mime_type(mime_type: str) -> bool:
|
|
94
|
+
"""Check if MIME type is JSON"""
|
|
95
|
+
return mime_type == "application/json"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def is_yaml_mime_type(mime_type: str) -> bool:
|
|
99
|
+
"""Check if MIME type is YAML"""
|
|
100
|
+
return mime_type == "application/yaml" or mime_type == "application/x-yaml"
|
|
101
|
+
|
|
102
|
+
|
|
71
103
|
def is_pdf_mime_type(mime_type: str) -> bool:
|
|
72
104
|
"""Check if MIME type is PDF"""
|
|
73
105
|
return mime_type == "application/pdf"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ai-pipeline-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Core utilities for AI-powered processing pipelines using prefect
|
|
5
5
|
Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
|
|
6
6
|
Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
|
|
@@ -471,7 +471,7 @@ Built with:
|
|
|
471
471
|
|
|
472
472
|
## Stability Notice
|
|
473
473
|
|
|
474
|
-
**Current Version**: 0.1.
|
|
474
|
+
**Current Version**: 0.1.6
|
|
475
475
|
**Status**: Internal Preview
|
|
476
476
|
**API Stability**: Unstable - Breaking changes expected
|
|
477
477
|
**Recommended Use**: Learning and reference only
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ai_pipeline_core/__init__.py,sha256=
|
|
1
|
+
ai_pipeline_core/__init__.py,sha256=2zVmBkQNdYcy2mQmw8qO1et3a5pv6KMTXfxrcTfA7EM,779
|
|
2
2
|
ai_pipeline_core/exceptions.py,sha256=_vW0Hbw2LGb5tcVvH0YzTKMff7QOPfCRr3w-w_zPyCE,968
|
|
3
3
|
ai_pipeline_core/prompt_manager.py,sha256=XmNUdMIC0WrE9fF0LIcfozAKOGrlYwj8AfXvCndIH-o,4693
|
|
4
4
|
ai_pipeline_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
ai_pipeline_core/settings.py,sha256=Zl2BPa6IHzh-B5V7cg5mtySr1dhWZQYYKxXz3BwrHlQ,615
|
|
6
6
|
ai_pipeline_core/tracing.py,sha256=_bijptKWXh7V_xENFQGF11-B70rOGwV6g0qdBoF-VCw,7890
|
|
7
7
|
ai_pipeline_core/documents/__init__.py,sha256=rEnKj-sSlZ9WnFlZAmSGVi1P8vnsHmU9O9_YwtP40ms,242
|
|
8
|
-
ai_pipeline_core/documents/document.py,sha256=
|
|
8
|
+
ai_pipeline_core/documents/document.py,sha256=e3IBr0TThucBAaOHvdqv0X--iCcBrqh2jzFTyaOp7O0,12418
|
|
9
9
|
ai_pipeline_core/documents/document_list.py,sha256=HOG_uZDazA9CJB7Lr_tNcDFzb5Ff9RUt0ELWQK_eYNM,4940
|
|
10
10
|
ai_pipeline_core/documents/flow_document.py,sha256=qsV-2JYOMhkvAj7lW54ZNH_4QUclld9h06CoU59tWww,815
|
|
11
|
-
ai_pipeline_core/documents/mime_type.py,sha256=
|
|
11
|
+
ai_pipeline_core/documents/mime_type.py,sha256=sBhNRoBJQ35JoHWhJzBGpp00WFDfMdEX0JZKKkR7QH0,3371
|
|
12
12
|
ai_pipeline_core/documents/task_document.py,sha256=WjHqtl1d60XFBBqewNRdz1OqBErGI0jRx15oQYCTHo8,907
|
|
13
13
|
ai_pipeline_core/documents/utils.py,sha256=BdE4taSl1vrBhxnFbOP5nDA7lXIcvY__AMRTHoaNb5M,2764
|
|
14
14
|
ai_pipeline_core/flow/__init__.py,sha256=_Sji2yY1ICkvVX6QiiGWKzqIXtg9UAiuvhjHSK_gdO8,57
|
|
@@ -23,7 +23,7 @@ ai_pipeline_core/logging/__init__.py,sha256=DOO6ckgnMVXl29Sy7q6jhO-iW96h54pCHQDz
|
|
|
23
23
|
ai_pipeline_core/logging/logging.yml,sha256=YTW48keO_K5bkkb-KXGM7ZuaYKiquLsjsURei8Ql0V4,1353
|
|
24
24
|
ai_pipeline_core/logging/logging_config.py,sha256=6MBz9nnVNvqiLDoyy9-R3sWkn6927Re5hdz4hwTptpI,4903
|
|
25
25
|
ai_pipeline_core/logging/logging_mixin.py,sha256=RDaR2ju2-vKTJRzXGa0DquGPT8_UxahWjvKJnaD0IV8,7810
|
|
26
|
-
ai_pipeline_core-0.1.
|
|
27
|
-
ai_pipeline_core-0.1.
|
|
28
|
-
ai_pipeline_core-0.1.
|
|
29
|
-
ai_pipeline_core-0.1.
|
|
26
|
+
ai_pipeline_core-0.1.6.dist-info/METADATA,sha256=BSLr818JTSrsTGPjOEB7bQoCv4q3ep-0YX55UgJRH4s,15869
|
|
27
|
+
ai_pipeline_core-0.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
28
|
+
ai_pipeline_core-0.1.6.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
|
|
29
|
+
ai_pipeline_core-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|