ai-pipeline-core 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ from .prompt_manager import PromptManager
16
16
  from .settings import settings
17
17
  from .tracing import trace
18
18
 
19
- __version__ = "0.1.4"
19
+ __version__ = "0.1.6"
20
20
 
21
21
  __all__ = [
22
22
  "Document",
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
6
6
  from base64 import b32encode
7
7
  from enum import StrEnum
8
8
  from functools import cached_property
9
- from typing import Any, ClassVar, Literal, Self
9
+ from typing import Any, ClassVar, Literal, Self, TypeVar
10
10
 
11
11
  from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
12
12
  from ruamel.yaml import YAML
@@ -19,8 +19,11 @@ from .mime_type import (
19
19
  is_image_mime_type,
20
20
  is_pdf_mime_type,
21
21
  is_text_mime_type,
22
+ is_yaml_mime_type,
22
23
  )
23
24
 
25
+ TModel = TypeVar("TModel", bound=BaseModel)
26
+
24
27
 
25
28
  class Document(BaseModel, ABC):
26
29
  """Abstract base class for all documents"""
@@ -207,15 +210,40 @@ class Document(BaseModel, ABC):
207
210
  """Parse document as JSON"""
208
211
  return json.loads(self.as_text())
209
212
 
213
+ def as_pydantic_model(self, model_type: type[TModel]) -> TModel:
214
+ """Parse document as a pydantic model and return the validated instance"""
215
+ data = self.as_yaml() if is_yaml_mime_type(self.mime_type) else self.as_json()
216
+ return model_type.model_validate(data)
217
+
210
218
  def as_markdown_list(self) -> list[str]:
211
219
  """Parse document as a markdown list"""
212
220
  return self.as_text().split(self.MARKDOWN_LIST_SEPARATOR)
213
221
 
214
222
  @classmethod
215
- def create(cls, name: str, description: str | None, content: bytes | str) -> Self:
223
+ def create(
224
+ cls,
225
+ name: str,
226
+ description: str | None,
227
+ content: bytes | str | BaseModel | list[str] | Any,
228
+ ) -> Self:
216
229
  """Create a document from a name, description, and content"""
217
- if isinstance(content, str):
230
+ is_yaml_extension = name.endswith(".yaml") or name.endswith(".yml")
231
+ is_json_extension = name.endswith(".json")
232
+ is_markdown_extension = name.endswith(".md")
233
+ is_str_list = isinstance(content, list) and all(isinstance(item, str) for item in content)
234
+ if isinstance(content, bytes):
235
+ pass
236
+ elif isinstance(content, str):
218
237
  content = content.encode("utf-8")
238
+ elif is_str_list and is_markdown_extension:
239
+ return cls.create_as_markdown_list(name, description, content) # type: ignore[arg-type]
240
+ elif is_yaml_extension:
241
+ return cls.create_as_yaml(name, description, content)
242
+ elif is_json_extension:
243
+ return cls.create_as_json(name, description, content)
244
+ else:
245
+ raise ValueError(f"Unsupported content type: {type(content)} for {name}")
246
+
219
247
  return cls(name=name, description=description, content=content)
220
248
 
221
249
  @classmethod
@@ -230,6 +258,32 @@ class Document(BaseModel, ABC):
230
258
  content = Document.MARKDOWN_LIST_SEPARATOR.join(cleaned_items)
231
259
  return cls.create(name, description, content)
232
260
 
261
+ @classmethod
262
+ def create_as_json(cls, name: str, description: str | None, data: Any) -> Self:
263
+ """Create a document from a name, description, and JSON data"""
264
+ assert name.endswith(".json"), f"Document name must end with .json: {name}"
265
+ if isinstance(data, BaseModel):
266
+ data = data.model_dump(mode="json")
267
+ content = json.dumps(data, indent=2).encode("utf-8")
268
+ return cls.create(name, description, content)
269
+
270
+ @classmethod
271
+ def create_as_yaml(cls, name: str, description: str | None, data: Any) -> Self:
272
+ """Create a document from a name, description, and YAML data"""
273
+ assert name.endswith(".yaml") or name.endswith(".yml"), (
274
+ f"Document name must end with .yaml or .yml: {name}"
275
+ )
276
+ if isinstance(data, BaseModel):
277
+ data = data.model_dump()
278
+ yaml = YAML()
279
+ yaml.indent(mapping=2, sequence=4, offset=2)
280
+ from io import BytesIO
281
+
282
+ stream = BytesIO()
283
+ yaml.dump(data, stream)
284
+ content = stream.getvalue()
285
+ return cls.create(name, description, content)
286
+
233
287
  def serialize_model(self) -> dict[str, Any]:
234
288
  """Serialize document to a dictionary with proper encoding."""
235
289
  result = {
@@ -6,53 +6,75 @@ from ai_pipeline_core.logging import get_pipeline_logger
6
6
 
7
7
  logger = get_pipeline_logger(__name__)
8
8
 
9
+ # Extension to MIME type mapping for common formats
10
+ # These are formats where extension-based detection is more reliable
11
+ EXTENSION_MIME_MAP = {
12
+ "md": "text/markdown",
13
+ "txt": "text/plain",
14
+ "pdf": "application/pdf",
15
+ "png": "image/png",
16
+ "jpg": "image/jpeg",
17
+ "jpeg": "image/jpeg",
18
+ "gif": "image/gif",
19
+ "bmp": "image/bmp",
20
+ "webp": "image/webp",
21
+ "json": "application/json",
22
+ "yaml": "application/yaml",
23
+ "yml": "application/yaml",
24
+ "xml": "text/xml",
25
+ "html": "text/html",
26
+ "htm": "text/html",
27
+ "py": "text/x-python",
28
+ "css": "text/css",
29
+ "js": "application/javascript",
30
+ "ts": "application/typescript",
31
+ "tsx": "application/typescript",
32
+ "jsx": "application/javascript",
33
+ }
34
+
9
35
 
10
36
  def detect_mime_type(content: bytes, name: str) -> str:
11
- """Detect MIME type from content using python-magic"""
37
+ """Detect MIME type from content and filename
12
38
 
13
- try:
14
- if name.endswith(".md") and content.decode("utf-8"):
15
- return "text/markdown"
16
- except UnicodeDecodeError:
17
- pass
39
+ Uses a hybrid approach:
40
+ 1. Check for empty content
41
+ 2. Try extension-based detection for known formats
42
+ 3. Fall back to magic content detection
43
+ 4. Final fallback to application/octet-stream
44
+ """
18
45
 
19
- if len(content) <= 4:
46
+ # Check for empty content
47
+ if len(content) == 0:
20
48
  return "application/x-empty"
21
49
 
50
+ # Try extension-based detection first for known formats
51
+ # This is more reliable for text formats that magic might misidentify
52
+ ext = name.lower().split(".")[-1] if "." in name else ""
53
+ if ext in EXTENSION_MIME_MAP:
54
+ return EXTENSION_MIME_MAP[ext]
55
+
56
+ # Try content-based detection with magic
22
57
  try:
23
58
  mime = magic.from_buffer(content[:1024], mime=True)
24
- return mime
59
+ # If magic returns a valid mime type, use it
60
+ if mime and mime != "application/octet-stream":
61
+ return mime
25
62
  except (AttributeError, OSError, magic.MagicException) as e:
26
- logger.warning(f"MIME detection failed for {name}: {e}, falling back to extension")
27
- return mime_type_from_extension(name)
63
+ logger.warning(f"MIME detection failed for {name}: {e}")
28
64
  except Exception as e:
29
65
  logger.error(f"Unexpected error in MIME detection for {name}: {e}")
30
- return mime_type_from_extension(name)
66
+
67
+ # Final fallback based on extension or default
68
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
31
69
 
32
70
 
33
71
  def mime_type_from_extension(name: str) -> str:
34
- """Get MIME type based on file extension"""
35
- ext = name.lower().split(".")[-1] if "." in name else ""
72
+ """Get MIME type based on file extension
36
73
 
37
- mime_map = {
38
- "md": "text/markdown",
39
- "txt": "text/plain",
40
- "pdf": "application/pdf",
41
- "png": "image/png",
42
- "jpg": "image/jpeg",
43
- "jpeg": "image/jpeg",
44
- "gif": "image/gif",
45
- "bmp": "image/bmp",
46
- "webp": "image/webp",
47
- "json": "application/json",
48
- "yaml": "application/yaml",
49
- "yml": "application/yaml",
50
- "xml": "text/xml",
51
- "html": "text/html",
52
- "htm": "text/html",
53
- }
54
-
55
- return mime_map.get(ext, "application/octet-stream")
74
+ Legacy function kept for compatibility
75
+ """
76
+ ext = name.lower().split(".")[-1] if "." in name else ""
77
+ return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
56
78
 
57
79
 
58
80
  def is_text_mime_type(mime_type: str) -> bool:
@@ -68,6 +90,16 @@ def is_text_mime_type(mime_type: str) -> bool:
68
90
  return any(mime_type.startswith(t) for t in text_types)
69
91
 
70
92
 
93
+ def is_json_mime_type(mime_type: str) -> bool:
94
+ """Check if MIME type is JSON"""
95
+ return mime_type == "application/json"
96
+
97
+
98
+ def is_yaml_mime_type(mime_type: str) -> bool:
99
+ """Check if MIME type is YAML"""
100
+ return mime_type == "application/yaml" or mime_type == "application/x-yaml"
101
+
102
+
71
103
  def is_pdf_mime_type(mime_type: str) -> bool:
72
104
  """Check if MIME type is PDF"""
73
105
  return mime_type == "application/pdf"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ai-pipeline-core
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Core utilities for AI-powered processing pipelines using prefect
5
5
  Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
6
  Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
@@ -471,7 +471,7 @@ Built with:
471
471
 
472
472
  ## Stability Notice
473
473
 
474
- **Current Version**: 0.1.5
474
+ **Current Version**: 0.1.6
475
475
  **Status**: Internal Preview
476
476
  **API Stability**: Unstable - Breaking changes expected
477
477
  **Recommended Use**: Learning and reference only
@@ -1,14 +1,14 @@
1
- ai_pipeline_core/__init__.py,sha256=xjZh6D4fkepTm3LK42qaTpNHgN2meK4uthrW9pjwLfE,779
1
+ ai_pipeline_core/__init__.py,sha256=2zVmBkQNdYcy2mQmw8qO1et3a5pv6KMTXfxrcTfA7EM,779
2
2
  ai_pipeline_core/exceptions.py,sha256=_vW0Hbw2LGb5tcVvH0YzTKMff7QOPfCRr3w-w_zPyCE,968
3
3
  ai_pipeline_core/prompt_manager.py,sha256=XmNUdMIC0WrE9fF0LIcfozAKOGrlYwj8AfXvCndIH-o,4693
4
4
  ai_pipeline_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  ai_pipeline_core/settings.py,sha256=Zl2BPa6IHzh-B5V7cg5mtySr1dhWZQYYKxXz3BwrHlQ,615
6
6
  ai_pipeline_core/tracing.py,sha256=_bijptKWXh7V_xENFQGF11-B70rOGwV6g0qdBoF-VCw,7890
7
7
  ai_pipeline_core/documents/__init__.py,sha256=rEnKj-sSlZ9WnFlZAmSGVi1P8vnsHmU9O9_YwtP40ms,242
8
- ai_pipeline_core/documents/document.py,sha256=cKa9v0GpDYh48Aqj0u_tCwh4_uCcn5YsKzC537f0v0I,10089
8
+ ai_pipeline_core/documents/document.py,sha256=e3IBr0TThucBAaOHvdqv0X--iCcBrqh2jzFTyaOp7O0,12418
9
9
  ai_pipeline_core/documents/document_list.py,sha256=HOG_uZDazA9CJB7Lr_tNcDFzb5Ff9RUt0ELWQK_eYNM,4940
10
10
  ai_pipeline_core/documents/flow_document.py,sha256=qsV-2JYOMhkvAj7lW54ZNH_4QUclld9h06CoU59tWww,815
11
- ai_pipeline_core/documents/mime_type.py,sha256=tMWGH9PVmHe6a_IzdaJUqIHf4qnwQOwOCBhsgW2AyTE,2244
11
+ ai_pipeline_core/documents/mime_type.py,sha256=sBhNRoBJQ35JoHWhJzBGpp00WFDfMdEX0JZKKkR7QH0,3371
12
12
  ai_pipeline_core/documents/task_document.py,sha256=WjHqtl1d60XFBBqewNRdz1OqBErGI0jRx15oQYCTHo8,907
13
13
  ai_pipeline_core/documents/utils.py,sha256=BdE4taSl1vrBhxnFbOP5nDA7lXIcvY__AMRTHoaNb5M,2764
14
14
  ai_pipeline_core/flow/__init__.py,sha256=_Sji2yY1ICkvVX6QiiGWKzqIXtg9UAiuvhjHSK_gdO8,57
@@ -23,7 +23,7 @@ ai_pipeline_core/logging/__init__.py,sha256=DOO6ckgnMVXl29Sy7q6jhO-iW96h54pCHQDz
23
23
  ai_pipeline_core/logging/logging.yml,sha256=YTW48keO_K5bkkb-KXGM7ZuaYKiquLsjsURei8Ql0V4,1353
24
24
  ai_pipeline_core/logging/logging_config.py,sha256=6MBz9nnVNvqiLDoyy9-R3sWkn6927Re5hdz4hwTptpI,4903
25
25
  ai_pipeline_core/logging/logging_mixin.py,sha256=RDaR2ju2-vKTJRzXGa0DquGPT8_UxahWjvKJnaD0IV8,7810
26
- ai_pipeline_core-0.1.5.dist-info/METADATA,sha256=U1OIPjGwAGsuyJ3QnhUtJQWMzj-OqkXDuyH6cW8Dq70,15869
27
- ai_pipeline_core-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
28
- ai_pipeline_core-0.1.5.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
29
- ai_pipeline_core-0.1.5.dist-info/RECORD,,
26
+ ai_pipeline_core-0.1.6.dist-info/METADATA,sha256=BSLr818JTSrsTGPjOEB7bQoCv4q3ep-0YX55UgJRH4s,15869
27
+ ai_pipeline_core-0.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
28
+ ai_pipeline_core-0.1.6.dist-info/licenses/LICENSE,sha256=kKj8mfbdWwkyG3U6n7ztB3bAZlEwShTkAsvaY657i3I,1074
29
+ ai_pipeline_core-0.1.6.dist-info/RECORD,,