ai-pipeline-core 0.1.10__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +84 -4
- ai_pipeline_core/documents/__init__.py +9 -0
- ai_pipeline_core/documents/document.py +1034 -151
- ai_pipeline_core/documents/document_list.py +147 -38
- ai_pipeline_core/documents/flow_document.py +112 -11
- ai_pipeline_core/documents/mime_type.py +173 -15
- ai_pipeline_core/documents/task_document.py +117 -12
- ai_pipeline_core/documents/temporary_document.py +84 -5
- ai_pipeline_core/documents/utils.py +41 -9
- ai_pipeline_core/exceptions.py +47 -11
- ai_pipeline_core/flow/__init__.py +2 -0
- ai_pipeline_core/flow/config.py +232 -23
- ai_pipeline_core/flow/options.py +50 -1
- ai_pipeline_core/llm/__init__.py +6 -0
- ai_pipeline_core/llm/ai_messages.py +125 -27
- ai_pipeline_core/llm/client.py +278 -26
- ai_pipeline_core/llm/model_options.py +130 -1
- ai_pipeline_core/llm/model_response.py +239 -35
- ai_pipeline_core/llm/model_types.py +67 -0
- ai_pipeline_core/logging/__init__.py +13 -0
- ai_pipeline_core/logging/logging_config.py +72 -20
- ai_pipeline_core/logging/logging_mixin.py +38 -32
- ai_pipeline_core/pipeline.py +308 -60
- ai_pipeline_core/prefect.py +48 -1
- ai_pipeline_core/prompt_manager.py +209 -24
- ai_pipeline_core/settings.py +108 -4
- ai_pipeline_core/simple_runner/__init__.py +5 -0
- ai_pipeline_core/simple_runner/cli.py +96 -11
- ai_pipeline_core/simple_runner/simple_runner.py +237 -4
- ai_pipeline_core/tracing.py +232 -30
- ai_pipeline_core-0.1.11.dist-info/METADATA +450 -0
- ai_pipeline_core-0.1.11.dist-info/RECORD +36 -0
- ai_pipeline_core-0.1.10.dist-info/METADATA +0 -538
- ai_pipeline_core-0.1.10.dist-info/RECORD +0 -36
- {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.1.10.dist-info → ai_pipeline_core-0.1.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
"""Type-safe list container for Document objects.
|
|
2
|
+
|
|
3
|
+
@public
|
|
4
|
+
"""
|
|
5
|
+
|
|
1
6
|
from typing import Any, Iterable, SupportsIndex, Union, overload
|
|
2
7
|
|
|
3
8
|
from typing_extensions import Self
|
|
@@ -6,14 +11,25 @@ from .document import Document
|
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
class DocumentList(list[Document]):
|
|
9
|
-
"""
|
|
10
|
-
|
|
14
|
+
"""Type-safe container for Document objects.
|
|
15
|
+
|
|
16
|
+
@public
|
|
17
|
+
|
|
18
|
+
Specialized list with validation and filtering for documents.
|
|
19
|
+
|
|
20
|
+
Best Practice: Use default constructor in 90% of cases. Only enable
|
|
21
|
+
validate_same_type or validate_duplicates when you explicitly need them.
|
|
11
22
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
23
|
+
Example:
|
|
24
|
+
>>> # RECOMMENDED - default constructor for most cases
|
|
25
|
+
>>> docs = DocumentList([doc1, doc2])
|
|
26
|
+
>>> # Or empty initialization
|
|
27
|
+
>>> docs = DocumentList()
|
|
28
|
+
>>> docs.append(MyDocument(name="file.txt", content=b"data"))
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Only use validation flags when specifically needed:
|
|
31
|
+
>>> docs = DocumentList(validate_same_type=True) # Rare use case
|
|
32
|
+
>>> doc = docs.get_by("file.txt") # Get by name
|
|
17
33
|
"""
|
|
18
34
|
|
|
19
35
|
def __init__(
|
|
@@ -22,13 +38,14 @@ class DocumentList(list[Document]):
|
|
|
22
38
|
validate_same_type: bool = False,
|
|
23
39
|
validate_duplicates: bool = False,
|
|
24
40
|
) -> None:
|
|
25
|
-
"""
|
|
26
|
-
|
|
41
|
+
"""Initialize DocumentList.
|
|
42
|
+
|
|
43
|
+
@public
|
|
27
44
|
|
|
28
45
|
Args:
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
46
|
+
documents: Initial list of documents.
|
|
47
|
+
validate_same_type: Enforce same document type.
|
|
48
|
+
validate_duplicates: Prevent duplicate filenames.
|
|
32
49
|
"""
|
|
33
50
|
super().__init__()
|
|
34
51
|
self._validate_same_type = validate_same_type
|
|
@@ -37,7 +54,11 @@ class DocumentList(list[Document]):
|
|
|
37
54
|
self.extend(documents)
|
|
38
55
|
|
|
39
56
|
def _validate_no_duplicates(self) -> None:
|
|
40
|
-
"""
|
|
57
|
+
"""Check for duplicate document names.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
ValueError: If duplicate document names are found.
|
|
61
|
+
"""
|
|
41
62
|
if not self._validate_duplicates:
|
|
42
63
|
return
|
|
43
64
|
|
|
@@ -53,7 +74,11 @@ class DocumentList(list[Document]):
|
|
|
53
74
|
raise ValueError(f"Duplicate document names found: {unique_duplicates}")
|
|
54
75
|
|
|
55
76
|
def _validate_no_description_files(self) -> None:
|
|
56
|
-
"""
|
|
77
|
+
"""Ensure no documents use reserved description file extension.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If any document uses the reserved description file extension.
|
|
81
|
+
"""
|
|
57
82
|
description_files = [
|
|
58
83
|
doc.name for doc in self if doc.name.endswith(Document.DESCRIPTION_EXTENSION)
|
|
59
84
|
]
|
|
@@ -64,7 +89,11 @@ class DocumentList(list[Document]):
|
|
|
64
89
|
)
|
|
65
90
|
|
|
66
91
|
def _validate_types(self) -> None:
|
|
67
|
-
"""
|
|
92
|
+
"""Ensure all documents are of the same class type.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If documents have different class types.
|
|
96
|
+
"""
|
|
68
97
|
if not self._validate_same_type or not self:
|
|
69
98
|
return
|
|
70
99
|
|
|
@@ -75,23 +104,23 @@ class DocumentList(list[Document]):
|
|
|
75
104
|
raise ValueError(f"All documents must have the same type. Found types: {types}")
|
|
76
105
|
|
|
77
106
|
def _validate(self) -> None:
|
|
78
|
-
"""Run all
|
|
107
|
+
"""Run all configured validation checks."""
|
|
79
108
|
self._validate_no_duplicates()
|
|
80
109
|
self._validate_no_description_files()
|
|
81
110
|
self._validate_types()
|
|
82
111
|
|
|
83
112
|
def append(self, document: Document) -> None:
|
|
84
|
-
"""Add a document to the
|
|
113
|
+
"""Add a document to the end of the list."""
|
|
85
114
|
super().append(document)
|
|
86
115
|
self._validate()
|
|
87
116
|
|
|
88
117
|
def extend(self, documents: Iterable[Document]) -> None:
|
|
89
|
-
"""
|
|
118
|
+
"""Add multiple documents to the list."""
|
|
90
119
|
super().extend(documents)
|
|
91
120
|
self._validate()
|
|
92
121
|
|
|
93
122
|
def insert(self, index: SupportsIndex, document: Document) -> None:
|
|
94
|
-
"""Insert a document at the specified
|
|
123
|
+
"""Insert a document at the specified position."""
|
|
95
124
|
super().insert(index, document)
|
|
96
125
|
self._validate()
|
|
97
126
|
|
|
@@ -102,30 +131,110 @@ class DocumentList(list[Document]):
|
|
|
102
131
|
def __setitem__(self, index: slice, value: Iterable[Document]) -> None: ...
|
|
103
132
|
|
|
104
133
|
def __setitem__(self, index: Union[SupportsIndex, slice], value: Any) -> None:
|
|
105
|
-
"""Set item with validation."""
|
|
134
|
+
"""Set item or slice with validation."""
|
|
106
135
|
super().__setitem__(index, value)
|
|
107
136
|
self._validate()
|
|
108
137
|
|
|
109
138
|
def __iadd__(self, other: Any) -> "Self":
|
|
110
|
-
"""In-place addition with validation.
|
|
139
|
+
"""In-place addition (+=) with validation.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Self: This DocumentList after modification.
|
|
143
|
+
"""
|
|
111
144
|
result = super().__iadd__(other)
|
|
112
145
|
self._validate()
|
|
113
146
|
return result
|
|
114
147
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
148
|
+
@overload
|
|
149
|
+
def filter_by(self, arg: str) -> "DocumentList": ...
|
|
150
|
+
|
|
151
|
+
@overload
|
|
152
|
+
def filter_by(self, arg: type[Document]) -> "DocumentList": ...
|
|
153
|
+
|
|
154
|
+
@overload
|
|
155
|
+
def filter_by(self, arg: list[type[Document]]) -> "DocumentList": ...
|
|
156
|
+
|
|
157
|
+
def filter_by(self, arg: str | type[Document] | list[type[Document]]) -> "DocumentList":
|
|
158
|
+
"""Filter documents by name or type(s).
|
|
159
|
+
|
|
160
|
+
@public
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
arg: Document name (str), single document type, or list of document types.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
New DocumentList with filtered documents.
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
TypeError: If arg is not a valid type (str, Document type, or list of Document types).
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
>>> docs.filter_by("file.txt") # Filter by name
|
|
173
|
+
>>> docs.filter_by(MyDocument) # Filter by type
|
|
174
|
+
>>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types
|
|
175
|
+
"""
|
|
176
|
+
if isinstance(arg, str):
|
|
177
|
+
# Filter by name
|
|
178
|
+
return DocumentList([doc for doc in self if doc.name == arg])
|
|
179
|
+
elif isinstance(arg, type):
|
|
180
|
+
# Filter by single type (including subclasses)
|
|
181
|
+
return DocumentList([doc for doc in self if isinstance(doc, arg)])
|
|
182
|
+
elif isinstance(arg, list): # type: ignore[reportUnnecessaryIsInstance]
|
|
183
|
+
# Filter by multiple types
|
|
184
|
+
documents = DocumentList()
|
|
185
|
+
for document_type in arg:
|
|
186
|
+
documents.extend([doc for doc in self if isinstance(doc, document_type)])
|
|
187
|
+
return documents
|
|
188
|
+
else:
|
|
189
|
+
raise TypeError(f"Invalid argument type for filter_by: {type(arg)}")
|
|
190
|
+
|
|
191
|
+
@overload
|
|
192
|
+
def get_by(self, arg: str) -> Document: ...
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def get_by(self, arg: type[Document]) -> Document: ...
|
|
196
|
+
|
|
197
|
+
@overload
|
|
198
|
+
def get_by(self, arg: str, required: bool = True) -> Document | None: ...
|
|
199
|
+
|
|
200
|
+
@overload
|
|
201
|
+
def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
|
|
202
|
+
|
|
203
|
+
def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
|
|
204
|
+
"""Get a single document by name or type.
|
|
205
|
+
|
|
206
|
+
@public
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
arg: Document name (str) or document type.
|
|
210
|
+
required: If True, raises ValueError when not found. If False, returns None.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
The first matching document, or None if not found and required=False.
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
ValueError: If required=True and document not found.
|
|
217
|
+
TypeError: If arg is not a string or Document type.
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
>>> doc = docs.get_by("file.txt") # Get by name, raises if not found
|
|
221
|
+
>>> doc = docs.get_by(MyDocument, required=False) # Returns None if not found
|
|
222
|
+
"""
|
|
223
|
+
if isinstance(arg, str):
|
|
224
|
+
# Get by name
|
|
225
|
+
for doc in self:
|
|
226
|
+
if doc.name == arg:
|
|
227
|
+
return doc
|
|
228
|
+
if required:
|
|
229
|
+
raise ValueError(f"Document with name '{arg}' not found")
|
|
230
|
+
return None
|
|
231
|
+
elif isinstance(arg, type): # type: ignore[reportUnnecessaryIsInstance]
|
|
232
|
+
# Get by type (including subclasses)
|
|
233
|
+
for doc in self:
|
|
234
|
+
if isinstance(doc, arg):
|
|
235
|
+
return doc
|
|
236
|
+
if required:
|
|
237
|
+
raise ValueError(f"Document of type '{arg.__name__}' not found")
|
|
238
|
+
return None
|
|
239
|
+
else:
|
|
240
|
+
raise TypeError(f"Invalid argument type for get_by: {type(arg)}")
|
|
@@ -1,27 +1,128 @@
|
|
|
1
|
-
"""Flow-specific document base class.
|
|
1
|
+
"""Flow-specific document base class for persistent pipeline data.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
@public
|
|
4
|
+
|
|
5
|
+
This module provides the FlowDocument abstract base class for documents
|
|
6
|
+
that need to persist across Prefect flow runs and between pipeline steps.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Literal, final
|
|
4
10
|
|
|
5
11
|
from .document import Document
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
class FlowDocument(Document):
|
|
9
|
-
"""
|
|
10
|
-
|
|
15
|
+
"""Abstract base class for documents that persist across flow runs.
|
|
16
|
+
|
|
17
|
+
@public
|
|
18
|
+
|
|
19
|
+
FlowDocument is used for data that needs to be saved between pipeline
|
|
20
|
+
steps and across multiple flow executions. These documents are typically
|
|
21
|
+
written to the file system using the simple_runner utilities.
|
|
22
|
+
|
|
23
|
+
Key characteristics:
|
|
24
|
+
- Persisted to file system between pipeline steps
|
|
25
|
+
- Survives across multiple flow runs
|
|
26
|
+
- Used for flow inputs and outputs
|
|
27
|
+
- Saved in directories named after the document's canonical name
|
|
28
|
+
|
|
29
|
+
Creating FlowDocuments:
|
|
30
|
+
**Use the `create` classmethod** for most use cases. It handles automatic
|
|
31
|
+
conversion of various content types. Only use __init__ when you have bytes.
|
|
32
|
+
|
|
33
|
+
>>> from enum import StrEnum
|
|
34
|
+
>>>
|
|
35
|
+
>>> # Simple document with pass:
|
|
36
|
+
>>> class MyDoc(FlowDocument):
|
|
37
|
+
... pass
|
|
38
|
+
>>>
|
|
39
|
+
>>> # Document with restricted file names:
|
|
40
|
+
>>> class ConfigDoc(FlowDocument):
|
|
41
|
+
... class FILES(StrEnum):
|
|
42
|
+
... CONFIG = "config.yaml"
|
|
43
|
+
... SETTINGS = "settings.json"
|
|
44
|
+
>>>
|
|
45
|
+
>>> # RECOMMENDED - automatic conversion:
|
|
46
|
+
>>> doc = MyDoc.create(name="data.json", content={"key": "value"})
|
|
47
|
+
>>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
|
|
48
|
+
|
|
49
|
+
Persistence:
|
|
50
|
+
Documents are saved to: {output_dir}/{canonical_name}/{filename}
|
|
51
|
+
For example: output/my_doc/data.json
|
|
11
52
|
|
|
12
|
-
|
|
13
|
-
|
|
53
|
+
Note:
|
|
54
|
+
- Cannot instantiate FlowDocument directly - must subclass
|
|
55
|
+
- Used with FlowConfig to define flow input/output types
|
|
56
|
+
- No additional abstract methods to implement
|
|
14
57
|
|
|
15
|
-
|
|
58
|
+
See Also:
|
|
59
|
+
TaskDocument: For temporary documents within task execution
|
|
60
|
+
TemporaryDocument: For documents that are never persisted
|
|
16
61
|
"""
|
|
17
62
|
|
|
18
|
-
def __init__(
|
|
19
|
-
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
*,
|
|
66
|
+
name: str,
|
|
67
|
+
content: bytes,
|
|
68
|
+
description: str | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Initialize a FlowDocument with raw bytes content.
|
|
71
|
+
|
|
72
|
+
Important:
|
|
73
|
+
**Most users should use the `create` classmethod instead of __init__.**
|
|
74
|
+
The create method provides automatic content conversion for various types
|
|
75
|
+
(str, dict, list, Pydantic models) while __init__ only accepts bytes.
|
|
76
|
+
|
|
77
|
+
Prevents direct instantiation of the abstract FlowDocument class.
|
|
78
|
+
FlowDocument must be subclassed for specific document types.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
name: Document filename (required, keyword-only)
|
|
82
|
+
content: Document content as raw bytes (required, keyword-only)
|
|
83
|
+
description: Optional human-readable description (keyword-only)
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
TypeError: If attempting to instantiate FlowDocument directly
|
|
87
|
+
instead of using a concrete subclass.
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> from enum import StrEnum
|
|
91
|
+
>>>
|
|
92
|
+
>>> # Simple subclass:
|
|
93
|
+
>>> class MyFlowDoc(FlowDocument):
|
|
94
|
+
... pass
|
|
95
|
+
>>>
|
|
96
|
+
>>> # With FILES restriction:
|
|
97
|
+
>>> class RestrictedDoc(FlowDocument):
|
|
98
|
+
... class FILES(StrEnum):
|
|
99
|
+
... DATA = "data.json"
|
|
100
|
+
... METADATA = "metadata.yaml"
|
|
101
|
+
>>>
|
|
102
|
+
>>> # Direct constructor - only for bytes:
|
|
103
|
+
>>> doc = MyFlowDoc(name="test.bin", content=b"raw data")
|
|
104
|
+
>>>
|
|
105
|
+
>>> # RECOMMENDED - use create for automatic conversion:
|
|
106
|
+
>>> doc = RestrictedDoc.create(name="data.json", content={"key": "value"})
|
|
107
|
+
>>> # This would raise DocumentNameError:
|
|
108
|
+
>>> # doc = RestrictedDoc.create(name="other.json", content={})
|
|
109
|
+
"""
|
|
20
110
|
if type(self) is FlowDocument:
|
|
21
111
|
raise TypeError("Cannot instantiate abstract FlowDocument class directly")
|
|
22
|
-
super().__init__(
|
|
112
|
+
super().__init__(name=name, content=content, description=description)
|
|
23
113
|
|
|
24
114
|
@final
|
|
25
115
|
def get_base_type(self) -> Literal["flow"]:
|
|
26
|
-
"""
|
|
116
|
+
"""Return the base type identifier for flow documents.
|
|
117
|
+
|
|
118
|
+
This method is final and cannot be overridden by subclasses.
|
|
119
|
+
It identifies this document as a flow-persistent document.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
"flow" - Indicates this document persists across flow runs.
|
|
123
|
+
|
|
124
|
+
Note:
|
|
125
|
+
This determines the document's lifecycle and persistence behavior
|
|
126
|
+
in the pipeline system.
|
|
127
|
+
"""
|
|
27
128
|
return "flow"
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
"""MIME type detection utilities for documents
|
|
1
|
+
"""@internal MIME type detection utilities for documents.
|
|
2
|
+
|
|
3
|
+
This module provides functions for detecting and validating MIME types
|
|
4
|
+
from document content and filenames. It uses a hybrid approach combining
|
|
5
|
+
extension-based detection for known formats and content analysis via
|
|
6
|
+
python-magic for unknown files.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
import magic
|
|
4
10
|
|
|
@@ -34,15 +40,45 @@ EXTENSION_MIME_MAP = {
|
|
|
34
40
|
|
|
35
41
|
|
|
36
42
|
def detect_mime_type(content: bytes, name: str) -> str:
|
|
37
|
-
"""Detect MIME type from content and filename
|
|
43
|
+
r"""Detect MIME type from document content and filename.
|
|
38
44
|
|
|
39
|
-
Uses a
|
|
40
|
-
1.
|
|
41
|
-
2.
|
|
42
|
-
3.
|
|
43
|
-
4. Final fallback to application/octet-stream
|
|
44
|
-
|
|
45
|
+
Uses a multi-stage detection strategy for maximum accuracy:
|
|
46
|
+
1. Returns 'application/x-empty' for empty content
|
|
47
|
+
2. Uses extension-based detection for known formats (most reliable)
|
|
48
|
+
3. Falls back to python-magic content analysis
|
|
49
|
+
4. Final fallback to extension or 'application/octet-stream'
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
content: Document content as bytes.
|
|
53
|
+
name: Filename with extension.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
MIME type string (e.g., 'text/plain', 'application/json').
|
|
57
|
+
Never returns None or empty string.
|
|
58
|
+
|
|
59
|
+
Fallback behavior:
|
|
60
|
+
- Empty content: 'application/x-empty'
|
|
61
|
+
- Unknown extension with binary content: 'application/octet-stream'
|
|
62
|
+
- Magic library failure: Falls back to extension or 'application/octet-stream'
|
|
45
63
|
|
|
64
|
+
Performance:
|
|
65
|
+
Only the first 1024 bytes are analyzed for content detection.
|
|
66
|
+
Extension-based detection is O(1) lookup.
|
|
67
|
+
|
|
68
|
+
Note:
|
|
69
|
+
Extension-based detection is preferred for text formats as
|
|
70
|
+
content analysis can sometimes misidentify structured text.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
>>> detect_mime_type(b'{"key": "value"}', "data.json")
|
|
74
|
+
'application/json'
|
|
75
|
+
>>> detect_mime_type(b'Hello World', "text.txt")
|
|
76
|
+
'text/plain'
|
|
77
|
+
>>> detect_mime_type(b'', "empty.txt")
|
|
78
|
+
'application/x-empty'
|
|
79
|
+
>>> detect_mime_type(b'\\x89PNG', "image.xyz")
|
|
80
|
+
'image/png' # Magic detects PNG despite wrong extension
|
|
81
|
+
"""
|
|
46
82
|
# Check for empty content
|
|
47
83
|
if len(content) == 0:
|
|
48
84
|
return "application/x-empty"
|
|
@@ -69,16 +105,60 @@ def detect_mime_type(content: bytes, name: str) -> str:
|
|
|
69
105
|
|
|
70
106
|
|
|
71
107
|
def mime_type_from_extension(name: str) -> str:
|
|
72
|
-
"""Get MIME type based on file extension
|
|
108
|
+
"""Get MIME type based solely on file extension.
|
|
109
|
+
|
|
110
|
+
Simple extension-based MIME type detection without content analysis.
|
|
111
|
+
This is a legacy function maintained for backward compatibility.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
name: Filename with extension.
|
|
73
115
|
|
|
74
|
-
|
|
116
|
+
Returns:
|
|
117
|
+
MIME type based on extension, or 'application/octet-stream'
|
|
118
|
+
if extension is unknown.
|
|
119
|
+
|
|
120
|
+
Note:
|
|
121
|
+
Prefer detect_mime_type() for more accurate detection.
|
|
122
|
+
This function only checks the file extension.
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
>>> mime_type_from_extension("document.pdf")
|
|
126
|
+
'application/pdf'
|
|
127
|
+
>>> mime_type_from_extension("unknown.xyz")
|
|
128
|
+
'application/octet-stream'
|
|
75
129
|
"""
|
|
76
130
|
ext = name.lower().split(".")[-1] if "." in name else ""
|
|
77
131
|
return EXTENSION_MIME_MAP.get(ext, "application/octet-stream")
|
|
78
132
|
|
|
79
133
|
|
|
80
134
|
def is_text_mime_type(mime_type: str) -> bool:
|
|
81
|
-
"""Check if MIME type represents text content
|
|
135
|
+
"""Check if MIME type represents text-based content.
|
|
136
|
+
|
|
137
|
+
Determines if content can be safely decoded as text.
|
|
138
|
+
Includes common text formats and structured text like JSON/YAML.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
mime_type: MIME type string to check.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
True if MIME type indicates text content, False otherwise.
|
|
145
|
+
|
|
146
|
+
Recognized as text:
|
|
147
|
+
- Any type starting with 'text/'
|
|
148
|
+
- application/json
|
|
149
|
+
- application/xml
|
|
150
|
+
- application/javascript
|
|
151
|
+
- application/yaml
|
|
152
|
+
- application/x-yaml
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> is_text_mime_type('text/plain')
|
|
156
|
+
True
|
|
157
|
+
>>> is_text_mime_type('application/json')
|
|
158
|
+
True
|
|
159
|
+
>>> is_text_mime_type('image/png')
|
|
160
|
+
False
|
|
161
|
+
"""
|
|
82
162
|
text_types = [
|
|
83
163
|
"text/",
|
|
84
164
|
"application/json",
|
|
@@ -91,20 +171,98 @@ def is_text_mime_type(mime_type: str) -> bool:
|
|
|
91
171
|
|
|
92
172
|
|
|
93
173
|
def is_json_mime_type(mime_type: str) -> bool:
|
|
94
|
-
"""Check if MIME type is JSON
|
|
174
|
+
"""Check if MIME type is JSON.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
mime_type: MIME type string to check.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
True if MIME type is 'application/json', False otherwise.
|
|
181
|
+
|
|
182
|
+
Note:
|
|
183
|
+
Only matches exact 'application/json', not variants like
|
|
184
|
+
'application/ld+json' or 'application/vnd.api+json'.
|
|
185
|
+
|
|
186
|
+
Example:
|
|
187
|
+
>>> is_json_mime_type('application/json')
|
|
188
|
+
True
|
|
189
|
+
>>> is_json_mime_type('text/json') # Not standard JSON MIME
|
|
190
|
+
False
|
|
191
|
+
"""
|
|
95
192
|
return mime_type == "application/json"
|
|
96
193
|
|
|
97
194
|
|
|
98
195
|
def is_yaml_mime_type(mime_type: str) -> bool:
|
|
99
|
-
"""Check if MIME type is YAML
|
|
196
|
+
"""Check if MIME type is YAML.
|
|
197
|
+
|
|
198
|
+
Recognizes both standard YAML MIME types.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
mime_type: MIME type string to check.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
True if MIME type is YAML, False otherwise.
|
|
205
|
+
|
|
206
|
+
Recognized types:
|
|
207
|
+
- application/yaml (standard)
|
|
208
|
+
- application/x-yaml (legacy)
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
>>> is_yaml_mime_type('application/yaml')
|
|
212
|
+
True
|
|
213
|
+
>>> is_yaml_mime_type('application/x-yaml')
|
|
214
|
+
True
|
|
215
|
+
"""
|
|
100
216
|
return mime_type == "application/yaml" or mime_type == "application/x-yaml"
|
|
101
217
|
|
|
102
218
|
|
|
103
219
|
def is_pdf_mime_type(mime_type: str) -> bool:
|
|
104
|
-
"""Check if MIME type is PDF
|
|
220
|
+
"""Check if MIME type is PDF.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
mime_type: MIME type string to check.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
True if MIME type is 'application/pdf', False otherwise.
|
|
227
|
+
|
|
228
|
+
Note:
|
|
229
|
+
PDF documents require special handling in the LLM module
|
|
230
|
+
and are supported by certain vision-capable models.
|
|
231
|
+
|
|
232
|
+
Example:
|
|
233
|
+
>>> is_pdf_mime_type('application/pdf')
|
|
234
|
+
True
|
|
235
|
+
>>> is_pdf_mime_type('text/plain')
|
|
236
|
+
False
|
|
237
|
+
"""
|
|
105
238
|
return mime_type == "application/pdf"
|
|
106
239
|
|
|
107
240
|
|
|
108
241
|
def is_image_mime_type(mime_type: str) -> bool:
|
|
109
|
-
"""Check if MIME type
|
|
242
|
+
"""Check if MIME type represents an image.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
mime_type: MIME type string to check.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
True if MIME type starts with 'image/', False otherwise.
|
|
249
|
+
|
|
250
|
+
Recognized formats:
|
|
251
|
+
Any MIME type starting with 'image/' including:
|
|
252
|
+
- image/png
|
|
253
|
+
- image/jpeg
|
|
254
|
+
- image/gif
|
|
255
|
+
- image/webp
|
|
256
|
+
- image/svg+xml
|
|
257
|
+
|
|
258
|
+
Note:
|
|
259
|
+
Image documents are automatically encoded for vision-capable
|
|
260
|
+
LLM models in the AIMessages.document_to_prompt() method.
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
>>> is_image_mime_type('image/png')
|
|
264
|
+
True
|
|
265
|
+
>>> is_image_mime_type('application/pdf')
|
|
266
|
+
False
|
|
267
|
+
"""
|
|
110
268
|
return mime_type.startswith("image/")
|