ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ai_pipeline_core/__init__.py +25 -14
  2. ai_pipeline_core/documents/__init__.py +2 -1
  3. ai_pipeline_core/documents/document.py +317 -49
  4. ai_pipeline_core/documents/document_list.py +136 -33
  5. ai_pipeline_core/documents/flow_document.py +8 -29
  6. ai_pipeline_core/documents/task_document.py +6 -27
  7. ai_pipeline_core/documents/temporary_document.py +6 -27
  8. ai_pipeline_core/documents/utils.py +64 -1
  9. ai_pipeline_core/flow/config.py +174 -5
  10. ai_pipeline_core/flow/options.py +2 -2
  11. ai_pipeline_core/llm/__init__.py +6 -1
  12. ai_pipeline_core/llm/ai_messages.py +14 -7
  13. ai_pipeline_core/llm/client.py +143 -55
  14. ai_pipeline_core/llm/model_options.py +20 -5
  15. ai_pipeline_core/llm/model_response.py +77 -29
  16. ai_pipeline_core/llm/model_types.py +38 -40
  17. ai_pipeline_core/logging/__init__.py +0 -2
  18. ai_pipeline_core/logging/logging_config.py +0 -6
  19. ai_pipeline_core/logging/logging_mixin.py +2 -10
  20. ai_pipeline_core/pipeline.py +68 -65
  21. ai_pipeline_core/prefect.py +12 -3
  22. ai_pipeline_core/prompt_manager.py +6 -7
  23. ai_pipeline_core/settings.py +13 -5
  24. ai_pipeline_core/simple_runner/__init__.py +1 -11
  25. ai_pipeline_core/simple_runner/cli.py +13 -12
  26. ai_pipeline_core/simple_runner/simple_runner.py +34 -172
  27. ai_pipeline_core/storage/__init__.py +8 -0
  28. ai_pipeline_core/storage/storage.py +628 -0
  29. ai_pipeline_core/tracing.py +110 -26
  30. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
  31. ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
  32. ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
  33. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
  34. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -17,8 +17,8 @@ class DocumentList(list[Document]):
17
17
 
18
18
  Specialized list with validation and filtering for documents.
19
19
 
20
- Best Practice: Use default constructor in 90% of cases. Only enable
21
- validate_same_type or validate_duplicates when you explicitly need them.
20
+ Best Practice: Use default constructor by default, unless instructed otherwise.
21
+ Only enable validate_same_type or validate_duplicates when you explicitly need them.
22
22
 
23
23
  Example:
24
24
  >>> # RECOMMENDED - default constructor for most cases
@@ -152,41 +152,109 @@ class DocumentList(list[Document]):
152
152
  def filter_by(self, arg: type[Document]) -> "DocumentList": ...
153
153
 
154
154
  @overload
155
- def filter_by(self, arg: list[type[Document]]) -> "DocumentList": ...
155
+ def filter_by(self, arg: Iterable[type[Document]]) -> "DocumentList": ...
156
156
 
157
- def filter_by(self, arg: str | type[Document] | list[type[Document]]) -> "DocumentList":
158
- """Filter documents by name or type(s).
157
+ @overload
158
+ def filter_by(self, arg: Iterable[str]) -> "DocumentList": ...
159
+
160
+ def filter_by(
161
+ self, arg: str | type[Document] | Iterable[type[Document]] | Iterable[str]
162
+ ) -> "DocumentList":
163
+ """Filter documents by name(s) or type(s).
159
164
 
160
165
  @public
161
166
 
167
+ ALWAYS returns a DocumentList (which may be empty), never raises an exception
168
+ for no matches. Use this when you want to process all matching documents.
169
+
162
170
  Args:
163
- arg: Document name (str), single document type, or list of document types.
171
+ arg: Can be one of:
172
+ - str: Single document name to filter by
173
+ - type[Document]: Single document type to filter by (includes subclasses)
174
+ - Iterable[type[Document]]: Multiple document types to filter by
175
+ (list, tuple, set, generator, or any iterable)
176
+ - Iterable[str]: Multiple document names to filter by
177
+ (list, tuple, set, generator, or any iterable)
164
178
 
165
179
  Returns:
166
- New DocumentList with filtered documents.
180
+ New DocumentList with filtered documents (may be empty).
181
+ - Returns ALL matching documents
182
+ - Empty DocumentList if no matches found
167
183
 
168
184
  Raises:
169
- TypeError: If arg is not a valid type (str, Document type, or list of Document types).
185
+ TypeError: If arg is not a valid type (not str, type, or iterable),
186
+ or if iterable contains mixed types (strings and types together).
187
+ AttributeError: If arg is expected to be iterable but doesn't support iteration.
170
188
 
171
189
  Example:
172
- >>> docs.filter_by("file.txt") # Filter by name
173
- >>> docs.filter_by(MyDocument) # Filter by type
174
- >>> docs.filter_by([Doc1, Doc2]) # Filter by multiple types
190
+ >>> # Returns list with all matching documents
191
+ >>> matching_docs = docs.filter_by("file.txt") # May be empty
192
+ >>> for doc in matching_docs:
193
+ ... process(doc)
194
+ >>>
195
+ >>> # Filter by type - returns all instances
196
+ >>> config_docs = docs.filter_by(ConfigDocument)
197
+ >>> print(f"Found {len(config_docs)} config documents")
198
+ >>>
199
+ >>> # Filter by multiple names
200
+ >>> important_docs = docs.filter_by(["config.yaml", "settings.json"])
201
+ >>> if not important_docs: # Check if empty
202
+ ... print("No important documents found")
175
203
  """
176
204
  if isinstance(arg, str):
177
- # Filter by name
205
+ # Filter by single name
178
206
  return DocumentList([doc for doc in self if doc.name == arg])
179
207
  elif isinstance(arg, type):
180
208
  # Filter by single type (including subclasses)
209
+ # The type system ensures arg is type[Document] due to overloads
181
210
  return DocumentList([doc for doc in self if isinstance(doc, arg)])
182
- elif isinstance(arg, list): # type: ignore[reportUnnecessaryIsInstance]
183
- # Filter by multiple types
184
- documents = DocumentList()
185
- for document_type in arg:
186
- documents.extend([doc for doc in self if isinstance(doc, document_type)])
187
- return documents
188
211
  else:
189
- raise TypeError(f"Invalid argument type for filter_by: {type(arg)}")
212
+ # Try to consume as iterable
213
+ try:
214
+ # Convert to list to check the first element and allow reuse
215
+ items = list(arg) # type: ignore[arg-type]
216
+ if not items:
217
+ return DocumentList()
218
+
219
+ first_item = items[0]
220
+ if isinstance(first_item, str):
221
+ # Iterable of names - validate all items are strings
222
+ for item in items:
223
+ if not isinstance(item, str):
224
+ raise TypeError(
225
+ "Iterable must contain only strings or only Document types, "
226
+ "not mixed types"
227
+ )
228
+ names_set = set(items)
229
+ return DocumentList([doc for doc in self if doc.name in names_set])
230
+ elif isinstance(first_item, type): # type: ignore[reportUnnecessaryIsInstance]
231
+ # Iterable of document types - validate all items are types
232
+ for item in items:
233
+ if not isinstance(item, type):
234
+ raise TypeError(
235
+ "Iterable must contain only strings or only Document types, "
236
+ "not mixed types"
237
+ )
238
+ # Convert to set for efficient lookup
239
+ types_set = set(items)
240
+ # Filter documents that match any of the requested types
241
+ matching = [
242
+ doc
243
+ for doc in self
244
+ if any(isinstance(doc, doc_type) for doc_type in types_set) # type: ignore[arg-type]
245
+ ]
246
+ return DocumentList(matching)
247
+ else:
248
+ raise TypeError(
249
+ f"Iterable must contain strings or Document types, "
250
+ f"got {type(first_item).__name__}"
251
+ )
252
+ except (TypeError, AttributeError) as e:
253
+ # If the error message already mentions Iterable, re-raise it
254
+ if "Iterable" in str(e) or "strings or Document types" in str(e):
255
+ raise
256
+ # Otherwise, provide a generic error message
257
+ raise TypeError(f"Invalid argument type for filter_by: {type(arg).__name__}") from e
190
258
 
191
259
  @overload
192
260
  def get_by(self, arg: str) -> Document: ...
@@ -201,38 +269,73 @@ class DocumentList(list[Document]):
201
269
  def get_by(self, arg: type[Document], required: bool = True) -> Document | None: ...
202
270
 
203
271
  def get_by(self, arg: str | type[Document], required: bool = True) -> Document | None:
204
- """Get a single document by name or type.
272
+ """Get EXACTLY ONE document by name or type.
205
273
 
206
274
  @public
207
275
 
276
+ IMPORTANT: This method expects to find exactly one matching document.
277
+ - If no matches and required=True: raises ValueError
278
+ - If no matches and required=False: returns None
279
+ - If multiple matches: ALWAYS raises ValueError (ambiguous)
280
+
281
+ When required=True (default), you do NOT need to check for None:
282
+ >>> doc = docs.get_by("config.yaml") # Will raise if not found
283
+ >>> # No need for: if doc is not None <- This is redundant!
284
+ >>> print(doc.content) # Safe to use directly
285
+
208
286
  Args:
209
287
  arg: Document name (str) or document type.
210
- required: If True, raises ValueError when not found. If False, returns None.
288
+ required: If True (default), raises ValueError when not found.
289
+ If False, returns None when not found.
211
290
 
212
291
  Returns:
213
- The first matching document, or None if not found and required=False.
292
+ The single matching document, or None if not found and required=False.
214
293
 
215
294
  Raises:
216
- ValueError: If required=True and document not found.
295
+ ValueError: If required=True and document not found, OR if multiple
296
+ documents match (ambiguous result).
217
297
  TypeError: If arg is not a string or Document type.
218
298
 
219
299
  Example:
220
- >>> doc = docs.get_by("file.txt") # Get by name, raises if not found
221
- >>> doc = docs.get_by(MyDocument, required=False) # Returns None if not found
300
+ >>> # CORRECT - No need to check for None when required=True (default)
301
+ >>> doc = docs.get_by("file.txt") # Raises if not found
302
+ >>> print(doc.content) # Safe to use directly
303
+ >>>
304
+ >>> # When using required=False, check for None
305
+ >>> doc = docs.get_by("optional.txt", required=False)
306
+ >>> if doc is not None:
307
+ ... print(doc.content)
308
+ >>>
309
+ >>> # Will raise if multiple documents have same type
310
+ >>> # Use filter_by() instead if you want all matches
311
+ >>> try:
312
+ ... doc = docs.get_by(ConfigDocument) # Error if 2+ configs
313
+ >>> except ValueError as e:
314
+ ... configs = docs.filter_by(ConfigDocument) # Get all instead
222
315
  """
223
316
  if isinstance(arg, str):
224
- # Get by name
225
- for doc in self:
226
- if doc.name == arg:
227
- return doc
317
+ # Get by name - collect all matches to check for duplicates
318
+ matches = [doc for doc in self if doc.name == arg]
319
+ if len(matches) > 1:
320
+ raise ValueError(
321
+ f"Multiple documents found with name '{arg}'. "
322
+ f"Found {len(matches)} matches. Use filter_by() to get all matches."
323
+ )
324
+ if matches:
325
+ return matches[0]
228
326
  if required:
229
327
  raise ValueError(f"Document with name '{arg}' not found")
230
328
  return None
231
329
  elif isinstance(arg, type): # type: ignore[reportUnnecessaryIsInstance]
232
- # Get by type (including subclasses)
233
- for doc in self:
234
- if isinstance(doc, arg):
235
- return doc
330
+ # Get by type (including subclasses) - collect all matches
331
+ matches = [doc for doc in self if isinstance(doc, arg)]
332
+ if len(matches) > 1:
333
+ raise ValueError(
334
+ f"Multiple documents found of type '{arg.__name__}'. "
335
+ f"Found {len(matches)} matches. Use filter_by() to get all matches."
336
+ )
337
+ if matches:
338
+ return matches[0]
236
339
  if required:
237
340
  raise ValueError(f"Document of type '{arg.__name__}' not found")
238
341
  return None
@@ -24,40 +24,20 @@ class FlowDocument(Document):
24
24
  - Persisted to file system between pipeline steps
25
25
  - Survives across multiple flow runs
26
26
  - Used for flow inputs and outputs
27
- - Saved in directories named after the document's canonical name
27
+ - Saved in directories organized by the document's type/name
28
28
 
29
29
  Creating FlowDocuments:
30
- **Use the `create` classmethod** for most use cases. It handles automatic
31
- conversion of various content types. Only use __init__ when you have bytes.
32
-
33
- >>> from enum import StrEnum
34
- >>>
35
- >>> # Simple document with pass:
36
- >>> class MyDoc(FlowDocument):
37
- ... pass
38
- >>>
39
- >>> # Document with restricted file names:
40
- >>> class ConfigDoc(FlowDocument):
41
- ... class FILES(StrEnum):
42
- ... CONFIG = "config.yaml"
43
- ... SETTINGS = "settings.json"
44
- >>>
45
- >>> # RECOMMENDED - automatic conversion:
46
- >>> doc = MyDoc.create(name="data.json", content={"key": "value"})
47
- >>> doc = ConfigDoc.create(name="config.yaml", content={"host": "localhost"})
30
+ Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
31
+ See Document.create() for detailed usage examples.
48
32
 
49
33
  Persistence:
50
- Documents are saved to: {output_dir}/{canonical_name}/{filename}
34
+ Documents are saved under an output directory path associated with the document's type/name.
51
35
  For example: output/my_doc/data.json
52
36
 
53
37
  Note:
54
38
  - Cannot instantiate FlowDocument directly - must subclass
55
39
  - Used with FlowConfig to define flow input/output types
56
40
  - No additional abstract methods to implement
57
-
58
- See Also:
59
- TaskDocument: For temporary documents within task execution
60
- TemporaryDocument: For documents that are never persisted
61
41
  """
62
42
 
63
43
  def __init__(
@@ -66,13 +46,11 @@ class FlowDocument(Document):
66
46
  name: str,
67
47
  content: bytes,
68
48
  description: str | None = None,
49
+ sources: list[str] = [],
69
50
  ) -> None:
70
51
  """Initialize a FlowDocument with raw bytes content.
71
52
 
72
- Important:
73
- **Most users should use the `create` classmethod instead of __init__.**
74
- The create method provides automatic content conversion for various types
75
- (str, dict, list, Pydantic models) while __init__ only accepts bytes.
53
+ See Document.__init__() for parameter details and usage notes.
76
54
 
77
55
  Prevents direct instantiation of the abstract FlowDocument class.
78
56
  FlowDocument must be subclassed for specific document types.
@@ -81,6 +59,7 @@ class FlowDocument(Document):
81
59
  name: Document filename (required, keyword-only)
82
60
  content: Document content as raw bytes (required, keyword-only)
83
61
  description: Optional human-readable description (keyword-only)
62
+ sources: Optional list of strings for provenance tracking
84
63
 
85
64
  Raises:
86
65
  TypeError: If attempting to instantiate FlowDocument directly
@@ -109,7 +88,7 @@ class FlowDocument(Document):
109
88
  """
110
89
  if type(self) is FlowDocument:
111
90
  raise TypeError("Cannot instantiate abstract FlowDocument class directly")
112
- super().__init__(name=name, content=content, description=description)
91
+ super().__init__(name=name, content=content, description=description, sources=sources)
113
92
 
114
93
  @final
115
94
  def get_base_type(self) -> Literal["flow"]:
@@ -29,24 +29,8 @@ class TaskDocument(Document):
29
29
  - Reduces persistent I/O for temporary data
30
30
 
31
31
  Creating TaskDocuments:
32
- **Use the `create` classmethod** for most use cases. It handles automatic
33
- conversion of various content types. Only use __init__ when you have bytes.
34
-
35
- >>> from enum import StrEnum
36
- >>>
37
- >>> # Simple task document:
38
- >>> class TempDoc(TaskDocument):
39
- ... pass
40
- >>>
41
- >>> # With restricted files:
42
- >>> class CacheDoc(TaskDocument):
43
- ... class FILES(StrEnum):
44
- ... CACHE = "cache.json"
45
- ... INDEX = "index.dat"
46
- >>>
47
- >>> # RECOMMENDED - automatic conversion:
48
- >>> doc = TempDoc.create(name="temp.json", content={"status": "processing"})
49
- >>> doc = CacheDoc.create(name="cache.json", content={"data": [1, 2, 3]})
32
+ Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
33
+ See Document.create() for detailed usage examples.
50
34
 
51
35
  Use Cases:
52
36
  - Intermediate transformation results
@@ -59,10 +43,6 @@ class TaskDocument(Document):
59
43
  - Not saved by simple_runner utilities
60
44
  - Reduces I/O overhead for temporary data
61
45
  - No additional abstract methods to implement
62
-
63
- See Also:
64
- FlowDocument: For documents that persist across flow runs
65
- TemporaryDocument: Alternative for non-persistent documents
66
46
  """
67
47
 
68
48
  def __init__(
@@ -71,13 +51,11 @@ class TaskDocument(Document):
71
51
  name: str,
72
52
  content: bytes,
73
53
  description: str | None = None,
54
+ sources: list[str] = [],
74
55
  ) -> None:
75
56
  """Initialize a TaskDocument with raw bytes content.
76
57
 
77
- Important:
78
- **Most users should use the `create` classmethod instead of __init__.**
79
- The create method provides automatic content conversion for various types
80
- (str, dict, list, Pydantic models) while __init__ only accepts bytes.
58
+ See Document.__init__() for parameter details and usage notes.
81
59
 
82
60
  Prevents direct instantiation of the abstract TaskDocument class.
83
61
  TaskDocument must be subclassed for specific temporary document types.
@@ -86,6 +64,7 @@ class TaskDocument(Document):
86
64
  name: Document filename (required, keyword-only)
87
65
  content: Document content as raw bytes (required, keyword-only)
88
66
  description: Optional human-readable description (keyword-only)
67
+ sources: Optional list of strings for provenance tracking
89
68
 
90
69
  Raises:
91
70
  TypeError: If attempting to instantiate TaskDocument directly
@@ -114,7 +93,7 @@ class TaskDocument(Document):
114
93
  """
115
94
  if type(self) is TaskDocument:
116
95
  raise TypeError("Cannot instantiate abstract TaskDocument class directly")
117
- super().__init__(name=name, content=content, description=description)
96
+ super().__init__(name=name, content=content, description=description, sources=sources)
118
97
 
119
98
  @final
120
99
  def get_base_type(self) -> Literal["task"]:
@@ -1,7 +1,5 @@
1
1
  """Temporary document implementation for non-persistent data.
2
2
 
3
- @public
4
-
5
3
  This module provides the TemporaryDocument class for documents that
6
4
  are never persisted, regardless of context.
7
5
  """
@@ -15,8 +13,6 @@ from .document import Document
15
13
  class TemporaryDocument(Document):
16
14
  r"""Concrete document class for data that is never persisted.
17
15
 
18
- @public
19
-
20
16
  TemporaryDocument is a final (non-subclassable) document type for
21
17
  data that should never be saved to disk, regardless of whether it's
22
18
  used in a flow or task context. Unlike FlowDocument and TaskDocument
@@ -28,27 +24,14 @@ class TemporaryDocument(Document):
28
24
  - Cannot be subclassed (annotated with Python's @final decorator in code)
29
25
  - Useful for transient data like API responses or intermediate calculations
30
26
  - Ignored by simple_runner save operations
27
+ - Useful for tests and debugging
31
28
 
32
29
  Creating TemporaryDocuments:
33
- **Use the `create` classmethod** for most use cases. It handles automatic
34
- conversion of various content types. Only use __init__ when you have bytes.
35
-
36
- >>> # RECOMMENDED - automatic conversion:
37
- >>> doc = TemporaryDocument.create(
38
- ... name="api_response.json",
39
- ... content={"status": "ok", "data": [1, 2, 3]}
40
- ... )
41
- >>> doc = TemporaryDocument.create(
42
- ... name="credentials.txt",
43
- ... content="secret_token_xyz"
44
- ... )
45
- >>>
46
- >>> # Direct constructor - only for bytes:
47
- >>> doc = TemporaryDocument(
48
- ... name="binary.dat",
49
- ... content=b"\x00\x01\x02"
50
- ... )
51
- >>>
30
+ Same as Document - use `create()` for automatic conversion, `__init__` for bytes.
31
+ Unlike abstract document types, TemporaryDocument can be instantiated directly.
32
+ See Document.create() for detailed usage examples.
33
+
34
+ >>> doc = TemporaryDocument.create(name="api.json", content={"status": "ok"})
52
35
  >>> doc.is_temporary # Always True
53
36
 
54
37
  Use Cases:
@@ -62,10 +45,6 @@ class TemporaryDocument(Document):
62
45
  - This is a final class and cannot be subclassed
63
46
  - Use when you explicitly want to prevent persistence
64
47
  - Useful for sensitive data that shouldn't be written to disk
65
-
66
- See Also:
67
- FlowDocument: For documents that persist across flow runs
68
- TaskDocument: For documents temporary within task execution
69
48
  """
70
49
 
71
50
  def __init_subclass__(cls, **kwargs: Any) -> None:
@@ -1,7 +1,7 @@
1
1
  """Utility functions for document handling.
2
2
 
3
3
  Provides helper functions for URL sanitization, naming conventions,
4
- and canonical key generation used throughout the document system.
4
+ canonical key generation, and hash validation used throughout the document system.
5
5
  """
6
6
 
7
7
  import re
@@ -115,3 +115,66 @@ def canonical_name_key(
115
115
  break
116
116
 
117
117
  return camel_to_snake(name)
118
+
119
+
120
+ def is_document_sha256(value: str) -> bool:
121
+ """Check if a string is a valid base32-encoded SHA256 hash with proper entropy.
122
+
123
+ @public
124
+
125
+ This function validates that a string is not just formatted like a SHA256 hash,
126
+ but actually has the entropy characteristics of a real hash. It checks:
127
+ 1. Correct length (52 characters without padding)
128
+ 2. Valid base32 characters (A-Z, 2-7)
129
+ 3. Sufficient entropy (at least 8 unique characters)
130
+
131
+ The entropy check prevents false positives like 'AAAAAAA...AAA' from being
132
+ identified as valid document hashes.
133
+
134
+ Args:
135
+ value: String to check if it's a document SHA256 hash.
136
+
137
+ Returns:
138
+ True if the string appears to be a real base32-encoded SHA256 hash,
139
+ False otherwise.
140
+
141
+ Examples:
142
+ >>> # Real SHA256 hash
143
+ >>> is_document_sha256("P3AEMA2PSYILKFYVBUALJLMIYWVZIS2QDI3S5VTMD2X7SOODF2YQ")
144
+ True
145
+
146
+ >>> # Too uniform - lacks entropy
147
+ >>> is_document_sha256("A" * 52)
148
+ False
149
+
150
+ >>> # Wrong length
151
+ >>> is_document_sha256("ABC123")
152
+ False
153
+
154
+ >>> # Invalid characters
155
+ >>> is_document_sha256("a" * 52) # lowercase
156
+ False
157
+ """
158
+ # Check basic format: exactly 52 uppercase base32 characters
159
+ try:
160
+ if not value or len(value) != 52:
161
+ return False
162
+ except (TypeError, AttributeError):
163
+ return False
164
+
165
+ # Check if all characters are valid base32 (A-Z, 2-7)
166
+ try:
167
+ if not re.match(r"^[A-Z2-7]{52}$", value):
168
+ return False
169
+ except TypeError:
170
+ # re.match raises TypeError for non-string types like bytes
171
+ return False
172
+
173
+ # Check entropy: real SHA256 hashes have high entropy
174
+ # Require at least 8 unique characters (out of 32 possible in base32)
175
+ # This prevents patterns like "AAAAAAA..." from being identified as real hashes
176
+ unique_chars = len(set(value))
177
+ if unique_chars < 8:
178
+ return False
179
+
180
+ return True