ai-pipeline-core 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. ai_pipeline_core/__init__.py +25 -14
  2. ai_pipeline_core/documents/__init__.py +2 -1
  3. ai_pipeline_core/documents/document.py +317 -49
  4. ai_pipeline_core/documents/document_list.py +136 -33
  5. ai_pipeline_core/documents/flow_document.py +8 -29
  6. ai_pipeline_core/documents/task_document.py +6 -27
  7. ai_pipeline_core/documents/temporary_document.py +6 -27
  8. ai_pipeline_core/documents/utils.py +64 -1
  9. ai_pipeline_core/flow/config.py +174 -5
  10. ai_pipeline_core/flow/options.py +2 -2
  11. ai_pipeline_core/llm/__init__.py +6 -1
  12. ai_pipeline_core/llm/ai_messages.py +14 -7
  13. ai_pipeline_core/llm/client.py +143 -55
  14. ai_pipeline_core/llm/model_options.py +20 -5
  15. ai_pipeline_core/llm/model_response.py +77 -29
  16. ai_pipeline_core/llm/model_types.py +38 -40
  17. ai_pipeline_core/logging/__init__.py +0 -2
  18. ai_pipeline_core/logging/logging_config.py +0 -6
  19. ai_pipeline_core/logging/logging_mixin.py +2 -10
  20. ai_pipeline_core/pipeline.py +68 -65
  21. ai_pipeline_core/prefect.py +12 -3
  22. ai_pipeline_core/prompt_manager.py +6 -7
  23. ai_pipeline_core/settings.py +13 -5
  24. ai_pipeline_core/simple_runner/__init__.py +1 -11
  25. ai_pipeline_core/simple_runner/cli.py +13 -12
  26. ai_pipeline_core/simple_runner/simple_runner.py +34 -172
  27. ai_pipeline_core/storage/__init__.py +8 -0
  28. ai_pipeline_core/storage/storage.py +628 -0
  29. ai_pipeline_core/tracing.py +110 -26
  30. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +60 -23
  31. ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
  32. ai_pipeline_core-0.1.13.dist-info/RECORD +0 -36
  33. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
  34. {ai_pipeline_core-0.1.13.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,24 +5,24 @@ locally without full Prefect orchestration. It handles document I/O,
5
5
  flow sequencing, and error management.
6
6
 
7
7
  Key components:
8
- - Document I/O from/to filesystem directories
8
+ - Document I/O from/to filesystem directories via FlowConfig
9
9
  - Single and multi-flow execution
10
10
  - Automatic document validation and passing between flows
11
11
  - Step-based execution control (start/end steps)
12
12
 
13
13
  Directory structure:
14
14
  working_dir/
15
- ├── InputDocument/ # Documents of type InputDocument
15
+ ├── inputdocument/ # Documents of type InputDocument (lowercase)
16
16
  │ ├── file1.txt
17
17
  │ └── file1.txt.description.md # Optional description
18
- └── OutputDocument/ # Documents of type OutputDocument
18
+ └── outputdocument/ # Documents of type OutputDocument (lowercase)
19
19
  └── result.json
20
20
 
21
21
  Example:
22
- >>> from ai_pipeline_core.simple_runner import run_pipeline
22
+ >>> from ai_pipeline_core import simple_runner
23
23
  >>>
24
24
  >>> # Run single flow
25
- >>> results = await run_pipeline(
25
+ >>> results = await simple_runner.run_pipeline(
26
26
  ... flow_func=MyFlow,
27
27
  ... config=MyConfig,
28
28
  ... project_name="test",
@@ -31,15 +31,14 @@ Example:
31
31
  ... )
32
32
 
33
33
  Note:
34
- Document directories are named using the canonical_name() method
35
- of each document type for consistent organization.
34
+ Document directories are organized by document type names (lowercase)
35
+ for consistent structure and easy access.
36
36
  """
37
37
 
38
38
  from pathlib import Path
39
- from typing import Any, Callable, Sequence, Type
39
+ from typing import Any, Callable, Sequence
40
40
 
41
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
42
- from ai_pipeline_core.flow.config import FlowConfig
41
+ from ai_pipeline_core.documents import DocumentList
43
42
  from ai_pipeline_core.flow.options import FlowOptions
44
43
  from ai_pipeline_core.logging import get_pipeline_logger
45
44
 
@@ -48,147 +47,9 @@ logger = get_pipeline_logger(__name__)
48
47
  FlowSequence = Sequence[Callable[..., Any]]
49
48
  """Type alias for a sequence of flow functions."""
50
49
 
51
- ConfigSequence = Sequence[Type[FlowConfig]]
52
- """Type alias for a sequence of flow configuration classes."""
53
-
54
-
55
- def load_documents_from_directory(
56
- base_dir: Path, document_types: Sequence[Type[FlowDocument]]
57
- ) -> DocumentList:
58
- """Load documents from filesystem directories by type.
59
-
60
- Scans subdirectories of base_dir for documents matching the provided
61
- types. Each document type has its own subdirectory named after its
62
- canonical_name().
63
-
64
- Args:
65
- base_dir: Base directory containing document subdirectories.
66
- document_types: Sequence of FlowDocument subclasses to load.
67
- Each type corresponds to a subdirectory.
68
-
69
- Returns:
70
- DocumentList containing all successfully loaded documents.
71
- Empty list if no documents found or directories don't exist.
72
-
73
- Directory structure:
74
- base_dir/
75
- ├── DocumentTypeA/ # canonical_name() of DocumentTypeA
76
- │ ├── doc1.txt
77
- │ ├── doc1.txt.description.md # Optional description file
78
- │ └── doc2.json
79
- └── DocumentTypeB/
80
- └── data.csv
81
-
82
- File handling:
83
- - Document content is read as bytes
84
- - Optional .description.md files provide document descriptions
85
- - Failed loads are logged but don't stop processing
86
- - Non-file entries are skipped
87
-
88
- Example:
89
- >>> from my_docs import InputDoc, ConfigDoc
90
- >>> docs = load_documents_from_directory(
91
- ... Path("./data"),
92
- ... [InputDoc, ConfigDoc]
93
- ... )
94
- >>> print(f"Loaded {len(docs)} documents")
95
-
96
- Note:
97
- - Uses canonical_name() for directory names (e.g., "InputDocument")
98
- - Descriptions are loaded from "{filename}.description.md" files
99
- - All file types are supported (determined by document class)
100
- """
101
- documents = DocumentList()
102
-
103
- for doc_class in document_types:
104
- dir_name = doc_class.canonical_name()
105
- type_dir = base_dir / dir_name
106
-
107
- if not type_dir.exists() or not type_dir.is_dir():
108
- continue
109
-
110
- logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
111
-
112
- for file_path in type_dir.iterdir():
113
- if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
114
- continue
115
-
116
- try:
117
- content = file_path.read_bytes()
118
- doc = doc_class(name=file_path.name, content=content)
119
-
120
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
121
- if desc_file.exists():
122
- object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
123
-
124
- documents.append(doc)
125
- except Exception as e:
126
- logger.error(
127
- f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
128
- )
129
-
130
- return documents
131
-
132
-
133
- def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
134
- """Save documents to filesystem directories by type.
135
-
136
- Creates subdirectories under base_dir for each document type and
137
- saves documents with their original filenames. Only FlowDocument
138
- instances are saved (temporary documents are skipped).
139
-
140
- Args:
141
- base_dir: Base directory for saving document subdirectories.
142
- Created if it doesn't exist.
143
- documents: DocumentList containing documents to save.
144
- Non-FlowDocument instances are silently skipped.
145
-
146
- Side effects:
147
- - Creates base_dir and subdirectories as needed
148
- - Overwrites existing files with the same name
149
- - Logs each saved document
150
- - Creates .description.md files for documents with descriptions
151
-
152
- Directory structure created:
153
- base_dir/
154
- └── DocumentType/ # canonical_name() of document
155
- ├── output.json # Document content
156
- └── output.json.description.md # Optional description
157
-
158
- Example:
159
- >>> docs = DocumentList([
160
- ... OutputDoc(name="result.txt", content=b"data"),
161
- ... OutputDoc(name="stats.json", content=b'{...}')
162
- ... ])
163
- >>> save_documents_to_directory(Path("./output"), docs)
164
- >>> # Creates ./output/OutputDocument/result.txt
165
- >>> # and ./output/OutputDocument/stats.json
166
-
167
- Note:
168
- - Only FlowDocument subclasses are saved
169
- - TaskDocument and other temporary documents are skipped
170
- - Descriptions are saved as separate .description.md files
171
- """
172
- for document in documents:
173
- if not isinstance(document, FlowDocument):
174
- continue
175
-
176
- dir_name = document.canonical_name()
177
- document_dir = base_dir / dir_name
178
- document_dir.mkdir(parents=True, exist_ok=True)
179
-
180
- file_path = document_dir / document.name
181
- file_path.write_bytes(document.content)
182
- logger.info(f"Saved: {dir_name}/{document.name}")
183
-
184
- if document.description:
185
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
186
- desc_file.write_text(document.description, encoding="utf-8")
187
-
188
50
 
189
51
  async def run_pipeline(
190
52
  flow_func: Callable[..., Any],
191
- config: Type[FlowConfig],
192
53
  project_name: str,
193
54
  output_dir: Path,
194
55
  flow_options: FlowOptions,
@@ -202,7 +63,7 @@ async def run_pipeline(
202
63
 
203
64
  The execution proceeds through these steps:
204
65
  1. Load input documents from output_dir subdirectories
205
- 2. Validate input documents against config requirements
66
+ 2. Validate input documents against flow's config requirements
206
67
  3. Execute flow function with documents and options
207
68
  4. Validate output documents match config.OUTPUT_DOCUMENT_TYPE
208
69
  5. Save output documents to output_dir subdirectories
@@ -210,9 +71,7 @@ async def run_pipeline(
210
71
  Args:
211
72
  flow_func: Async flow function decorated with @pipeline_flow.
212
73
  Must accept (project_name, documents, flow_options).
213
-
214
- config: FlowConfig subclass defining input/output document types.
215
- Used for validation and directory organization.
74
+ The flow must have a config attribute set by @pipeline_flow.
216
75
 
217
76
  project_name: Name of the project/pipeline for logging and tracking.
218
77
 
@@ -229,14 +88,14 @@ async def run_pipeline(
229
88
  DocumentList containing the flow's output documents.
230
89
 
231
90
  Raises:
232
- RuntimeError: If required input documents are missing.
91
+ RuntimeError: If required input documents are missing or if
92
+ flow doesn't have a config attribute.
233
93
 
234
94
  Example:
235
- >>> from my_flows import AnalysisFlow, AnalysisConfig
95
+ >>> from my_flows import AnalysisFlow
236
96
  >>>
237
97
  >>> results = await run_pipeline(
238
98
  ... flow_func=AnalysisFlow,
239
- ... config=AnalysisConfig,
240
99
  ... project_name="analysis_001",
241
100
  ... output_dir=Path("./results"),
242
101
  ... flow_options=FlowOptions(temperature=0.7)
@@ -244,8 +103,8 @@ async def run_pipeline(
244
103
  >>> print(f"Generated {len(results)} documents")
245
104
 
246
105
  Note:
247
- - Flow must be async (decorated with @pipeline_flow)
248
- - Input documents are loaded based on config.INPUT_DOCUMENT_TYPES
106
+ - Flow must be async (decorated with @pipeline_flow with config)
107
+ - Input documents are loaded based on flow's config.INPUT_DOCUMENT_TYPES
249
108
  - Output is validated against config.OUTPUT_DOCUMENT_TYPE
250
109
  - All I/O is logged for debugging
251
110
  """
@@ -256,7 +115,16 @@ async def run_pipeline(
256
115
 
257
116
  logger.info(f"Running Flow: {flow_name}")
258
117
 
259
- input_documents = load_documents_from_directory(output_dir, config.INPUT_DOCUMENT_TYPES)
118
+ # Get config from the flow function (attached by @pipeline_flow decorator)
119
+ config = getattr(flow_func, "config", None)
120
+ if config is None:
121
+ raise RuntimeError(
122
+ f"Flow {flow_name} does not have a config attribute. "
123
+ "Ensure it's decorated with @pipeline_flow(config=YourConfig)"
124
+ )
125
+
126
+ # Load input documents using FlowConfig's new async method
127
+ input_documents = await config.load_documents(str(output_dir))
260
128
 
261
129
  if not config.has_input_documents(input_documents):
262
130
  raise RuntimeError(f"Missing input documents for flow {flow_name}")
@@ -265,7 +133,8 @@ async def run_pipeline(
265
133
 
266
134
  config.validate_output_documents(result_documents)
267
135
 
268
- save_documents_to_directory(output_dir, result_documents)
136
+ # Save output documents using FlowConfig's new async method
137
+ await config.save_documents(str(output_dir), result_documents)
269
138
 
270
139
  logger.info(f"Completed Flow: {flow_name}")
271
140
 
@@ -276,7 +145,6 @@ async def run_pipelines(
276
145
  project_name: str,
277
146
  output_dir: Path,
278
147
  flows: FlowSequence,
279
- flow_configs: ConfigSequence,
280
148
  flow_options: FlowOptions,
281
149
  start_step: int = 1,
282
150
  end_step: int | None = None,
@@ -288,7 +156,7 @@ async def run_pipelines(
288
156
  for debugging and resuming failed pipelines.
289
157
 
290
158
  Execution proceeds by:
291
- 1. Validating step indices and sequence lengths
159
+ 1. Validating step indices
292
160
  2. For each flow in range [start_step, end_step]:
293
161
  a. Loading input documents from output_dir
294
162
  b. Executing flow with documents
@@ -305,9 +173,8 @@ async def run_pipelines(
305
173
  output_dir: Directory for document I/O between flows.
306
174
  Shared by all flows in the sequence.
307
175
  flows: Sequence of flow functions to execute in order.
308
- Must all be async functions decorated with @pipeline_flow.
309
- flow_configs: Sequence of FlowConfig classes corresponding to flows.
310
- Must have same length as flows sequence.
176
+ Must all be async functions decorated with @pipeline_flow
177
+ with a config parameter.
311
178
  flow_options: Options passed to all flows in the sequence.
312
179
  Individual flows can use different fields.
313
180
  start_step: First flow to execute (1-based index).
@@ -316,8 +183,8 @@ async def run_pipelines(
316
183
  None runs through the last flow.
317
184
 
318
185
  Raises:
319
- ValueError: If flows and configs have different lengths, or if
320
- start_step or end_step are out of range.
186
+ ValueError: If start_step or end_step are out of range.
187
+ RuntimeError: If any flow doesn't have a config attribute.
321
188
 
322
189
  Example:
323
190
  >>> # Run full pipeline
@@ -325,7 +192,6 @@ async def run_pipelines(
325
192
  ... project_name="analysis",
326
193
  ... output_dir=Path("./work"),
327
194
  ... flows=[ExtractFlow, AnalyzeFlow, SummarizeFlow],
328
- ... flow_configs=[ExtractConfig, AnalyzeConfig, SummaryConfig],
329
195
  ... flow_options=options
330
196
  ... )
331
197
  >>>
@@ -337,14 +203,12 @@ async def run_pipelines(
337
203
  ... )
338
204
 
339
205
  Note:
206
+ - Each flow must be decorated with @pipeline_flow(config=...)
340
207
  - Each flow's output must match the next flow's input types
341
208
  - Failed flows stop the entire pipeline
342
209
  - Progress is logged with step numbers for debugging
343
210
  - Documents persist in output_dir between runs
344
211
  """
345
- if len(flows) != len(flow_configs):
346
- raise ValueError("The number of flows and flow configs must match.")
347
-
348
212
  num_steps = len(flows)
349
213
  start_index = start_step - 1
350
214
  end_index = (end_step if end_step is not None else num_steps) - 1
@@ -360,7 +224,6 @@ async def run_pipelines(
360
224
 
361
225
  for i in range(start_index, end_index + 1):
362
226
  flow_func = flows[i]
363
- config = flow_configs[i]
364
227
  # For Prefect Flow objects, use their name attribute; for functions, use __name__
365
228
  flow_name = getattr(flow_func, "name", None) or getattr(
366
229
  flow_func, "__name__", f"flow_{i + 1}"
@@ -371,7 +234,6 @@ async def run_pipelines(
371
234
  try:
372
235
  await run_pipeline(
373
236
  flow_func=flow_func,
374
- config=config,
375
237
  project_name=project_name,
376
238
  output_dir=output_dir,
377
239
  flow_options=flow_options,
@@ -0,0 +1,8 @@
1
+ """Storage module for ai_pipeline_core.
2
+
3
+ @public
4
+ """
5
+
6
+ from ai_pipeline_core.storage.storage import ObjectInfo, RetryPolicy, Storage
7
+
8
+ __all__ = ["Storage", "ObjectInfo", "RetryPolicy"]