ai-pipeline-core 0.1.14__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ai_pipeline_core/__init__.py +21 -13
  2. ai_pipeline_core/documents/document.py +93 -50
  3. ai_pipeline_core/documents/document_list.py +70 -23
  4. ai_pipeline_core/documents/flow_document.py +2 -6
  5. ai_pipeline_core/documents/task_document.py +0 -4
  6. ai_pipeline_core/documents/temporary_document.py +1 -8
  7. ai_pipeline_core/flow/config.py +174 -5
  8. ai_pipeline_core/llm/__init__.py +1 -1
  9. ai_pipeline_core/llm/ai_messages.py +14 -4
  10. ai_pipeline_core/llm/client.py +116 -59
  11. ai_pipeline_core/llm/model_options.py +2 -5
  12. ai_pipeline_core/llm/model_response.py +17 -16
  13. ai_pipeline_core/llm/model_types.py +0 -4
  14. ai_pipeline_core/logging/__init__.py +0 -2
  15. ai_pipeline_core/logging/logging_config.py +0 -6
  16. ai_pipeline_core/logging/logging_mixin.py +2 -10
  17. ai_pipeline_core/pipeline.py +45 -68
  18. ai_pipeline_core/prefect.py +12 -3
  19. ai_pipeline_core/prompt_manager.py +6 -7
  20. ai_pipeline_core/settings.py +13 -5
  21. ai_pipeline_core/simple_runner/__init__.py +1 -11
  22. ai_pipeline_core/simple_runner/cli.py +13 -12
  23. ai_pipeline_core/simple_runner/simple_runner.py +34 -189
  24. ai_pipeline_core/storage/__init__.py +8 -0
  25. ai_pipeline_core/storage/storage.py +628 -0
  26. ai_pipeline_core/tracing.py +3 -26
  27. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/METADATA +19 -17
  28. ai_pipeline_core-0.2.0.dist-info/RECORD +38 -0
  29. ai_pipeline_core-0.1.14.dist-info/RECORD +0 -36
  30. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/WHEEL +0 -0
  31. {ai_pipeline_core-0.1.14.dist-info → ai_pipeline_core-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,24 +5,24 @@ locally without full Prefect orchestration. It handles document I/O,
5
5
  flow sequencing, and error management.
6
6
 
7
7
  Key components:
8
- - Document I/O from/to filesystem directories
8
+ - Document I/O from/to filesystem directories via FlowConfig
9
9
  - Single and multi-flow execution
10
10
  - Automatic document validation and passing between flows
11
11
  - Step-based execution control (start/end steps)
12
12
 
13
13
  Directory structure:
14
14
  working_dir/
15
- ├── InputDocument/ # Documents of type InputDocument
15
+ ├── inputdocument/ # Documents of type InputDocument (lowercase)
16
16
  │ ├── file1.txt
17
17
  │ └── file1.txt.description.md # Optional description
18
- └── OutputDocument/ # Documents of type OutputDocument
18
+ └── outputdocument/ # Documents of type OutputDocument (lowercase)
19
19
  └── result.json
20
20
 
21
21
  Example:
22
- >>> from ai_pipeline_core.simple_runner import run_pipeline
22
+ >>> from ai_pipeline_core import simple_runner
23
23
  >>>
24
24
  >>> # Run single flow
25
- >>> results = await run_pipeline(
25
+ >>> results = await simple_runner.run_pipeline(
26
26
  ... flow_func=MyFlow,
27
27
  ... config=MyConfig,
28
28
  ... project_name="test",
@@ -31,16 +31,14 @@ Example:
31
31
  ... )
32
32
 
33
33
  Note:
34
- Document directories are named using the canonical_name() method
35
- of each document type for consistent organization.
34
+ Document directories are organized by document type names (lowercase)
35
+ for consistent structure and easy access.
36
36
  """
37
37
 
38
- import json
39
38
  from pathlib import Path
40
- from typing import Any, Callable, Sequence, Type
39
+ from typing import Any, Callable, Sequence
41
40
 
42
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
43
- from ai_pipeline_core.flow.config import FlowConfig
41
+ from ai_pipeline_core.documents import DocumentList
44
42
  from ai_pipeline_core.flow.options import FlowOptions
45
43
  from ai_pipeline_core.logging import get_pipeline_logger
46
44
 
@@ -49,163 +47,9 @@ logger = get_pipeline_logger(__name__)
49
47
  FlowSequence = Sequence[Callable[..., Any]]
50
48
  """Type alias for a sequence of flow functions."""
51
49
 
52
- ConfigSequence = Sequence[Type[FlowConfig]]
53
- """Type alias for a sequence of flow configuration classes."""
54
-
55
-
56
- def load_documents_from_directory(
57
- base_dir: Path, document_types: Sequence[Type[FlowDocument]]
58
- ) -> DocumentList:
59
- """Load documents from filesystem directories by type.
60
-
61
- Scans subdirectories of base_dir for documents matching the provided
62
- types. Each document type has its own subdirectory named after its
63
- canonical_name().
64
-
65
- Args:
66
- base_dir: Base directory containing document subdirectories.
67
- document_types: Sequence of FlowDocument subclasses to load.
68
- Each type corresponds to a subdirectory.
69
-
70
- Returns:
71
- DocumentList containing all successfully loaded documents.
72
- Empty list if no documents found or directories don't exist.
73
-
74
- Directory structure:
75
- base_dir/
76
- ├── DocumentTypeA/ # canonical_name() of DocumentTypeA
77
- │ ├── doc1.txt
78
- │ ├── doc1.txt.description.md # Optional description file
79
- │ └── doc2.json
80
- └── DocumentTypeB/
81
- └── data.csv
82
-
83
- File handling:
84
- - Document content is read as bytes
85
- - Optional .description.md files provide document descriptions
86
- - Failed loads are logged but don't stop processing
87
- - Non-file entries are skipped
88
-
89
- Example:
90
- >>> from my_docs import InputDoc, ConfigDoc
91
- >>> docs = load_documents_from_directory(
92
- ... Path("./data"),
93
- ... [InputDoc, ConfigDoc]
94
- ... )
95
- >>> print(f"Loaded {len(docs)} documents")
96
-
97
- Note:
98
- - Uses canonical_name() for directory names (e.g., "InputDocument")
99
- - Descriptions are loaded from "{filename}.description.md" files
100
- - All file types are supported (determined by document class)
101
- """
102
- documents = DocumentList()
103
-
104
- for doc_class in document_types:
105
- dir_name = doc_class.canonical_name()
106
- type_dir = base_dir / dir_name
107
-
108
- if not type_dir.exists() or not type_dir.is_dir():
109
- continue
110
-
111
- logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
112
-
113
- for file_path in type_dir.iterdir():
114
- if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
115
- continue
116
-
117
- # Skip .sources.json files - they are metadata, not documents
118
- if file_path.name.endswith(".sources.json"):
119
- continue
120
-
121
- try:
122
- content = file_path.read_bytes()
123
-
124
- # Load sources if .sources.json exists
125
- sources = []
126
- sources_file = file_path.with_name(file_path.name + ".sources.json")
127
- if sources_file.exists():
128
- sources = json.loads(sources_file.read_text(encoding="utf-8"))
129
-
130
- doc = doc_class(name=file_path.name, content=content, sources=sources)
131
-
132
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
133
- if desc_file.exists():
134
- object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
135
-
136
- documents.append(doc)
137
- except Exception as e:
138
- logger.error(
139
- f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
140
- )
141
-
142
- return documents
143
-
144
-
145
- def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
146
- """Save documents to filesystem directories by type.
147
-
148
- Creates subdirectories under base_dir for each document type and
149
- saves documents with their original filenames. Only FlowDocument
150
- instances are saved (temporary documents are skipped).
151
-
152
- Args:
153
- base_dir: Base directory for saving document subdirectories.
154
- Created if it doesn't exist.
155
- documents: DocumentList containing documents to save.
156
- Non-FlowDocument instances are silently skipped.
157
-
158
- Side effects:
159
- - Creates base_dir and subdirectories as needed
160
- - Overwrites existing files with the same name
161
- - Logs each saved document
162
- - Creates .description.md files for documents with descriptions
163
-
164
- Directory structure created:
165
- base_dir/
166
- └── DocumentType/ # canonical_name() of document
167
- ├── output.json # Document content
168
- └── output.json.description.md # Optional description
169
-
170
- Example:
171
- >>> docs = DocumentList([
172
- ... OutputDoc(name="result.txt", content=b"data"),
173
- ... OutputDoc(name="stats.json", content=b'{...}')
174
- ... ])
175
- >>> save_documents_to_directory(Path("./output"), docs)
176
- >>> # Creates ./output/OutputDocument/result.txt
177
- >>> # and ./output/OutputDocument/stats.json
178
-
179
- Note:
180
- - Only FlowDocument subclasses are saved
181
- - TaskDocument and other temporary documents are skipped
182
- - Descriptions are saved as separate .description.md files
183
- """
184
- for document in documents:
185
- if not isinstance(document, FlowDocument):
186
- continue
187
-
188
- dir_name = document.canonical_name()
189
- document_dir = base_dir / dir_name
190
- document_dir.mkdir(parents=True, exist_ok=True)
191
-
192
- file_path = document_dir / document.name
193
- file_path.write_bytes(document.content)
194
- logger.info(f"Saved: {dir_name}/{document.name}")
195
-
196
- if document.description:
197
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
198
- desc_file.write_text(document.description, encoding="utf-8")
199
-
200
- # Save sources to .sources.json if present
201
- if document.sources:
202
- sources_file = file_path.with_name(file_path.name + ".sources.json")
203
- sources_file.write_text(json.dumps(document.sources, indent=2), encoding="utf-8")
204
-
205
50
 
206
51
  async def run_pipeline(
207
52
  flow_func: Callable[..., Any],
208
- config: Type[FlowConfig],
209
53
  project_name: str,
210
54
  output_dir: Path,
211
55
  flow_options: FlowOptions,
@@ -219,7 +63,7 @@ async def run_pipeline(
219
63
 
220
64
  The execution proceeds through these steps:
221
65
  1. Load input documents from output_dir subdirectories
222
- 2. Validate input documents against config requirements
66
+ 2. Validate input documents against flow's config requirements
223
67
  3. Execute flow function with documents and options
224
68
  4. Validate output documents match config.OUTPUT_DOCUMENT_TYPE
225
69
  5. Save output documents to output_dir subdirectories
@@ -227,9 +71,7 @@ async def run_pipeline(
227
71
  Args:
228
72
  flow_func: Async flow function decorated with @pipeline_flow.
229
73
  Must accept (project_name, documents, flow_options).
230
-
231
- config: FlowConfig subclass defining input/output document types.
232
- Used for validation and directory organization.
74
+ The flow must have a config attribute set by @pipeline_flow.
233
75
 
234
76
  project_name: Name of the project/pipeline for logging and tracking.
235
77
 
@@ -246,14 +88,14 @@ async def run_pipeline(
246
88
  DocumentList containing the flow's output documents.
247
89
 
248
90
  Raises:
249
- RuntimeError: If required input documents are missing.
91
+ RuntimeError: If required input documents are missing or if
92
+ flow doesn't have a config attribute.
250
93
 
251
94
  Example:
252
- >>> from my_flows import AnalysisFlow, AnalysisConfig
95
+ >>> from my_flows import AnalysisFlow
253
96
  >>>
254
97
  >>> results = await run_pipeline(
255
98
  ... flow_func=AnalysisFlow,
256
- ... config=AnalysisConfig,
257
99
  ... project_name="analysis_001",
258
100
  ... output_dir=Path("./results"),
259
101
  ... flow_options=FlowOptions(temperature=0.7)
@@ -261,8 +103,8 @@ async def run_pipeline(
261
103
  >>> print(f"Generated {len(results)} documents")
262
104
 
263
105
  Note:
264
- - Flow must be async (decorated with @pipeline_flow)
265
- - Input documents are loaded based on config.INPUT_DOCUMENT_TYPES
106
+ - Flow must be async (decorated with @pipeline_flow with config)
107
+ - Input documents are loaded based on flow's config.INPUT_DOCUMENT_TYPES
266
108
  - Output is validated against config.OUTPUT_DOCUMENT_TYPE
267
109
  - All I/O is logged for debugging
268
110
  """
@@ -273,7 +115,16 @@ async def run_pipeline(
273
115
 
274
116
  logger.info(f"Running Flow: {flow_name}")
275
117
 
276
- input_documents = load_documents_from_directory(output_dir, config.INPUT_DOCUMENT_TYPES)
118
+ # Get config from the flow function (attached by @pipeline_flow decorator)
119
+ config = getattr(flow_func, "config", None)
120
+ if config is None:
121
+ raise RuntimeError(
122
+ f"Flow {flow_name} does not have a config attribute. "
123
+ "Ensure it's decorated with @pipeline_flow(config=YourConfig)"
124
+ )
125
+
126
+ # Load input documents using FlowConfig's new async method
127
+ input_documents = await config.load_documents(str(output_dir))
277
128
 
278
129
  if not config.has_input_documents(input_documents):
279
130
  raise RuntimeError(f"Missing input documents for flow {flow_name}")
@@ -282,7 +133,8 @@ async def run_pipeline(
282
133
 
283
134
  config.validate_output_documents(result_documents)
284
135
 
285
- save_documents_to_directory(output_dir, result_documents)
136
+ # Save output documents using FlowConfig's new async method
137
+ await config.save_documents(str(output_dir), result_documents)
286
138
 
287
139
  logger.info(f"Completed Flow: {flow_name}")
288
140
 
@@ -293,7 +145,6 @@ async def run_pipelines(
293
145
  project_name: str,
294
146
  output_dir: Path,
295
147
  flows: FlowSequence,
296
- flow_configs: ConfigSequence,
297
148
  flow_options: FlowOptions,
298
149
  start_step: int = 1,
299
150
  end_step: int | None = None,
@@ -305,7 +156,7 @@ async def run_pipelines(
305
156
  for debugging and resuming failed pipelines.
306
157
 
307
158
  Execution proceeds by:
308
- 1. Validating step indices and sequence lengths
159
+ 1. Validating step indices
309
160
  2. For each flow in range [start_step, end_step]:
310
161
  a. Loading input documents from output_dir
311
162
  b. Executing flow with documents
@@ -322,9 +173,8 @@ async def run_pipelines(
322
173
  output_dir: Directory for document I/O between flows.
323
174
  Shared by all flows in the sequence.
324
175
  flows: Sequence of flow functions to execute in order.
325
- Must all be async functions decorated with @pipeline_flow.
326
- flow_configs: Sequence of FlowConfig classes corresponding to flows.
327
- Must have same length as flows sequence.
176
+ Must all be async functions decorated with @pipeline_flow
177
+ with a config parameter.
328
178
  flow_options: Options passed to all flows in the sequence.
329
179
  Individual flows can use different fields.
330
180
  start_step: First flow to execute (1-based index).
@@ -333,8 +183,8 @@ async def run_pipelines(
333
183
  None runs through the last flow.
334
184
 
335
185
  Raises:
336
- ValueError: If flows and configs have different lengths, or if
337
- start_step or end_step are out of range.
186
+ ValueError: If start_step or end_step are out of range.
187
+ RuntimeError: If any flow doesn't have a config attribute.
338
188
 
339
189
  Example:
340
190
  >>> # Run full pipeline
@@ -342,7 +192,6 @@ async def run_pipelines(
342
192
  ... project_name="analysis",
343
193
  ... output_dir=Path("./work"),
344
194
  ... flows=[ExtractFlow, AnalyzeFlow, SummarizeFlow],
345
- ... flow_configs=[ExtractConfig, AnalyzeConfig, SummaryConfig],
346
195
  ... flow_options=options
347
196
  ... )
348
197
  >>>
@@ -354,14 +203,12 @@ async def run_pipelines(
354
203
  ... )
355
204
 
356
205
  Note:
206
+ - Each flow must be decorated with @pipeline_flow(config=...)
357
207
  - Each flow's output must match the next flow's input types
358
208
  - Failed flows stop the entire pipeline
359
209
  - Progress is logged with step numbers for debugging
360
210
  - Documents persist in output_dir between runs
361
211
  """
362
- if len(flows) != len(flow_configs):
363
- raise ValueError("The number of flows and flow configs must match.")
364
-
365
212
  num_steps = len(flows)
366
213
  start_index = start_step - 1
367
214
  end_index = (end_step if end_step is not None else num_steps) - 1
@@ -377,7 +224,6 @@ async def run_pipelines(
377
224
 
378
225
  for i in range(start_index, end_index + 1):
379
226
  flow_func = flows[i]
380
- config = flow_configs[i]
381
227
  # For Prefect Flow objects, use their name attribute; for functions, use __name__
382
228
  flow_name = getattr(flow_func, "name", None) or getattr(
383
229
  flow_func, "__name__", f"flow_{i + 1}"
@@ -388,7 +234,6 @@ async def run_pipelines(
388
234
  try:
389
235
  await run_pipeline(
390
236
  flow_func=flow_func,
391
- config=config,
392
237
  project_name=project_name,
393
238
  output_dir=output_dir,
394
239
  flow_options=flow_options,
@@ -0,0 +1,8 @@
1
+ """Storage module for ai_pipeline_core.
2
+
3
+ @public
4
+ """
5
+
6
+ from ai_pipeline_core.storage.storage import ObjectInfo, RetryPolicy, Storage
7
+
8
+ __all__ = ["Storage", "ObjectInfo", "RetryPolicy"]