ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. ai_pipeline_core/__init__.py +83 -119
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +14 -15
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +349 -1062
  30. ai_pipeline_core/documents/mime_type.py +40 -85
  31. ai_pipeline_core/documents/utils.py +62 -7
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +5 -3
  36. ai_pipeline_core/llm/ai_messages.py +284 -73
  37. ai_pipeline_core/llm/client.py +462 -209
  38. ai_pipeline_core/llm/model_options.py +86 -53
  39. ai_pipeline_core/llm/model_response.py +187 -241
  40. ai_pipeline_core/llm/model_types.py +34 -54
  41. ai_pipeline_core/logging/__init__.py +2 -9
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -43
  44. ai_pipeline_core/logging/logging_mixin.py +17 -51
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/observability/tracing.py +640 -0
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +26 -105
  70. ai_pipeline_core/settings.py +41 -32
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -240
  76. ai_pipeline_core/documents/flow_document.py +0 -128
  77. ai_pipeline_core/documents/task_document.py +0 -133
  78. ai_pipeline_core/documents/temporary_document.py +0 -95
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -314
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -717
  83. ai_pipeline_core/prefect.py +0 -54
  84. ai_pipeline_core/simple_runner/__init__.py +0 -24
  85. ai_pipeline_core/simple_runner/cli.py +0 -255
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -385
  87. ai_pipeline_core/tracing.py +0 -475
  88. ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
  89. ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
  90. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,385 +0,0 @@
1
- """Simple pipeline runner for local flow execution.
2
-
3
- This module provides the core functionality for running AI pipeline flows
4
- locally without full Prefect orchestration. It handles document I/O,
5
- flow sequencing, and error management.
6
-
7
- Key components:
8
- - Document I/O from/to filesystem directories
9
- - Single and multi-flow execution
10
- - Automatic document validation and passing between flows
11
- - Step-based execution control (start/end steps)
12
-
13
- Directory structure:
14
- working_dir/
15
- ├── InputDocument/ # Documents of type InputDocument
16
- │ ├── file1.txt
17
- │ └── file1.txt.description.md # Optional description
18
- └── OutputDocument/ # Documents of type OutputDocument
19
- └── result.json
20
-
21
- Example:
22
- >>> from ai_pipeline_core.simple_runner import run_pipeline
23
- >>>
24
- >>> # Run single flow
25
- >>> results = await run_pipeline(
26
- ... flow_func=MyFlow,
27
- ... config=MyConfig,
28
- ... project_name="test",
29
- ... output_dir=Path("./output"),
30
- ... flow_options=options
31
- ... )
32
-
33
- Note:
34
- Document directories are named using the canonical_name() method
35
- of each document type for consistent organization.
36
- """
37
-
38
- from pathlib import Path
39
- from typing import Any, Callable, Sequence, Type
40
-
41
- from ai_pipeline_core.documents import Document, DocumentList, FlowDocument
42
- from ai_pipeline_core.flow.config import FlowConfig
43
- from ai_pipeline_core.flow.options import FlowOptions
44
- from ai_pipeline_core.logging import get_pipeline_logger
45
-
46
- logger = get_pipeline_logger(__name__)
47
-
48
- FlowSequence = Sequence[Callable[..., Any]]
49
- """Type alias for a sequence of flow functions."""
50
-
51
- ConfigSequence = Sequence[Type[FlowConfig]]
52
- """Type alias for a sequence of flow configuration classes."""
53
-
54
-
55
- def load_documents_from_directory(
56
- base_dir: Path, document_types: Sequence[Type[FlowDocument]]
57
- ) -> DocumentList:
58
- """Load documents from filesystem directories by type.
59
-
60
- Scans subdirectories of base_dir for documents matching the provided
61
- types. Each document type has its own subdirectory named after its
62
- canonical_name().
63
-
64
- Args:
65
- base_dir: Base directory containing document subdirectories.
66
- document_types: Sequence of FlowDocument subclasses to load.
67
- Each type corresponds to a subdirectory.
68
-
69
- Returns:
70
- DocumentList containing all successfully loaded documents.
71
- Empty list if no documents found or directories don't exist.
72
-
73
- Directory structure:
74
- base_dir/
75
- ├── DocumentTypeA/ # canonical_name() of DocumentTypeA
76
- │ ├── doc1.txt
77
- │ ├── doc1.txt.description.md # Optional description file
78
- │ └── doc2.json
79
- └── DocumentTypeB/
80
- └── data.csv
81
-
82
- File handling:
83
- - Document content is read as bytes
84
- - Optional .description.md files provide document descriptions
85
- - Failed loads are logged but don't stop processing
86
- - Non-file entries are skipped
87
-
88
- Example:
89
- >>> from my_docs import InputDoc, ConfigDoc
90
- >>> docs = load_documents_from_directory(
91
- ... Path("./data"),
92
- ... [InputDoc, ConfigDoc]
93
- ... )
94
- >>> print(f"Loaded {len(docs)} documents")
95
-
96
- Note:
97
- - Uses canonical_name() for directory names (e.g., "InputDocument")
98
- - Descriptions are loaded from "{filename}.description.md" files
99
- - All file types are supported (determined by document class)
100
- """
101
- documents = DocumentList()
102
-
103
- for doc_class in document_types:
104
- dir_name = doc_class.canonical_name()
105
- type_dir = base_dir / dir_name
106
-
107
- if not type_dir.exists() or not type_dir.is_dir():
108
- continue
109
-
110
- logger.info(f"Loading documents from {type_dir.relative_to(base_dir)}")
111
-
112
- for file_path in type_dir.iterdir():
113
- if not file_path.is_file() or file_path.name.endswith(Document.DESCRIPTION_EXTENSION):
114
- continue
115
-
116
- try:
117
- content = file_path.read_bytes()
118
- doc = doc_class(name=file_path.name, content=content)
119
-
120
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
121
- if desc_file.exists():
122
- object.__setattr__(doc, "description", desc_file.read_text(encoding="utf-8"))
123
-
124
- documents.append(doc)
125
- except Exception as e:
126
- logger.error(
127
- f" Failed to load {file_path.name} as {doc_class.__name__}: {e}", exc_info=True
128
- )
129
-
130
- return documents
131
-
132
-
133
- def save_documents_to_directory(base_dir: Path, documents: DocumentList) -> None:
134
- """Save documents to filesystem directories by type.
135
-
136
- Creates subdirectories under base_dir for each document type and
137
- saves documents with their original filenames. Only FlowDocument
138
- instances are saved (temporary documents are skipped).
139
-
140
- Args:
141
- base_dir: Base directory for saving document subdirectories.
142
- Created if it doesn't exist.
143
- documents: DocumentList containing documents to save.
144
- Non-FlowDocument instances are silently skipped.
145
-
146
- Side effects:
147
- - Creates base_dir and subdirectories as needed
148
- - Overwrites existing files with the same name
149
- - Logs each saved document
150
- - Creates .description.md files for documents with descriptions
151
-
152
- Directory structure created:
153
- base_dir/
154
- └── DocumentType/ # canonical_name() of document
155
- ├── output.json # Document content
156
- └── output.json.description.md # Optional description
157
-
158
- Example:
159
- >>> docs = DocumentList([
160
- ... OutputDoc(name="result.txt", content=b"data"),
161
- ... OutputDoc(name="stats.json", content=b'{...}')
162
- ... ])
163
- >>> save_documents_to_directory(Path("./output"), docs)
164
- >>> # Creates ./output/OutputDocument/result.txt
165
- >>> # and ./output/OutputDocument/stats.json
166
-
167
- Note:
168
- - Only FlowDocument subclasses are saved
169
- - TaskDocument and other temporary documents are skipped
170
- - Descriptions are saved as separate .description.md files
171
- """
172
- for document in documents:
173
- if not isinstance(document, FlowDocument):
174
- continue
175
-
176
- dir_name = document.canonical_name()
177
- document_dir = base_dir / dir_name
178
- document_dir.mkdir(parents=True, exist_ok=True)
179
-
180
- file_path = document_dir / document.name
181
- file_path.write_bytes(document.content)
182
- logger.info(f"Saved: {dir_name}/{document.name}")
183
-
184
- if document.description:
185
- desc_file = file_path.with_name(file_path.name + Document.DESCRIPTION_EXTENSION)
186
- desc_file.write_text(document.description, encoding="utf-8")
187
-
188
-
189
- async def run_pipeline(
190
- flow_func: Callable[..., Any],
191
- config: Type[FlowConfig],
192
- project_name: str,
193
- output_dir: Path,
194
- flow_options: FlowOptions,
195
- flow_name: str | None = None,
196
- ) -> DocumentList:
197
- """Execute a single pipeline flow with document I/O.
198
-
199
- Runs a flow function with automatic document loading, validation,
200
- and saving. The flow receives input documents from the filesystem
201
- and saves its output for subsequent flows.
202
-
203
- The execution proceeds through these steps:
204
- 1. Load input documents from output_dir subdirectories
205
- 2. Validate input documents against config requirements
206
- 3. Execute flow function with documents and options
207
- 4. Validate output documents match config.OUTPUT_DOCUMENT_TYPE
208
- 5. Save output documents to output_dir subdirectories
209
-
210
- Args:
211
- flow_func: Async flow function decorated with @pipeline_flow.
212
- Must accept (project_name, documents, flow_options).
213
-
214
- config: FlowConfig subclass defining input/output document types.
215
- Used for validation and directory organization.
216
-
217
- project_name: Name of the project/pipeline for logging and tracking.
218
-
219
- output_dir: Directory for loading input and saving output documents.
220
- Document subdirectories are created as needed.
221
-
222
- flow_options: Configuration options passed to the flow function.
223
- Can be FlowOptions or any subclass.
224
-
225
- flow_name: Optional display name for logging. If None, uses
226
- flow_func.name or flow_func.__name__.
227
-
228
- Returns:
229
- DocumentList containing the flow's output documents.
230
-
231
- Raises:
232
- RuntimeError: If required input documents are missing.
233
-
234
- Example:
235
- >>> from my_flows import AnalysisFlow, AnalysisConfig
236
- >>>
237
- >>> results = await run_pipeline(
238
- ... flow_func=AnalysisFlow,
239
- ... config=AnalysisConfig,
240
- ... project_name="analysis_001",
241
- ... output_dir=Path("./results"),
242
- ... flow_options=FlowOptions(temperature=0.7)
243
- ... )
244
- >>> print(f"Generated {len(results)} documents")
245
-
246
- Note:
247
- - Flow must be async (decorated with @pipeline_flow)
248
- - Input documents are loaded based on config.INPUT_DOCUMENT_TYPES
249
- - Output is validated against config.OUTPUT_DOCUMENT_TYPE
250
- - All I/O is logged for debugging
251
- """
252
- if flow_name is None:
253
- # For Prefect Flow objects, use their name attribute
254
- # For regular functions, fall back to __name__
255
- flow_name = getattr(flow_func, "name", None) or getattr(flow_func, "__name__", "flow")
256
-
257
- logger.info(f"Running Flow: {flow_name}")
258
-
259
- input_documents = load_documents_from_directory(output_dir, config.INPUT_DOCUMENT_TYPES)
260
-
261
- if not config.has_input_documents(input_documents):
262
- raise RuntimeError(f"Missing input documents for flow {flow_name}")
263
-
264
- result_documents = await flow_func(project_name, input_documents, flow_options)
265
-
266
- config.validate_output_documents(result_documents)
267
-
268
- save_documents_to_directory(output_dir, result_documents)
269
-
270
- logger.info(f"Completed Flow: {flow_name}")
271
-
272
- return result_documents
273
-
274
-
275
- async def run_pipelines(
276
- project_name: str,
277
- output_dir: Path,
278
- flows: FlowSequence,
279
- flow_configs: ConfigSequence,
280
- flow_options: FlowOptions,
281
- start_step: int = 1,
282
- end_step: int | None = None,
283
- ) -> None:
284
- """Execute multiple pipeline flows in sequence.
285
-
286
- Runs a series of flows where each flow's output becomes the input
287
- for the next flow. Supports partial execution with start/end steps
288
- for debugging and resuming failed pipelines.
289
-
290
- Execution proceeds by:
291
- 1. Validating step indices and sequence lengths
292
- 2. For each flow in range [start_step, end_step]:
293
- a. Loading input documents from output_dir
294
- b. Executing flow with documents
295
- c. Saving output documents to output_dir
296
- d. Output becomes input for next flow
297
- 3. Logging progress and any failures
298
-
299
- Steps are 1-based for user convenience. Step 1 is the first flow,
300
- Step N is the Nth flow. Use start_step > 1 to skip initial flows
301
- and end_step < N to stop early.
302
-
303
- Args:
304
- project_name: Name of the overall pipeline/project.
305
- output_dir: Directory for document I/O between flows.
306
- Shared by all flows in the sequence.
307
- flows: Sequence of flow functions to execute in order.
308
- Must all be async functions decorated with @pipeline_flow.
309
- flow_configs: Sequence of FlowConfig classes corresponding to flows.
310
- Must have same length as flows sequence.
311
- flow_options: Options passed to all flows in the sequence.
312
- Individual flows can use different fields.
313
- start_step: First flow to execute (1-based index).
314
- Default 1 starts from the beginning.
315
- end_step: Last flow to execute (1-based index).
316
- None runs through the last flow.
317
-
318
- Raises:
319
- ValueError: If flows and configs have different lengths, or if
320
- start_step or end_step are out of range.
321
-
322
- Example:
323
- >>> # Run full pipeline
324
- >>> await run_pipelines(
325
- ... project_name="analysis",
326
- ... output_dir=Path("./work"),
327
- ... flows=[ExtractFlow, AnalyzeFlow, SummarizeFlow],
328
- ... flow_configs=[ExtractConfig, AnalyzeConfig, SummaryConfig],
329
- ... flow_options=options
330
- ... )
331
- >>>
332
- >>> # Run only steps 2-3 (skip extraction)
333
- >>> await run_pipelines(
334
- ... ...,
335
- ... start_step=2,
336
- ... end_step=3
337
- ... )
338
-
339
- Note:
340
- - Each flow's output must match the next flow's input types
341
- - Failed flows stop the entire pipeline
342
- - Progress is logged with step numbers for debugging
343
- - Documents persist in output_dir between runs
344
- """
345
- if len(flows) != len(flow_configs):
346
- raise ValueError("The number of flows and flow configs must match.")
347
-
348
- num_steps = len(flows)
349
- start_index = start_step - 1
350
- end_index = (end_step if end_step is not None else num_steps) - 1
351
-
352
- if (
353
- not (0 <= start_index < num_steps)
354
- or not (0 <= end_index < num_steps)
355
- or start_index > end_index
356
- ):
357
- raise ValueError("Invalid start/end steps.")
358
-
359
- logger.info(f"Starting pipeline '{project_name}' (Steps {start_step} to {end_index + 1})")
360
-
361
- for i in range(start_index, end_index + 1):
362
- flow_func = flows[i]
363
- config = flow_configs[i]
364
- # For Prefect Flow objects, use their name attribute; for functions, use __name__
365
- flow_name = getattr(flow_func, "name", None) or getattr(
366
- flow_func, "__name__", f"flow_{i + 1}"
367
- )
368
-
369
- logger.info(f"--- [Step {i + 1}/{num_steps}] Running Flow: {flow_name} ---")
370
-
371
- try:
372
- await run_pipeline(
373
- flow_func=flow_func,
374
- config=config,
375
- project_name=project_name,
376
- output_dir=output_dir,
377
- flow_options=flow_options,
378
- flow_name=f"[Step {i + 1}/{num_steps}] {flow_name}",
379
- )
380
-
381
- except Exception as e:
382
- logger.error(
383
- f"--- [Step {i + 1}/{num_steps}] Flow {flow_name} Failed: {e} ---", exc_info=True
384
- )
385
- raise