ai-pipeline-core 0.1.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. ai_pipeline_core/__init__.py +83 -119
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +14 -15
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +349 -1062
  30. ai_pipeline_core/documents/mime_type.py +40 -85
  31. ai_pipeline_core/documents/utils.py +62 -7
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +5 -3
  36. ai_pipeline_core/llm/ai_messages.py +284 -73
  37. ai_pipeline_core/llm/client.py +462 -209
  38. ai_pipeline_core/llm/model_options.py +86 -53
  39. ai_pipeline_core/llm/model_response.py +187 -241
  40. ai_pipeline_core/llm/model_types.py +34 -54
  41. ai_pipeline_core/logging/__init__.py +2 -9
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -43
  44. ai_pipeline_core/logging/logging_mixin.py +17 -51
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/observability/tracing.py +640 -0
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +26 -105
  70. ai_pipeline_core/settings.py +41 -32
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -240
  76. ai_pipeline_core/documents/flow_document.py +0 -128
  77. ai_pipeline_core/documents/task_document.py +0 -133
  78. ai_pipeline_core/documents/temporary_document.py +0 -95
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -314
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -717
  83. ai_pipeline_core/prefect.py +0 -54
  84. ai_pipeline_core/simple_runner/__init__.py +0 -24
  85. ai_pipeline_core/simple_runner/cli.py +0 -255
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -385
  87. ai_pipeline_core/tracing.py +0 -475
  88. ai_pipeline_core-0.1.12.dist-info/METADATA +0 -450
  89. ai_pipeline_core-0.1.12.dist-info/RECORD +0 -36
  90. {ai_pipeline_core-0.1.12.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,807 @@
1
+ Metadata-Version: 2.4
2
+ Name: ai-pipeline-core
3
+ Version: 0.4.1
4
+ Summary: Core utilities for AI-powered processing pipelines using prefect
5
+ Project-URL: Homepage, https://github.com/bbarwik/ai-pipeline-core
6
+ Project-URL: Repository, https://github.com/bbarwik/ai-pipeline-core
7
+ Project-URL: Issues, https://github.com/bbarwik/ai-pipeline-core/issues
8
+ Author-email: bbarwik <bbarwik@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Typing :: Typed
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: clickhouse-connect>=0.10.0
22
+ Requires-Dist: httpx>=0.28.1
23
+ Requires-Dist: jinja2>=3.1.6
24
+ Requires-Dist: lmnr>=0.7.37
25
+ Requires-Dist: openai>=2.16.0
26
+ Requires-Dist: opentelemetry-sdk>=1.39.1
27
+ Requires-Dist: pillow>=12.1.0
28
+ Requires-Dist: prefect-gcp>=0.6.15
29
+ Requires-Dist: prefect>=3.6.15
30
+ Requires-Dist: pydantic-settings>=2.12.0
31
+ Requires-Dist: pydantic>=2.12.5
32
+ Requires-Dist: python-magic>=0.4.27
33
+ Requires-Dist: ruamel-yaml>=0.19.1
34
+ Requires-Dist: tiktoken>=0.12.0
35
+ Provides-Extra: dev
36
+ Requires-Dist: basedpyright==1.37.3; extra == 'dev'
37
+ Requires-Dist: bump2version>=1.0.1; extra == 'dev'
38
+ Requires-Dist: interrogate==1.7.0; extra == 'dev'
39
+ Requires-Dist: pre-commit>=4.3.0; extra == 'dev'
40
+ Requires-Dist: pytest-asyncio>=1.1.0; extra == 'dev'
41
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
42
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
43
+ Requires-Dist: pytest-xdist>=3.8.0; extra == 'dev'
44
+ Requires-Dist: pytest>=8.4.1; extra == 'dev'
45
+ Requires-Dist: ruff==0.14.14; extra == 'dev'
46
+ Requires-Dist: testcontainers[clickhouse]>=4.0.0; extra == 'dev'
47
+ Requires-Dist: vulture==2.14; extra == 'dev'
48
+ Description-Content-Type: text/markdown
49
+
50
+ # AI Pipeline Core
51
+
52
+ A high-performance async framework for building type-safe AI pipelines with LLMs, document processing, and workflow orchestration.
53
+
54
+ [![Python Version](https://img.shields.io/badge/python-3.12%2B-blue)](https://www.python.org/downloads/)
55
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
56
+ [![Code Style: Ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
57
+ [![Type Checked: Basedpyright](https://img.shields.io/badge/type%20checked-basedpyright-blue)](https://github.com/DetachHead/basedpyright)
58
+
59
+ ## Overview
60
+
61
+ AI Pipeline Core is a production-ready framework that combines document processing, LLM integration, and workflow orchestration into a unified system. Built with strong typing (Pydantic), automatic retries, cost tracking, and distributed tracing, it enforces best practices while keeping application code minimal and straightforward.
62
+
63
+ ### Key Features
64
+
65
+ - **Document System**: Single `Document` base class with immutable content, SHA256-based identity, automatic MIME type detection, provenance tracking, and multi-part attachments
66
+ - **Document Store**: Pluggable storage backends (ClickHouse production, local filesystem CLI/debug, in-memory testing) with automatic deduplication
67
+ - **LLM Integration**: Unified interface to any model via LiteLLM proxy with context caching (default 300s TTL)
68
+ - **Structured Output**: Type-safe generation with Pydantic model validation via `generate_structured()`
69
+ - **Workflow Orchestration**: Prefect-based flows and tasks with annotation-driven document types
70
+ - **Auto-Persistence**: `@pipeline_task` saves returned documents to `DocumentStore` automatically (configurable via `persist` parameter)
71
+ - **Image Processing**: Automatic image tiling/splitting for LLM vision models with model-specific presets
72
+ - **Observability**: Built-in distributed tracing via Laminar (LMNR) with cost tracking, local trace debugging, and ClickHouse-based tracking
73
+ - **Deployment**: Unified pipeline execution for local, CLI, and production environments with per-flow resume
74
+
75
+ ## Installation
76
+
77
+ ```bash
78
+ pip install ai-pipeline-core
79
+ ```
80
+
81
+ ### Requirements
82
+
83
+ - Python 3.12 or higher
84
+ - Linux/macOS (Windows via WSL2)
85
+
86
+ ### Development Installation
87
+
88
+ ```bash
89
+ git clone https://github.com/bbarwik/ai-pipeline-core.git
90
+ cd ai-pipeline-core
91
+ pip install -e ".[dev]"
92
+ pipx install semgrep # Installed separately due to dependency conflicts
93
+ make install-dev # Installs pre-commit hooks
94
+ ```
95
+
96
+ ## Quick Start
97
+
98
+ ### Basic Pipeline
99
+
100
+ ```python
101
+ from typing import ClassVar
102
+
103
+ from pydantic import BaseModel, Field
104
+
105
+ from ai_pipeline_core import (
106
+ Document,
107
+ DeploymentResult,
108
+ FlowOptions,
109
+ PipelineDeployment,
110
+ pipeline_flow,
111
+ pipeline_task,
112
+ setup_logging,
113
+ get_pipeline_logger,
114
+ )
115
+
116
+ setup_logging(level="INFO")
117
+ logger = get_pipeline_logger(__name__)
118
+
119
+
120
+ # 1. Define document types (subclass Document)
121
+ class InputDocument(Document):
122
+ """Pipeline input."""
123
+
124
+ class AnalysisDocument(Document):
125
+ """Per-document analysis result."""
126
+
127
+ class ReportDocument(Document):
128
+ """Final compiled report."""
129
+
130
+
131
+ # 2. Structured output model
132
+ class AnalysisSummary(BaseModel):
133
+ word_count: int
134
+ top_keywords: list[str] = Field(default_factory=list)
135
+
136
+
137
+ # 3. Pipeline task -- auto-saves returned documents to DocumentStore
138
+ @pipeline_task
139
+ async def analyze_document(document: InputDocument) -> AnalysisDocument:
140
+ return AnalysisDocument.create(
141
+ name=f"analysis_{document.sha256[:12]}.json",
142
+ content=AnalysisSummary(word_count=42, top_keywords=["ai", "pipeline"]),
143
+ sources=(document.sha256,),
144
+ )
145
+
146
+
147
+ # 4. Pipeline flow -- type contract is in the annotations
148
+ @pipeline_flow(estimated_minutes=5)
149
+ async def analysis_flow(
150
+ project_name: str,
151
+ documents: list[InputDocument],
152
+ flow_options: FlowOptions,
153
+ ) -> list[AnalysisDocument]:
154
+ results: list[AnalysisDocument] = []
155
+ for doc in documents:
156
+ results.append(await analyze_document(doc))
157
+ return results
158
+
159
+
160
+ @pipeline_flow(estimated_minutes=2)
161
+ async def report_flow(
162
+ project_name: str,
163
+ documents: list[AnalysisDocument],
164
+ flow_options: FlowOptions,
165
+ ) -> list[ReportDocument]:
166
+ report = ReportDocument.create(
167
+ name="report.md",
168
+ content="# Report\n\nAnalysis complete.",
169
+ sources=tuple(doc.sha256 for doc in documents),
170
+ )
171
+ return [report]
172
+
173
+
174
+ # 5. Deployment -- ties flows together with type chain validation
175
+ class MyResult(DeploymentResult):
176
+ report_count: int = 0
177
+
178
+
179
+ class MyPipeline(PipelineDeployment[FlowOptions, MyResult]):
180
+ flows: ClassVar = [analysis_flow, report_flow]
181
+
182
+ @staticmethod
183
+ def build_result(
184
+ project_name: str,
185
+ documents: list[Document],
186
+ options: FlowOptions,
187
+ ) -> MyResult:
188
+ reports = [d for d in documents if isinstance(d, ReportDocument)]
189
+ return MyResult(success=True, report_count=len(reports))
190
+
191
+
192
+ # 6. CLI initializer provides project name and initial documents
193
+ def initialize(options: FlowOptions) -> tuple[str, list[Document]]:
194
+ docs: list[Document] = [
195
+ InputDocument.create(name="input.txt", content="Sample data"),
196
+ ]
197
+ return "my-project", docs
198
+
199
+
200
+ # Run from CLI (requires positional working_directory arg: python script.py ./output)
201
+ pipeline = MyPipeline()
202
+ pipeline.run_cli(initializer=initialize, trace_name="my-pipeline")
203
+ ```
204
+
205
+ ### Structured Output
206
+
207
+ ```python
208
+ from pydantic import BaseModel
209
+ from ai_pipeline_core import llm
210
+
211
+ class Analysis(BaseModel):
212
+ summary: str
213
+ sentiment: float
214
+ key_points: list[str]
215
+
216
+ # Generate structured output
217
+ response = await llm.generate_structured(
218
+ model="gemini-3-pro",
219
+ response_format=Analysis,
220
+ messages="Analyze this product review: ..."
221
+ )
222
+
223
+ # Access parsed result with type safety
224
+ analysis = response.parsed
225
+ print(f"Sentiment: {analysis.sentiment}")
226
+ for point in analysis.key_points:
227
+ print(f"- {point}")
228
+ ```
229
+
230
+ ### Document Handling
231
+
232
+ ```python
233
+ from ai_pipeline_core import Document
234
+
235
+ class MyDocument(Document):
236
+ """Custom document type -- must subclass Document."""
237
+
238
+ # Create documents with automatic conversion
239
+ doc = MyDocument.create(
240
+ name="data.json",
241
+ content={"key": "value"} # Automatically converted to JSON bytes
242
+ )
243
+
244
+ # Parse back to original type
245
+ data = doc.parse(dict) # Returns {"key": "value"}
246
+
247
+ # Document provenance tracking
248
+ source_doc = MyDocument.create(name="source.txt", content="original data")
249
+ plan_doc = MyDocument.create(name="plan.txt", content="research plan", sources=(source_doc.sha256,))
250
+ derived = MyDocument.create(
251
+ name="derived.json",
252
+ content={"result": "processed"},
253
+ sources=("https://api.example.com/data",), # Content came from this URL
254
+ origins=(plan_doc.sha256,), # Created because of this plan (causal, not content)
255
+ )
256
+
257
+ # Check provenance
258
+ for hash in derived.source_documents:
259
+ print(f"Derived from document: {hash}")
260
+ for ref in derived.source_references:
261
+ print(f"External source: {ref}")
262
+ ```
263
+
264
+ ## Core Concepts
265
+
266
+ ### Documents
267
+
268
+ Documents are immutable Pydantic models that wrap binary content with metadata. There is a single `Document` base class -- subclass it to define your document types:
269
+
270
+ ```python
271
+ class MyDocument(Document):
272
+ """All documents subclass Document directly."""
273
+
274
+ # Use create() for automatic conversion
275
+ doc = MyDocument.create(
276
+ name="data.json",
277
+ content={"key": "value"} # Auto-converts to JSON
278
+ )
279
+
280
+ # Access content
281
+ if doc.is_text:
282
+ print(doc.text)
283
+
284
+ # Parse structured data
285
+ data = doc.as_json() # or as_yaml()
286
+ model = doc.as_pydantic_model(MyModel) # Requires model_type argument
287
+
288
+ # Convert between document types
289
+ other = doc.model_convert(OtherDocType)
290
+
291
+ # Content-addressed identity
292
+ print(doc.sha256) # Full SHA256 hash (base32)
293
+ print(doc.id) # Short 6-char identifier
294
+ ```
295
+
296
+ **Document fields:**
297
+ - `name`: Filename (validated for security -- no path traversal)
298
+ - `description`: Optional human-readable description
299
+ - `content`: Raw bytes (auto-converted from str, dict, list, BaseModel via `create()`)
300
+ - `sources`: Content provenance — SHA256 hashes of source documents or external references (URLs, file paths). A SHA256 must not appear in both sources and origins.
301
+ - `origins`: Causal provenance — SHA256 hashes of documents that caused this document to be created without contributing to its content.
302
+ - `attachments`: Tuple of `Attachment` objects for multi-part content
303
+
304
+ Documents support:
305
+ - Automatic content serialization based on file extension: `.json` → JSON, `.yaml`/`.yml` → YAML, others → UTF-8 text. Structured data (dict, list, BaseModel) requires `.json` or `.yaml` extension.
306
+ - MIME type detection via `mime_type` cached property, with `is_text`/`is_image`/`is_pdf` helpers
307
+ - SHA256-based identity and deduplication
308
+ - Source provenance tracking (`sources` for references, `origins` for parent lineage)
309
+ - `FILES` enum for filename restrictions (definition-time validation)
310
+ - `model_convert()` for type conversion between document subclasses
311
+ - `canonical_name()` for standardized snake_case class identification
312
+ - Token count estimation via `approximate_tokens_count`
313
+
314
+ ### Document Store
315
+
316
+ Documents are automatically persisted by `@pipeline_task` to a `DocumentStore`. The store is a protocol with three implementations:
317
+
318
+ - **ClickHouseDocumentStore**: Production backend (selected when `CLICKHOUSE_HOST` is configured). Requires `clickhouse-connect` (included in dependencies).
319
+ - **LocalDocumentStore**: CLI/debug mode (filesystem-based, browsable files on disk)
320
+ - **MemoryDocumentStore**: Testing (in-memory, zero I/O)
321
+
322
+ **Store selection depends on the execution mode:**
323
+ - `run_cli()`: Always uses `LocalDocumentStore` (files saved to the working directory)
324
+ - `run_local()`: Always uses `MemoryDocumentStore` (in-memory, no persistence)
325
+ - `as_prefect_flow()`: Auto-selects based on settings -- `ClickHouseDocumentStore` when `CLICKHOUSE_HOST` is set, `LocalDocumentStore` otherwise
326
+
327
+ **Store protocol methods:**
328
+ - `save(document, run_scope)` -- Save a single document (idempotent)
329
+ - `save_batch(documents, run_scope)` -- Save multiple documents
330
+ - `load(run_scope, document_types)` -- Load documents by type
331
+ - `has_documents(run_scope, document_type)` -- Check if documents exist for a type
332
+ - `check_existing(sha256s)` -- Check which SHA256 hashes exist in the store
333
+ - `update_summary(run_scope, document_sha256, summary)` -- Update summary for a stored document
334
+ - `load_summaries(run_scope, document_sha256s)` -- Load summaries by SHA256
335
+ - `flush()` -- Block until all pending background work (summaries) is processed
336
+ - `shutdown()` -- Flush pending work and stop background workers
337
+
338
+ **Document summaries:** When a `SummaryGenerator` callable is provided, stores automatically generate LLM-powered summaries in the background after each new document is saved (including empty and binary documents). Summaries are best-effort (failures are logged and skipped) and stored as store-level metadata (not on the `Document` model). Configure via `DOC_SUMMARY_ENABLED` and `DOC_SUMMARY_MODEL` environment variables.
339
+
340
+ **Note:** Store implementations are not exported from the top-level package. Import from submodules:
341
+
342
+ ```python
343
+ from ai_pipeline_core.document_store.local import LocalDocumentStore
344
+ from ai_pipeline_core.document_store.memory import MemoryDocumentStore
345
+ from ai_pipeline_core.document_store.clickhouse import ClickHouseDocumentStore
346
+ ```
347
+
348
+ ### LLM Integration
349
+
350
+ The framework provides a unified interface for LLM interactions with context caching:
351
+
352
+ ```python
353
+ from ai_pipeline_core import llm, AIMessages, ModelOptions
354
+
355
+ # Simple generation
356
+ response = await llm.generate(
357
+ model="gemini-3-pro",
358
+ messages="Explain quantum computing"
359
+ )
360
+ print(response.content)
361
+
362
+ # With context caching (saves 50-90% tokens on repeated calls)
363
+ static_context = AIMessages([large_document])
364
+
365
+ # First call: caches context (default TTL is 300s / 5 minutes)
366
+ r1 = await llm.generate(
367
+ model="gemini-3-pro",
368
+ context=static_context,
369
+ messages="Summarize"
370
+ )
371
+
372
+ # Second call: reuses cache
373
+ r2 = await llm.generate(
374
+ model="gemini-3-pro",
375
+ context=static_context,
376
+ messages="Key points?"
377
+ )
378
+
379
+ # Multi-turn conversation
380
+ messages = AIMessages([
381
+ "What is Python?",
382
+ r1, # ModelResponse from previous call
383
+ "Can you give an example?"
384
+ ])
385
+ response = await llm.generate("gemini-3-pro", messages=messages)
386
+
387
+ # Observability: purpose labels traces, expected_cost tracks budget
388
+ response = await llm.generate(
389
+ model="gemini-3-pro",
390
+ messages="Analyze this",
391
+ purpose="source-verification",
392
+ expected_cost=0.05,
393
+ )
394
+ ```
395
+
396
+ **`generate()` signature:**
397
+ ```python
398
+ async def generate(
399
+ model: ModelName,
400
+ *,
401
+ context: AIMessages | None = None, # Static cacheable content
402
+ messages: AIMessages | str, # Dynamic query
403
+ options: ModelOptions | None = None, # Usually omit (defaults are optimal)
404
+ purpose: str | None = None, # Span name for tracing
405
+ expected_cost: float | None = None, # Cost tracking attribute
406
+ ) -> ModelResponse
407
+ ```
408
+
409
+ **`generate_structured()` signature:**
410
+ ```python
411
+ async def generate_structured(
412
+ model: ModelName,
413
+ response_format: type[T], # Pydantic model class
414
+ *,
415
+ context: AIMessages | None = None,
416
+ messages: AIMessages | str,
417
+ options: ModelOptions | None = None,
418
+ purpose: str | None = None,
419
+ expected_cost: float | None = None,
420
+ ) -> StructuredModelResponse[T]
421
+ ```
422
+
423
+ **`ModelOptions` key fields (all optional with sensible defaults):**
424
+ - `cache_ttl`: Context cache TTL (default `"300s"`, set `None` to disable)
425
+ - `system_prompt`: System-level instructions
426
+ - `reasoning_effort`: `"low" | "medium" | "high"` for models with explicit reasoning
427
+ - `search_context_size`: `"low" | "medium" | "high"` for search-enabled models
428
+ - `retries`: Retry attempts (default `3`)
429
+ - `retry_delay_seconds`: Delay between retries (default `20`)
430
+ - `timeout`: Max wait seconds (default `600`)
431
+ - `service_tier`: `"auto" | "default" | "flex" | "scale" | "priority"` (OpenAI only)
432
+ - `max_completion_tokens`: Max output tokens
433
+ - `temperature`: Generation randomness (usually omit -- use provider defaults)
434
+
435
+ **ModelName predefined values:** `"gemini-3-pro"`, `"gpt-5.1"`, `"gemini-3-flash"`, `"gpt-5-mini"`, `"grok-4.1-fast"`, `"gemini-3-flash-search"`, `"sonar-pro-search"` (also accepts any string for custom models).
436
+
437
+ ### Pipeline Decorators
438
+
439
+ #### `@pipeline_task`
440
+
441
+ Decorates async functions as traced Prefect tasks with automatic document persistence:
442
+
443
+ ```python
444
+ from ai_pipeline_core import pipeline_task
445
+
446
+ @pipeline_task # No parameters needed for most cases
447
+ async def process_chunk(document: InputDocument) -> OutputDocument:
448
+ return OutputDocument.create(
449
+ name="result.json",
450
+ content={"processed": True},
451
+ sources=(document.sha256,),
452
+ )
453
+
454
+ @pipeline_task(retries=3, estimated_minutes=5)
455
+ async def expensive_task(data: str) -> OutputDocument:
456
+ # Retries, tracing, and document auto-save handled automatically
457
+ ...
458
+
459
+ @pipeline_task(persist=False) # Disable auto-save for this task
460
+ async def transient_task(data: str) -> OutputDocument:
461
+ ...
462
+ ```
463
+
464
+ Key parameters:
465
+ - `persist`: Auto-save returned documents to store (default `True`)
466
+ - `retries`: Retry attempts on failure (default `0` -- no retries unless specified)
467
+ - `estimated_minutes`: Duration estimate for progress tracking (default `1`, must be >= 1)
468
+ - `timeout_seconds`: Task execution timeout
469
+ - `trace_level`: `"always" | "debug" | "off"` (default `"always"`)
470
+ - `user_summary`: Enable LLM-generated span summaries (default `False`)
471
+ - `expected_cost`: Expected cost budget for cost tracking
472
+
473
+ Key features:
474
+ - Async-only enforcement (raises `TypeError` if not `async def`)
475
+ - Laminar tracing (automatic)
476
+ - Document auto-save to DocumentStore (returned documents are extracted and persisted)
477
+ - Source validation (warns if referenced SHA256s don't exist in store)
478
+
479
+ #### `@pipeline_flow`
480
+
481
+ Decorates async flow functions with annotation-driven document type extraction. Always requires parentheses:
482
+
483
+ ```python
484
+ from ai_pipeline_core import pipeline_flow, FlowOptions
485
+
486
+ @pipeline_flow(estimated_minutes=10, retries=2, timeout_seconds=1200)
487
+ async def my_flow(
488
+ project_name: str,
489
+ documents: list[InputDoc], # Input types extracted from annotation
490
+ flow_options: MyFlowOptions, # Must be FlowOptions or subclass
491
+ ) -> list[OutputDoc]: # Output types extracted from annotation
492
+ ...
493
+ ```
494
+
495
+ The flow's `documents` parameter annotation determines input types, and the return annotation determines output types. The function must have exactly 3 parameters: `(str, list[...], FlowOptions)`. No separate config class needed -- the type contract is in the function signature.
496
+
497
+ **FlowOptions** is a base `BaseSettings` class for pipeline configuration. Subclass it to add flow-specific parameters:
498
+
499
+ ```python
500
+ class ResearchOptions(FlowOptions):
501
+ analysis_model: ModelName = "gemini-3-pro"
502
+ verification_model: ModelName = "grok-4.1-fast"
503
+ synthesis_model: ModelName = "gemini-3-pro"
504
+ max_sources: int = 10
505
+ ```
506
+
507
+ #### `PipelineDeployment`
508
+
509
+ Orchestrates multi-flow pipelines with resume, uploads, and webhooks:
510
+
511
+ ```python
512
+ class MyPipeline(PipelineDeployment[MyOptions, MyResult]):
513
+ flows: ClassVar = [flow_1, flow_2, flow_3]
514
+
515
+ @staticmethod
516
+ def build_result(
517
+ project_name: str,
518
+ documents: list[Document],
519
+ options: MyOptions,
520
+ ) -> MyResult:
521
+ ...
522
+ ```
523
+
524
+ **Execution modes:**
525
+
526
+ ```python
527
+ pipeline = MyPipeline()
528
+
529
+ # CLI mode: parses sys.argv, requires positional working_directory argument
530
+ # Usage: python script.py ./output [--start N] [--end N] [--max-keywords 8]
531
+ pipeline.run_cli(initializer=init_fn, trace_name="my-pipeline")
532
+
533
+ # Local mode: in-memory store, returns result directly (synchronous)
534
+ result = pipeline.run_local(
535
+ project_name="test",
536
+ documents=input_docs,
537
+ options=MyOptions(),
538
+ )
539
+
540
+ # Production: generates a Prefect flow for deployment
541
+ prefect_flow = pipeline.as_prefect_flow()
542
+ ```
543
+
544
+ Features:
545
+ - **Per-flow resume**: Skips flows whose output documents already exist in the store
546
+ - **Type chain validation**: At class definition time, validates that each flow's input types are producible by preceding flows
547
+ - **Per-flow uploads**: Upload documents after each flow completes
548
+ - **CLI mode**: `--start N` / `--end N` for step control, automatic `LocalDocumentStore`
549
+
550
+ ### Image Processing
551
+
552
+ The `images` module provides image splitting and compression for LLM vision models:
553
+
554
+ ```python
555
+ from ai_pipeline_core.images import process_image, process_image_to_documents, ImagePreset
556
+
557
+ # Process an image with model-specific presets
558
+ result = process_image(screenshot_bytes, preset=ImagePreset.GEMINI)
559
+ for part in result:
560
+ print(part.label, len(part.data))
561
+
562
+ # Convert to Document objects for AIMessages
563
+ image_docs = process_image_to_documents(screenshot_bytes, name_prefix="screenshot")
564
+ ```
565
+
566
+ Available presets: `GEMINI` (3000px, 9M pixels), `CLAUDE` (1568px, 1.15M pixels), `GPT4V` (2048px, 4M pixels).
567
+
568
+ The LLM client automatically splits oversized images at the model boundary -- you typically don't need to call these functions directly.
569
+
570
+ ### Prompt Manager
571
+
572
+ Jinja2 template management for structured prompts:
573
+
574
+ ```python
575
+ from ai_pipeline_core import PromptManager
576
+
577
+ # Module-level initialization (uses __file__ for relative template discovery)
578
+ prompts = PromptManager(__file__, prompts_dir="templates")
579
+
580
+ # Render a template
581
+ prompt = prompts.get("analyze.jinja2", source_id="example.com", task="summarize")
582
+ ```
583
+
584
+ Globals available in all templates: `current_date` (formatted as "01 February 2026").
585
+
586
+ ### Local Trace Debugging
587
+
588
+ When running via `run_cli()`, trace spans are automatically saved to `<working_dir>/.trace/` for
589
+ LLM-assisted debugging. Disable with `--no-trace`.
590
+
591
+ The directory structure mirrors the execution flow:
592
+
593
+ ```
594
+ .trace/
595
+ 20260128_152932_abc12345_my_flow/
596
+ |-- _trace.yaml # Trace metadata
597
+ |-- _tree.yaml # Lightweight tree structure
598
+ |-- _llm_calls.yaml # LLM-specific details (tokens, cost, purpose)
599
+ |-- _errors.yaml # Failed spans only (written only if errors exist)
600
+ |-- _summary.md # Static execution summary (always generated)
601
+ |-- artifacts/ # Deduplicated content storage
602
+ | +-- sha256/
603
+ | +-- ab/cd/ # Sharded by hash prefix
604
+ | +-- abcdef...1234.txt
605
+ +-- 0001_my_flow/ # Root span (numbered for execution order)
606
+ |-- _span.yaml # Span metadata (timing, status, attributes, I/O refs)
607
+ |-- input.yaml
608
+ |-- output.yaml
609
+ |-- events.yaml # OTel span events (log records, etc.)
610
+ +-- 0002_task_1/
611
+ +-- 0003_llm_call/
612
+ |-- _span.yaml
613
+ |-- input.yaml
614
+ +-- output.yaml
615
+ ```
616
+
617
+ Up to 20 traces are kept (oldest are automatically cleaned up).
618
+
619
+ ## Configuration
620
+
621
+ ### Environment Variables
622
+
623
+ ```bash
624
+ # LLM Configuration (via LiteLLM proxy)
625
+ OPENAI_BASE_URL=http://localhost:4000
626
+ OPENAI_API_KEY=your-api-key
627
+
628
+ # Optional: Observability
629
+ LMNR_PROJECT_API_KEY=your-lmnr-key
630
+ LMNR_DEBUG=true # Enable debug traces
631
+
632
+ # Optional: Orchestration
633
+ PREFECT_API_URL=http://localhost:4200/api
634
+ PREFECT_API_KEY=your-prefect-key
635
+ PREFECT_API_AUTH_STRING=your-auth-string
636
+ PREFECT_WORK_POOL_NAME=default
637
+ PREFECT_WORK_QUEUE_NAME=default
638
+ PREFECT_GCS_BUCKET=your-gcs-bucket
639
+
640
+ # Optional: GCS (for remote storage)
641
+ GCS_SERVICE_ACCOUNT_FILE=/path/to/service-account.json
642
+
643
+ # Optional: Document Store & Tracking (ClickHouse -- omit for local filesystem store)
644
+ CLICKHOUSE_HOST=your-clickhouse-host
645
+ CLICKHOUSE_PORT=8443
646
+ CLICKHOUSE_DATABASE=default
647
+ CLICKHOUSE_USER=default
648
+ CLICKHOUSE_PASSWORD=your-password
649
+ CLICKHOUSE_SECURE=true
650
+ TRACKING_ENABLED=true
651
+ TRACKING_SUMMARY_MODEL=gemini-3-flash
652
+
653
+ # Optional: Document Summaries (store-level, LLM-generated)
654
+ DOC_SUMMARY_ENABLED=true
655
+ DOC_SUMMARY_MODEL=gemini-3-flash
656
+ ```
657
+
658
+ ### Settings Management
659
+
660
+ Create custom settings by inheriting from the base Settings class:
661
+
662
+ ```python
663
+ from ai_pipeline_core import Settings
664
+
665
+ class ProjectSettings(Settings):
666
+ """Project-specific configuration."""
667
+ app_name: str = "my-app"
668
+ max_retries: int = 3
669
+
670
+ # Create singleton instance
671
+ settings = ProjectSettings()
672
+
673
+ # Access configuration (all env vars above are available)
674
+ print(settings.openai_base_url)
675
+ print(settings.app_name)
676
+ ```
677
+
678
+ ## Best Practices
679
+
680
+ ### Framework Rules
681
+
682
+ 1. **Decorators**: Use `@pipeline_task` without parameters for most cases, `@pipeline_flow(estimated_minutes=N)` with annotations (always requires parentheses)
683
+ 2. **Logging**: Use `get_pipeline_logger(__name__)` -- never `print()` or `logging` module directly
684
+ 3. **LLM calls**: Use `AIMessages` or `str` for messages. Wrap Documents in `AIMessages`
685
+ 4. **Options**: Omit `ModelOptions` unless specifically needed (defaults are production-optimized)
686
+ 5. **Documents**: Create with just `name` and `content` -- skip `description`. Always subclass `Document`
687
+ 6. **Flow annotations**: Input/output types are in the function signature -- `list[InputDoc]` and `-> list[OutputDoc]`
688
+ 7. **Initialization**: `PromptManager` and logger at module scope, not in functions
689
+ 8. **Document lists**: Use plain `list[Document]` -- no wrapper class needed
690
+
691
+ ### Import Convention
692
+
693
+ Always import from the top-level package when possible:
694
+
695
+ ```python
696
+ # CORRECT - top-level imports
697
+ from ai_pipeline_core import Document, pipeline_flow, pipeline_task, llm, AIMessages
698
+
699
+ # ALSO CORRECT - store implementations are NOT exported from top-level
700
+ from ai_pipeline_core.document_store.local import LocalDocumentStore
701
+ from ai_pipeline_core.document_store.memory import MemoryDocumentStore
702
+ ```
703
+
704
+ ## Development
705
+
706
+ ### Running Tests
707
+
708
+ ```bash
709
+ make test # Run all tests
710
+ make test-cov # Run with coverage report
711
+ make test-clickhouse # ClickHouse integration tests (requires Docker)
712
+ ```
713
+
714
+ ### Code Quality
715
+
716
+ ```bash
717
+ make check # Run ALL checks (lint, typecheck, deadcode, semgrep, docstrings, tests)
718
+ make lint # Ruff linting (27 rule sets)
719
+ make format # Auto-format and auto-fix code with ruff
720
+ make typecheck # Type checking with basedpyright (strict mode)
721
+ make deadcode # Dead code detection with vulture
722
+ make semgrep # Project-specific AST pattern checks (.semgrep/ rules)
723
+ make docstrings-cover # Docstring coverage (100% required)
724
+ ```
725
+
726
+ **Static analysis tools:**
727
+ - **Ruff** — 27 rule sets including bugbear, security (bandit), complexity, async enforcement, exception patterns
728
+ - **Basedpyright** — strict mode with `reportUnusedCoroutine`, `reportUnreachable`, `reportImplicitStringConcatenation`
729
+ - **Vulture** — dead code detection with framework-aware whitelist
730
+ - **Semgrep** — custom rules in `.semgrep/` for frozen model mutable fields, async enforcement, docstring quality, architecture constraints
731
+ - **Interrogate** — 100% docstring coverage enforcement
732
+
733
+ ### AI Documentation
734
+
735
+ ```bash
736
+ make docs-ai-build # Generate .ai-docs/ from source code
737
+ make docs-ai-check # Validate .ai-docs/ freshness and completeness
738
+ ```
739
+
740
+ ## Examples
741
+
742
+ The `examples/` directory contains:
743
+
744
+ - **`showcase.py`** -- Full pipeline demonstrating Document types, `@pipeline_task` auto-save, `@pipeline_flow` annotations, `PipelineDeployment`, and CLI mode
745
+ - **`showcase_document_store.py`** -- DocumentStore usage patterns: MemoryDocumentStore, LocalDocumentStore, RunContext scoping, pipeline tasks with auto-save, and `run_local()` execution
746
+
747
+ Run examples:
748
+ ```bash
749
+ # CLI mode with output directory
750
+ python examples/showcase.py ./output
751
+
752
+ # With custom options
753
+ python examples/showcase.py ./output --max-keywords 8
754
+
755
+ # Document store showcase (no arguments needed)
756
+ python examples/showcase_document_store.py
757
+ ```
758
+
759
+ ## Project Structure
760
+
761
+ ```
762
+ ai-pipeline-core/
763
+ |-- ai_pipeline_core/
764
+ | |-- deployment/ # Pipeline deployment, deploy script, progress, remote
765
+ | |-- docs_generator/ # AI-focused documentation generator
766
+ | |-- document_store/ # Store protocol and backends (ClickHouse, local, memory)
767
+ | |-- documents/ # Document system (Document base class, attachments, context)
768
+ | |-- images/ # Image processing for LLM vision models
769
+ | |-- llm/ # LLM client, AIMessages, ModelOptions, ModelResponse
770
+ | |-- logging/ # Logging infrastructure
771
+ | |-- observability/ # Tracing, tracking, and debug trace writer
772
+ | |-- pipeline/ # Pipeline decorators and FlowOptions
773
+ | |-- prompt_manager.py # Jinja2 template management
774
+ | |-- settings.py # Configuration management (Pydantic BaseSettings)
775
+ | |-- testing.py # Prefect test harness re-exports
776
+ | +-- exceptions.py # Framework exceptions (LLMError, DocumentNameError, etc.)
777
+ |-- tests/ # Comprehensive test suite
778
+ |-- examples/ # Usage examples
779
+ |-- .specification/ # Framework requirements and documentation spec
780
+ +-- pyproject.toml # Project configuration
781
+ ```
782
+
783
+ ## Contributing
784
+
785
+ 1. Fork the repository
786
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
787
+ 3. Make changes following the project's style guide
788
+ 4. Run all checks (`make check`)
789
+ 5. Commit your changes
790
+ 6. Push to the branch (`git push origin feature/amazing-feature`)
791
+ 7. Open a Pull Request
792
+
793
+ ## License
794
+
795
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
796
+
797
+ ## Support
798
+
799
+ - **Issues**: [GitHub Issues](https://github.com/bbarwik/ai-pipeline-core/issues)
800
+ - **Discussions**: [GitHub Discussions](https://github.com/bbarwik/ai-pipeline-core/discussions)
801
+
802
+ ## Acknowledgments
803
+
804
+ - Built on [Prefect](https://www.prefect.io/) for workflow orchestration
805
+ - Uses [LiteLLM](https://github.com/BerriAI/litellm) for LLM provider abstraction
806
+ - Integrates [Laminar (LMNR)](https://www.lmnr.ai/) for observability
807
+ - Type checking with [Pydantic](https://pydantic.dev/) and [basedpyright](https://github.com/DetachHead/basedpyright)