ai-pipeline-core 0.2.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. ai_pipeline_core/__init__.py +78 -125
  2. ai_pipeline_core/deployment/__init__.py +34 -0
  3. ai_pipeline_core/deployment/base.py +861 -0
  4. ai_pipeline_core/deployment/contract.py +80 -0
  5. ai_pipeline_core/deployment/deploy.py +561 -0
  6. ai_pipeline_core/deployment/helpers.py +97 -0
  7. ai_pipeline_core/deployment/progress.py +126 -0
  8. ai_pipeline_core/deployment/remote.py +116 -0
  9. ai_pipeline_core/docs_generator/__init__.py +54 -0
  10. ai_pipeline_core/docs_generator/__main__.py +5 -0
  11. ai_pipeline_core/docs_generator/cli.py +196 -0
  12. ai_pipeline_core/docs_generator/extractor.py +324 -0
  13. ai_pipeline_core/docs_generator/guide_builder.py +644 -0
  14. ai_pipeline_core/docs_generator/trimmer.py +35 -0
  15. ai_pipeline_core/docs_generator/validator.py +114 -0
  16. ai_pipeline_core/document_store/__init__.py +13 -0
  17. ai_pipeline_core/document_store/_summary.py +9 -0
  18. ai_pipeline_core/document_store/_summary_worker.py +170 -0
  19. ai_pipeline_core/document_store/clickhouse.py +492 -0
  20. ai_pipeline_core/document_store/factory.py +38 -0
  21. ai_pipeline_core/document_store/local.py +312 -0
  22. ai_pipeline_core/document_store/memory.py +85 -0
  23. ai_pipeline_core/document_store/protocol.py +68 -0
  24. ai_pipeline_core/documents/__init__.py +12 -14
  25. ai_pipeline_core/documents/_context_vars.py +85 -0
  26. ai_pipeline_core/documents/_hashing.py +52 -0
  27. ai_pipeline_core/documents/attachment.py +85 -0
  28. ai_pipeline_core/documents/context.py +128 -0
  29. ai_pipeline_core/documents/document.py +318 -1434
  30. ai_pipeline_core/documents/mime_type.py +37 -82
  31. ai_pipeline_core/documents/utils.py +4 -12
  32. ai_pipeline_core/exceptions.py +10 -62
  33. ai_pipeline_core/images/__init__.py +309 -0
  34. ai_pipeline_core/images/_processing.py +151 -0
  35. ai_pipeline_core/llm/__init__.py +6 -4
  36. ai_pipeline_core/llm/ai_messages.py +130 -81
  37. ai_pipeline_core/llm/client.py +327 -193
  38. ai_pipeline_core/llm/model_options.py +14 -86
  39. ai_pipeline_core/llm/model_response.py +60 -103
  40. ai_pipeline_core/llm/model_types.py +16 -34
  41. ai_pipeline_core/logging/__init__.py +2 -7
  42. ai_pipeline_core/logging/logging.yml +1 -1
  43. ai_pipeline_core/logging/logging_config.py +27 -37
  44. ai_pipeline_core/logging/logging_mixin.py +15 -41
  45. ai_pipeline_core/observability/__init__.py +32 -0
  46. ai_pipeline_core/observability/_debug/__init__.py +30 -0
  47. ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
  48. ai_pipeline_core/observability/_debug/_config.py +95 -0
  49. ai_pipeline_core/observability/_debug/_content.py +764 -0
  50. ai_pipeline_core/observability/_debug/_processor.py +98 -0
  51. ai_pipeline_core/observability/_debug/_summary.py +312 -0
  52. ai_pipeline_core/observability/_debug/_types.py +75 -0
  53. ai_pipeline_core/observability/_debug/_writer.py +843 -0
  54. ai_pipeline_core/observability/_document_tracking.py +146 -0
  55. ai_pipeline_core/observability/_initialization.py +194 -0
  56. ai_pipeline_core/observability/_logging_bridge.py +57 -0
  57. ai_pipeline_core/observability/_summary.py +81 -0
  58. ai_pipeline_core/observability/_tracking/__init__.py +6 -0
  59. ai_pipeline_core/observability/_tracking/_client.py +178 -0
  60. ai_pipeline_core/observability/_tracking/_internal.py +28 -0
  61. ai_pipeline_core/observability/_tracking/_models.py +138 -0
  62. ai_pipeline_core/observability/_tracking/_processor.py +158 -0
  63. ai_pipeline_core/observability/_tracking/_service.py +311 -0
  64. ai_pipeline_core/observability/_tracking/_writer.py +229 -0
  65. ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -283
  66. ai_pipeline_core/pipeline/__init__.py +10 -0
  67. ai_pipeline_core/pipeline/decorators.py +915 -0
  68. ai_pipeline_core/pipeline/options.py +16 -0
  69. ai_pipeline_core/prompt_manager.py +16 -102
  70. ai_pipeline_core/settings.py +26 -31
  71. ai_pipeline_core/testing.py +9 -0
  72. ai_pipeline_core-0.4.1.dist-info/METADATA +807 -0
  73. ai_pipeline_core-0.4.1.dist-info/RECORD +76 -0
  74. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/WHEEL +1 -1
  75. ai_pipeline_core/documents/document_list.py +0 -420
  76. ai_pipeline_core/documents/flow_document.py +0 -112
  77. ai_pipeline_core/documents/task_document.py +0 -117
  78. ai_pipeline_core/documents/temporary_document.py +0 -74
  79. ai_pipeline_core/flow/__init__.py +0 -9
  80. ai_pipeline_core/flow/config.py +0 -483
  81. ai_pipeline_core/flow/options.py +0 -75
  82. ai_pipeline_core/pipeline.py +0 -718
  83. ai_pipeline_core/prefect.py +0 -63
  84. ai_pipeline_core/simple_runner/__init__.py +0 -14
  85. ai_pipeline_core/simple_runner/cli.py +0 -254
  86. ai_pipeline_core/simple_runner/simple_runner.py +0 -247
  87. ai_pipeline_core/storage/__init__.py +0 -8
  88. ai_pipeline_core/storage/storage.py +0 -628
  89. ai_pipeline_core/utils/__init__.py +0 -8
  90. ai_pipeline_core/utils/deploy.py +0 -373
  91. ai_pipeline_core/utils/remote_deployment.py +0 -269
  92. ai_pipeline_core-0.2.6.dist-info/METADATA +0 -500
  93. ai_pipeline_core-0.2.6.dist-info/RECORD +0 -41
  94. {ai_pipeline_core-0.2.6.dist-info → ai_pipeline_core-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,861 @@
1
+ """Core classes for pipeline deployments.
2
+
3
+ Provides the PipelineDeployment base class and related types for
4
+ creating unified, type-safe pipeline deployments with:
5
+ - Per-flow resume (skip if outputs exist in DocumentStore)
6
+ - Per-flow uploads (immediate, not just at end)
7
+ - Prefect state hooks (on_running, on_completion, etc.)
8
+ - Upload on failure (partial results saved)
9
+ """
10
+
11
+ import asyncio
12
+ import contextlib
13
+ import hashlib
14
+ import os
15
+ import sys
16
+ from abc import abstractmethod
17
+ from collections.abc import Callable
18
+ from contextlib import ExitStack
19
+ from dataclasses import dataclass
20
+ from datetime import UTC, datetime
21
+ from pathlib import Path
22
+ from typing import Any, ClassVar, Generic, Protocol, TypeVar, cast, final
23
+ from uuid import UUID, uuid4
24
+
25
+ import httpx
26
+ from lmnr import Laminar
27
+ from opentelemetry import trace as otel_trace
28
+ from prefect import flow, get_client, runtime
29
+ from pydantic import BaseModel, ConfigDict, Field
30
+ from pydantic_settings import CliPositionalArg, SettingsConfigDict
31
+
32
+ from ai_pipeline_core.document_store import SummaryGenerator, create_document_store, get_document_store, set_document_store
33
+ from ai_pipeline_core.document_store.local import LocalDocumentStore
34
+ from ai_pipeline_core.document_store.memory import MemoryDocumentStore
35
+ from ai_pipeline_core.documents import Document
36
+ from ai_pipeline_core.documents.context import RunContext, reset_run_context, set_run_context
37
+ from ai_pipeline_core.logging import get_pipeline_logger, setup_logging
38
+ from ai_pipeline_core.observability._debug import LocalDebugSpanProcessor, LocalTraceWriter, TraceDebugConfig
39
+ from ai_pipeline_core.observability._initialization import get_tracking_service, initialize_observability
40
+ from ai_pipeline_core.observability._tracking._models import RunStatus
41
+ from ai_pipeline_core.pipeline.options import FlowOptions
42
+ from ai_pipeline_core.settings import settings
43
+ from ai_pipeline_core.testing import disable_run_logger, prefect_test_harness
44
+
45
+ from .contract import CompletedRun, DeploymentResultData, FailedRun, ProgressRun
46
+ from .helpers import (
47
+ StatusPayload,
48
+ class_name_to_deployment_name,
49
+ download_documents,
50
+ extract_generic_params,
51
+ send_webhook,
52
+ upload_documents,
53
+ )
54
+
55
+ logger = get_pipeline_logger(__name__)
56
+
57
+
58
+ def _build_summary_generator() -> SummaryGenerator | None:
59
+ """Build a summary generator callable from settings, or None if disabled/unavailable."""
60
+ if not settings.doc_summary_enabled:
61
+ return None
62
+
63
+ from ai_pipeline_core.observability._summary import generate_document_summary
64
+
65
+ model = settings.doc_summary_model
66
+
67
+ async def _generator(name: str, excerpt: str) -> str:
68
+ return await generate_document_summary(name, excerpt, model=model)
69
+
70
+ return _generator
71
+
72
+
73
+ # Fields added by run_cli()'s _CliOptions that should not affect the run scope fingerprint
74
+ _CLI_FIELDS: set[str] = {"working_directory", "project_name", "start", "end", "no_trace"}
75
+
76
+
77
+ def _compute_run_scope(project_name: str, documents: list[Document], options: FlowOptions) -> str:
78
+ """Compute a run scope that fingerprints inputs and options.
79
+
80
+ Different inputs or options produce a different scope, preventing
81
+ stale cache hits when re-running with the same project name.
82
+ Falls back to just project_name when no documents are provided
83
+ (e.g. --start N resume without initializer).
84
+ """
85
+ if not documents:
86
+ return project_name
87
+ sha256s = sorted(doc.sha256 for doc in documents)
88
+ exclude = _CLI_FIELDS & set(type(options).model_fields)
89
+ options_json = options.model_dump_json(exclude=exclude, exclude_none=True)
90
+ fingerprint = hashlib.sha256(f"{':'.join(sha256s)}|{options_json}".encode()).hexdigest()[:16]
91
+ return f"{project_name}:{fingerprint}"
92
+
93
+
94
+ class DeploymentContext(BaseModel):
95
+ """Infrastructure configuration for deployments.
96
+
97
+ Webhooks are optional - provide URLs to enable:
98
+ - progress_webhook_url: Per-flow progress (started/completed/cached)
99
+ - status_webhook_url: Prefect state transitions (RUNNING/FAILED/etc)
100
+ - completion_webhook_url: Final result when deployment ends
101
+ """
102
+
103
+ input_documents_urls: tuple[str, ...] = Field(default_factory=tuple)
104
+ output_documents_urls: dict[str, str] = Field(default_factory=dict) # nosemgrep: mutable-field-on-frozen-pydantic-model
105
+
106
+ progress_webhook_url: str = ""
107
+ status_webhook_url: str = ""
108
+ completion_webhook_url: str = ""
109
+
110
+ model_config = ConfigDict(frozen=True, extra="forbid")
111
+
112
+
113
+ class DeploymentResult(BaseModel):
114
+ """Base class for deployment results."""
115
+
116
+ success: bool
117
+ error: str | None = None
118
+
119
+ model_config = ConfigDict(frozen=True, extra="forbid")
120
+
121
+
122
+ TOptions = TypeVar("TOptions", bound=FlowOptions)
123
+ TResult = TypeVar("TResult", bound=DeploymentResult)
124
+
125
+
126
+ class FlowCallable(Protocol):
127
+ """Protocol for @pipeline_flow decorated functions."""
128
+
129
+ name: str
130
+ __name__: str
131
+ input_document_types: list[type[Document]]
132
+ output_document_types: list[type[Document]]
133
+ estimated_minutes: int
134
+
135
+ def __call__(self, project_name: str, documents: list[Document], flow_options: FlowOptions) -> Any: # type: ignore[type-arg]
136
+ """Execute the flow with standard pipeline signature."""
137
+ ...
138
+
139
+ def with_options(self, **kwargs: Any) -> "FlowCallable":
140
+ """Return a copy with overridden Prefect flow options (e.g., hooks)."""
141
+ ...
142
+
143
+
144
+ def _reattach_flow_metadata(original: FlowCallable, target: Any) -> None:
145
+ """Reattach custom flow attributes that Prefect's with_options() may strip."""
146
+ for attr in ("input_document_types", "output_document_types", "estimated_minutes"):
147
+ if hasattr(original, attr) and not hasattr(target, attr):
148
+ setattr(target, attr, getattr(original, attr))
149
+
150
+
151
+ @dataclass(slots=True)
152
+ class _StatusWebhookHook:
153
+ """Prefect hook that sends status webhooks on state transitions."""
154
+
155
+ webhook_url: str
156
+ flow_run_id: str
157
+ project_name: str
158
+ step: int
159
+ total_steps: int
160
+ flow_name: str
161
+
162
+ async def __call__(self, flow: Any, flow_run: Any, state: Any) -> None:
163
+ payload: StatusPayload = {
164
+ "type": "status",
165
+ "flow_run_id": str(flow_run.id),
166
+ "project_name": self.project_name,
167
+ "step": self.step,
168
+ "total_steps": self.total_steps,
169
+ "flow_name": self.flow_name,
170
+ "state": state.type.value if hasattr(state.type, "value") else str(state.type),
171
+ "state_name": state.name or "",
172
+ "timestamp": datetime.now(UTC).isoformat(),
173
+ }
174
+ try:
175
+ async with httpx.AsyncClient(timeout=10) as client:
176
+ await client.post(self.webhook_url, json=payload)
177
+ except Exception as e:
178
+ logger.warning(f"Status webhook failed: {e}")
179
+
180
+
181
+ def _validate_flow_chain(deployment_name: str, flows: list[Any]) -> None:
182
+ """Validate that each flow's input types are satisfiable by preceding flows' outputs.
183
+
184
+ Simulates a type pool: starts with the first flow's input types, adds each flow's
185
+ output types after processing. For subsequent flows, each required input type must
186
+ be satisfiable by at least one type in the pool (via issubclass).
187
+ """
188
+ type_pool: set[type[Document]] = set()
189
+
190
+ for i, flow_fn in enumerate(flows):
191
+ input_types: list[type[Document]] = getattr(flow_fn, "input_document_types", [])
192
+ output_types: list[type[Document]] = getattr(flow_fn, "output_document_types", [])
193
+ flow_name = getattr(flow_fn, "name", getattr(flow_fn, "__name__", f"flow[{i}]"))
194
+
195
+ if i == 0:
196
+ # First flow: its input types seed the pool
197
+ type_pool.update(input_types)
198
+ elif input_types:
199
+ # Subsequent flows: at least one declared input type must be satisfiable
200
+ # from the pool (union semantics — flow accepts any of the declared types)
201
+ any_satisfied = any(any(issubclass(available, t) for available in type_pool) for t in input_types)
202
+ if not any_satisfied:
203
+ input_names = sorted(t.__name__ for t in input_types)
204
+ pool_names = sorted(t.__name__ for t in type_pool) if type_pool else ["(empty)"]
205
+ raise TypeError(
206
+ f"{deployment_name}: flow '{flow_name}' (step {i + 1}) requires input types "
207
+ f"{input_names} but none are produced by preceding flows. "
208
+ f"Available types: {pool_names}"
209
+ )
210
+
211
+ type_pool.update(output_types)
212
+
213
+
214
+ class PipelineDeployment(Generic[TOptions, TResult]):
215
+ """Base class for pipeline deployments.
216
+
217
+ Features enabled by default:
218
+ - Per-flow resume: Skip flows if outputs exist in DocumentStore
219
+ - Per-flow uploads: Upload documents after each flow
220
+ - Prefect hooks: Attach state hooks if status_webhook_url provided
221
+ - Upload on failure: Save partial results if pipeline fails
222
+ """
223
+
224
+ flows: ClassVar[list[FlowCallable]]
225
+ name: ClassVar[str]
226
+ options_type: ClassVar[type[FlowOptions]]
227
+ result_type: ClassVar[type[DeploymentResult]]
228
+
229
+ def __init_subclass__(cls, **kwargs: Any) -> None:
230
+ super().__init_subclass__(**kwargs)
231
+
232
+ if not hasattr(cls, "flows"):
233
+ return
234
+
235
+ if cls.__name__.startswith("Test"):
236
+ raise TypeError(f"Deployment class name cannot start with 'Test': {cls.__name__}")
237
+
238
+ cls.name = class_name_to_deployment_name(cls.__name__)
239
+
240
+ options_type, result_type = extract_generic_params(cls, PipelineDeployment)
241
+ if options_type is None or result_type is None:
242
+ raise TypeError(f"{cls.__name__} must specify Generic parameters: class {cls.__name__}(PipelineDeployment[MyOptions, MyResult])")
243
+
244
+ cls.options_type = options_type
245
+ cls.result_type = result_type
246
+
247
+ if not cls.flows:
248
+ raise TypeError(f"{cls.__name__}.flows cannot be empty")
249
+
250
+ # build_result must be implemented (not still abstract from PipelineDeployment)
251
+ build_result_fn = getattr(cls, "build_result", None)
252
+ if build_result_fn is None or getattr(build_result_fn, "__isabstractmethod__", False):
253
+ raise TypeError(f"{cls.__name__} must implement 'build_result' static method")
254
+
255
+ # No duplicate flows (by identity)
256
+ seen_ids: set[int] = set()
257
+ for flow_fn in cls.flows:
258
+ fid = id(flow_fn)
259
+ if fid in seen_ids:
260
+ flow_name = getattr(flow_fn, "name", getattr(flow_fn, "__name__", str(flow_fn)))
261
+ raise TypeError(f"{cls.__name__}.flows contains duplicate flow '{flow_name}'")
262
+ seen_ids.add(fid)
263
+
264
+ # Flow type chain validation: simulate a type pool
265
+ _validate_flow_chain(cls.__name__, cls.flows)
266
+
267
+ @staticmethod
268
+ @abstractmethod
269
+ def build_result(project_name: str, documents: list[Document], options: TOptions) -> TResult:
270
+ """Extract typed result from pipeline documents loaded from DocumentStore."""
271
+ ...
272
+
273
+ def _all_document_types(self) -> list[type[Document]]:
274
+ """Collect all document types from all flows (inputs + outputs), deduplicated."""
275
+ types: dict[str, type[Document]] = {}
276
+ for flow_fn in self.flows:
277
+ for t in getattr(flow_fn, "input_document_types", []):
278
+ types[t.__name__] = t
279
+ for t in getattr(flow_fn, "output_document_types", []):
280
+ types[t.__name__] = t
281
+ return list(types.values())
282
+
283
+ def _build_status_hooks(
284
+ self,
285
+ context: DeploymentContext,
286
+ flow_run_id: str,
287
+ project_name: str,
288
+ step: int,
289
+ total_steps: int,
290
+ flow_name: str,
291
+ ) -> dict[str, list[Callable[..., Any]]]:
292
+ """Build Prefect hooks for status webhooks."""
293
+ hook = _StatusWebhookHook(
294
+ webhook_url=context.status_webhook_url,
295
+ flow_run_id=flow_run_id,
296
+ project_name=project_name,
297
+ step=step,
298
+ total_steps=total_steps,
299
+ flow_name=flow_name,
300
+ )
301
+ return {
302
+ "on_running": [hook],
303
+ "on_completion": [hook],
304
+ "on_failure": [hook],
305
+ "on_crashed": [hook],
306
+ "on_cancellation": [hook],
307
+ }
308
+
309
+ async def _send_progress(
310
+ self,
311
+ context: DeploymentContext,
312
+ flow_run_id: str,
313
+ project_name: str,
314
+ step: int,
315
+ total_steps: int,
316
+ flow_name: str,
317
+ status: str,
318
+ step_progress: float = 0.0,
319
+ message: str = "",
320
+ ) -> None:
321
+ """Send progress webhook and update flow run labels."""
322
+ # Use estimated_minutes for weighted progress calculation
323
+ flow_minutes = [getattr(f, "estimated_minutes", 1) for f in self.flows]
324
+ total_minutes = sum(flow_minutes) or 1
325
+ completed_minutes = sum(flow_minutes[: max(step - 1, 0)])
326
+ current_flow_minutes = flow_minutes[step - 1] if step - 1 < len(flow_minutes) else 1
327
+ progress = round(max(0.0, min(1.0, (completed_minutes + current_flow_minutes * step_progress) / total_minutes)), 4)
328
+
329
+ if context.progress_webhook_url:
330
+ payload = ProgressRun(
331
+ flow_run_id=UUID(flow_run_id) if flow_run_id else UUID(int=0),
332
+ project_name=project_name,
333
+ state="RUNNING",
334
+ timestamp=datetime.now(UTC),
335
+ step=step,
336
+ total_steps=total_steps,
337
+ flow_name=flow_name,
338
+ status=status,
339
+ progress=progress,
340
+ step_progress=round(step_progress, 4),
341
+ message=message,
342
+ )
343
+ try:
344
+ await send_webhook(context.progress_webhook_url, payload)
345
+ except Exception as e:
346
+ logger.warning(f"Progress webhook failed: {e}")
347
+
348
+ if flow_run_id:
349
+ try:
350
+ async with get_client() as client:
351
+ await client.update_flow_run_labels(
352
+ flow_run_id=UUID(flow_run_id),
353
+ labels={
354
+ "progress.step": step,
355
+ "progress.total_steps": total_steps,
356
+ "progress.flow_name": flow_name,
357
+ "progress.status": status,
358
+ "progress.progress": progress,
359
+ "progress.step_progress": round(step_progress, 4),
360
+ "progress.message": message,
361
+ },
362
+ )
363
+ except Exception as e:
364
+ logger.warning(f"Progress label update failed: {e}")
365
+
366
+ async def _send_completion(
367
+ self,
368
+ context: DeploymentContext,
369
+ flow_run_id: str,
370
+ project_name: str,
371
+ result: TResult | None,
372
+ error: str | None,
373
+ ) -> None:
374
+ """Send completion webhook."""
375
+ if not context.completion_webhook_url:
376
+ return
377
+ try:
378
+ now = datetime.now(UTC)
379
+ frid = UUID(flow_run_id) if flow_run_id else UUID(int=0)
380
+ payload: CompletedRun | FailedRun
381
+ if result is not None:
382
+ payload = CompletedRun(
383
+ flow_run_id=frid,
384
+ project_name=project_name,
385
+ timestamp=now,
386
+ state="COMPLETED",
387
+ result=DeploymentResultData.model_validate(result.model_dump()),
388
+ )
389
+ else:
390
+ payload = FailedRun(
391
+ flow_run_id=frid,
392
+ project_name=project_name,
393
+ timestamp=now,
394
+ state="FAILED",
395
+ error=error or "Unknown error",
396
+ )
397
+ await send_webhook(context.completion_webhook_url, payload)
398
+ except Exception as e:
399
+ logger.warning(f"Completion webhook failed: {e}")
400
+
401
+ @final
402
+ async def run(
403
+ self,
404
+ project_name: str,
405
+ documents: list[Document],
406
+ options: TOptions,
407
+ context: DeploymentContext,
408
+ ) -> TResult:
409
+ """Execute all flows with resume, per-flow uploads, and webhooks.
410
+
411
+ Args:
412
+ project_name: Unique identifier for this pipeline run (used as run_scope).
413
+ documents: Initial input documents for the first flow.
414
+ options: Flow options passed to each flow.
415
+ context: Deployment context with webhook URLs and document upload config.
416
+
417
+ Returns:
418
+ Typed deployment result built from all pipeline documents.
419
+ """
420
+ store = get_document_store()
421
+ total_steps = len(self.flows)
422
+ flow_run_id: str = str(runtime.flow_run.get_id()) if runtime.flow_run else "" # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownArgumentType]
423
+
424
+ # Write identity labels for polling endpoint
425
+ if flow_run_id:
426
+ try:
427
+ async with get_client() as client:
428
+ await client.update_flow_run_labels(
429
+ flow_run_id=UUID(flow_run_id),
430
+ labels={"pipeline.project_name": project_name},
431
+ )
432
+ except Exception as e:
433
+ logger.warning(f"Identity label update failed: {e}")
434
+
435
+ # Download additional input documents
436
+ input_docs = list(documents)
437
+ if context.input_documents_urls:
438
+ downloaded = await download_documents(list(context.input_documents_urls))
439
+ input_docs.extend(downloaded)
440
+
441
+ # Compute run scope AFTER downloads so the fingerprint includes all inputs
442
+ run_scope = _compute_run_scope(project_name, input_docs, options)
443
+
444
+ if not store and total_steps > 1:
445
+ logger.warning("No DocumentStore configured for multi-step pipeline — intermediate outputs will not accumulate between flows")
446
+
447
+ completion_sent = False
448
+
449
+ # Tracking lifecycle
450
+ tracking_svc = None
451
+ run_uuid: UUID | None = None
452
+ run_failed = False
453
+ try:
454
+ tracking_svc = get_tracking_service()
455
+ if tracking_svc:
456
+ run_uuid = UUID(flow_run_id) if flow_run_id else uuid4()
457
+ tracking_svc.set_run_context(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
458
+ tracking_svc.track_run_start(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
459
+ except Exception:
460
+ tracking_svc = None
461
+
462
+ # Set RunContext for the entire pipeline run
463
+ run_token = set_run_context(RunContext(run_scope=run_scope))
464
+ try:
465
+ # Save initial input documents to store
466
+ if store and input_docs:
467
+ await store.save_batch(input_docs, run_scope)
468
+
469
+ for step, flow_fn in enumerate(self.flows, start=1):
470
+ flow_name = getattr(flow_fn, "name", flow_fn.__name__)
471
+ flow_run_id = str(runtime.flow_run.get_id()) if runtime.flow_run else "" # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownArgumentType]
472
+
473
+ # Resume check: skip if output documents already exist in store
474
+ output_types = getattr(flow_fn, "output_document_types", [])
475
+ if store and output_types:
476
+ all_outputs_exist = all([await store.has_documents(run_scope, ot) for ot in output_types])
477
+ if all_outputs_exist:
478
+ logger.info(f"[{step}/{total_steps}] Resume: skipping {flow_name} (outputs exist)")
479
+ await self._send_progress(
480
+ context,
481
+ flow_run_id,
482
+ project_name,
483
+ step,
484
+ total_steps,
485
+ flow_name,
486
+ "cached",
487
+ step_progress=1.0,
488
+ message=f"Resumed from store: {flow_name}",
489
+ )
490
+ continue
491
+
492
+ # Prefect state hooks
493
+ active_flow = flow_fn
494
+ if context.status_webhook_url:
495
+ hooks = self._build_status_hooks(context, flow_run_id, project_name, step, total_steps, flow_name)
496
+ active_flow = flow_fn.with_options(**hooks)
497
+ _reattach_flow_metadata(flow_fn, active_flow)
498
+
499
+ # Progress: started
500
+ await self._send_progress(
501
+ context,
502
+ flow_run_id,
503
+ project_name,
504
+ step,
505
+ total_steps,
506
+ flow_name,
507
+ "started",
508
+ step_progress=0.0,
509
+ message=f"Starting: {flow_name}",
510
+ )
511
+
512
+ logger.info(f"[{step}/{total_steps}] Starting: {flow_name}")
513
+
514
+ # Load input documents from store
515
+ input_types = getattr(flow_fn, "input_document_types", [])
516
+ if store and input_types:
517
+ current_docs = await store.load(run_scope, input_types)
518
+ else:
519
+ current_docs = input_docs
520
+
521
+ try:
522
+ await active_flow(project_name, current_docs, options)
523
+ except Exception as e:
524
+ # Upload partial results on failure
525
+ if context.output_documents_urls and store:
526
+ all_docs = await store.load(run_scope, self._all_document_types())
527
+ await upload_documents(all_docs, context.output_documents_urls)
528
+ await self._send_completion(context, flow_run_id, project_name, result=None, error=str(e))
529
+ completion_sent = True
530
+ raise
531
+
532
+ # Per-flow upload (load from store since @pipeline_flow saves there)
533
+ if context.output_documents_urls and store and output_types:
534
+ flow_docs = await store.load(run_scope, output_types)
535
+ await upload_documents(flow_docs, context.output_documents_urls)
536
+
537
+ # Progress: completed
538
+ await self._send_progress(
539
+ context,
540
+ flow_run_id,
541
+ project_name,
542
+ step,
543
+ total_steps,
544
+ flow_name,
545
+ "completed",
546
+ step_progress=1.0,
547
+ message=f"Completed: {flow_name}",
548
+ )
549
+
550
+ logger.info(f"[{step}/{total_steps}] Completed: {flow_name}")
551
+
552
+ # Build result from all documents in store
553
+ if store:
554
+ all_docs = await store.load(run_scope, self._all_document_types())
555
+ else:
556
+ all_docs = input_docs
557
+ result = self.build_result(project_name, all_docs, options)
558
+ await self._send_completion(context, flow_run_id, project_name, result=result, error=None)
559
+ return result
560
+
561
+ except Exception as e:
562
+ run_failed = True
563
+ if not completion_sent:
564
+ await self._send_completion(context, flow_run_id, project_name, result=None, error=str(e))
565
+ raise
566
+ finally:
567
+ reset_run_context(run_token)
568
+ store = get_document_store()
569
+ if store:
570
+ with contextlib.suppress(Exception):
571
+ store.flush()
572
+ if (svc := tracking_svc) is not None and run_uuid is not None:
573
+ with contextlib.suppress(Exception):
574
+ svc.track_run_end(run_id=run_uuid, status=RunStatus.FAILED if run_failed else RunStatus.COMPLETED)
575
+ svc.flush()
576
+
577
+ @final
578
+ def run_local(
579
+ self,
580
+ project_name: str,
581
+ documents: list[Document],
582
+ options: TOptions,
583
+ context: DeploymentContext | None = None,
584
+ output_dir: Path | None = None,
585
+ ) -> TResult:
586
+ """Run locally with Prefect test harness and in-memory document store.
587
+
588
+ Args:
589
+ project_name: Pipeline run identifier.
590
+ documents: Initial input documents.
591
+ options: Flow options.
592
+ context: Optional deployment context (defaults to empty).
593
+ output_dir: Optional directory for writing result.json.
594
+
595
+ Returns:
596
+ Typed deployment result.
597
+ """
598
+ if context is None:
599
+ context = DeploymentContext()
600
+
601
+ if output_dir:
602
+ output_dir.mkdir(parents=True, exist_ok=True)
603
+
604
+ store = MemoryDocumentStore()
605
+ set_document_store(store)
606
+ try:
607
+ with prefect_test_harness(), disable_run_logger():
608
+ result = asyncio.run(self.run(project_name, documents, options, context))
609
+ finally:
610
+ store.shutdown()
611
+ set_document_store(None)
612
+
613
+ if output_dir:
614
+ (output_dir / "result.json").write_text(result.model_dump_json(indent=2))
615
+
616
+ return result
617
+
618
+ @final
619
+ def run_cli(
620
+ self,
621
+ initializer: Callable[[TOptions], tuple[str, list[Document]]] | None = None,
622
+ trace_name: str | None = None,
623
+ ) -> None:
624
+ """Execute pipeline from CLI arguments with --start/--end step control.
625
+
626
+ Args:
627
+ initializer: Optional callback returning (project_name, documents) from options.
628
+ trace_name: Optional Laminar trace span name prefix.
629
+ """
630
+ if len(sys.argv) == 1:
631
+ sys.argv.append("--help")
632
+
633
+ setup_logging()
634
+ try:
635
+ initialize_observability()
636
+ logger.info("Observability initialized.")
637
+ except Exception as e:
638
+ logger.warning(f"Failed to initialize observability: {e}")
639
+ with contextlib.suppress(Exception):
640
+ Laminar.initialize(export_timeout_seconds=15)
641
+
642
+ deployment = self
643
+
644
+ class _CliOptions(
645
+ deployment.options_type,
646
+ cli_parse_args=True,
647
+ cli_kebab_case=True,
648
+ cli_exit_on_error=True,
649
+ cli_prog_name=deployment.name,
650
+ cli_use_class_docs_for_groups=True,
651
+ ):
652
+ working_directory: CliPositionalArg[Path]
653
+ project_name: str | None = None
654
+ start: int = 1
655
+ end: int | None = None
656
+ no_trace: bool = False
657
+
658
+ model_config = SettingsConfigDict(frozen=True, extra="ignore")
659
+
660
+ opts = cast(TOptions, _CliOptions()) # type: ignore[reportCallIssue]
661
+
662
+ wd = cast(Path, opts.working_directory) # pyright: ignore[reportAttributeAccessIssue]
663
+ wd.mkdir(parents=True, exist_ok=True)
664
+
665
+ project_name = cast(str, opts.project_name or wd.name) # pyright: ignore[reportAttributeAccessIssue]
666
+ start_step = getattr(opts, "start", 1)
667
+ end_step = getattr(opts, "end", None)
668
+ no_trace = getattr(opts, "no_trace", False)
669
+
670
+ # Set up local debug tracing (writes to <working_dir>/.trace)
671
+ debug_processor: LocalDebugSpanProcessor | None = None
672
+ if not no_trace:
673
+ try:
674
+ trace_path = wd / ".trace"
675
+ trace_path.mkdir(parents=True, exist_ok=True)
676
+ debug_config = TraceDebugConfig(path=trace_path, max_traces=20)
677
+ debug_writer = LocalTraceWriter(debug_config)
678
+ debug_processor = LocalDebugSpanProcessor(debug_writer)
679
+ provider: Any = otel_trace.get_tracer_provider()
680
+ if hasattr(provider, "add_span_processor"):
681
+ provider.add_span_processor(debug_processor)
682
+ logger.info(f"Local debug tracing enabled at {trace_path}")
683
+ except Exception as e:
684
+ logger.warning(f"Failed to set up local debug tracing: {e}")
685
+ debug_processor = None
686
+
687
+ # Initialize document store — ClickHouse when configured, local filesystem otherwise
688
+ summary_generator = _build_summary_generator()
689
+ if settings.clickhouse_host:
690
+ store = create_document_store(settings, summary_generator=summary_generator)
691
+ else:
692
+ store = LocalDocumentStore(base_path=wd, summary_generator=summary_generator)
693
+ set_document_store(store)
694
+
695
+ # Initialize documents (always run initializer for run scope fingerprinting,
696
+ # even when start_step > 1, so --start N resumes find the correct scope)
697
+ initial_documents: list[Document] = []
698
+ if initializer:
699
+ _, initial_documents = initializer(opts)
700
+
701
+ context = DeploymentContext()
702
+
703
+ with ExitStack() as stack:
704
+ if trace_name:
705
+ stack.enter_context(
706
+ Laminar.start_as_current_span(
707
+ name=f"{trace_name}-{project_name}",
708
+ input=[opts.model_dump_json()],
709
+ )
710
+ )
711
+
712
+ under_pytest = "PYTEST_CURRENT_TEST" in os.environ or "pytest" in sys.modules
713
+ if not settings.prefect_api_key and not under_pytest:
714
+ stack.enter_context(prefect_test_harness())
715
+ stack.enter_context(disable_run_logger())
716
+
717
+ result = asyncio.run(
718
+ self._run_with_steps(
719
+ project_name=project_name,
720
+ options=opts,
721
+ context=context,
722
+ start_step=start_step,
723
+ end_step=end_step,
724
+ initial_documents=initial_documents,
725
+ )
726
+ )
727
+
728
+ result_file = wd / "result.json"
729
+ result_file.write_text(result.model_dump_json(indent=2))
730
+ logger.info(f"Result saved to {result_file}")
731
+
732
+ # Shutdown background workers (debug tracing, document summaries, tracking)
733
+ if debug_processor is not None:
734
+ debug_processor.shutdown()
735
+ store = get_document_store()
736
+ if store:
737
+ store.shutdown()
738
+ tracking_svc = get_tracking_service()
739
+ if tracking_svc:
740
+ tracking_svc.shutdown()
741
+
742
+ async def _run_with_steps(
743
+ self,
744
+ project_name: str,
745
+ options: TOptions,
746
+ context: DeploymentContext,
747
+ start_step: int = 1,
748
+ end_step: int | None = None,
749
+ initial_documents: list[Document] | None = None,
750
+ ) -> TResult:
751
+ """Run pipeline with start/end step control and DocumentStore-based resume."""
752
+ store = get_document_store()
753
+ if end_step is None:
754
+ end_step = len(self.flows)
755
+
756
+ total_steps = len(self.flows)
757
+ run_scope = _compute_run_scope(project_name, initial_documents or [], options)
758
+
759
+ # Tracking lifecycle for CLI path
760
+ tracking_svc = None
761
+ run_uuid: UUID | None = None
762
+ run_failed = False
763
+ try:
764
+ tracking_svc = get_tracking_service()
765
+ if tracking_svc:
766
+ run_uuid = uuid4()
767
+ tracking_svc.set_run_context(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
768
+ tracking_svc.track_run_start(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
769
+ except Exception:
770
+ tracking_svc = None
771
+
772
+ # Set RunContext for the entire pipeline run
773
+ run_token = set_run_context(RunContext(run_scope=run_scope))
774
+ try:
775
+ # Save initial documents to store
776
+ if store and initial_documents:
777
+ await store.save_batch(initial_documents, run_scope)
778
+
779
+ for i in range(start_step - 1, end_step):
780
+ step = i + 1
781
+ flow_fn = self.flows[i]
782
+ flow_name = getattr(flow_fn, "name", flow_fn.__name__)
783
+ logger.info(f"--- [Step {step}/{total_steps}] {flow_name} ---")
784
+
785
+ # Resume check: skip if output documents already exist
786
+ output_types = getattr(flow_fn, "output_document_types", [])
787
+ if store and output_types:
788
+ all_outputs_exist = all([await store.has_documents(run_scope, ot) for ot in output_types])
789
+ if all_outputs_exist:
790
+ logger.info(f"--- [Step {step}/{total_steps}] Skipping {flow_name} (outputs exist) ---")
791
+ continue
792
+
793
+ # Load inputs from store
794
+ input_types = getattr(flow_fn, "input_document_types", [])
795
+ if store and input_types:
796
+ current_docs = await store.load(run_scope, input_types)
797
+ else:
798
+ current_docs = initial_documents or []
799
+
800
+ await flow_fn(project_name, current_docs, options)
801
+
802
+ # Build result from all documents in store
803
+ if store:
804
+ all_docs = await store.load(run_scope, self._all_document_types())
805
+ else:
806
+ all_docs = initial_documents or []
807
+ return self.build_result(project_name, all_docs, options)
808
+ except Exception:
809
+ run_failed = True
810
+ raise
811
+ finally:
812
+ reset_run_context(run_token)
813
+ store = get_document_store()
814
+ if store:
815
+ with contextlib.suppress(Exception):
816
+ store.flush()
817
+ if (svc := tracking_svc) is not None and run_uuid is not None:
818
+ with contextlib.suppress(Exception):
819
+ svc.track_run_end(run_id=run_uuid, status=RunStatus.FAILED if run_failed else RunStatus.COMPLETED)
820
+ svc.flush()
821
+
822
+ @final
823
+ def as_prefect_flow(self) -> Callable[..., Any]:
824
+ """Generate a Prefect flow for production deployment.
825
+
826
+ Returns:
827
+ Async Prefect flow callable that initializes DocumentStore from settings.
828
+ """
829
+ deployment = self
830
+
831
+ @flow(
832
+ name=self.name,
833
+ flow_run_name=f"{self.name}-{{project_name}}",
834
+ persist_result=True,
835
+ result_serializer="json",
836
+ )
837
+ async def _deployment_flow(
838
+ project_name: str,
839
+ documents: list[Document],
840
+ options: FlowOptions,
841
+ context: DeploymentContext,
842
+ ) -> DeploymentResult:
843
+ store = create_document_store(
844
+ settings,
845
+ summary_generator=_build_summary_generator(),
846
+ )
847
+ set_document_store(store)
848
+ try:
849
+ return await deployment.run(project_name, documents, cast(Any, options), context)
850
+ finally:
851
+ store.shutdown()
852
+ set_document_store(None)
853
+
854
+ return _deployment_flow
855
+
856
+
857
+ __all__ = [
858
+ "DeploymentContext",
859
+ "DeploymentResult",
860
+ "PipelineDeployment",
861
+ ]