ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +64 -158
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +11 -84
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +102 -90
- ai_pipeline_core/llm/client.py +229 -183
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,40 +1,46 @@
|
|
|
1
1
|
"""Core classes for pipeline deployments.
|
|
2
2
|
|
|
3
|
-
@public
|
|
4
|
-
|
|
5
3
|
Provides the PipelineDeployment base class and related types for
|
|
6
4
|
creating unified, type-safe pipeline deployments with:
|
|
7
|
-
- Per-flow
|
|
5
|
+
- Per-flow resume (skip if outputs exist in DocumentStore)
|
|
8
6
|
- Per-flow uploads (immediate, not just at end)
|
|
9
7
|
- Prefect state hooks (on_running, on_completion, etc.)
|
|
10
|
-
- Smart storage provisioning (override provision_storage)
|
|
11
8
|
- Upload on failure (partial results saved)
|
|
12
9
|
"""
|
|
13
10
|
|
|
14
11
|
import asyncio
|
|
12
|
+
import contextlib
|
|
13
|
+
import hashlib
|
|
15
14
|
import os
|
|
16
|
-
import re
|
|
17
15
|
import sys
|
|
18
16
|
from abc import abstractmethod
|
|
17
|
+
from collections.abc import Callable
|
|
19
18
|
from contextlib import ExitStack
|
|
20
19
|
from dataclasses import dataclass
|
|
21
|
-
from datetime import
|
|
22
|
-
from hashlib import sha256
|
|
20
|
+
from datetime import UTC, datetime
|
|
23
21
|
from pathlib import Path
|
|
24
|
-
from typing import Any,
|
|
25
|
-
from uuid import UUID
|
|
22
|
+
from typing import Any, ClassVar, Generic, Protocol, TypeVar, cast, final
|
|
23
|
+
from uuid import UUID, uuid4
|
|
26
24
|
|
|
27
25
|
import httpx
|
|
28
26
|
from lmnr import Laminar
|
|
29
|
-
from
|
|
27
|
+
from opentelemetry import trace as otel_trace
|
|
28
|
+
from prefect import flow, get_client, runtime
|
|
30
29
|
from pydantic import BaseModel, ConfigDict, Field
|
|
31
30
|
from pydantic_settings import CliPositionalArg, SettingsConfigDict
|
|
32
31
|
|
|
33
|
-
from ai_pipeline_core.
|
|
34
|
-
from ai_pipeline_core.
|
|
32
|
+
from ai_pipeline_core.document_store import SummaryGenerator, create_document_store, get_document_store, set_document_store
|
|
33
|
+
from ai_pipeline_core.document_store.local import LocalDocumentStore
|
|
34
|
+
from ai_pipeline_core.document_store.memory import MemoryDocumentStore
|
|
35
|
+
from ai_pipeline_core.documents import Document
|
|
36
|
+
from ai_pipeline_core.documents.context import RunContext, reset_run_context, set_run_context
|
|
35
37
|
from ai_pipeline_core.logging import get_pipeline_logger, setup_logging
|
|
36
|
-
from ai_pipeline_core.
|
|
38
|
+
from ai_pipeline_core.observability._debug import LocalDebugSpanProcessor, LocalTraceWriter, TraceDebugConfig
|
|
39
|
+
from ai_pipeline_core.observability._initialization import get_tracking_service, initialize_observability
|
|
40
|
+
from ai_pipeline_core.observability._tracking._models import RunStatus
|
|
41
|
+
from ai_pipeline_core.pipeline.options import FlowOptions
|
|
37
42
|
from ai_pipeline_core.settings import settings
|
|
43
|
+
from ai_pipeline_core.testing import disable_run_logger, prefect_test_harness
|
|
38
44
|
|
|
39
45
|
from .contract import CompletedRun, DeploymentResultData, FailedRun, ProgressRun
|
|
40
46
|
from .helpers import (
|
|
@@ -49,8 +55,44 @@ from .helpers import (
|
|
|
49
55
|
logger = get_pipeline_logger(__name__)
|
|
50
56
|
|
|
51
57
|
|
|
58
|
+
def _build_summary_generator() -> SummaryGenerator | None:
|
|
59
|
+
"""Build a summary generator callable from settings, or None if disabled/unavailable."""
|
|
60
|
+
if not settings.doc_summary_enabled:
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
from ai_pipeline_core.observability._summary import generate_document_summary
|
|
64
|
+
|
|
65
|
+
model = settings.doc_summary_model
|
|
66
|
+
|
|
67
|
+
async def _generator(name: str, excerpt: str) -> str:
|
|
68
|
+
return await generate_document_summary(name, excerpt, model=model)
|
|
69
|
+
|
|
70
|
+
return _generator
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Fields added by run_cli()'s _CliOptions that should not affect the run scope fingerprint
|
|
74
|
+
_CLI_FIELDS: set[str] = {"working_directory", "project_name", "start", "end", "no_trace"}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _compute_run_scope(project_name: str, documents: list[Document], options: FlowOptions) -> str:
|
|
78
|
+
"""Compute a run scope that fingerprints inputs and options.
|
|
79
|
+
|
|
80
|
+
Different inputs or options produce a different scope, preventing
|
|
81
|
+
stale cache hits when re-running with the same project name.
|
|
82
|
+
Falls back to just project_name when no documents are provided
|
|
83
|
+
(e.g. --start N resume without initializer).
|
|
84
|
+
"""
|
|
85
|
+
if not documents:
|
|
86
|
+
return project_name
|
|
87
|
+
sha256s = sorted(doc.sha256 for doc in documents)
|
|
88
|
+
exclude = _CLI_FIELDS & set(type(options).model_fields)
|
|
89
|
+
options_json = options.model_dump_json(exclude=exclude, exclude_none=True)
|
|
90
|
+
fingerprint = hashlib.sha256(f"{':'.join(sha256s)}|{options_json}".encode()).hexdigest()[:16]
|
|
91
|
+
return f"{project_name}:{fingerprint}"
|
|
92
|
+
|
|
93
|
+
|
|
52
94
|
class DeploymentContext(BaseModel):
|
|
53
|
-
"""
|
|
95
|
+
"""Infrastructure configuration for deployments.
|
|
54
96
|
|
|
55
97
|
Webhooks are optional - provide URLs to enable:
|
|
56
98
|
- progress_webhook_url: Per-flow progress (started/completed/cached)
|
|
@@ -58,8 +100,8 @@ class DeploymentContext(BaseModel):
|
|
|
58
100
|
- completion_webhook_url: Final result when deployment ends
|
|
59
101
|
"""
|
|
60
102
|
|
|
61
|
-
input_documents_urls:
|
|
62
|
-
output_documents_urls: dict[str, str] = Field(default_factory=dict)
|
|
103
|
+
input_documents_urls: tuple[str, ...] = Field(default_factory=tuple)
|
|
104
|
+
output_documents_urls: dict[str, str] = Field(default_factory=dict) # nosemgrep: mutable-field-on-frozen-pydantic-model
|
|
63
105
|
|
|
64
106
|
progress_webhook_url: str = ""
|
|
65
107
|
status_webhook_url: str = ""
|
|
@@ -69,7 +111,7 @@ class DeploymentContext(BaseModel):
|
|
|
69
111
|
|
|
70
112
|
|
|
71
113
|
class DeploymentResult(BaseModel):
|
|
72
|
-
"""
|
|
114
|
+
"""Base class for deployment results."""
|
|
73
115
|
|
|
74
116
|
success: bool
|
|
75
117
|
error: str | None = None
|
|
@@ -84,19 +126,28 @@ TResult = TypeVar("TResult", bound=DeploymentResult)
|
|
|
84
126
|
class FlowCallable(Protocol):
|
|
85
127
|
"""Protocol for @pipeline_flow decorated functions."""
|
|
86
128
|
|
|
87
|
-
config: Any
|
|
88
129
|
name: str
|
|
89
130
|
__name__: str
|
|
131
|
+
input_document_types: list[type[Document]]
|
|
132
|
+
output_document_types: list[type[Document]]
|
|
133
|
+
estimated_minutes: int
|
|
90
134
|
|
|
91
|
-
def __call__(
|
|
92
|
-
|
|
93
|
-
|
|
135
|
+
def __call__(self, project_name: str, documents: list[Document], flow_options: FlowOptions) -> Any: # type: ignore[type-arg]
|
|
136
|
+
"""Execute the flow with standard pipeline signature."""
|
|
137
|
+
...
|
|
94
138
|
|
|
95
139
|
def with_options(self, **kwargs: Any) -> "FlowCallable":
|
|
96
|
-
"""Return a copy with overridden Prefect flow options."""
|
|
140
|
+
"""Return a copy with overridden Prefect flow options (e.g., hooks)."""
|
|
97
141
|
...
|
|
98
142
|
|
|
99
143
|
|
|
144
|
+
def _reattach_flow_metadata(original: FlowCallable, target: Any) -> None:
|
|
145
|
+
"""Reattach custom flow attributes that Prefect's with_options() may strip."""
|
|
146
|
+
for attr in ("input_document_types", "output_document_types", "estimated_minutes"):
|
|
147
|
+
if hasattr(original, attr) and not hasattr(target, attr):
|
|
148
|
+
setattr(target, attr, getattr(original, attr))
|
|
149
|
+
|
|
150
|
+
|
|
100
151
|
@dataclass(slots=True)
|
|
101
152
|
class _StatusWebhookHook:
|
|
102
153
|
"""Prefect hook that sends status webhooks on state transitions."""
|
|
@@ -118,7 +169,7 @@ class _StatusWebhookHook:
|
|
|
118
169
|
"flow_name": self.flow_name,
|
|
119
170
|
"state": state.type.value if hasattr(state.type, "value") else str(state.type),
|
|
120
171
|
"state_name": state.name or "",
|
|
121
|
-
"timestamp": datetime.now(
|
|
172
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
122
173
|
}
|
|
123
174
|
try:
|
|
124
175
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
@@ -127,11 +178,44 @@ class _StatusWebhookHook:
|
|
|
127
178
|
logger.warning(f"Status webhook failed: {e}")
|
|
128
179
|
|
|
129
180
|
|
|
181
|
+
def _validate_flow_chain(deployment_name: str, flows: list[Any]) -> None:
|
|
182
|
+
"""Validate that each flow's input types are satisfiable by preceding flows' outputs.
|
|
183
|
+
|
|
184
|
+
Simulates a type pool: starts with the first flow's input types, adds each flow's
|
|
185
|
+
output types after processing. For subsequent flows, each required input type must
|
|
186
|
+
be satisfiable by at least one type in the pool (via issubclass).
|
|
187
|
+
"""
|
|
188
|
+
type_pool: set[type[Document]] = set()
|
|
189
|
+
|
|
190
|
+
for i, flow_fn in enumerate(flows):
|
|
191
|
+
input_types: list[type[Document]] = getattr(flow_fn, "input_document_types", [])
|
|
192
|
+
output_types: list[type[Document]] = getattr(flow_fn, "output_document_types", [])
|
|
193
|
+
flow_name = getattr(flow_fn, "name", getattr(flow_fn, "__name__", f"flow[{i}]"))
|
|
194
|
+
|
|
195
|
+
if i == 0:
|
|
196
|
+
# First flow: its input types seed the pool
|
|
197
|
+
type_pool.update(input_types)
|
|
198
|
+
elif input_types:
|
|
199
|
+
# Subsequent flows: at least one declared input type must be satisfiable
|
|
200
|
+
# from the pool (union semantics — flow accepts any of the declared types)
|
|
201
|
+
any_satisfied = any(any(issubclass(available, t) for available in type_pool) for t in input_types)
|
|
202
|
+
if not any_satisfied:
|
|
203
|
+
input_names = sorted(t.__name__ for t in input_types)
|
|
204
|
+
pool_names = sorted(t.__name__ for t in type_pool) if type_pool else ["(empty)"]
|
|
205
|
+
raise TypeError(
|
|
206
|
+
f"{deployment_name}: flow '{flow_name}' (step {i + 1}) requires input types "
|
|
207
|
+
f"{input_names} but none are produced by preceding flows. "
|
|
208
|
+
f"Available types: {pool_names}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
type_pool.update(output_types)
|
|
212
|
+
|
|
213
|
+
|
|
130
214
|
class PipelineDeployment(Generic[TOptions, TResult]):
|
|
131
|
-
"""
|
|
215
|
+
"""Base class for pipeline deployments.
|
|
132
216
|
|
|
133
|
-
Features enabled by default
|
|
134
|
-
- Per-flow
|
|
217
|
+
Features enabled by default:
|
|
218
|
+
- Per-flow resume: Skip flows if outputs exist in DocumentStore
|
|
135
219
|
- Per-flow uploads: Upload documents after each flow
|
|
136
220
|
- Prefect hooks: Attach state hooks if status_webhook_url provided
|
|
137
221
|
- Upload on failure: Save partial results if pipeline fails
|
|
@@ -153,12 +237,9 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
153
237
|
|
|
154
238
|
cls.name = class_name_to_deployment_name(cls.__name__)
|
|
155
239
|
|
|
156
|
-
options_type, result_type = extract_generic_params(cls)
|
|
240
|
+
options_type, result_type = extract_generic_params(cls, PipelineDeployment)
|
|
157
241
|
if options_type is None or result_type is None:
|
|
158
|
-
raise TypeError(
|
|
159
|
-
f"{cls.__name__} must specify Generic parameters: "
|
|
160
|
-
f"class {cls.__name__}(PipelineDeployment[MyOptions, MyResult])"
|
|
161
|
-
)
|
|
242
|
+
raise TypeError(f"{cls.__name__} must specify Generic parameters: class {cls.__name__}(PipelineDeployment[MyOptions, MyResult])")
|
|
162
243
|
|
|
163
244
|
cls.options_type = options_type
|
|
164
245
|
cls.result_type = result_type
|
|
@@ -166,70 +247,38 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
166
247
|
if not cls.flows:
|
|
167
248
|
raise TypeError(f"{cls.__name__}.flows cannot be empty")
|
|
168
249
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
...
|
|
174
|
-
|
|
175
|
-
async def provision_storage(
|
|
176
|
-
self,
|
|
177
|
-
project_name: str,
|
|
178
|
-
documents: DocumentList,
|
|
179
|
-
options: TOptions,
|
|
180
|
-
context: DeploymentContext,
|
|
181
|
-
) -> str:
|
|
182
|
-
"""Provision GCS storage bucket based on project name and content hash.
|
|
183
|
-
|
|
184
|
-
Default: Creates `{project}-{date}-{hash}` bucket on GCS.
|
|
185
|
-
Returns empty string if GCS is unavailable or creation fails.
|
|
186
|
-
Override for custom storage provisioning logic.
|
|
187
|
-
"""
|
|
188
|
-
if not documents:
|
|
189
|
-
return ""
|
|
250
|
+
# build_result must be implemented (not still abstract from PipelineDeployment)
|
|
251
|
+
build_result_fn = getattr(cls, "build_result", None)
|
|
252
|
+
if build_result_fn is None or getattr(build_result_fn, "__isabstractmethod__", False):
|
|
253
|
+
raise TypeError(f"{cls.__name__} must implement 'build_result' static method")
|
|
190
254
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%y-%m-%d")
|
|
255
|
+
# No duplicate flows (by identity)
|
|
256
|
+
seen_ids: set[int] = set()
|
|
257
|
+
for flow_fn in cls.flows:
|
|
258
|
+
fid = id(flow_fn)
|
|
259
|
+
if fid in seen_ids:
|
|
260
|
+
flow_name = getattr(flow_fn, "name", getattr(flow_fn, "__name__", str(flow_fn)))
|
|
261
|
+
raise TypeError(f"{cls.__name__}.flows contains duplicate flow '{flow_name}'")
|
|
262
|
+
seen_ids.add(fid)
|
|
200
263
|
|
|
201
|
-
|
|
202
|
-
|
|
264
|
+
# Flow type chain validation: simulate a type pool
|
|
265
|
+
_validate_flow_chain(cls.__name__, cls.flows)
|
|
203
266
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
logger.info(f"Using existing bucket: {bucket_name}")
|
|
210
|
-
return f"gs://{bucket_name}"
|
|
211
|
-
except Exception:
|
|
212
|
-
continue
|
|
267
|
+
@staticmethod
|
|
268
|
+
@abstractmethod
|
|
269
|
+
def build_result(project_name: str, documents: list[Document], options: TOptions) -> TResult:
|
|
270
|
+
"""Extract typed result from pipeline documents loaded from DocumentStore."""
|
|
271
|
+
...
|
|
213
272
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
async def _load_cached_output(
|
|
224
|
-
self, flow_fn: FlowCallable, storage_uri: str
|
|
225
|
-
) -> DocumentList | None:
|
|
226
|
-
"""Load cached outputs if they exist. Override for custom cache logic."""
|
|
227
|
-
try:
|
|
228
|
-
output_type = flow_fn.config.OUTPUT_DOCUMENT_TYPE
|
|
229
|
-
docs = await flow_fn.config.load_documents_by_type(storage_uri, [output_type])
|
|
230
|
-
return docs if docs else None
|
|
231
|
-
except Exception:
|
|
232
|
-
return None
|
|
273
|
+
def _all_document_types(self) -> list[type[Document]]:
|
|
274
|
+
"""Collect all document types from all flows (inputs + outputs), deduplicated."""
|
|
275
|
+
types: dict[str, type[Document]] = {}
|
|
276
|
+
for flow_fn in self.flows:
|
|
277
|
+
for t in getattr(flow_fn, "input_document_types", []):
|
|
278
|
+
types[t.__name__] = t
|
|
279
|
+
for t in getattr(flow_fn, "output_document_types", []):
|
|
280
|
+
types[t.__name__] = t
|
|
281
|
+
return list(types.values())
|
|
233
282
|
|
|
234
283
|
def _build_status_hooks(
|
|
235
284
|
self,
|
|
@@ -262,7 +311,6 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
262
311
|
context: DeploymentContext,
|
|
263
312
|
flow_run_id: str,
|
|
264
313
|
project_name: str,
|
|
265
|
-
storage_uri: str,
|
|
266
314
|
step: int,
|
|
267
315
|
total_steps: int,
|
|
268
316
|
flow_name: str,
|
|
@@ -271,15 +319,19 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
271
319
|
message: str = "",
|
|
272
320
|
) -> None:
|
|
273
321
|
"""Send progress webhook and update flow run labels."""
|
|
274
|
-
|
|
322
|
+
# Use estimated_minutes for weighted progress calculation
|
|
323
|
+
flow_minutes = [getattr(f, "estimated_minutes", 1) for f in self.flows]
|
|
324
|
+
total_minutes = sum(flow_minutes) or 1
|
|
325
|
+
completed_minutes = sum(flow_minutes[: max(step - 1, 0)])
|
|
326
|
+
current_flow_minutes = flow_minutes[step - 1] if step - 1 < len(flow_minutes) else 1
|
|
327
|
+
progress = round(max(0.0, min(1.0, (completed_minutes + current_flow_minutes * step_progress) / total_minutes)), 4)
|
|
275
328
|
|
|
276
329
|
if context.progress_webhook_url:
|
|
277
330
|
payload = ProgressRun(
|
|
278
331
|
flow_run_id=UUID(flow_run_id) if flow_run_id else UUID(int=0),
|
|
279
332
|
project_name=project_name,
|
|
280
333
|
state="RUNNING",
|
|
281
|
-
timestamp=datetime.now(
|
|
282
|
-
storage_uri=storage_uri,
|
|
334
|
+
timestamp=datetime.now(UTC),
|
|
283
335
|
step=step,
|
|
284
336
|
total_steps=total_steps,
|
|
285
337
|
flow_name=flow_name,
|
|
@@ -316,7 +368,6 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
316
368
|
context: DeploymentContext,
|
|
317
369
|
flow_run_id: str,
|
|
318
370
|
project_name: str,
|
|
319
|
-
storage_uri: str,
|
|
320
371
|
result: TResult | None,
|
|
321
372
|
error: str | None,
|
|
322
373
|
) -> None:
|
|
@@ -324,7 +375,7 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
324
375
|
if not context.completion_webhook_url:
|
|
325
376
|
return
|
|
326
377
|
try:
|
|
327
|
-
now = datetime.now(
|
|
378
|
+
now = datetime.now(UTC)
|
|
328
379
|
frid = UUID(flow_run_id) if flow_run_id else UUID(int=0)
|
|
329
380
|
payload: CompletedRun | FailedRun
|
|
330
381
|
if result is not None:
|
|
@@ -332,7 +383,6 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
332
383
|
flow_run_id=frid,
|
|
333
384
|
project_name=project_name,
|
|
334
385
|
timestamp=now,
|
|
335
|
-
storage_uri=storage_uri,
|
|
336
386
|
state="COMPLETED",
|
|
337
387
|
result=DeploymentResultData.model_validate(result.model_dump()),
|
|
338
388
|
)
|
|
@@ -341,7 +391,6 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
341
391
|
flow_run_id=frid,
|
|
342
392
|
project_name=project_name,
|
|
343
393
|
timestamp=now,
|
|
344
|
-
storage_uri=storage_uri,
|
|
345
394
|
state="FAILED",
|
|
346
395
|
error=error or "Unknown error",
|
|
347
396
|
)
|
|
@@ -353,27 +402,24 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
353
402
|
async def run(
|
|
354
403
|
self,
|
|
355
404
|
project_name: str,
|
|
356
|
-
documents:
|
|
405
|
+
documents: list[Document],
|
|
357
406
|
options: TOptions,
|
|
358
407
|
context: DeploymentContext,
|
|
359
408
|
) -> TResult:
|
|
360
|
-
"""Execute flows with
|
|
361
|
-
from prefect import runtime # noqa: PLC0415
|
|
409
|
+
"""Execute all flows with resume, per-flow uploads, and webhooks.
|
|
362
410
|
|
|
363
|
-
|
|
364
|
-
|
|
411
|
+
Args:
|
|
412
|
+
project_name: Unique identifier for this pipeline run (used as run_scope).
|
|
413
|
+
documents: Initial input documents for the first flow.
|
|
414
|
+
options: Flow options passed to each flow.
|
|
415
|
+
context: Deployment context with webhook URLs and document upload config.
|
|
365
416
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
storage_uri = await self.provision_storage(project_name, docs, options, context)
|
|
373
|
-
if storage_uri and docs:
|
|
374
|
-
await self.flows[0].config.save_documents(
|
|
375
|
-
storage_uri, docs, validate_output_type=False
|
|
376
|
-
)
|
|
417
|
+
Returns:
|
|
418
|
+
Typed deployment result built from all pipeline documents.
|
|
419
|
+
"""
|
|
420
|
+
store = get_document_store()
|
|
421
|
+
total_steps = len(self.flows)
|
|
422
|
+
flow_run_id: str = str(runtime.flow_run.get_id()) if runtime.flow_run else "" # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownArgumentType]
|
|
377
423
|
|
|
378
424
|
# Write identity labels for polling endpoint
|
|
379
425
|
if flow_run_id:
|
|
@@ -381,62 +427,80 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
381
427
|
async with get_client() as client:
|
|
382
428
|
await client.update_flow_run_labels(
|
|
383
429
|
flow_run_id=UUID(flow_run_id),
|
|
384
|
-
labels={
|
|
385
|
-
"pipeline.project_name": project_name,
|
|
386
|
-
"pipeline.storage_uri": storage_uri,
|
|
387
|
-
},
|
|
430
|
+
labels={"pipeline.project_name": project_name},
|
|
388
431
|
)
|
|
389
432
|
except Exception as e:
|
|
390
433
|
logger.warning(f"Identity label update failed: {e}")
|
|
391
434
|
|
|
392
435
|
# Download additional input documents
|
|
436
|
+
input_docs = list(documents)
|
|
393
437
|
if context.input_documents_urls:
|
|
394
|
-
|
|
395
|
-
downloaded
|
|
396
|
-
|
|
438
|
+
downloaded = await download_documents(list(context.input_documents_urls))
|
|
439
|
+
input_docs.extend(downloaded)
|
|
440
|
+
|
|
441
|
+
# Compute run scope AFTER downloads so the fingerprint includes all inputs
|
|
442
|
+
run_scope = _compute_run_scope(project_name, input_docs, options)
|
|
443
|
+
|
|
444
|
+
if not store and total_steps > 1:
|
|
445
|
+
logger.warning("No DocumentStore configured for multi-step pipeline — intermediate outputs will not accumulate between flows")
|
|
397
446
|
|
|
398
|
-
accumulated_docs = docs
|
|
399
447
|
completion_sent = False
|
|
400
448
|
|
|
449
|
+
# Tracking lifecycle
|
|
450
|
+
tracking_svc = None
|
|
451
|
+
run_uuid: UUID | None = None
|
|
452
|
+
run_failed = False
|
|
401
453
|
try:
|
|
454
|
+
tracking_svc = get_tracking_service()
|
|
455
|
+
if tracking_svc:
|
|
456
|
+
run_uuid = UUID(flow_run_id) if flow_run_id else uuid4()
|
|
457
|
+
tracking_svc.set_run_context(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
|
|
458
|
+
tracking_svc.track_run_start(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
|
|
459
|
+
except Exception:
|
|
460
|
+
tracking_svc = None
|
|
461
|
+
|
|
462
|
+
# Set RunContext for the entire pipeline run
|
|
463
|
+
run_token = set_run_context(RunContext(run_scope=run_scope))
|
|
464
|
+
try:
|
|
465
|
+
# Save initial input documents to store
|
|
466
|
+
if store and input_docs:
|
|
467
|
+
await store.save_batch(input_docs, run_scope)
|
|
468
|
+
|
|
402
469
|
for step, flow_fn in enumerate(self.flows, start=1):
|
|
403
470
|
flow_name = getattr(flow_fn, "name", flow_fn.__name__)
|
|
404
|
-
flow_run_id = str(runtime.flow_run.get_id()) if runtime.flow_run else "" # pyright: ignore[reportAttributeAccessIssue]
|
|
405
|
-
|
|
406
|
-
#
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
471
|
+
flow_run_id = str(runtime.flow_run.get_id()) if runtime.flow_run else "" # pyright: ignore[reportAttributeAccessIssue, reportUnknownMemberType, reportUnknownArgumentType]
|
|
472
|
+
|
|
473
|
+
# Resume check: skip if output documents already exist in store
|
|
474
|
+
output_types = getattr(flow_fn, "output_document_types", [])
|
|
475
|
+
if store and output_types:
|
|
476
|
+
all_outputs_exist = all([await store.has_documents(run_scope, ot) for ot in output_types])
|
|
477
|
+
if all_outputs_exist:
|
|
478
|
+
logger.info(f"[{step}/{total_steps}] Resume: skipping {flow_name} (outputs exist)")
|
|
412
479
|
await self._send_progress(
|
|
413
480
|
context,
|
|
414
481
|
flow_run_id,
|
|
415
482
|
project_name,
|
|
416
|
-
storage_uri,
|
|
417
483
|
step,
|
|
418
484
|
total_steps,
|
|
419
485
|
flow_name,
|
|
420
486
|
"cached",
|
|
421
487
|
step_progress=1.0,
|
|
422
|
-
message=f"
|
|
488
|
+
message=f"Resumed from store: {flow_name}",
|
|
423
489
|
)
|
|
424
490
|
continue
|
|
425
491
|
|
|
426
492
|
# Prefect state hooks
|
|
427
493
|
active_flow = flow_fn
|
|
428
494
|
if context.status_webhook_url:
|
|
429
|
-
hooks = self._build_status_hooks(
|
|
430
|
-
context, flow_run_id, project_name, step, total_steps, flow_name
|
|
431
|
-
)
|
|
495
|
+
hooks = self._build_status_hooks(context, flow_run_id, project_name, step, total_steps, flow_name)
|
|
432
496
|
active_flow = flow_fn.with_options(**hooks)
|
|
497
|
+
_reattach_flow_metadata(flow_fn, active_flow)
|
|
433
498
|
|
|
434
499
|
# Progress: started
|
|
435
500
|
await self._send_progress(
|
|
436
501
|
context,
|
|
437
502
|
flow_run_id,
|
|
438
503
|
project_name,
|
|
439
|
-
storage_uri,
|
|
440
504
|
step,
|
|
441
505
|
total_steps,
|
|
442
506
|
flow_name,
|
|
@@ -447,40 +511,34 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
447
511
|
|
|
448
512
|
logger.info(f"[{step}/{total_steps}] Starting: {flow_name}")
|
|
449
513
|
|
|
450
|
-
# Load documents
|
|
451
|
-
|
|
452
|
-
|
|
514
|
+
# Load input documents from store
|
|
515
|
+
input_types = getattr(flow_fn, "input_document_types", [])
|
|
516
|
+
if store and input_types:
|
|
517
|
+
current_docs = await store.load(run_scope, input_types)
|
|
453
518
|
else:
|
|
454
|
-
current_docs =
|
|
519
|
+
current_docs = input_docs
|
|
455
520
|
|
|
456
521
|
try:
|
|
457
|
-
|
|
522
|
+
await active_flow(project_name, current_docs, options)
|
|
458
523
|
except Exception as e:
|
|
459
524
|
# Upload partial results on failure
|
|
460
|
-
if context.output_documents_urls:
|
|
461
|
-
await
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
)
|
|
525
|
+
if context.output_documents_urls and store:
|
|
526
|
+
all_docs = await store.load(run_scope, self._all_document_types())
|
|
527
|
+
await upload_documents(all_docs, context.output_documents_urls)
|
|
528
|
+
await self._send_completion(context, flow_run_id, project_name, result=None, error=str(e))
|
|
465
529
|
completion_sent = True
|
|
466
530
|
raise
|
|
467
531
|
|
|
468
|
-
#
|
|
469
|
-
if
|
|
470
|
-
await
|
|
471
|
-
|
|
472
|
-
accumulated_docs = DocumentList(list(accumulated_docs) + list(new_docs))
|
|
473
|
-
|
|
474
|
-
# Per-flow upload
|
|
475
|
-
if context.output_documents_urls:
|
|
476
|
-
await upload_documents(new_docs, context.output_documents_urls)
|
|
532
|
+
# Per-flow upload (load from store since @pipeline_flow saves there)
|
|
533
|
+
if context.output_documents_urls and store and output_types:
|
|
534
|
+
flow_docs = await store.load(run_scope, output_types)
|
|
535
|
+
await upload_documents(flow_docs, context.output_documents_urls)
|
|
477
536
|
|
|
478
537
|
# Progress: completed
|
|
479
538
|
await self._send_progress(
|
|
480
539
|
context,
|
|
481
540
|
flow_run_id,
|
|
482
541
|
project_name,
|
|
483
|
-
storage_uri,
|
|
484
542
|
step,
|
|
485
543
|
total_steps,
|
|
486
544
|
flow_name,
|
|
@@ -491,43 +549,68 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
491
549
|
|
|
492
550
|
logger.info(f"[{step}/{total_steps}] Completed: {flow_name}")
|
|
493
551
|
|
|
494
|
-
result
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
552
|
+
# Build result from all documents in store
|
|
553
|
+
if store:
|
|
554
|
+
all_docs = await store.load(run_scope, self._all_document_types())
|
|
555
|
+
else:
|
|
556
|
+
all_docs = input_docs
|
|
557
|
+
result = self.build_result(project_name, all_docs, options)
|
|
558
|
+
await self._send_completion(context, flow_run_id, project_name, result=result, error=None)
|
|
498
559
|
return result
|
|
499
560
|
|
|
500
561
|
except Exception as e:
|
|
562
|
+
run_failed = True
|
|
501
563
|
if not completion_sent:
|
|
502
|
-
await self._send_completion(
|
|
503
|
-
context, flow_run_id, project_name, storage_uri, result=None, error=str(e)
|
|
504
|
-
)
|
|
564
|
+
await self._send_completion(context, flow_run_id, project_name, result=None, error=str(e))
|
|
505
565
|
raise
|
|
566
|
+
finally:
|
|
567
|
+
reset_run_context(run_token)
|
|
568
|
+
store = get_document_store()
|
|
569
|
+
if store:
|
|
570
|
+
with contextlib.suppress(Exception):
|
|
571
|
+
store.flush()
|
|
572
|
+
if (svc := tracking_svc) is not None and run_uuid is not None:
|
|
573
|
+
with contextlib.suppress(Exception):
|
|
574
|
+
svc.track_run_end(run_id=run_uuid, status=RunStatus.FAILED if run_failed else RunStatus.COMPLETED)
|
|
575
|
+
svc.flush()
|
|
506
576
|
|
|
507
577
|
@final
|
|
508
578
|
def run_local(
|
|
509
579
|
self,
|
|
510
580
|
project_name: str,
|
|
511
|
-
documents:
|
|
581
|
+
documents: list[Document],
|
|
512
582
|
options: TOptions,
|
|
513
583
|
context: DeploymentContext | None = None,
|
|
514
584
|
output_dir: Path | None = None,
|
|
515
585
|
) -> TResult:
|
|
516
|
-
"""Run locally with Prefect test harness.
|
|
586
|
+
"""Run locally with Prefect test harness and in-memory document store.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
project_name: Pipeline run identifier.
|
|
590
|
+
documents: Initial input documents.
|
|
591
|
+
options: Flow options.
|
|
592
|
+
context: Optional deployment context (defaults to empty).
|
|
593
|
+
output_dir: Optional directory for writing result.json.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Typed deployment result.
|
|
597
|
+
"""
|
|
517
598
|
if context is None:
|
|
518
599
|
context = DeploymentContext()
|
|
519
600
|
|
|
520
|
-
|
|
521
|
-
if output_dir and isinstance(documents, DocumentList):
|
|
601
|
+
if output_dir:
|
|
522
602
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
523
|
-
documents = str(output_dir)
|
|
524
603
|
|
|
525
|
-
|
|
526
|
-
|
|
604
|
+
store = MemoryDocumentStore()
|
|
605
|
+
set_document_store(store)
|
|
606
|
+
try:
|
|
607
|
+
with prefect_test_harness(), disable_run_logger():
|
|
527
608
|
result = asyncio.run(self.run(project_name, documents, options, context))
|
|
609
|
+
finally:
|
|
610
|
+
store.shutdown()
|
|
611
|
+
set_document_store(None)
|
|
528
612
|
|
|
529
613
|
if output_dir:
|
|
530
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
531
614
|
(output_dir / "result.json").write_text(result.model_dump_json(indent=2))
|
|
532
615
|
|
|
533
616
|
return result
|
|
@@ -535,19 +618,26 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
535
618
|
@final
|
|
536
619
|
def run_cli(
|
|
537
620
|
self,
|
|
538
|
-
initializer: Callable[[TOptions], tuple[str,
|
|
621
|
+
initializer: Callable[[TOptions], tuple[str, list[Document]]] | None = None,
|
|
539
622
|
trace_name: str | None = None,
|
|
540
623
|
) -> None:
|
|
541
|
-
"""Execute pipeline from CLI arguments with --start/--end step control.
|
|
624
|
+
"""Execute pipeline from CLI arguments with --start/--end step control.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
initializer: Optional callback returning (project_name, documents) from options.
|
|
628
|
+
trace_name: Optional Laminar trace span name prefix.
|
|
629
|
+
"""
|
|
542
630
|
if len(sys.argv) == 1:
|
|
543
631
|
sys.argv.append("--help")
|
|
544
632
|
|
|
545
633
|
setup_logging()
|
|
546
634
|
try:
|
|
547
|
-
|
|
548
|
-
logger.info("
|
|
635
|
+
initialize_observability()
|
|
636
|
+
logger.info("Observability initialized.")
|
|
549
637
|
except Exception as e:
|
|
550
|
-
logger.warning(f"Failed to initialize
|
|
638
|
+
logger.warning(f"Failed to initialize observability: {e}")
|
|
639
|
+
with contextlib.suppress(Exception):
|
|
640
|
+
Laminar.initialize(export_timeout_seconds=15)
|
|
551
641
|
|
|
552
642
|
deployment = self
|
|
553
643
|
|
|
@@ -563,27 +653,50 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
563
653
|
project_name: str | None = None
|
|
564
654
|
start: int = 1
|
|
565
655
|
end: int | None = None
|
|
656
|
+
no_trace: bool = False
|
|
566
657
|
|
|
567
658
|
model_config = SettingsConfigDict(frozen=True, extra="ignore")
|
|
568
659
|
|
|
569
660
|
opts = cast(TOptions, _CliOptions()) # type: ignore[reportCallIssue]
|
|
570
661
|
|
|
571
|
-
wd
|
|
662
|
+
wd = cast(Path, opts.working_directory) # pyright: ignore[reportAttributeAccessIssue]
|
|
572
663
|
wd.mkdir(parents=True, exist_ok=True)
|
|
573
664
|
|
|
574
|
-
project_name =
|
|
665
|
+
project_name = cast(str, opts.project_name or wd.name) # pyright: ignore[reportAttributeAccessIssue]
|
|
575
666
|
start_step = getattr(opts, "start", 1)
|
|
576
667
|
end_step = getattr(opts, "end", None)
|
|
668
|
+
no_trace = getattr(opts, "no_trace", False)
|
|
577
669
|
|
|
578
|
-
#
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
670
|
+
# Set up local debug tracing (writes to <working_dir>/.trace)
|
|
671
|
+
debug_processor: LocalDebugSpanProcessor | None = None
|
|
672
|
+
if not no_trace:
|
|
673
|
+
try:
|
|
674
|
+
trace_path = wd / ".trace"
|
|
675
|
+
trace_path.mkdir(parents=True, exist_ok=True)
|
|
676
|
+
debug_config = TraceDebugConfig(path=trace_path, max_traces=20)
|
|
677
|
+
debug_writer = LocalTraceWriter(debug_config)
|
|
678
|
+
debug_processor = LocalDebugSpanProcessor(debug_writer)
|
|
679
|
+
provider: Any = otel_trace.get_tracer_provider()
|
|
680
|
+
if hasattr(provider, "add_span_processor"):
|
|
681
|
+
provider.add_span_processor(debug_processor)
|
|
682
|
+
logger.info(f"Local debug tracing enabled at {trace_path}")
|
|
683
|
+
except Exception as e:
|
|
684
|
+
logger.warning(f"Failed to set up local debug tracing: {e}")
|
|
685
|
+
debug_processor = None
|
|
686
|
+
|
|
687
|
+
# Initialize document store — ClickHouse when configured, local filesystem otherwise
|
|
688
|
+
summary_generator = _build_summary_generator()
|
|
689
|
+
if settings.clickhouse_host:
|
|
690
|
+
store = create_document_store(settings, summary_generator=summary_generator)
|
|
691
|
+
else:
|
|
692
|
+
store = LocalDocumentStore(base_path=wd, summary_generator=summary_generator)
|
|
693
|
+
set_document_store(store)
|
|
694
|
+
|
|
695
|
+
# Initialize documents (always run initializer for run scope fingerprinting,
|
|
696
|
+
# even when start_step > 1, so --start N resumes find the correct scope)
|
|
697
|
+
initial_documents: list[Document] = []
|
|
698
|
+
if initializer:
|
|
699
|
+
_, initial_documents = initializer(opts)
|
|
587
700
|
|
|
588
701
|
context = DeploymentContext()
|
|
589
702
|
|
|
@@ -604,11 +717,11 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
604
717
|
result = asyncio.run(
|
|
605
718
|
self._run_with_steps(
|
|
606
719
|
project_name=project_name,
|
|
607
|
-
storage_uri=str(wd),
|
|
608
720
|
options=opts,
|
|
609
721
|
context=context,
|
|
610
722
|
start_step=start_step,
|
|
611
723
|
end_step=end_step,
|
|
724
|
+
initial_documents=initial_documents,
|
|
612
725
|
)
|
|
613
726
|
)
|
|
614
727
|
|
|
@@ -616,48 +729,106 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
616
729
|
result_file.write_text(result.model_dump_json(indent=2))
|
|
617
730
|
logger.info(f"Result saved to {result_file}")
|
|
618
731
|
|
|
732
|
+
# Shutdown background workers (debug tracing, document summaries, tracking)
|
|
733
|
+
if debug_processor is not None:
|
|
734
|
+
debug_processor.shutdown()
|
|
735
|
+
store = get_document_store()
|
|
736
|
+
if store:
|
|
737
|
+
store.shutdown()
|
|
738
|
+
tracking_svc = get_tracking_service()
|
|
739
|
+
if tracking_svc:
|
|
740
|
+
tracking_svc.shutdown()
|
|
741
|
+
|
|
619
742
|
async def _run_with_steps(
|
|
620
743
|
self,
|
|
621
744
|
project_name: str,
|
|
622
|
-
storage_uri: str,
|
|
623
745
|
options: TOptions,
|
|
624
746
|
context: DeploymentContext,
|
|
625
747
|
start_step: int = 1,
|
|
626
748
|
end_step: int | None = None,
|
|
749
|
+
initial_documents: list[Document] | None = None,
|
|
627
750
|
) -> TResult:
|
|
628
|
-
"""Run pipeline with start/end step control
|
|
751
|
+
"""Run pipeline with start/end step control and DocumentStore-based resume."""
|
|
752
|
+
store = get_document_store()
|
|
629
753
|
if end_step is None:
|
|
630
754
|
end_step = len(self.flows)
|
|
631
755
|
|
|
632
756
|
total_steps = len(self.flows)
|
|
633
|
-
|
|
757
|
+
run_scope = _compute_run_scope(project_name, initial_documents or [], options)
|
|
634
758
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
759
|
+
# Tracking lifecycle for CLI path
|
|
760
|
+
tracking_svc = None
|
|
761
|
+
run_uuid: UUID | None = None
|
|
762
|
+
run_failed = False
|
|
763
|
+
try:
|
|
764
|
+
tracking_svc = get_tracking_service()
|
|
765
|
+
if tracking_svc:
|
|
766
|
+
run_uuid = uuid4()
|
|
767
|
+
tracking_svc.set_run_context(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
|
|
768
|
+
tracking_svc.track_run_start(run_id=run_uuid, project_name=project_name, flow_name=self.name, run_scope=run_scope)
|
|
769
|
+
except Exception:
|
|
770
|
+
tracking_svc = None
|
|
640
771
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
772
|
+
# Set RunContext for the entire pipeline run
|
|
773
|
+
run_token = set_run_context(RunContext(run_scope=run_scope))
|
|
774
|
+
try:
|
|
775
|
+
# Save initial documents to store
|
|
776
|
+
if store and initial_documents:
|
|
777
|
+
await store.save_batch(initial_documents, run_scope)
|
|
647
778
|
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
779
|
+
for i in range(start_step - 1, end_step):
|
|
780
|
+
step = i + 1
|
|
781
|
+
flow_fn = self.flows[i]
|
|
782
|
+
flow_name = getattr(flow_fn, "name", flow_fn.__name__)
|
|
783
|
+
logger.info(f"--- [Step {step}/{total_steps}] {flow_name} ---")
|
|
784
|
+
|
|
785
|
+
# Resume check: skip if output documents already exist
|
|
786
|
+
output_types = getattr(flow_fn, "output_document_types", [])
|
|
787
|
+
if store and output_types:
|
|
788
|
+
all_outputs_exist = all([await store.has_documents(run_scope, ot) for ot in output_types])
|
|
789
|
+
if all_outputs_exist:
|
|
790
|
+
logger.info(f"--- [Step {step}/{total_steps}] Skipping {flow_name} (outputs exist) ---")
|
|
791
|
+
continue
|
|
792
|
+
|
|
793
|
+
# Load inputs from store
|
|
794
|
+
input_types = getattr(flow_fn, "input_document_types", [])
|
|
795
|
+
if store and input_types:
|
|
796
|
+
current_docs = await store.load(run_scope, input_types)
|
|
797
|
+
else:
|
|
798
|
+
current_docs = initial_documents or []
|
|
652
799
|
|
|
653
|
-
|
|
800
|
+
await flow_fn(project_name, current_docs, options)
|
|
801
|
+
|
|
802
|
+
# Build result from all documents in store
|
|
803
|
+
if store:
|
|
804
|
+
all_docs = await store.load(run_scope, self._all_document_types())
|
|
805
|
+
else:
|
|
806
|
+
all_docs = initial_documents or []
|
|
807
|
+
return self.build_result(project_name, all_docs, options)
|
|
808
|
+
except Exception:
|
|
809
|
+
run_failed = True
|
|
810
|
+
raise
|
|
811
|
+
finally:
|
|
812
|
+
reset_run_context(run_token)
|
|
813
|
+
store = get_document_store()
|
|
814
|
+
if store:
|
|
815
|
+
with contextlib.suppress(Exception):
|
|
816
|
+
store.flush()
|
|
817
|
+
if (svc := tracking_svc) is not None and run_uuid is not None:
|
|
818
|
+
with contextlib.suppress(Exception):
|
|
819
|
+
svc.track_run_end(run_id=run_uuid, status=RunStatus.FAILED if run_failed else RunStatus.COMPLETED)
|
|
820
|
+
svc.flush()
|
|
654
821
|
|
|
655
822
|
@final
|
|
656
823
|
def as_prefect_flow(self) -> Callable[..., Any]:
|
|
657
|
-
"""Generate Prefect flow for production deployment.
|
|
824
|
+
"""Generate a Prefect flow for production deployment.
|
|
825
|
+
|
|
826
|
+
Returns:
|
|
827
|
+
Async Prefect flow callable that initializes DocumentStore from settings.
|
|
828
|
+
"""
|
|
658
829
|
deployment = self
|
|
659
830
|
|
|
660
|
-
@flow(
|
|
831
|
+
@flow(
|
|
661
832
|
name=self.name,
|
|
662
833
|
flow_run_name=f"{self.name}-{{project_name}}",
|
|
663
834
|
persist_result=True,
|
|
@@ -665,11 +836,20 @@ class PipelineDeployment(Generic[TOptions, TResult]):
|
|
|
665
836
|
)
|
|
666
837
|
async def _deployment_flow(
|
|
667
838
|
project_name: str,
|
|
668
|
-
documents:
|
|
839
|
+
documents: list[Document],
|
|
669
840
|
options: FlowOptions,
|
|
670
841
|
context: DeploymentContext,
|
|
671
842
|
) -> DeploymentResult:
|
|
672
|
-
|
|
843
|
+
store = create_document_store(
|
|
844
|
+
settings,
|
|
845
|
+
summary_generator=_build_summary_generator(),
|
|
846
|
+
)
|
|
847
|
+
set_document_store(store)
|
|
848
|
+
try:
|
|
849
|
+
return await deployment.run(project_name, documents, cast(Any, options), context)
|
|
850
|
+
finally:
|
|
851
|
+
store.shutdown()
|
|
852
|
+
set_document_store(None)
|
|
673
853
|
|
|
674
854
|
return _deployment_flow
|
|
675
855
|
|