pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pulse_engine.pipeline.models import PipelineRunModel
|
|
7
|
+
from pulse_engine.pipeline.repositories import (
|
|
8
|
+
ModuleRegistryRepository,
|
|
9
|
+
PipelineRunRepository,
|
|
10
|
+
PipelineStepRunRepository,
|
|
11
|
+
)
|
|
12
|
+
from pulse_engine.pipeline.schemas import PipelineConfig, StepStatus
|
|
13
|
+
from pulse_engine.pipeline.translators.base import (
|
|
14
|
+
BaseStatusProvider,
|
|
15
|
+
BaseTranslator,
|
|
16
|
+
PipelineStatus,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PipelineServiceError(Exception):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PipelineSubmissionError(PipelineServiceError):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PipelineService:
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
module_repo: ModuleRegistryRepository,
|
|
32
|
+
run_repo: PipelineRunRepository,
|
|
33
|
+
translators: dict[str, BaseTranslator],
|
|
34
|
+
status_providers: dict[str, BaseStatusProvider],
|
|
35
|
+
settings: Any,
|
|
36
|
+
token_issuer: Any,
|
|
37
|
+
step_repo: PipelineStepRunRepository | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
self._module_repo = module_repo
|
|
40
|
+
self._run_repo = run_repo
|
|
41
|
+
self._translators = translators
|
|
42
|
+
self._status_providers = status_providers
|
|
43
|
+
self._settings = settings
|
|
44
|
+
self._token_issuer = token_issuer
|
|
45
|
+
self._step_repo = step_repo
|
|
46
|
+
|
|
47
|
+
async def trigger(
|
|
48
|
+
self,
|
|
49
|
+
tenant_id: str,
|
|
50
|
+
product: str,
|
|
51
|
+
orchestrator: str,
|
|
52
|
+
config: PipelineConfig,
|
|
53
|
+
global_config: dict[str, Any],
|
|
54
|
+
) -> str:
|
|
55
|
+
"""Trigger a pipeline. Returns the pipeline run ID."""
|
|
56
|
+
# Validate orchestrator
|
|
57
|
+
if orchestrator not in self._translators:
|
|
58
|
+
raise PipelineServiceError(f"Unsupported orchestrator: '{orchestrator}'")
|
|
59
|
+
|
|
60
|
+
# Resolve module images
|
|
61
|
+
images = await self._module_repo.get_images_map(tenant_id, product)
|
|
62
|
+
missing = [m.name for m in config.modules if m.module not in images]
|
|
63
|
+
if missing:
|
|
64
|
+
raise PipelineServiceError(
|
|
65
|
+
f"Modules not registered for product '{product}': {missing}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Create pipeline run record
|
|
69
|
+
run = await self._run_repo.create(
|
|
70
|
+
tenant_id=tenant_id,
|
|
71
|
+
product=product,
|
|
72
|
+
orchestrator=orchestrator,
|
|
73
|
+
config_snapshot=config.model_dump(mode="json", by_alias=True),
|
|
74
|
+
global_config=global_config,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Issue run-scoped token
|
|
78
|
+
token = self._token_issuer.issue_token(
|
|
79
|
+
pipeline_run_id=run.id,
|
|
80
|
+
tenant_id=tenant_id,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Submit to orchestrator
|
|
84
|
+
translator = self._translators[orchestrator]
|
|
85
|
+
try:
|
|
86
|
+
orchestrator_run_id = await translator.submit(
|
|
87
|
+
pipeline_run_id=run.id,
|
|
88
|
+
parsed_config=config,
|
|
89
|
+
module_images=images,
|
|
90
|
+
global_config=global_config,
|
|
91
|
+
tenant_id=tenant_id,
|
|
92
|
+
pulse_engine_url=self._settings.pulse_engine_url,
|
|
93
|
+
pulse_api_token=token,
|
|
94
|
+
)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
await self._run_repo.update_status(run.id, "submission_failed")
|
|
97
|
+
raise PipelineSubmissionError(
|
|
98
|
+
f"Failed to submit pipeline to {orchestrator}: {e}"
|
|
99
|
+
) from e
|
|
100
|
+
|
|
101
|
+
await self._run_repo.set_orchestrator_run_id(run.id, orchestrator_run_id)
|
|
102
|
+
await self._run_repo.update_status(run.id, "running")
|
|
103
|
+
return run.id
|
|
104
|
+
|
|
105
|
+
async def get_run(self, tenant_id: str, run_id: str) -> PipelineRunModel | None:
|
|
106
|
+
"""Get a pipeline run record."""
|
|
107
|
+
return await self._run_repo.get(run_id, tenant_id)
|
|
108
|
+
|
|
109
|
+
async def get_status(self, tenant_id: str, run_id: str) -> PipelineStatus | None:
|
|
110
|
+
"""Get normalized pipeline status, merging DB step records as fallback."""
|
|
111
|
+
run = await self._run_repo.get(run_id, tenant_id)
|
|
112
|
+
if run is None:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
# Load DB-persisted step records for merge / fallback
|
|
116
|
+
db_steps: list[StepStatus] = []
|
|
117
|
+
if self._step_repo is not None:
|
|
118
|
+
rows = await self._step_repo.list_by_run(run_id, tenant_id)
|
|
119
|
+
db_steps = [
|
|
120
|
+
StepStatus(
|
|
121
|
+
step=r.step_name,
|
|
122
|
+
module=r.module_type,
|
|
123
|
+
status=r.status,
|
|
124
|
+
started_at=r.started_at,
|
|
125
|
+
completed_at=r.completed_at,
|
|
126
|
+
error_message=r.error_message,
|
|
127
|
+
output_ref=r.output_ref,
|
|
128
|
+
)
|
|
129
|
+
for r in rows
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
if run.orchestrator_run_id is None:
|
|
133
|
+
return PipelineStatus(status=run.status, steps=db_steps)
|
|
134
|
+
|
|
135
|
+
provider = self._status_providers.get(run.orchestrator)
|
|
136
|
+
if provider is None:
|
|
137
|
+
return PipelineStatus(status=run.status, steps=db_steps)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
status = await provider.get_status(run.orchestrator_run_id)
|
|
141
|
+
except Exception:
|
|
142
|
+
# Orchestrator unreachable — serve from DB
|
|
143
|
+
return PipelineStatus(status=run.status, steps=db_steps)
|
|
144
|
+
|
|
145
|
+
# Merge: enrich orchestrator steps with DB fields (error_message, output_ref)
|
|
146
|
+
db_map = {s.step: s for s in db_steps}
|
|
147
|
+
merged: list[StepStatus] = []
|
|
148
|
+
for s in status.steps:
|
|
149
|
+
db = db_map.get(s.step)
|
|
150
|
+
if db is not None:
|
|
151
|
+
merged.append(
|
|
152
|
+
StepStatus(
|
|
153
|
+
step=s.step,
|
|
154
|
+
module=s.module,
|
|
155
|
+
status=s.status,
|
|
156
|
+
fan_out_count=s.fan_out_count,
|
|
157
|
+
completed_count=s.completed_count,
|
|
158
|
+
failed_count=s.failed_count,
|
|
159
|
+
started_at=s.started_at or db.started_at,
|
|
160
|
+
completed_at=db.completed_at,
|
|
161
|
+
error_message=db.error_message,
|
|
162
|
+
output_ref=db.output_ref,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
merged.append(s)
|
|
167
|
+
|
|
168
|
+
# Add DB-only steps not returned by orchestrator (e.g. after purge)
|
|
169
|
+
orchestrator_step_names = {s.step for s in status.steps}
|
|
170
|
+
for s in db_steps:
|
|
171
|
+
if s.step not in orchestrator_step_names:
|
|
172
|
+
merged.append(s)
|
|
173
|
+
|
|
174
|
+
status.steps = merged
|
|
175
|
+
|
|
176
|
+
# Cache top-level status
|
|
177
|
+
if status.status != run.status:
|
|
178
|
+
await self._run_repo.update_status(run_id, status.status)
|
|
179
|
+
|
|
180
|
+
return status
|
|
181
|
+
|
|
182
|
+
async def record_step_status(
|
|
183
|
+
self,
|
|
184
|
+
tenant_id: str,
|
|
185
|
+
run_id: str,
|
|
186
|
+
step_name: str,
|
|
187
|
+
status: str,
|
|
188
|
+
output_ref: dict[str, Any] | None = None,
|
|
189
|
+
error_message: str | None = None,
|
|
190
|
+
) -> None:
|
|
191
|
+
"""Persist a step status callback from the orchestrator runner."""
|
|
192
|
+
run = await self._run_repo.get(run_id, tenant_id)
|
|
193
|
+
if run is None or self._step_repo is None:
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
now = datetime.now(tz=UTC)
|
|
197
|
+
started_at = now if status == "running" else None
|
|
198
|
+
completed_at = now if status in {"completed", "failed"} else None
|
|
199
|
+
|
|
200
|
+
# Derive module_type from config snapshot if possible
|
|
201
|
+
module_type = ""
|
|
202
|
+
dag = run.config_snapshot.get("dag", [])
|
|
203
|
+
for step in dag:
|
|
204
|
+
base_name = step_name.split("[")[0]
|
|
205
|
+
if step.get("step") == base_name:
|
|
206
|
+
module_type = step.get("module_type") or step.get("module", "")
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
await self._step_repo.upsert(
|
|
210
|
+
pipeline_run_id=run_id,
|
|
211
|
+
tenant_id=tenant_id,
|
|
212
|
+
step_name=step_name,
|
|
213
|
+
module_type=module_type,
|
|
214
|
+
status=status,
|
|
215
|
+
started_at=started_at,
|
|
216
|
+
completed_at=completed_at,
|
|
217
|
+
error_message=error_message,
|
|
218
|
+
output_ref=output_ref,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
async def cancel(self, tenant_id: str, run_id: str) -> bool:
|
|
222
|
+
"""Cancel a running pipeline."""
|
|
223
|
+
run = await self._run_repo.get(run_id, tenant_id)
|
|
224
|
+
if run is None:
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
if run.orchestrator_run_id is None:
|
|
228
|
+
await self._run_repo.update_status(run_id, "cancelled")
|
|
229
|
+
return True
|
|
230
|
+
|
|
231
|
+
provider = self._status_providers.get(run.orchestrator)
|
|
232
|
+
if provider is None:
|
|
233
|
+
return False
|
|
234
|
+
|
|
235
|
+
result = await provider.cancel(run.orchestrator_run_id)
|
|
236
|
+
if result:
|
|
237
|
+
await self._run_repo.update_status(run_id, "cancelled")
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
async def list_runs(
|
|
241
|
+
self,
|
|
242
|
+
tenant_id: str,
|
|
243
|
+
product: str | None = None,
|
|
244
|
+
status: str | None = None,
|
|
245
|
+
limit: int = 20,
|
|
246
|
+
offset: int = 0,
|
|
247
|
+
) -> tuple[list[PipelineRunModel], int]:
|
|
248
|
+
return await self._run_repo.list_runs(
|
|
249
|
+
tenant_id, product=product, status=status, limit=limit, offset=offset
|
|
250
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pulse_engine.pipeline.translators.base import BaseStatusProvider, BaseTranslator
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BaseTranslator",
|
|
9
|
+
"BaseStatusProvider",
|
|
10
|
+
"get_translator",
|
|
11
|
+
"get_status_provider",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_translator(orchestrator: str, **kwargs: Any) -> BaseTranslator:
|
|
16
|
+
if orchestrator == "prefect":
|
|
17
|
+
from pulse_engine.pipeline.translators.prefect_translator import (
|
|
18
|
+
PrefectTranslator,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return PrefectTranslator(**kwargs)
|
|
22
|
+
elif orchestrator == "airflow":
|
|
23
|
+
from pulse_engine.pipeline.translators.airflow_translator import (
|
|
24
|
+
AirflowTranslator,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
return AirflowTranslator()
|
|
28
|
+
raise ValueError(f"Unknown orchestrator: {orchestrator}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_status_provider(orchestrator: str, **kwargs: Any) -> BaseStatusProvider:
|
|
32
|
+
if orchestrator == "prefect":
|
|
33
|
+
from pulse_engine.pipeline.translators.prefect_status import (
|
|
34
|
+
PrefectStatusProvider,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return PrefectStatusProvider(**kwargs)
|
|
38
|
+
elif orchestrator == "airflow":
|
|
39
|
+
from pulse_engine.pipeline.translators.airflow_status import (
|
|
40
|
+
AirflowStatusProvider,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return AirflowStatusProvider()
|
|
44
|
+
raise ValueError(f"Unknown orchestrator: {orchestrator}")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pulse_engine.pipeline.translators.base import BaseStatusProvider, PipelineStatus
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AirflowStatusProvider(BaseStatusProvider):
|
|
7
|
+
async def get_status(self, orchestrator_run_id: str) -> PipelineStatus:
|
|
8
|
+
raise NotImplementedError("Airflow status provider is not yet implemented.")
|
|
9
|
+
|
|
10
|
+
async def cancel(self, orchestrator_run_id: str) -> bool:
|
|
11
|
+
raise NotImplementedError("Airflow cancel is not yet implemented.")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pulse_engine.pipeline.schemas import PipelineConfig
|
|
6
|
+
from pulse_engine.pipeline.translators.base import BaseTranslator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AirflowTranslator(BaseTranslator):
|
|
10
|
+
async def submit(
|
|
11
|
+
self,
|
|
12
|
+
pipeline_run_id: str,
|
|
13
|
+
parsed_config: PipelineConfig,
|
|
14
|
+
module_images: dict[str, str],
|
|
15
|
+
global_config: dict[str, Any],
|
|
16
|
+
tenant_id: str,
|
|
17
|
+
pulse_engine_url: str,
|
|
18
|
+
pulse_api_token: str,
|
|
19
|
+
) -> str:
|
|
20
|
+
raise NotImplementedError(
|
|
21
|
+
"Airflow translator is not yet implemented. Use orchestrator='prefect'."
|
|
22
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pulse_engine.pipeline.schemas import PipelineConfig, StepStatus
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PipelineStatus:
|
|
12
|
+
status: str # pending, running, completed, failed, cancelled
|
|
13
|
+
started_at: str | None = None
|
|
14
|
+
steps: list[StepStatus] = field(default_factory=list)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseTranslator(ABC):
|
|
18
|
+
@abstractmethod
|
|
19
|
+
async def submit(
|
|
20
|
+
self,
|
|
21
|
+
pipeline_run_id: str,
|
|
22
|
+
parsed_config: PipelineConfig,
|
|
23
|
+
module_images: dict[str, str],
|
|
24
|
+
global_config: dict[str, Any],
|
|
25
|
+
tenant_id: str,
|
|
26
|
+
pulse_engine_url: str,
|
|
27
|
+
pulse_api_token: str,
|
|
28
|
+
) -> str:
|
|
29
|
+
"""Submit pipeline to orchestrator. Returns orchestrator's native run ID."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BaseStatusProvider(ABC):
|
|
34
|
+
@abstractmethod
|
|
35
|
+
async def get_status(self, orchestrator_run_id: str) -> PipelineStatus:
|
|
36
|
+
"""Query orchestrator and return normalized status."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def cancel(self, orchestrator_run_id: str) -> bool:
|
|
41
|
+
"""Cancel a running pipeline. Returns True if successfully cancelled."""
|
|
42
|
+
...
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from pulse_engine.pipeline.schemas import StepStatus
|
|
8
|
+
from pulse_engine.pipeline.translators.base import BaseStatusProvider, PipelineStatus
|
|
9
|
+
|
|
10
|
+
_PREFECT_STATE_MAP: dict[str, str] = {
|
|
11
|
+
"COMPLETED": "completed",
|
|
12
|
+
"FAILED": "failed",
|
|
13
|
+
"CANCELLED": "cancelled",
|
|
14
|
+
"CANCELLING": "cancelled",
|
|
15
|
+
"RUNNING": "running",
|
|
16
|
+
"PENDING": "pending",
|
|
17
|
+
"SCHEDULED": "pending",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PrefectStatusProvider(BaseStatusProvider):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
prefect_api_url: str,
|
|
25
|
+
prefect_api_key: str,
|
|
26
|
+
http_client: httpx.AsyncClient | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
self._api_url = prefect_api_url.rstrip("/")
|
|
29
|
+
self._api_key = prefect_api_key
|
|
30
|
+
self._client = http_client
|
|
31
|
+
|
|
32
|
+
def _get_client(self) -> httpx.AsyncClient:
|
|
33
|
+
if self._client is not None:
|
|
34
|
+
return self._client
|
|
35
|
+
auth_header = base64.b64encode(f":{self._api_key}".encode()).decode()
|
|
36
|
+
return httpx.AsyncClient(
|
|
37
|
+
headers={"Authorization": f"Basic {auth_header}"},
|
|
38
|
+
timeout=30.0,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
async def get_status(self, orchestrator_run_id: str) -> PipelineStatus:
|
|
42
|
+
client = self._get_client()
|
|
43
|
+
try:
|
|
44
|
+
resp = await client.get(f"{self._api_url}/flow_runs/{orchestrator_run_id}")
|
|
45
|
+
resp.raise_for_status()
|
|
46
|
+
flow_run = resp.json()
|
|
47
|
+
|
|
48
|
+
prefect_state = flow_run.get("state", {}).get("type", "PENDING")
|
|
49
|
+
status = _PREFECT_STATE_MAP.get(prefect_state, "pending")
|
|
50
|
+
started_at = flow_run.get("start_time")
|
|
51
|
+
|
|
52
|
+
task_resp = await client.post(
|
|
53
|
+
f"{self._api_url}/task_runs/filter",
|
|
54
|
+
json={"flow_runs": {"id": {"any_": [orchestrator_run_id]}}},
|
|
55
|
+
)
|
|
56
|
+
task_resp.raise_for_status()
|
|
57
|
+
task_runs = task_resp.json()
|
|
58
|
+
|
|
59
|
+
steps: list[StepStatus] = []
|
|
60
|
+
for task in task_runs:
|
|
61
|
+
task_state = task.get("state", {}).get("type", "PENDING")
|
|
62
|
+
steps.append(
|
|
63
|
+
StepStatus(
|
|
64
|
+
step=task.get("name", "unknown"),
|
|
65
|
+
module=task.get("name", "unknown"),
|
|
66
|
+
status=_PREFECT_STATE_MAP.get(task_state, "pending"),
|
|
67
|
+
started_at=task.get("start_time"),
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return PipelineStatus(
|
|
72
|
+
status=status,
|
|
73
|
+
started_at=started_at,
|
|
74
|
+
steps=steps,
|
|
75
|
+
)
|
|
76
|
+
finally:
|
|
77
|
+
if self._client is None:
|
|
78
|
+
await client.aclose()
|
|
79
|
+
|
|
80
|
+
async def cancel(self, orchestrator_run_id: str) -> bool:
|
|
81
|
+
client = self._get_client()
|
|
82
|
+
try:
|
|
83
|
+
resp = await client.post(
|
|
84
|
+
f"{self._api_url}/flow_runs/{orchestrator_run_id}/set_state",
|
|
85
|
+
json={"state": {"type": "CANCELLED"}},
|
|
86
|
+
)
|
|
87
|
+
resp.raise_for_status()
|
|
88
|
+
return True
|
|
89
|
+
except httpx.HTTPStatusError:
|
|
90
|
+
return False
|
|
91
|
+
finally:
|
|
92
|
+
if self._client is None:
|
|
93
|
+
await client.aclose()
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from pulse_engine.pipeline.schemas import PipelineConfig, SecretRef
|
|
9
|
+
from pulse_engine.pipeline.translators.base import BaseTranslator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PrefectTranslator(BaseTranslator):
|
|
13
|
+
"""Translates a PipelineConfig into a Prefect flow run submission."""
|
|
14
|
+
|
|
15
|
+
FLOW_ENTRYPOINT = "pulse_engine.runners.prefect_pipeline_flow:pipeline_flow"
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
prefect_api_url: str,
|
|
20
|
+
prefect_api_key: str,
|
|
21
|
+
work_pool_name: str,
|
|
22
|
+
engine_image: str = "",
|
|
23
|
+
http_client: httpx.AsyncClient | None = None,
|
|
24
|
+
) -> None:
|
|
25
|
+
self._api_url = prefect_api_url.rstrip("/")
|
|
26
|
+
self._api_key = prefect_api_key
|
|
27
|
+
self._work_pool_name = work_pool_name
|
|
28
|
+
self._engine_image = engine_image
|
|
29
|
+
self._client = http_client
|
|
30
|
+
|
|
31
|
+
def _get_client(self) -> httpx.AsyncClient:
|
|
32
|
+
if self._client is not None:
|
|
33
|
+
return self._client
|
|
34
|
+
auth_header = base64.b64encode(f":{self._api_key}".encode()).decode()
|
|
35
|
+
return httpx.AsyncClient(
|
|
36
|
+
headers={"Authorization": f"Basic {auth_header}"},
|
|
37
|
+
timeout=30.0,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def _serialize_dag(self, config: PipelineConfig) -> list[dict[str, Any]]:
|
|
41
|
+
"""Serialize v2 DAG into a JSON-friendly list for Prefect flow parameters."""
|
|
42
|
+
module_map = {m.name: m for m in config.modules}
|
|
43
|
+
steps = []
|
|
44
|
+
for s in config.dag:
|
|
45
|
+
module = module_map[s.module]
|
|
46
|
+
step_dict: dict[str, Any] = {
|
|
47
|
+
"step": s.step,
|
|
48
|
+
"module": s.module,
|
|
49
|
+
"module_type": module.module,
|
|
50
|
+
"retries": module.retries,
|
|
51
|
+
"retry_delay": module.retry_delay,
|
|
52
|
+
"timeout": module.timeout,
|
|
53
|
+
"max_concurrency": module.max_concurrency,
|
|
54
|
+
"args": module.args,
|
|
55
|
+
"env": {
|
|
56
|
+
k: v.model_dump() if isinstance(v, SecretRef) else v
|
|
57
|
+
for k, v in module.env.items()
|
|
58
|
+
},
|
|
59
|
+
"resources": {
|
|
60
|
+
**module.resources.model_dump(),
|
|
61
|
+
"compute": module.resources.compute
|
|
62
|
+
or config.infra.default_compute
|
|
63
|
+
or "ecs",
|
|
64
|
+
"cpu_units": module.resources.ecs_cpu_units,
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
if s.for_each is not None:
|
|
68
|
+
step_dict["for_each"] = s.for_each
|
|
69
|
+
if s.collect_from is not None:
|
|
70
|
+
step_dict["collect_from"] = (
|
|
71
|
+
[s.collect_from]
|
|
72
|
+
if isinstance(s.collect_from, str)
|
|
73
|
+
else list(s.collect_from)
|
|
74
|
+
)
|
|
75
|
+
if s.when is not None:
|
|
76
|
+
step_dict["when"] = s.when
|
|
77
|
+
if s.trigger_rule is not None:
|
|
78
|
+
step_dict["trigger_rule"] = s.trigger_rule
|
|
79
|
+
if s.depends_on:
|
|
80
|
+
step_dict["depends_on"] = [{"step": d.step} for d in s.depends_on]
|
|
81
|
+
steps.append(step_dict)
|
|
82
|
+
return steps
|
|
83
|
+
|
|
84
|
+
async def _ensure_deployment(
|
|
85
|
+
self,
|
|
86
|
+
client: httpx.AsyncClient,
|
|
87
|
+
pipeline_name: str,
|
|
88
|
+
cron_schedule: str | None = None,
|
|
89
|
+
) -> str:
|
|
90
|
+
"""Create or update the Prefect deployment for this pipeline.
|
|
91
|
+
|
|
92
|
+
Returns the deployment ID.
|
|
93
|
+
"""
|
|
94
|
+
deployment_name = self._get_deployment_name(pipeline_name)
|
|
95
|
+
|
|
96
|
+
# Get or create the flow (idempotent — Prefect returns existing if name matches)
|
|
97
|
+
flow_resp = await client.post(
|
|
98
|
+
f"{self._api_url}/flows/",
|
|
99
|
+
json={"name": deployment_name},
|
|
100
|
+
)
|
|
101
|
+
flow_resp.raise_for_status()
|
|
102
|
+
flow_id = str(flow_resp.json()["id"])
|
|
103
|
+
|
|
104
|
+
deploy_body: dict[str, Any] = {
|
|
105
|
+
"name": deployment_name,
|
|
106
|
+
"flow_id": flow_id,
|
|
107
|
+
"entrypoint": self.FLOW_ENTRYPOINT,
|
|
108
|
+
"work_pool_name": self._work_pool_name,
|
|
109
|
+
"job_variables": {"image": self._engine_image},
|
|
110
|
+
"path": "/app",
|
|
111
|
+
}
|
|
112
|
+
if cron_schedule:
|
|
113
|
+
deploy_body["schedules"] = [
|
|
114
|
+
{"schedule": {"cron": cron_schedule, "timezone": "UTC"}, "active": True}
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
# Upsert deployment
|
|
118
|
+
# path="/app" required for Prefect process worker to locate the flow
|
|
119
|
+
# when the entrypoint is a package import (not a relative file path).
|
|
120
|
+
deploy_resp = await client.post(
|
|
121
|
+
f"{self._api_url}/deployments/",
|
|
122
|
+
json=deploy_body,
|
|
123
|
+
)
|
|
124
|
+
deploy_resp.raise_for_status()
|
|
125
|
+
return str(deploy_resp.json()["id"])
|
|
126
|
+
|
|
127
|
+
async def submit(
|
|
128
|
+
self,
|
|
129
|
+
pipeline_run_id: str,
|
|
130
|
+
parsed_config: PipelineConfig,
|
|
131
|
+
module_images: dict[str, str],
|
|
132
|
+
global_config: dict[str, Any],
|
|
133
|
+
tenant_id: str,
|
|
134
|
+
pulse_engine_url: str,
|
|
135
|
+
pulse_api_token: str,
|
|
136
|
+
) -> str:
|
|
137
|
+
"""Submit a pipeline as a Prefect flow run, upserting the deployment first."""
|
|
138
|
+
client = self._get_client()
|
|
139
|
+
|
|
140
|
+
parameters = {
|
|
141
|
+
"pipeline_run_id": pipeline_run_id,
|
|
142
|
+
"tenant_id": tenant_id,
|
|
143
|
+
"dag": self._serialize_dag(parsed_config),
|
|
144
|
+
"module_images": module_images,
|
|
145
|
+
"global_config": global_config,
|
|
146
|
+
"pulse_engine_url": pulse_engine_url,
|
|
147
|
+
"pulse_api_token": pulse_api_token,
|
|
148
|
+
"results_backend": parsed_config.infra.results_backend.model_dump(),
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
body = {
|
|
152
|
+
"parameters": parameters,
|
|
153
|
+
"state": {"type": "SCHEDULED"},
|
|
154
|
+
"tags": [f"product:{parsed_config.name}", f"pipeline:{pipeline_run_id}"],
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
deployment_id = await self._ensure_deployment(
|
|
159
|
+
client, parsed_config.deployment_name, parsed_config.infra.schedule
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
response = await client.post(
|
|
163
|
+
f"{self._api_url}/deployments/{deployment_id}/create_flow_run",
|
|
164
|
+
json=body,
|
|
165
|
+
)
|
|
166
|
+
response.raise_for_status()
|
|
167
|
+
data = response.json()
|
|
168
|
+
return str(data["id"])
|
|
169
|
+
finally:
|
|
170
|
+
if self._client is None:
|
|
171
|
+
await client.aclose()
|
|
172
|
+
|
|
173
|
+
async def _resolve_deployment_id(
|
|
174
|
+
self, client: httpx.AsyncClient, deployment_name: str
|
|
175
|
+
) -> str:
|
|
176
|
+
"""Look up a Prefect deployment ID by name."""
|
|
177
|
+
response = await client.post(
|
|
178
|
+
f"{self._api_url}/deployments/filter",
|
|
179
|
+
json={
|
|
180
|
+
"deployments": {"name": {"any_": [deployment_name]}},
|
|
181
|
+
"limit": 1,
|
|
182
|
+
},
|
|
183
|
+
)
|
|
184
|
+
response.raise_for_status()
|
|
185
|
+
deployments = response.json()
|
|
186
|
+
if not deployments:
|
|
187
|
+
raise RuntimeError(
|
|
188
|
+
f"Prefect deployment '{deployment_name}' not found. "
|
|
189
|
+
f"Create it first with: prefect deployment build ..."
|
|
190
|
+
)
|
|
191
|
+
return str(deployments[0]["id"])
|
|
192
|
+
|
|
193
|
+
def _get_deployment_name(self, pipeline_name: str) -> str:
|
|
194
|
+
"""Deployment name for the generic pipeline flow."""
|
|
195
|
+
return f"pulse-pipeline-{pipeline_name}"
|
|
File without changes
|