pulse-engine 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pulse_engine/__init__.py +0 -0
- pulse_engine/adapters/__init__.py +58 -0
- pulse_engine/adapters/audio_transcription.py +167 -0
- pulse_engine/adapters/batcher.py +36 -0
- pulse_engine/adapters/digital_news.py +128 -0
- pulse_engine/adapters/digital_news_metadata.py +536 -0
- pulse_engine/adapters/exceptions.py +10 -0
- pulse_engine/adapters/models.py +134 -0
- pulse_engine/adapters/opensearch_storage.py +160 -0
- pulse_engine/adapters/speech_content.py +130 -0
- pulse_engine/adapters/speech_metadata.py +374 -0
- pulse_engine/adapters/twitter.py +423 -0
- pulse_engine/adapters/youtube_downloader.py +186 -0
- pulse_engine/adapters/youtube_metadata.py +261 -0
- pulse_engine/api/__init__.py +0 -0
- pulse_engine/api/v1/__init__.py +0 -0
- pulse_engine/api/v1/auth.py +91 -0
- pulse_engine/api/v1/health.py +62 -0
- pulse_engine/api/v1/router.py +16 -0
- pulse_engine/chain_recovery.py +131 -0
- pulse_engine/cli/__init__.py +0 -0
- pulse_engine/cli/main.py +169 -0
- pulse_engine/cli/templates/cookiecutter.json +4 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/.gitignore +13 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/Dockerfile +32 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pipeline.yaml +17 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/pyproject.toml +25 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/src/pulse_{{cookiecutter.product_slug}}/__init__.py +8 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/__init__.py +0 -0
- pulse_engine/cli/templates/pulse-{{cookiecutter.product_name}}/tests/unit/test_manifest.py +15 -0
- pulse_engine/client.py +95 -0
- pulse_engine/config.py +157 -0
- pulse_engine/core/__init__.py +0 -0
- pulse_engine/core/error_handlers.py +64 -0
- pulse_engine/core/exceptions.py +67 -0
- pulse_engine/core/job_token.py +109 -0
- pulse_engine/core/logging.py +45 -0
- pulse_engine/core/scope.py +23 -0
- pulse_engine/core/security.py +130 -0
- pulse_engine/database.py +30 -0
- pulse_engine/dependencies.py +166 -0
- pulse_engine/deployment/__init__.py +0 -0
- pulse_engine/deployment/backend_deployment_repository.py +83 -0
- pulse_engine/deployment/backends/__init__.py +0 -0
- pulse_engine/deployment/backends/base.py +50 -0
- pulse_engine/deployment/backends/exceptions.py +20 -0
- pulse_engine/deployment/backends/native_lambda.py +125 -0
- pulse_engine/deployment/backends/prefect_ecs.py +116 -0
- pulse_engine/deployment/backends/prefect_k8s.py +131 -0
- pulse_engine/deployment/backends/registry.py +50 -0
- pulse_engine/deployment/infra_provisioner.py +285 -0
- pulse_engine/deployment/job_launcher.py +178 -0
- pulse_engine/deployment/models.py +48 -0
- pulse_engine/deployment/repository.py +54 -0
- pulse_engine/deployment/router.py +22 -0
- pulse_engine/deployment/schemas.py +18 -0
- pulse_engine/deployment/service.py +65 -0
- pulse_engine/extractor/__init__.py +0 -0
- pulse_engine/extractor/adapters/__init__.py +0 -0
- pulse_engine/extractor/base.py +48 -0
- pulse_engine/extractor/models.py +50 -0
- pulse_engine/extractor/orchestrator/__init__.py +15 -0
- pulse_engine/extractor/orchestrator/base.py +34 -0
- pulse_engine/extractor/orchestrator/noop.py +37 -0
- pulse_engine/extractor/orchestrator/prefect.py +163 -0
- pulse_engine/extractor/repository.py +163 -0
- pulse_engine/extractor/router.py +102 -0
- pulse_engine/extractor/schemas.py +93 -0
- pulse_engine/extractor/service.py +431 -0
- pulse_engine/extractor/stage_models.py +36 -0
- pulse_engine/extractor/stage_repository.py +109 -0
- pulse_engine/main.py +195 -0
- pulse_engine/mcp/__init__.py +0 -0
- pulse_engine/mcp/__main__.py +5 -0
- pulse_engine/mcp/server.py +108 -0
- pulse_engine/mcp/tools_jobs.py +159 -0
- pulse_engine/mcp/tools_kb.py +88 -0
- pulse_engine/mcp/tools_modules.py +115 -0
- pulse_engine/mcp/tools_pipelines.py +215 -0
- pulse_engine/mcp/tools_processor.py +208 -0
- pulse_engine/middleware/__init__.py +0 -0
- pulse_engine/middleware/rate_limit.py +144 -0
- pulse_engine/middleware/request_id.py +16 -0
- pulse_engine/middleware/security_headers.py +25 -0
- pulse_engine/middleware/tenant.py +90 -0
- pulse_engine/pipeline/__init__.py +0 -0
- pulse_engine/pipeline/config_parser.py +148 -0
- pulse_engine/pipeline/expression.py +268 -0
- pulse_engine/pipeline/models.py +98 -0
- pulse_engine/pipeline/repositories.py +224 -0
- pulse_engine/pipeline/router_modules.py +66 -0
- pulse_engine/pipeline/router_pipelines.py +198 -0
- pulse_engine/pipeline/schemas.py +200 -0
- pulse_engine/pipeline/service.py +250 -0
- pulse_engine/pipeline/translators/__init__.py +44 -0
- pulse_engine/pipeline/translators/airflow_status.py +11 -0
- pulse_engine/pipeline/translators/airflow_translator.py +22 -0
- pulse_engine/pipeline/translators/base.py +42 -0
- pulse_engine/pipeline/translators/prefect_status.py +93 -0
- pulse_engine/pipeline/translators/prefect_translator.py +195 -0
- pulse_engine/processor/__init__.py +0 -0
- pulse_engine/processor/base.py +36 -0
- pulse_engine/processor/core/__init__.py +0 -0
- pulse_engine/processor/core/analysis.py +148 -0
- pulse_engine/processor/core/chunking.py +158 -0
- pulse_engine/processor/core/prompts.py +340 -0
- pulse_engine/processor/core/topic_splitter.py +105 -0
- pulse_engine/processor/defaults/__init__.py +11 -0
- pulse_engine/processor/defaults/core_processor.py +12 -0
- pulse_engine/processor/defaults/postprocessor.py +12 -0
- pulse_engine/processor/defaults/preprocessor.py +12 -0
- pulse_engine/processor/llm/__init__.py +0 -0
- pulse_engine/processor/llm/provider.py +58 -0
- pulse_engine/processor/ocr/gemini.py +52 -0
- pulse_engine/processor/pipeline.py +107 -0
- pulse_engine/processor/postprocessor/__init__.py +0 -0
- pulse_engine/processor/postprocessor/embeddings.py +34 -0
- pulse_engine/processor/postprocessor/tasks.py +180 -0
- pulse_engine/processor/preprocessor/__init__.py +0 -0
- pulse_engine/processor/preprocessor/tasks.py +71 -0
- pulse_engine/processor/router.py +192 -0
- pulse_engine/processor/schemas.py +167 -0
- pulse_engine/registry.py +117 -0
- pulse_engine/runners/__init__.py +0 -0
- pulse_engine/runners/lambda_runner.py +26 -0
- pulse_engine/runners/pipeline_runner.py +43 -0
- pulse_engine/runners/prefect_pipeline_flow.py +904 -0
- pulse_engine/runners/prefect_runner.py +33 -0
- pulse_engine/s3.py +72 -0
- pulse_engine/secrets.py +46 -0
- pulse_engine/services/__init__.py +0 -0
- pulse_engine/services/bootstrap.py +211 -0
- pulse_engine/services/opensearch.py +84 -0
- pulse_engine/storage/__init__.py +0 -0
- pulse_engine/storage/connectors/__init__.py +0 -0
- pulse_engine/storage/connectors/athena.py +226 -0
- pulse_engine/storage/connectors/base.py +32 -0
- pulse_engine/storage/connectors/opensearch.py +344 -0
- pulse_engine/storage/knowledge_base.py +68 -0
- pulse_engine/storage/router.py +78 -0
- pulse_engine/storage/schemas.py +93 -0
- pulse_engine/testing/__init__.py +13 -0
- pulse_engine/testing/fixtures.py +50 -0
- pulse_engine/testing/mocks.py +104 -0
- pulse_engine/worker.py +53 -0
- pulse_engine-0.2.0.dist-info/METADATA +654 -0
- pulse_engine-0.2.0.dist-info/RECORD +150 -0
- pulse_engine-0.2.0.dist-info/WHEEL +4 -0
- pulse_engine-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Base extractor ABC and supporting types for product extractors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ExtractorConfig:
|
|
12
|
+
"""Configuration for a product extractor."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
job_type: str
|
|
16
|
+
schedule: str | None = None
|
|
17
|
+
timeout_seconds: int = 3600
|
|
18
|
+
max_retries: int = 3
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ExtractionResult:
|
|
23
|
+
"""A single extracted item from a source."""
|
|
24
|
+
|
|
25
|
+
raw_content: str
|
|
26
|
+
source_id: str
|
|
27
|
+
source_type: str
|
|
28
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BaseExtractor(ABC):
|
|
32
|
+
"""Abstract base class that all product extractors must implement."""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_config(self) -> ExtractorConfig:
|
|
36
|
+
"""Return the static configuration for this extractor."""
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
async def extract(
|
|
40
|
+
self, tenant_id: str, parameters: dict[str, Any]
|
|
41
|
+
) -> list[ExtractionResult]:
|
|
42
|
+
"""Run extraction and return a list of results."""
|
|
43
|
+
|
|
44
|
+
async def on_success(self, tenant_id: str, results: list[ExtractionResult]) -> None:
|
|
45
|
+
"""Hook called after successful extraction. Override for custom behaviour."""
|
|
46
|
+
|
|
47
|
+
async def on_failure(self, tenant_id: str, error: Exception) -> None:
|
|
48
|
+
"""Hook called after extraction failure. Override for custom behaviour."""
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Job record ORM model."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sa
|
|
8
|
+
from sqlalchemy.orm import Mapped, mapped_column
|
|
9
|
+
|
|
10
|
+
from pulse_engine.database import Base
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JobRecordModel(Base):
|
|
14
|
+
__tablename__ = "job_records"
|
|
15
|
+
|
|
16
|
+
job_id: Mapped[str] = mapped_column(
|
|
17
|
+
sa.String, primary_key=True, default=lambda: str(uuid.uuid4())
|
|
18
|
+
)
|
|
19
|
+
job_type: Mapped[str] = mapped_column(sa.String, nullable=False)
|
|
20
|
+
product: Mapped[str] = mapped_column(sa.String, nullable=False)
|
|
21
|
+
tenant_id: Mapped[str] = mapped_column(sa.String, nullable=False, index=True)
|
|
22
|
+
status: Mapped[str] = mapped_column(sa.String, nullable=False, default="pending")
|
|
23
|
+
priority: Mapped[str] = mapped_column(sa.String, nullable=False, default="normal")
|
|
24
|
+
parameters: Mapped[dict[str, Any]] = mapped_column(
|
|
25
|
+
sa.JSON, nullable=False, default=dict
|
|
26
|
+
)
|
|
27
|
+
orchestrator_run_id: Mapped[str | None] = mapped_column(
|
|
28
|
+
sa.String, nullable=True, default=None
|
|
29
|
+
)
|
|
30
|
+
callback_url: Mapped[str | None] = mapped_column(
|
|
31
|
+
sa.String, nullable=True, default=None
|
|
32
|
+
)
|
|
33
|
+
created_at: Mapped[datetime] = mapped_column(
|
|
34
|
+
sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False
|
|
35
|
+
)
|
|
36
|
+
started_at: Mapped[datetime | None] = mapped_column(
|
|
37
|
+
sa.DateTime(timezone=True), nullable=True, default=None
|
|
38
|
+
)
|
|
39
|
+
completed_at: Mapped[datetime | None] = mapped_column(
|
|
40
|
+
sa.DateTime(timezone=True), nullable=True, default=None
|
|
41
|
+
)
|
|
42
|
+
result_summary: Mapped[dict[str, Any] | None] = mapped_column(
|
|
43
|
+
sa.JSON, nullable=True, default=None
|
|
44
|
+
)
|
|
45
|
+
error: Mapped[str | None] = mapped_column(sa.String, nullable=True, default=None)
|
|
46
|
+
|
|
47
|
+
__table_args__ = (
|
|
48
|
+
sa.Index("ix_job_records_tenant_status", "tenant_id", "status"),
|
|
49
|
+
sa.Index("ix_job_records_tenant_product", "tenant_id", "product"),
|
|
50
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pulse_engine.config import Settings
|
|
2
|
+
from pulse_engine.extractor.orchestrator.base import BaseOrchestratorAdapter
|
|
3
|
+
from pulse_engine.extractor.orchestrator.noop import NoopAdapter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_orchestrator_adapter(settings: Settings) -> BaseOrchestratorAdapter:
|
|
7
|
+
backend = settings.pulse_orchestrator_backend.lower()
|
|
8
|
+
if backend == "prefect":
|
|
9
|
+
from pulse_engine.extractor.orchestrator.prefect import PrefectAdapter
|
|
10
|
+
|
|
11
|
+
return PrefectAdapter(
|
|
12
|
+
api_url=settings.prefect_api_url,
|
|
13
|
+
api_key=settings.prefect_api_key or None,
|
|
14
|
+
)
|
|
15
|
+
return NoopAdapter()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class OrchestratorRunStatus:
|
|
8
|
+
run_id: str
|
|
9
|
+
status: str # pending, running, completed, failed, cancelled, unknown
|
|
10
|
+
raw_state: str | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseOrchestratorAdapter(ABC):
|
|
14
|
+
"""Abstraction over any external job orchestrator (Prefect, Airflow, etc.)."""
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
async def get_run_status(self, run_id: str) -> OrchestratorRunStatus:
|
|
18
|
+
"""Fetch the current status of an orchestrator run by its external ID."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
async def cancel_run(self, run_id: str) -> bool:
|
|
22
|
+
"""Request cancellation of an orchestrator run."""
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
async def health_check(self) -> bool:
|
|
26
|
+
"""Return True if the orchestrator is reachable."""
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def create_flow_run(
|
|
30
|
+
self,
|
|
31
|
+
deployment_id: str,
|
|
32
|
+
parameters: dict[str, Any] | None = None,
|
|
33
|
+
) -> str:
|
|
34
|
+
"""Create a new flow run for the given deployment and return its run ID."""
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pulse_engine.extractor.orchestrator.base import (
|
|
5
|
+
BaseOrchestratorAdapter,
|
|
6
|
+
OrchestratorRunStatus,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NoopAdapter(BaseOrchestratorAdapter):
|
|
11
|
+
"""No-op adapter used when no orchestrator is configured."""
|
|
12
|
+
|
|
13
|
+
async def get_run_status(self, run_id: str) -> OrchestratorRunStatus:
|
|
14
|
+
return OrchestratorRunStatus(run_id=run_id, status="unknown")
|
|
15
|
+
|
|
16
|
+
async def cancel_run(self, run_id: str) -> bool:
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
async def health_check(self) -> bool:
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
async def create_flow_run(
|
|
23
|
+
self,
|
|
24
|
+
deployment_id: str,
|
|
25
|
+
parameters: dict[str, Any] | None = None,
|
|
26
|
+
) -> str:
|
|
27
|
+
return ""
|
|
28
|
+
|
|
29
|
+
async def create_or_update_deployment(
|
|
30
|
+
self,
|
|
31
|
+
name: str,
|
|
32
|
+
flow_entrypoint: str,
|
|
33
|
+
image: str,
|
|
34
|
+
work_pool_name: str = "products-worker-pool",
|
|
35
|
+
) -> tuple[str, str]:
|
|
36
|
+
"""Return a generated ID and the name — no actual orchestrator registration."""
|
|
37
|
+
return str(uuid.uuid4()), name
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import structlog
|
|
8
|
+
|
|
9
|
+
from pulse_engine.extractor.base import BaseExtractor
|
|
10
|
+
from pulse_engine.extractor.orchestrator.base import (
|
|
11
|
+
BaseOrchestratorAdapter,
|
|
12
|
+
OrchestratorRunStatus,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = structlog.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Map Prefect flow-run state types to canonical job statuses
|
|
18
|
+
_PREFECT_STATE_MAP: dict[str, str] = {
|
|
19
|
+
"COMPLETED": "completed",
|
|
20
|
+
"FAILED": "failed",
|
|
21
|
+
"CRASHED": "failed",
|
|
22
|
+
"CANCELLED": "cancelled",
|
|
23
|
+
"CANCELLING": "cancelled",
|
|
24
|
+
"RUNNING": "running",
|
|
25
|
+
"PENDING": "pending",
|
|
26
|
+
"SCHEDULED": "pending",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PrefectAdapter(BaseOrchestratorAdapter):
|
|
31
|
+
"""Adapter for querying Prefect server API."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, api_url: str, api_key: str | None = None) -> None:
|
|
34
|
+
self._api_url = api_url.rstrip("/")
|
|
35
|
+
headers: dict[str, str] = {"Content-Type": "application/json"}
|
|
36
|
+
if api_key:
|
|
37
|
+
encoded = base64.b64encode(api_key.encode()).decode()
|
|
38
|
+
headers["Authorization"] = f"Basic {encoded}"
|
|
39
|
+
self._client = httpx.AsyncClient(
|
|
40
|
+
base_url=self._api_url, headers=headers, timeout=10.0
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
async def get_run_status(self, run_id: str) -> OrchestratorRunStatus:
|
|
44
|
+
try:
|
|
45
|
+
resp = await self._client.get(f"/flow_runs/{run_id}")
|
|
46
|
+
resp.raise_for_status()
|
|
47
|
+
data = resp.json()
|
|
48
|
+
raw_state = data.get("state", {}).get("type", "UNKNOWN")
|
|
49
|
+
canonical = _PREFECT_STATE_MAP.get(raw_state.upper(), "unknown")
|
|
50
|
+
return OrchestratorRunStatus(
|
|
51
|
+
run_id=run_id, status=canonical, raw_state=raw_state
|
|
52
|
+
)
|
|
53
|
+
except Exception:
|
|
54
|
+
logger.warning("prefect_status_fetch_failed", run_id=run_id, exc_info=True)
|
|
55
|
+
return OrchestratorRunStatus(run_id=run_id, status="unknown")
|
|
56
|
+
|
|
57
|
+
async def cancel_run(self, run_id: str) -> bool:
|
|
58
|
+
try:
|
|
59
|
+
resp = await self._client.post(
|
|
60
|
+
f"/flow_runs/{run_id}/set_state",
|
|
61
|
+
json={"state": {"type": "CANCELLED"}},
|
|
62
|
+
)
|
|
63
|
+
return resp.is_success
|
|
64
|
+
except Exception:
|
|
65
|
+
logger.warning("prefect_cancel_failed", run_id=run_id, exc_info=True)
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
async def health_check(self) -> bool:
|
|
69
|
+
try:
|
|
70
|
+
resp = await self._client.get("/health")
|
|
71
|
+
return resp.is_success
|
|
72
|
+
except Exception:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
async def register_extractors(
|
|
76
|
+
self,
|
|
77
|
+
extractors: list[type[BaseExtractor]],
|
|
78
|
+
pipeline: Any = None,
|
|
79
|
+
) -> list[str]:
|
|
80
|
+
"""Create Prefect deployments for each registered extractor.
|
|
81
|
+
|
|
82
|
+
Returns a list of deployment IDs created.
|
|
83
|
+
"""
|
|
84
|
+
deployment_ids: list[str] = []
|
|
85
|
+
for ext_cls in extractors:
|
|
86
|
+
ext = ext_cls()
|
|
87
|
+
config = ext.get_config()
|
|
88
|
+
payload: dict[str, Any] = {
|
|
89
|
+
"name": config.name,
|
|
90
|
+
"flow_name": f"extract_{config.job_type}",
|
|
91
|
+
"parameters": {
|
|
92
|
+
"extractor_cls": f"{ext_cls.__module__}.{ext_cls.__qualname__}",
|
|
93
|
+
},
|
|
94
|
+
"tags": ["pulse-engine", config.job_type],
|
|
95
|
+
}
|
|
96
|
+
if config.schedule:
|
|
97
|
+
payload["schedule"] = {"cron": config.schedule}
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
resp = await self._client.post("/deployments/", json=payload)
|
|
101
|
+
resp.raise_for_status()
|
|
102
|
+
dep_id = resp.json().get("id", "")
|
|
103
|
+
deployment_ids.append(dep_id)
|
|
104
|
+
logger.info(
|
|
105
|
+
"extractor_registered",
|
|
106
|
+
extractor=config.name,
|
|
107
|
+
job_type=config.job_type,
|
|
108
|
+
deployment_id=dep_id,
|
|
109
|
+
)
|
|
110
|
+
except Exception:
|
|
111
|
+
logger.warning(
|
|
112
|
+
"extractor_registration_failed",
|
|
113
|
+
extractor=config.name,
|
|
114
|
+
exc_info=True,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return deployment_ids
|
|
118
|
+
|
|
119
|
+
async def _get_or_create_flow_id(self, flow_name: str) -> str:
|
|
120
|
+
"""Return the UUID of a Prefect flow, creating it if it doesn't exist."""
|
|
121
|
+
resp = await self._client.post("/flows/", json={"name": flow_name})
|
|
122
|
+
resp.raise_for_status()
|
|
123
|
+
return str(resp.json()["id"])
|
|
124
|
+
|
|
125
|
+
async def create_or_update_deployment(
|
|
126
|
+
self,
|
|
127
|
+
name: str,
|
|
128
|
+
flow_entrypoint: str,
|
|
129
|
+
image: str,
|
|
130
|
+
work_pool_name: str = "products-worker-pool",
|
|
131
|
+
) -> tuple[str, str]:
|
|
132
|
+
"""Create or update a Prefect deployment.
|
|
133
|
+
|
|
134
|
+
Returns (deployment_id, deployment_name).
|
|
135
|
+
"""
|
|
136
|
+
flow_id = await self._get_or_create_flow_id(name)
|
|
137
|
+
payload: dict[str, Any] = {
|
|
138
|
+
"name": name,
|
|
139
|
+
"flow_id": flow_id,
|
|
140
|
+
"entrypoint": flow_entrypoint,
|
|
141
|
+
"work_pool_name": work_pool_name,
|
|
142
|
+
"job_variables": {"image": image},
|
|
143
|
+
}
|
|
144
|
+
resp = await self._client.post("/deployments/", json=payload)
|
|
145
|
+
resp.raise_for_status()
|
|
146
|
+
data: dict[str, Any] = resp.json()
|
|
147
|
+
return str(data["id"]), str(data.get("name", name))
|
|
148
|
+
|
|
149
|
+
async def create_flow_run(
|
|
150
|
+
self,
|
|
151
|
+
deployment_id: str,
|
|
152
|
+
parameters: dict[str, Any] | None = None,
|
|
153
|
+
) -> str:
|
|
154
|
+
"""Trigger a flow run from a deployment. Returns the flow run ID."""
|
|
155
|
+
payload: dict[str, Any] = {}
|
|
156
|
+
if parameters:
|
|
157
|
+
payload["parameters"] = parameters
|
|
158
|
+
resp = await self._client.post(
|
|
159
|
+
f"/deployments/{deployment_id}/create_flow_run",
|
|
160
|
+
json=payload,
|
|
161
|
+
)
|
|
162
|
+
resp.raise_for_status()
|
|
163
|
+
return str(resp.json()["id"])
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Job repository — data access layer for job records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from sqlalchemy.engine import CursorResult
|
|
11
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
12
|
+
|
|
13
|
+
from pulse_engine.core.exceptions import BadRequestError
|
|
14
|
+
from pulse_engine.extractor.models import JobRecordModel
|
|
15
|
+
|
|
16
|
+
# Explicit allowlist of columns permitted for sort_by queries
|
|
17
|
+
_ALLOWED_SORT_FIELDS: set[str] = {
|
|
18
|
+
"created_at",
|
|
19
|
+
"status",
|
|
20
|
+
"product",
|
|
21
|
+
"job_type",
|
|
22
|
+
"priority",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JobRepository:
|
|
27
|
+
def __init__(self, session: AsyncSession) -> None:
|
|
28
|
+
self._session = session
|
|
29
|
+
|
|
30
|
+
async def create(
|
|
31
|
+
self,
|
|
32
|
+
tenant_id: str,
|
|
33
|
+
job_type: str,
|
|
34
|
+
product: str,
|
|
35
|
+
priority: str = "normal",
|
|
36
|
+
parameters: dict[str, Any] | None = None,
|
|
37
|
+
orchestrator_run_id: str | None = None,
|
|
38
|
+
callback_url: str | None = None,
|
|
39
|
+
) -> JobRecordModel:
|
|
40
|
+
record = JobRecordModel(
|
|
41
|
+
job_id=str(uuid.uuid4()),
|
|
42
|
+
tenant_id=tenant_id,
|
|
43
|
+
job_type=job_type,
|
|
44
|
+
product=product,
|
|
45
|
+
priority=priority,
|
|
46
|
+
parameters=parameters or {},
|
|
47
|
+
orchestrator_run_id=orchestrator_run_id,
|
|
48
|
+
callback_url=callback_url,
|
|
49
|
+
status="pending",
|
|
50
|
+
created_at=datetime.now(UTC),
|
|
51
|
+
)
|
|
52
|
+
self._session.add(record)
|
|
53
|
+
await self._session.commit()
|
|
54
|
+
await self._session.refresh(record)
|
|
55
|
+
return record
|
|
56
|
+
|
|
57
|
+
async def get(self, job_id: str, tenant_id: str) -> JobRecordModel | None:
|
|
58
|
+
stmt = sa.select(JobRecordModel).where(
|
|
59
|
+
JobRecordModel.job_id == job_id,
|
|
60
|
+
JobRecordModel.tenant_id == tenant_id,
|
|
61
|
+
)
|
|
62
|
+
result = await self._session.execute(stmt)
|
|
63
|
+
return result.scalar_one_or_none()
|
|
64
|
+
|
|
65
|
+
async def list_jobs(
|
|
66
|
+
self,
|
|
67
|
+
tenant_id: str,
|
|
68
|
+
status: str | None = None,
|
|
69
|
+
product: str | None = None,
|
|
70
|
+
job_type: str | None = None,
|
|
71
|
+
limit: int = 20,
|
|
72
|
+
offset: int = 0,
|
|
73
|
+
sort_by: str = "created_at",
|
|
74
|
+
order: str = "desc",
|
|
75
|
+
) -> tuple[list[JobRecordModel], int]:
|
|
76
|
+
base = sa.select(JobRecordModel).where(
|
|
77
|
+
JobRecordModel.tenant_id == tenant_id,
|
|
78
|
+
)
|
|
79
|
+
count_base = (
|
|
80
|
+
sa.select(sa.func.count())
|
|
81
|
+
.select_from(JobRecordModel)
|
|
82
|
+
.where(
|
|
83
|
+
JobRecordModel.tenant_id == tenant_id,
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if status:
|
|
88
|
+
base = base.where(JobRecordModel.status == status)
|
|
89
|
+
count_base = count_base.where(JobRecordModel.status == status)
|
|
90
|
+
if product:
|
|
91
|
+
base = base.where(JobRecordModel.product == product)
|
|
92
|
+
count_base = count_base.where(JobRecordModel.product == product)
|
|
93
|
+
if job_type:
|
|
94
|
+
base = base.where(JobRecordModel.job_type == job_type)
|
|
95
|
+
count_base = count_base.where(JobRecordModel.job_type == job_type)
|
|
96
|
+
|
|
97
|
+
# Sorting — validate against allowlist to prevent column-name injection
|
|
98
|
+
if sort_by not in _ALLOWED_SORT_FIELDS:
|
|
99
|
+
raise BadRequestError(
|
|
100
|
+
f"Invalid sort field: {sort_by!r}. "
|
|
101
|
+
f"Allowed: {', '.join(sorted(_ALLOWED_SORT_FIELDS))}"
|
|
102
|
+
)
|
|
103
|
+
sort_col = getattr(JobRecordModel, sort_by)
|
|
104
|
+
if order == "asc":
|
|
105
|
+
base = base.order_by(sa.asc(sort_col))
|
|
106
|
+
else:
|
|
107
|
+
base = base.order_by(sa.desc(sort_col))
|
|
108
|
+
|
|
109
|
+
base = base.offset(offset).limit(limit)
|
|
110
|
+
|
|
111
|
+
result = await self._session.execute(base)
|
|
112
|
+
jobs = list(result.scalars().all())
|
|
113
|
+
|
|
114
|
+
count_result = await self._session.execute(count_base)
|
|
115
|
+
total = count_result.scalar_one()
|
|
116
|
+
|
|
117
|
+
return jobs, total
|
|
118
|
+
|
|
119
|
+
async def update_status(
|
|
120
|
+
self,
|
|
121
|
+
job_id: str,
|
|
122
|
+
tenant_id: str,
|
|
123
|
+
status: str,
|
|
124
|
+
**fields: object,
|
|
125
|
+
) -> JobRecordModel | None:
|
|
126
|
+
record = await self.get(job_id, tenant_id)
|
|
127
|
+
if record is None:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
record.status = status
|
|
131
|
+
for key, value in fields.items():
|
|
132
|
+
if hasattr(record, key):
|
|
133
|
+
setattr(record, key, value)
|
|
134
|
+
|
|
135
|
+
await self._session.commit()
|
|
136
|
+
await self._session.refresh(record)
|
|
137
|
+
return record
|
|
138
|
+
|
|
139
|
+
async def delete(self, job_id: str, tenant_id: str) -> bool:
|
|
140
|
+
stmt = sa.delete(JobRecordModel).where(
|
|
141
|
+
JobRecordModel.job_id == job_id,
|
|
142
|
+
JobRecordModel.tenant_id == tenant_id,
|
|
143
|
+
)
|
|
144
|
+
result: CursorResult[Any] = await self._session.execute(stmt) # type: ignore[assignment]
|
|
145
|
+
await self._session.commit()
|
|
146
|
+
return bool(result.rowcount > 0)
|
|
147
|
+
|
|
148
|
+
async def count_active(self, tenant_id: str) -> int:
|
|
149
|
+
stmt = (
|
|
150
|
+
sa.select(sa.func.count())
|
|
151
|
+
.select_from(JobRecordModel)
|
|
152
|
+
.where(
|
|
153
|
+
JobRecordModel.tenant_id == tenant_id,
|
|
154
|
+
JobRecordModel.status.in_(["pending", "running"]),
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
result = await self._session.execute(stmt)
|
|
158
|
+
return result.scalar_one()
|
|
159
|
+
|
|
160
|
+
async def get_by_id(self, job_id: str) -> JobRecordModel | None:
|
|
161
|
+
stmt = sa.select(JobRecordModel).where(JobRecordModel.job_id == job_id)
|
|
162
|
+
result = await self._session.execute(stmt)
|
|
163
|
+
return result.scalar_one_or_none()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Jobs API router."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, Depends, Query
|
|
6
|
+
from starlette.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
from pulse_engine.core.scope import require_scope
|
|
9
|
+
from pulse_engine.dependencies import get_job_service
|
|
10
|
+
from pulse_engine.extractor.schemas import (
|
|
11
|
+
CreateJobRequest,
|
|
12
|
+
CreateJobResponse,
|
|
13
|
+
JobListResponse,
|
|
14
|
+
JobResponse,
|
|
15
|
+
StatusUpdateRequest,
|
|
16
|
+
StatusUpdateResponse,
|
|
17
|
+
)
|
|
18
|
+
from pulse_engine.extractor.service import JobService
|
|
19
|
+
from pulse_engine.middleware.tenant import get_tenant_id
|
|
20
|
+
|
|
21
|
+
router = APIRouter(prefix="/jobs", tags=["Jobs"])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@router.post(
|
|
25
|
+
"/",
|
|
26
|
+
response_model=CreateJobResponse,
|
|
27
|
+
status_code=202,
|
|
28
|
+
dependencies=[require_scope("jobs:trigger_next")],
|
|
29
|
+
)
|
|
30
|
+
async def register_job(
|
|
31
|
+
body: CreateJobRequest,
|
|
32
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
33
|
+
service: JobService = Depends(get_job_service),
|
|
34
|
+
) -> CreateJobResponse:
|
|
35
|
+
return await service.register_job(tenant_id, body)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@router.get("/{job_id}", response_model=JobResponse, status_code=200)
|
|
39
|
+
async def get_job(
|
|
40
|
+
job_id: str,
|
|
41
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
42
|
+
service: JobService = Depends(get_job_service),
|
|
43
|
+
) -> JobResponse:
|
|
44
|
+
return await service.get_job(tenant_id, job_id)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@router.get("/", response_model=JobListResponse, status_code=200)
|
|
48
|
+
async def list_jobs(
|
|
49
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
50
|
+
service: JobService = Depends(get_job_service),
|
|
51
|
+
status: str | None = Query(None),
|
|
52
|
+
product: str | None = Query(None),
|
|
53
|
+
job_type: str | None = Query(None),
|
|
54
|
+
limit: int = Query(20, ge=1, le=100),
|
|
55
|
+
offset: int = Query(0, ge=0),
|
|
56
|
+
sort_by: str = Query("created_at"),
|
|
57
|
+
order: Literal["asc", "desc"] = Query("desc"),
|
|
58
|
+
) -> JobListResponse:
|
|
59
|
+
return await service.list_jobs(
|
|
60
|
+
tenant_id,
|
|
61
|
+
status=status,
|
|
62
|
+
product=product,
|
|
63
|
+
job_type=job_type,
|
|
64
|
+
limit=limit,
|
|
65
|
+
offset=offset,
|
|
66
|
+
sort_by=sort_by,
|
|
67
|
+
order=order,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@router.post(
|
|
72
|
+
"/{job_id}/status",
|
|
73
|
+
response_model=StatusUpdateResponse,
|
|
74
|
+
status_code=200,
|
|
75
|
+
dependencies=[require_scope("jobs:status")],
|
|
76
|
+
)
|
|
77
|
+
async def push_status(
|
|
78
|
+
job_id: str,
|
|
79
|
+
body: StatusUpdateRequest,
|
|
80
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
81
|
+
service: JobService = Depends(get_job_service),
|
|
82
|
+
) -> StatusUpdateResponse:
|
|
83
|
+
return await service.push_status(tenant_id, job_id, body)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@router.post("/{job_id}/cancel", response_model=JobResponse, status_code=200)
|
|
87
|
+
async def cancel_job(
|
|
88
|
+
job_id: str,
|
|
89
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
90
|
+
service: JobService = Depends(get_job_service),
|
|
91
|
+
) -> JobResponse:
|
|
92
|
+
return await service.cancel_job(tenant_id, job_id)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@router.delete("/{job_id}", status_code=204)
|
|
96
|
+
async def delete_job(
|
|
97
|
+
job_id: str,
|
|
98
|
+
tenant_id: str = Depends(get_tenant_id),
|
|
99
|
+
service: JobService = Depends(get_job_service),
|
|
100
|
+
) -> JSONResponse:
|
|
101
|
+
await service.delete_job(tenant_id, job_id)
|
|
102
|
+
return JSONResponse(status_code=204, content=None)
|