temporal-ewma-worker 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. temporal_ewma_worker-0.1.1/PKG-INFO +21 -0
  2. temporal_ewma_worker-0.1.1/README.md +278 -0
  3. temporal_ewma_worker-0.1.1/pyproject.toml +39 -0
  4. temporal_ewma_worker-0.1.1/setup.cfg +4 -0
  5. temporal_ewma_worker-0.1.1/src/api/index.py +41 -0
  6. temporal_ewma_worker-0.1.1/src/api/rest/v1/app.py +33 -0
  7. temporal_ewma_worker-0.1.1/src/features/ewma_compute/index.py +3 -0
  8. temporal_ewma_worker-0.1.1/src/features/ewma_compute/service.py +40 -0
  9. temporal_ewma_worker-0.1.1/src/infra/adapters/clickhouse/clickhouse_adapter.py +95 -0
  10. temporal_ewma_worker-0.1.1/src/infra/adapters/kafka/kafka_alert_adapter.py +20 -0
  11. temporal_ewma_worker-0.1.1/src/infra/adapters/postgres/postgres_adapter.py +58 -0
  12. temporal_ewma_worker-0.1.1/src/infra/adapters/redis/redis_adapter.py +21 -0
  13. temporal_ewma_worker-0.1.1/src/shared/contracts/validator.py +53 -0
  14. temporal_ewma_worker-0.1.1/src/shared/errors/base.py +14 -0
  15. temporal_ewma_worker-0.1.1/src/shared/ports/alert_publisher_port.py +6 -0
  16. temporal_ewma_worker-0.1.1/src/shared/ports/clickhouse_port.py +16 -0
  17. temporal_ewma_worker-0.1.1/src/shared/ports/postgres_port.py +10 -0
  18. temporal_ewma_worker-0.1.1/src/shared/ports/redis_port.py +9 -0
  19. temporal_ewma_worker-0.1.1/src/shared/types/ewma_types.py +32 -0
  20. temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/PKG-INFO +21 -0
  21. temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/SOURCES.txt +27 -0
  22. temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/dependency_links.txt +1 -0
  23. temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/requires.txt +17 -0
  24. temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/top_level.txt +5 -0
  25. temporal_ewma_worker-0.1.1/src/worker/activities.py +62 -0
  26. temporal_ewma_worker-0.1.1/src/worker/config.py +53 -0
  27. temporal_ewma_worker-0.1.1/src/worker/index.py +69 -0
  28. temporal_ewma_worker-0.1.1/src/worker/registry.py +22 -0
  29. temporal_ewma_worker-0.1.1/src/worker/workflows.py +102 -0
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: temporal-ewma-worker
3
+ Version: 0.1.1
4
+ Summary: Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: temporalio>=1.5.0
7
+ Requires-Dist: clickhouse-connect>=0.7.0
8
+ Requires-Dist: redis>=5.0.0
9
+ Requires-Dist: psycopg[binary]>=3.1.0
10
+ Requires-Dist: confluent-kafka>=2.3.0
11
+ Requires-Dist: opentelemetry-sdk>=1.26.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: fastapi>=0.100.0
14
+ Requires-Dist: uvicorn>=0.22.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=8.0; extra == "dev"
17
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
18
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
19
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
20
+ Requires-Dist: ruff>=0.2.0; extra == "dev"
21
+ Requires-Dist: httpx>=0.24.0; extra == "dev"
@@ -0,0 +1,278 @@
1
+ # Temporal EWMA Worker
2
+
3
+ Temporal worker package for scheduled EWMA baseline updates and cost anomaly detection.
4
+
5
+ ---
6
+
7
+ ## Folder Structure
8
+
9
+ ```
10
+ .
11
+ ├── build/
12
+ │ └── Dockerfile
13
+ ├── contracts/
14
+ │ ├── asyncapi/
15
+ │ │ └── v1.yaml
16
+ │ ├── changelog.md
17
+ │ └── workflows/
18
+ │ └── ewma_baseline_update.yaml
19
+ ├── database/
20
+ │ ├── migrations/
21
+ │ │ ├── 0001_init.rollback.sql
22
+ │ │ └── 0001_init.sql
23
+ │ └── schema.lock
24
+ ├── deploy/
25
+ │ └── docker/
26
+ │ └── docker-compose.yaml
27
+ ├── feature-registry.yaml
28
+ ├── pyproject.toml
29
+ ├── README.md
30
+ ├── scripts/
31
+ │ ├── deploy_docker.sh
32
+ │ ├── migrate.py
33
+ │ ├── migrate.sh
34
+ │ ├── run.sh
35
+ │ └── test.sh
36
+ ├── src/
37
+ │ ├── features/
38
+ │ │ └── ewma_compute/
39
+ │ │ ├── index.py
40
+ │ │ └── service.py
41
+ │ ├── infra/
42
+ │ │ └── adapters/
43
+ │ │ ├── clickhouse/
44
+ │ │ │ └── clickhouse_adapter.py
45
+ │ │ ├── kafka/
46
+ │ │ │ └── kafka_alert_adapter.py
47
+ │ │ ├── postgres/
48
+ │ │ │ └── postgres_adapter.py
49
+ │ │ └── redis/
50
+ │ │ └── redis_adapter.py
51
+ │ ├── shared/
52
+ │ │ ├── contracts/
53
+ │ │ │ └── validator.py
54
+ │ │ ├── errors/
55
+ │ │ │ └── base.py
56
+ │ │ ├── ports/
57
+ │ │ │ ├── clickhouse_port.py
58
+ │ │ │ ├── postgres_port.py
59
+ │ │ │ └── redis_port.py
60
+ │ │ └── types/
61
+ │ │ └── ewma_types.py
62
+ │ └── worker/
63
+ │ ├── activities.py
64
+ │ ├── config.py
65
+ │ ├── index.py
66
+ │ ├── registry.py
67
+ │ └── workflows.py
68
+ ├── tests/
69
+ │ ├── integration/
70
+ │ │ └── test_adapters.py
71
+ │ └── unit/
72
+ │ ├── test_config.py
73
+ │ ├── test_contract.py
74
+ │ ├── test_ewma_service.py
75
+ │ └── test_workflow.py
76
+ └── worker-registry.yaml
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Work Execution & Decision Flow
82
+
83
+ The following detailed decision tree outlines how the hourly workflow updates baselines and flags anomalies, with justification for each design choice:
84
+
85
+ ```
86
+ [Hourly Cron Trigger (0 * * * *)]
87
+ └── EwmaBaselineUpdate Workflow Starts
88
+
89
+ │ ► RATIONALE: Scheduled cron triggers workflow at off-peak hour intervals.
90
+
91
+ └── Activity: fetch_active_pairs()
92
+
93
+ │ ► RATIONALE: Scans ClickHouse log volumes for active (service, model) pairs
94
+ │ in the last 7 days. This filters out millions of historical combinations,
95
+ │ focusing computation ONLY on active traffic to minimize cost and execution time.
96
+
97
+ └── Loop over active (service, model) pairs concurrently:
98
+
99
+ │ ► RATIONALE: Temporal workflows run loops concurrently. Concurrency allows
100
+ │ thousands of pairs to be evaluated in parallel without blocking.
101
+
102
+ ├── Activity: get_baseline(service, model, hour_of_week)
103
+ │ │
104
+ │ │ ► RATIONALE: Reads the current baseline record from PostgreSQL. PostgreSQL is
105
+ │ │ used here because it provides ACID compliance for historical baselines.
106
+ │ │
107
+ │ ├── Existing Baseline NOT found (Cold Start)
108
+ │ │ ├── Activity: fetch_global_model_avg(model)
109
+ │ │ │
110
+ │ │ │ ► RATIONALE: Lacking historical service/model pairing, we seed the baseline
111
+ │ │ │ using the global average cost for this specific model (e.g. gpt-4o) across
112
+ │ │ │ all services. This prevents false positive anomaly triggers during cold starts.
113
+ │ │ │
114
+ │ │ └── Seed EWMA baseline value = Global Model Average
115
+ │ │
116
+ │ └── Existing Baseline found (Warm Status)
117
+ │ ├── Activity: fetch_cost_history(service, model, hour_of_week)
118
+ │ │ │
119
+ │ │ │ ► RATIONALE: Queries ClickHouse for the cost of the same hour_of_week (0-167)
120
+ │ │ │ over the last 4 weeks. ClickHouse is selected here because column-oriented
121
+ │ │ │ storage allows ultra-fast aggregation of historical logs.
122
+ │ │ │
123
+ │ │ └── Fetch last 4 occurrences from ClickHouse
124
+ │ └── Compute EWMA baseline value using α=0.1:
125
+ │ EWMA_new = (1 - α) * EWMA_prev + α * Cost_current
126
+
127
+ ├── Activity: fetch_current_cost_1h(service, model)
128
+
129
+ ├── Activity: upsert_baseline(EwmaRecord)
130
+ │ │
131
+ │ │ ► RATIONALE: Persists the calculated baseline to PostgreSQL for persistent audit trail.
132
+ │ │
133
+ │ └── Persist updated baseline to PostgreSQL
134
+
135
+ ├── Write updated EWMA value to Redis Cache
136
+ │ │
137
+ │ │ ► RATIONALE: Anomaly-detection gateways on the ingestion path need ultra-low latency.
138
+ │ │ Redis caches the calculated baseline under: ewma:cost:{service}:{model}:{hour_of_week}
139
+ │ │
140
+ │ └── Key: ewma:cost:{service}:{model}:{hour_of_week}
141
+
142
+ └── Decision: Is Cost_current > (3 * EWMA_baseline)?
143
+
144
+ ├── YES (Anomaly Detected)
145
+ │ ├── Activity: fetch_cost_by_cluster_1h(service, model)
146
+ │ │ │
147
+ │ │ │ ► RATIONALE: If cost spikes, we query ClickHouse to break down the cost
148
+ │ │ │ contributions by Kubernetes cluster/namespace to locate the root cause.
149
+ │ │ │
150
+ │ │ └── Get cluster drilldown metrics
151
+ │ │
152
+ │ └── Activity: publish_anomaly_alert(AnomalyPayload)
153
+ │ │
154
+ │ │ ► RATIONALE: Publishes to Kafka topic 'cost-anomaly-alerts'. Using Kafka
155
+ │ │ decouples anomaly detection from notification delivery (Slack, pager).
156
+ │ │
157
+ │ └── Emit alert JSON to Kafka topic
158
+
159
+ └── NO (Normal State)
160
+ └── Do nothing
161
+ ```
162
+
163
+ ---
164
+
165
+ ## Sequencing & Dependency Map
166
+
167
+ To run the worker successfully, you MUST spin up and configure dependencies in the following strict order:
168
+
169
+ ```
170
+ [Step 1: Docker Containers] ---> [Step 2: Configuration] ---> [Step 3: DB Migrations] ---> [Step 4: Verification] ---> [Step 5: Start Worker]
171
+ • ClickHouse (8123) • Copy .env.example • ./scripts/migrate.sh • ./scripts/test.sh • ./scripts/run.sh
172
+ • PostgreSQL (5432) • Set hosts & ports (Applies SQL schemas) (Ensures integrations (Starts polling
173
+ • Redis Cache (6379) and mock runs pass) Temporal task queue)
174
+ • Kafka & Zookeeper (9092)
175
+ • Temporal Server (7233)
176
+ ```
177
+
178
+ ---
179
+
180
+ ## Setup & Running
181
+
182
+ Follow these steps to set up the local development environment and run the worker:
183
+
184
+ ### 1. Prerequisites
185
+ Ensure you have the following installed:
186
+ - Python 3.11+
187
+ - Docker & Docker Compose
188
+ - Git
189
+
190
+ ### 2. Configure Virtual Environment & Dependencies
191
+ Create a virtual environment and install the package along with development requirements:
192
+ ```bash
193
+ # Create virtual environment
194
+ python3 -m venv .venv
195
+
196
+ # Activate virtual environment
197
+ source .venv/bin/activate
198
+
199
+ # Install package in editable mode with development dependencies
200
+ pip install -e ".[dev]"
201
+ ```
202
+
203
+ ### 3. Spin Up Infrastructure
204
+ Use the provided `docker-compose` to run ClickingHouse, Postgres, Redis, Kafka, and Temporal locally:
205
+ ```bash
206
+ docker compose -f deploy/docker/docker-compose.yaml up -d
207
+ ```
208
+
209
+ ### 4. Configure Environment Variables
210
+ Copy the template `.env.example` to `.env` and fill in custom connection strings if necessary:
211
+ ```bash
212
+ cp .env.example .env
213
+ ```
214
+
215
+ ---
216
+
217
+ ## Database Migrations Guide
218
+
219
+ The database schema is managed via light-weight migration scripts tracked under `database/migrations/` and verified using a `schema.lock` file.
220
+
221
+ ### How it Works
222
+ The migration status is tracked inside `/database/migrations/schema.lock` containing the active version tag (e.g. `0001` or `0000`).
223
+
224
+ ### Apply Migrations (UP)
225
+ To apply pending database schemas, run:
226
+ ```bash
227
+ ./scripts/migrate.sh up
228
+ ```
229
+ This runs `0001_init.sql` against the configured PostgreSQL database and writes `0001` to `schema.lock`.
230
+
231
+ ### Rollback Migrations
232
+ To revert schemas and return to baseline state, run:
233
+ ```bash
234
+ ./scripts/migrate.sh rollback
235
+ ```
236
+ This executes the rollback SQL scripts and sets the `schema.lock` version to `0000`.
237
+
238
+ ### Creating a New Migration
239
+ 1. Add your SQL changes inside `database/migrations/` using a sequential identifier (e.g., `0002_add_index.sql` and `0002_add_index.rollback.sql`).
240
+ 2. Update the transition mappings inside `scripts/migrate.py` to support applying and rolling back your new script file.
241
+
242
+ ---
243
+
244
+ ## Running Verification & Worker
245
+
246
+ ### 1. Run Tests
247
+ Verify configuration, domain services, and workflow behavior using the test script:
248
+ ```bash
249
+ ./scripts/test.sh
250
+ ```
251
+
252
+ ### 2. Run Worker
253
+ Start the Temporal worker polling queue `ewma-tasks`:
254
+ ```bash
255
+ ./scripts/run.sh
256
+ ```
257
+
258
+ ---
259
+
260
+ ## Remote Management API (REST)
261
+
262
+ The worker now includes a FastAPI management layer (port 8000 in prod).
263
+
264
+ | Endpoint | Method | Description |
265
+ | :--- | :--- | :--- |
266
+ | `/health` | GET | Check worker status and config. |
267
+ | `/trigger` | POST | Trigger the EWMA baseline update workflow on-demand. |
268
+
269
+ ### Example Execution
270
+
271
+ ```bash
272
+ curl -X POST http://localhost:8000/trigger \
273
+ -H "Content-Type: application/json" \
274
+ -d '{
275
+ "force_hour": 42
276
+ }'
277
+ ```
278
+
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "temporal-ewma-worker"
3
+ version = "0.1.1"
4
+ description = "Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "temporalio>=1.5.0",
8
+ "clickhouse-connect>=0.7.0",
9
+ "redis>=5.0.0",
10
+ "psycopg[binary]>=3.1.0",
11
+ "confluent-kafka>=2.3.0",
12
+ "opentelemetry-sdk>=1.26.0",
13
+ "pydantic>=2.0.0",
14
+ "fastapi>=0.100.0",
15
+ "uvicorn>=0.22.0"
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ dev = [
20
+ "pytest>=8.0",
21
+ "pytest-cov>=4.1.0",
22
+ "pytest-asyncio>=0.23.0",
23
+ "mypy>=1.8.0",
24
+ "ruff>=0.2.0",
25
+ "httpx>=0.24.0"
26
+ ]
27
+
28
+
29
+ [tool.pytest.ini_options]
30
+ pythonpath = ["src"]
31
+ testpaths = ["tests"]
32
+
33
+ [build-system]
34
+ requires = ["setuptools>=61.0", "wheel"]
35
+ build-backend = "setuptools.build_meta"
36
+
37
+ [tool.setuptools.packages.find]
38
+ where = ["src"]
39
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,41 @@
1
+ import time
2
+ from temporalio.client import Client
3
+ from worker.config import load_config
4
+ from worker.workflows import EwmaWorkflowInput
5
+
6
+
7
+ async def health(env: dict[str, str] | None = None) -> dict:
8
+ cfg = load_config(env)
9
+ return {
10
+ "status": "ok",
11
+ "temporal_host": cfg.temporal_host,
12
+ "temporal_namespace": cfg.temporal_namespace,
13
+ "temporal_task_queue": cfg.temporal_task_queue,
14
+ "redis_url": cfg.redis_url,
15
+ "kafka_bootstrap_servers": cfg.kafka_bootstrap_servers,
16
+ }
17
+
18
+
19
+ async def trigger_workflow(
20
+ force_hour: int | None = None, env: dict[str, str] | None = None
21
+ ) -> dict:
22
+ cfg = load_config(env)
23
+ client = await Client.connect(cfg.temporal_host, namespace=cfg.temporal_namespace)
24
+ workflow_id = f"ewma-update-manual-{int(time.time())}"
25
+
26
+ workflow_input = (
27
+ EwmaWorkflowInput(force_hour=force_hour) if force_hour is not None else None
28
+ )
29
+
30
+ handle = await client.start_workflow(
31
+ "EwmaBaselineUpdate",
32
+ id=workflow_id,
33
+ task_queue=cfg.temporal_task_queue,
34
+ args=[workflow_input] if workflow_input else [],
35
+ )
36
+
37
+ return {
38
+ "status": "triggered",
39
+ "workflow_id": workflow_id,
40
+ "run_id": handle.first_execution_run_id,
41
+ }
@@ -0,0 +1,33 @@
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, Field
3
+ from api.index import health as get_health, trigger_workflow as trigger_ewma_workflow
4
+
5
+ app = FastAPI(title="Temporal EWMA Worker API", version="1.0.0")
6
+
7
+
8
+ class TriggerRequest(BaseModel):
9
+ force_hour: int | None = Field(default=None, ge=0, le=167)
10
+
11
+
12
+ @app.get("/health")
13
+ async def health() -> dict:
14
+ try:
15
+ return await get_health()
16
+ except Exception as e:
17
+ raise HTTPException(status_code=500, detail=str(e))
18
+
19
+
20
+ @app.post("/trigger")
21
+ async def trigger_workflow(request: TriggerRequest | None = None) -> dict:
22
+ try:
23
+ force_hour = request.force_hour if request else None
24
+ res = await trigger_ewma_workflow(force_hour=force_hour)
25
+ return res
26
+ except Exception as e:
27
+ raise HTTPException(status_code=500, detail=str(e))
28
+
29
+
30
+ if __name__ == "__main__":
31
+ import uvicorn
32
+
33
+ uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,3 @@
1
+ from features.ewma_compute.service import EwmaService
2
+
3
+ __all__ = ["EwmaService"]
@@ -0,0 +1,40 @@
1
+ from shared.types.ewma_types import EwmaRecord
2
+
3
+
4
+ class EwmaService:
5
+ @staticmethod
6
+ def calculate_new_ewma(
7
+ current_value: float, previous_ewma: float, alpha: float = 0.1
8
+ ) -> float:
9
+ return (alpha * current_value) + ((1.0 - alpha) * previous_ewma)
10
+
11
+ @staticmethod
12
+ def process_update(
13
+ current_value: float,
14
+ existing_record: EwmaRecord | None,
15
+ global_model_avg: float,
16
+ ) -> EwmaRecord:
17
+ if existing_record is None:
18
+ return EwmaRecord(
19
+ service="",
20
+ model="",
21
+ hour_of_week=0,
22
+ ewma_value=global_model_avg,
23
+ sample_count=1,
24
+ is_cold_start=True,
25
+ )
26
+
27
+ new_sample_count = existing_record.sample_count + 1
28
+ new_ewma = EwmaService.calculate_new_ewma(
29
+ current_value, existing_record.ewma_value
30
+ )
31
+ new_is_cold_start = new_sample_count < 7
32
+
33
+ return EwmaRecord(
34
+ service=existing_record.service,
35
+ model=existing_record.model,
36
+ hour_of_week=existing_record.hour_of_week,
37
+ ewma_value=new_ewma,
38
+ sample_count=new_sample_count,
39
+ is_cold_start=new_is_cold_start,
40
+ )
@@ -0,0 +1,95 @@
1
+ from typing import List, Tuple
2
+ import clickhouse_connect
3
+ from shared.ports.clickhouse_port import ClickHousePort
4
+ from shared.types.ewma_types import ClusterCost
5
+
6
+
7
+ class ClickHouseAdapter(ClickHousePort):
8
+ def __init__(
9
+ self, host: str, port: int, username: str, password: str, database: str
10
+ ):
11
+ self.client = clickhouse_connect.get_client(
12
+ host=host,
13
+ port=port,
14
+ username=username,
15
+ password=password,
16
+ database=database,
17
+ )
18
+
19
+ def get_active_pairs(self) -> List[Tuple[str, str]]:
20
+ query = """
21
+ SELECT DISTINCT service, model
22
+ FROM cost_by_dimension
23
+ WHERE timestamp >= now() - INTERVAL 7 DAY
24
+ """
25
+ result = self.client.query(query)
26
+ return [(str(row[0]), str(row[1])) for row in result.result_rows]
27
+
28
+ def get_cost_history(
29
+ self, service: str, model: str, hour_of_week: int
30
+ ) -> List[float]:
31
+ query = """
32
+ SELECT sum(cost) as hourly_cost
33
+ FROM cost_by_dimension
34
+ WHERE service = %(service)s
35
+ AND model = %(model)s
36
+ AND ((toDayOfWeek(timestamp) - 1) * 24 + toHour(timestamp)) = %(hour_of_week)s
37
+ AND timestamp >= now() - INTERVAL 28 DAY
38
+ GROUP BY toStartOfHour(timestamp)
39
+ ORDER BY toStartOfHour(timestamp) DESC
40
+ LIMIT 4
41
+ """
42
+ result = self.client.query(
43
+ query, {"service": service, "model": model, "hour_of_week": hour_of_week}
44
+ )
45
+ return [float(row[0]) for row in result.result_rows if row[0] is not None]
46
+
47
+ def get_global_model_avg(self, model: str, hour_of_week: int) -> float:
48
+ query = """
49
+ SELECT avg(hourly_cost)
50
+ FROM (
51
+ SELECT sum(cost) as hourly_cost
52
+ FROM cost_by_dimension
53
+ WHERE model = %(model)s
54
+ AND ((toDayOfWeek(timestamp) - 1) * 24 + toHour(timestamp)) = %(hour_of_week)s
55
+ AND timestamp >= now() - INTERVAL 28 DAY
56
+ GROUP BY toStartOfHour(timestamp)
57
+ )
58
+ """
59
+ result = self.client.query(
60
+ query, {"model": model, "hour_of_week": hour_of_week}
61
+ )
62
+ rows = result.result_rows
63
+ if rows and rows[0][0] is not None:
64
+ return float(rows[0][0])
65
+ return 0.0
66
+
67
+ def get_current_cost_1h(self, service: str, model: str) -> float:
68
+ query = """
69
+ SELECT sum(cost)
70
+ FROM cost_by_dimension
71
+ WHERE service = %(service)s
72
+ AND model = %(model)s
73
+ AND timestamp >= now() - INTERVAL 1 HOUR
74
+ """
75
+ result = self.client.query(query, {"service": service, "model": model})
76
+ rows = result.result_rows
77
+ if rows and rows[0][0] is not None:
78
+ return float(rows[0][0])
79
+ return 0.0
80
+
81
+ def get_cost_by_cluster_1h(self, service: str, model: str) -> List[ClusterCost]:
82
+ query = """
83
+ SELECT cluster_id, sum(cost) as cost
84
+ FROM cost_by_dimension
85
+ WHERE service = %(service)s
86
+ AND model = %(model)s
87
+ AND timestamp >= now() - INTERVAL 1 HOUR
88
+ GROUP BY cluster_id
89
+ """
90
+ result = self.client.query(query, {"service": service, "model": model})
91
+ return [
92
+ ClusterCost(cluster_id=str(row[0]), cost=float(row[1]))
93
+ for row in result.result_rows
94
+ if row[0] is not None and row[1] is not None
95
+ ]
@@ -0,0 +1,20 @@
1
+ import json
2
+ from dataclasses import asdict
3
+ from confluent_kafka import Producer
4
+ from shared.ports.alert_publisher_port import AlertPublisherPort
5
+ from shared.types.ewma_types import AnomalyPayload
6
+
7
+
8
+ class KafkaAlertAdapter(AlertPublisherPort):
9
+ def __init__(self, bootstrap_servers: str):
10
+ self.producer = Producer({"bootstrap.servers": bootstrap_servers})
11
+
12
+ def publish_anomaly(self, payload: AnomalyPayload) -> None:
13
+ topic = "alerts.cost.anomaly"
14
+ payload_dict = asdict(payload)
15
+ self.producer.produce(
16
+ topic=topic,
17
+ key=f"{payload.service}:{payload.model}",
18
+ value=json.dumps(payload_dict).encode("utf-8"),
19
+ )
20
+ self.producer.flush()
@@ -0,0 +1,58 @@
1
+ import psycopg
2
+ from shared.ports.postgres_port import PostgresPort
3
+ from shared.types.ewma_types import EwmaRecord
4
+
5
+
6
+ class PostgresAdapter(PostgresPort):
7
+ def __init__(self, dsn: str):
8
+ self.dsn = dsn
9
+
10
+ def get_baseline(
11
+ self, service: str, model: str, hour_of_week: int
12
+ ) -> EwmaRecord | None:
13
+ query = """
14
+ SELECT service, model, hour_of_week, ewma_value, sample_count, is_cold_start, updated_at
15
+ FROM ewma_baselines
16
+ WHERE service = %s AND model = %s AND hour_of_week = %s
17
+ """
18
+ with psycopg.connect(self.dsn) as conn:
19
+ with conn.cursor() as cur:
20
+ cur.execute(query, (service, model, hour_of_week))
21
+ row = cur.fetchone()
22
+ if row:
23
+ return EwmaRecord(
24
+ service=row[0],
25
+ model=row[1],
26
+ hour_of_week=row[2],
27
+ ewma_value=float(row[3]),
28
+ sample_count=int(row[4]),
29
+ is_cold_start=bool(row[5]),
30
+ updated_at=row[6],
31
+ )
32
+ return None
33
+
34
+ def upsert_baseline(self, record: EwmaRecord) -> None:
35
+ query = """
36
+ INSERT INTO ewma_baselines (service, model, hour_of_week, ewma_value, sample_count, is_cold_start, updated_at)
37
+ VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
38
+ ON CONFLICT (service, model, hour_of_week)
39
+ DO UPDATE SET
40
+ ewma_value = EXCLUDED.ewma_value,
41
+ sample_count = EXCLUDED.sample_count,
42
+ is_cold_start = EXCLUDED.is_cold_start,
43
+ updated_at = CURRENT_TIMESTAMP
44
+ """
45
+ with psycopg.connect(self.dsn) as conn:
46
+ with conn.cursor() as cur:
47
+ cur.execute(
48
+ query,
49
+ (
50
+ record.service,
51
+ record.model,
52
+ record.hour_of_week,
53
+ record.ewma_value,
54
+ record.sample_count,
55
+ record.is_cold_start,
56
+ ),
57
+ )
58
+ conn.commit()
@@ -0,0 +1,21 @@
1
+ import redis
2
+ from shared.ports.redis_port import RedisPort
3
+
4
+
5
+ class RedisAdapter(RedisPort):
6
+ def __init__(self, url: str):
7
+ self.client = redis.from_url(url)
8
+
9
+ def get_ewma(self, service: str, model: str, hour_of_week: int) -> float | None:
10
+ key = f"ewma:cost:{service}:{model}:{hour_of_week}"
11
+ val = self.client.get(key)
12
+ if val is not None:
13
+ if isinstance(val, (str, bytes)):
14
+ return float(val)
15
+ return None
16
+
17
+ def set_ewma(
18
+ self, service: str, model: str, hour_of_week: int, value: float
19
+ ) -> None:
20
+ key = f"ewma:cost:{service}:{model}:{hour_of_week}"
21
+ self.client.set(key, str(value))
@@ -0,0 +1,53 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from shared.errors.base import ValidationError
5
+
6
+ BASE_DIR = Path(__file__).resolve().parents[3]
7
+ CONTRACT_FILE: Path = Path(
8
+ os.getenv(
9
+ "CONTRACTS_PATH",
10
+ BASE_DIR / "contracts" / "workflows" / "ewma_baseline_update.yaml",
11
+ )
12
+ )
13
+
14
+ if not CONTRACT_FILE.exists():
15
+ CONTRACT_FILE = Path("/app/contracts/workflows/ewma_baseline_update.yaml")
16
+
17
+
18
+ def _extract_val(text: str, pattern: str) -> str:
19
+ m = re.search(pattern, text, re.MULTILINE)
20
+ if not m:
21
+ raise ValidationError(f"Missing expected pattern in contract: {pattern}")
22
+ return m.group(1).strip()
23
+
24
+
25
+ def load_workflow_contract() -> dict:
26
+ path = Path(CONTRACT_FILE)
27
+ if not path.exists():
28
+ raise ValidationError(f"Contract file not found at {path}")
29
+ text = path.read_text()
30
+ validate_workflow_contract(text)
31
+ return {
32
+ "workflow": "ewma_baseline_update",
33
+ "version": int(_extract_val(text, r"^version:\s*(\d+)\s*$")),
34
+ "cron": _extract_val(text, r"^schedule:\s*\n\s*cron:\s*\"(.*)\"\s*$"),
35
+ }
36
+
37
+
38
+ def validate_workflow_contract(text: str) -> None:
39
+ required_fragments = [
40
+ "workflow: ewma_baseline_update",
41
+ 'cron: "0 * * * *"',
42
+ "activities:",
43
+ "fetch_active_pairs:",
44
+ "upsert_baseline:",
45
+ "publish_anomaly_alert:",
46
+ ]
47
+ for fragment in required_fragments:
48
+ if fragment not in text:
49
+ raise ValidationError(f"Missing required contract fragment: {fragment}")
50
+
51
+ version = int(_extract_val(text, r"^version:\s*(\d+)\s*$"))
52
+ if version < 1:
53
+ raise ValidationError("Contract version must be positive integer")
@@ -0,0 +1,14 @@
1
+ class WorkerError(Exception):
2
+ pass
3
+
4
+
5
+ class ValidationError(WorkerError):
6
+ pass
7
+
8
+
9
+ class ClickHouseQueryError(WorkerError):
10
+ pass
11
+
12
+
13
+ class DatabaseConnectionError(WorkerError):
14
+ pass
@@ -0,0 +1,6 @@
1
+ from typing import Protocol
2
+ from shared.types.ewma_types import AnomalyPayload
3
+
4
+
5
+ class AlertPublisherPort(Protocol):
6
+ def publish_anomaly(self, payload: AnomalyPayload) -> None: ...
@@ -0,0 +1,16 @@
1
+ from typing import Protocol, Tuple, List
2
+ from shared.types.ewma_types import ClusterCost
3
+
4
+
5
+ class ClickHousePort(Protocol):
6
+ def get_active_pairs(self) -> List[Tuple[str, str]]: ...
7
+
8
+ def get_cost_history(
9
+ self, service: str, model: str, hour_of_week: int
10
+ ) -> List[float]: ...
11
+
12
+ def get_global_model_avg(self, model: str, hour_of_week: int) -> float: ...
13
+
14
+ def get_current_cost_1h(self, service: str, model: str) -> float: ...
15
+
16
+ def get_cost_by_cluster_1h(self, service: str, model: str) -> List[ClusterCost]: ...
@@ -0,0 +1,10 @@
1
+ from typing import Protocol
2
+ from shared.types.ewma_types import EwmaRecord
3
+
4
+
5
+ class PostgresPort(Protocol):
6
+ def get_baseline(
7
+ self, service: str, model: str, hour_of_week: int
8
+ ) -> EwmaRecord | None: ...
9
+
10
+ def upsert_baseline(self, record: EwmaRecord) -> None: ...
@@ -0,0 +1,9 @@
1
+ from typing import Protocol
2
+
3
+
4
+ class RedisPort(Protocol):
5
+ def get_ewma(self, service: str, model: str, hour_of_week: int) -> float | None: ...
6
+
7
+ def set_ewma(
8
+ self, service: str, model: str, hour_of_week: int, value: float
9
+ ) -> None: ...
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+
4
+
5
+ @dataclass
6
+ class EwmaRecord:
7
+ service: str
8
+ model: str
9
+ hour_of_week: int
10
+ ewma_value: float
11
+ sample_count: int
12
+ is_cold_start: bool
13
+ updated_at: datetime | None = None
14
+
15
+
16
+ @dataclass
17
+ class ClusterCost:
18
+ cluster_id: str
19
+ cost: float
20
+
21
+
22
+ @dataclass
23
+ class AnomalyPayload:
24
+ service: str
25
+ model: str
26
+ hour_of_week: int
27
+ current_cost: float
28
+ ewma_value: float
29
+ threshold_value: float
30
+ sample_count: int
31
+ timestamp: str
32
+ cluster_drilldown: list[ClusterCost]
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: temporal-ewma-worker
3
+ Version: 0.1.1
4
+ Summary: Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: temporalio>=1.5.0
7
+ Requires-Dist: clickhouse-connect>=0.7.0
8
+ Requires-Dist: redis>=5.0.0
9
+ Requires-Dist: psycopg[binary]>=3.1.0
10
+ Requires-Dist: confluent-kafka>=2.3.0
11
+ Requires-Dist: opentelemetry-sdk>=1.26.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: fastapi>=0.100.0
14
+ Requires-Dist: uvicorn>=0.22.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=8.0; extra == "dev"
17
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
18
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
19
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
20
+ Requires-Dist: ruff>=0.2.0; extra == "dev"
21
+ Requires-Dist: httpx>=0.24.0; extra == "dev"
@@ -0,0 +1,27 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/api/index.py
4
+ src/api/rest/v1/app.py
5
+ src/features/ewma_compute/index.py
6
+ src/features/ewma_compute/service.py
7
+ src/infra/adapters/clickhouse/clickhouse_adapter.py
8
+ src/infra/adapters/kafka/kafka_alert_adapter.py
9
+ src/infra/adapters/postgres/postgres_adapter.py
10
+ src/infra/adapters/redis/redis_adapter.py
11
+ src/shared/contracts/validator.py
12
+ src/shared/errors/base.py
13
+ src/shared/ports/alert_publisher_port.py
14
+ src/shared/ports/clickhouse_port.py
15
+ src/shared/ports/postgres_port.py
16
+ src/shared/ports/redis_port.py
17
+ src/shared/types/ewma_types.py
18
+ src/temporal_ewma_worker.egg-info/PKG-INFO
19
+ src/temporal_ewma_worker.egg-info/SOURCES.txt
20
+ src/temporal_ewma_worker.egg-info/dependency_links.txt
21
+ src/temporal_ewma_worker.egg-info/requires.txt
22
+ src/temporal_ewma_worker.egg-info/top_level.txt
23
+ src/worker/activities.py
24
+ src/worker/config.py
25
+ src/worker/index.py
26
+ src/worker/registry.py
27
+ src/worker/workflows.py
@@ -0,0 +1,17 @@
1
+ temporalio>=1.5.0
2
+ clickhouse-connect>=0.7.0
3
+ redis>=5.0.0
4
+ psycopg[binary]>=3.1.0
5
+ confluent-kafka>=2.3.0
6
+ opentelemetry-sdk>=1.26.0
7
+ pydantic>=2.0.0
8
+ fastapi>=0.100.0
9
+ uvicorn>=0.22.0
10
+
11
+ [dev]
12
+ pytest>=8.0
13
+ pytest-cov>=4.1.0
14
+ pytest-asyncio>=0.23.0
15
+ mypy>=1.8.0
16
+ ruff>=0.2.0
17
+ httpx>=0.24.0
@@ -0,0 +1,5 @@
1
+ api
2
+ features
3
+ infra
4
+ shared
5
+ worker
@@ -0,0 +1,62 @@
1
+ from typing import List, Tuple
2
+ from temporalio import activity
3
+ from shared.ports.clickhouse_port import ClickHousePort
4
+ from shared.ports.redis_port import RedisPort
5
+ from shared.ports.postgres_port import PostgresPort
6
+ from shared.ports.alert_publisher_port import AlertPublisherPort
7
+ from shared.types.ewma_types import EwmaRecord, ClusterCost, AnomalyPayload
8
+
9
+
10
+ class EwmaActivities:
11
+ def __init__(
12
+ self,
13
+ clickhouse: ClickHousePort,
14
+ redis: RedisPort,
15
+ postgres: PostgresPort,
16
+ alert_publisher: AlertPublisherPort,
17
+ ):
18
+ self.clickhouse = clickhouse
19
+ self.redis = redis
20
+ self.postgres = postgres
21
+ self.alert_publisher = alert_publisher
22
+
23
+ @activity.defn(name="fetch_active_pairs")
24
+ async def fetch_active_pairs(self) -> List[Tuple[str, str]]:
25
+ return self.clickhouse.get_active_pairs()
26
+
27
+ @activity.defn(name="fetch_cost_history")
28
+ async def fetch_cost_history(
29
+ self, service: str, model: str, hour_of_week: int
30
+ ) -> List[float]:
31
+ return self.clickhouse.get_cost_history(service, model, hour_of_week)
32
+
33
+ @activity.defn(name="fetch_global_model_avg")
34
+ async def fetch_global_model_avg(self, model: str, hour_of_week: int) -> float:
35
+ return self.clickhouse.get_global_model_avg(model, hour_of_week)
36
+
37
+ @activity.defn(name="fetch_current_cost_1h")
38
+ async def fetch_current_cost_1h(self, service: str, model: str) -> float:
39
+ return self.clickhouse.get_current_cost_1h(service, model)
40
+
41
+ @activity.defn(name="fetch_cost_by_cluster_1h")
42
+ async def fetch_cost_by_cluster_1h(
43
+ self, service: str, model: str
44
+ ) -> List[ClusterCost]:
45
+ return self.clickhouse.get_cost_by_cluster_1h(service, model)
46
+
47
+ @activity.defn(name="get_baseline")
48
+ async def get_baseline(
49
+ self, service: str, model: str, hour_of_week: int
50
+ ) -> EwmaRecord | None:
51
+ return self.postgres.get_baseline(service, model, hour_of_week)
52
+
53
+ @activity.defn(name="upsert_baseline")
54
+ async def upsert_baseline(self, record: EwmaRecord) -> None:
55
+ self.postgres.upsert_baseline(record)
56
+ self.redis.set_ewma(
57
+ record.service, record.model, record.hour_of_week, record.ewma_value
58
+ )
59
+
60
+ @activity.defn(name="publish_anomaly_alert")
61
+ async def publish_anomaly_alert(self, payload: AnomalyPayload) -> None:
62
+ self.alert_publisher.publish_anomaly(payload)
@@ -0,0 +1,53 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from shared.errors.base import ValidationError
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class WorkerConfig:
8
+ temporal_host: str
9
+ temporal_namespace: str
10
+ temporal_task_queue: str
11
+ clickhouse_host: str
12
+ clickhouse_port: int
13
+ clickhouse_username: str
14
+ clickhouse_password: str
15
+ clickhouse_database: str
16
+ redis_url: str
17
+ postgres_dsn: str
18
+ kafka_bootstrap_servers: str
19
+
20
+
21
+ def _int_val(raw: str, key: str) -> int:
22
+ try:
23
+ return int(raw)
24
+ except ValueError as exc:
25
+ raise ValidationError(f"{key} must be an integer") from exc
26
+
27
+
28
+ def load_config(env: dict[str, str] | None = None) -> WorkerConfig:
29
+ source = env or os.environ
30
+
31
+ postgres_user = source.get("POSTGRES_USER", "postgres")
32
+ postgres_password = source.get("POSTGRES_PASSWORD", "postgres")
33
+ postgres_host = source.get("POSTGRES_HOST", "localhost")
34
+ postgres_port = source.get("POSTGRES_PORT", "5439")
35
+ postgres_db = source.get("POSTGRES_DB", "ewma_db")
36
+
37
+ postgres_dsn = f"postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}"
38
+
39
+ return WorkerConfig(
40
+ temporal_host=source.get("TEMPORAL_HOST", "localhost:7239"),
41
+ temporal_namespace=source.get("TEMPORAL_NAMESPACE", "default"),
42
+ temporal_task_queue=source.get("TEMPORAL_TASK_QUEUE", "ewma-tasks"),
43
+ clickhouse_host=source.get("CLICKHOUSE_HOST", "localhost"),
44
+ clickhouse_port=_int_val(
45
+ source.get("CLICKHOUSE_PORT", "8129"), "CLICKHOUSE_PORT"
46
+ ),
47
+ clickhouse_username=source.get("CLICKHOUSE_USERNAME", "default"),
48
+ clickhouse_password=source.get("CLICKHOUSE_PASSWORD", ""),
49
+ clickhouse_database=source.get("CLICKHOUSE_DATABASE", "default"),
50
+ redis_url=source.get("REDIS_URL", "redis://localhost:6389/0"),
51
+ postgres_dsn=source.get("POSTGRES_DSN", postgres_dsn),
52
+ kafka_bootstrap_servers=source.get("KAFKA_BOOTSTRAP_SERVERS", "localhost:9099"),
53
+ )
@@ -0,0 +1,69 @@
1
+ import asyncio
2
+ import uvicorn
3
+ from api.rest.v1.app import app
4
+ from temporalio.client import Client
5
+ from temporalio.worker import Worker
6
+ from worker.config import load_config
7
+ from worker.activities import EwmaActivities
8
+ from worker.workflows import EwmaBaselineUpdate
9
+ from infra.adapters.clickhouse.clickhouse_adapter import ClickHouseAdapter
10
+ from infra.adapters.redis.redis_adapter import RedisAdapter
11
+ from infra.adapters.postgres.postgres_adapter import PostgresAdapter
12
+ from infra.adapters.kafka.kafka_alert_adapter import KafkaAlertAdapter
13
+
14
+
15
+ async def main() -> None:
16
+ config = load_config()
17
+
18
+ clickhouse = ClickHouseAdapter(
19
+ host=config.clickhouse_host,
20
+ port=config.clickhouse_port,
21
+ username=config.clickhouse_username,
22
+ password=config.clickhouse_password,
23
+ database=config.clickhouse_database,
24
+ )
25
+
26
+ redis = RedisAdapter(url=config.redis_url)
27
+ postgres = PostgresAdapter(dsn=config.postgres_dsn)
28
+ alert_publisher = KafkaAlertAdapter(
29
+ bootstrap_servers=config.kafka_bootstrap_servers
30
+ )
31
+
32
+ activities = EwmaActivities(
33
+ clickhouse=clickhouse,
34
+ redis=redis,
35
+ postgres=postgres,
36
+ alert_publisher=alert_publisher,
37
+ )
38
+
39
+ client = await Client.connect(
40
+ config.temporal_host, namespace=config.temporal_namespace
41
+ )
42
+
43
+ worker = Worker(
44
+ client,
45
+ task_queue=config.temporal_task_queue,
46
+ workflows=[EwmaBaselineUpdate],
47
+ activities=[
48
+ activities.fetch_active_pairs,
49
+ activities.fetch_cost_history,
50
+ activities.fetch_global_model_avg,
51
+ activities.fetch_current_cost_1h,
52
+ activities.fetch_cost_by_cluster_1h,
53
+ activities.get_baseline,
54
+ activities.upsert_baseline,
55
+ activities.publish_anomaly_alert,
56
+ ],
57
+ )
58
+
59
+ server_config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
60
+ server = uvicorn.Server(server_config)
61
+
62
+ await asyncio.gather(
63
+ worker.run(),
64
+ server.serve(),
65
+ )
66
+
67
+
68
+ if __name__ == "__main__":
69
+ asyncio.run(main())
@@ -0,0 +1,22 @@
1
+ from dataclasses import dataclass
2
+ from shared.contracts.validator import load_workflow_contract
3
+ from worker.workflows import EwmaBaselineUpdate
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class WorkflowDefinition:
8
+ name: str
9
+ handler: type
10
+ contract: dict
11
+
12
+
13
+ def build_registry() -> dict[str, WorkflowDefinition]:
14
+ contract = load_workflow_contract()
15
+ return {
16
+ "ewma_baseline_update": WorkflowDefinition(
17
+ name="ewma_baseline_update", handler=EwmaBaselineUpdate, contract=contract
18
+ )
19
+ }
20
+
21
+
22
+ WORKFLOW_REGISTRY = build_registry()
@@ -0,0 +1,102 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from datetime import timedelta
4
+ from temporalio import workflow
5
+ from shared.types.ewma_types import EwmaRecord, AnomalyPayload
6
+ from features.ewma_compute.service import EwmaService
7
+
8
+
9
+ @dataclass
10
+ class EwmaWorkflowInput:
11
+ force_hour: int | None = None
12
+
13
+
14
+ @workflow.defn(name="EwmaBaselineUpdate")
15
+ class EwmaBaselineUpdate:
16
+ @workflow.run
17
+ async def run(self, workflow_input: EwmaWorkflowInput | None = None) -> None:
18
+ if workflow_input and workflow_input.force_hour is not None:
19
+ hour_of_week = workflow_input.force_hour
20
+ else:
21
+ current_time = workflow.now()
22
+ hour_of_week = current_time.weekday() * 24 + current_time.hour
23
+
24
+ active_pairs = await workflow.execute_activity(
25
+ "fetch_active_pairs", start_to_close_timeout=timedelta(seconds=60)
26
+ )
27
+
28
+ tasks = [
29
+ self._process_pair(service, model, hour_of_week)
30
+ for service, model in active_pairs
31
+ ]
32
+ await asyncio.gather(*tasks)
33
+
34
+ async def _process_pair(self, service: str, model: str, hour_of_week: int) -> None:
35
+ existing_record = await workflow.execute_activity(
36
+ "get_baseline",
37
+ args=[service, model, hour_of_week],
38
+ result_type=EwmaRecord,
39
+ start_to_close_timeout=timedelta(seconds=10),
40
+ )
41
+
42
+ if existing_record is None:
43
+ global_avg = await workflow.execute_activity(
44
+ "fetch_global_model_avg",
45
+ args=[model, hour_of_week],
46
+ start_to_close_timeout=timedelta(seconds=30),
47
+ )
48
+
49
+ new_record = EwmaService.process_update(0.0, None, global_avg)
50
+ new_record.service = service
51
+ new_record.model = model
52
+ new_record.hour_of_week = hour_of_week
53
+
54
+ await workflow.execute_activity(
55
+ "upsert_baseline",
56
+ args=[new_record],
57
+ start_to_close_timeout=timedelta(seconds=15),
58
+ )
59
+ else:
60
+ current_cost = await workflow.execute_activity(
61
+ "fetch_current_cost_1h",
62
+ args=[service, model],
63
+ start_to_close_timeout=timedelta(seconds=30),
64
+ )
65
+
66
+ is_anomaly = (
67
+ not existing_record.is_cold_start
68
+ and current_cost > 3.0 * existing_record.ewma_value
69
+ )
70
+
71
+ if is_anomaly:
72
+ cluster_drilldown = await workflow.execute_activity(
73
+ "fetch_cost_by_cluster_1h",
74
+ args=[service, model],
75
+ start_to_close_timeout=timedelta(seconds=30),
76
+ )
77
+
78
+ alert_payload = AnomalyPayload(
79
+ service=service,
80
+ model=model,
81
+ hour_of_week=hour_of_week,
82
+ current_cost=current_cost,
83
+ ewma_value=existing_record.ewma_value,
84
+ threshold_value=3.0 * existing_record.ewma_value,
85
+ sample_count=existing_record.sample_count,
86
+ timestamp=workflow.now().isoformat(),
87
+ cluster_drilldown=cluster_drilldown,
88
+ )
89
+
90
+ await workflow.execute_activity(
91
+ "publish_anomaly_alert",
92
+ args=[alert_payload],
93
+ start_to_close_timeout=timedelta(seconds=20),
94
+ )
95
+
96
+ new_record = EwmaService.process_update(current_cost, existing_record, 0.0)
97
+
98
+ await workflow.execute_activity(
99
+ "upsert_baseline",
100
+ args=[new_record],
101
+ start_to_close_timeout=timedelta(seconds=15),
102
+ )