temporal-ewma-worker 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- temporal_ewma_worker-0.1.1/PKG-INFO +21 -0
- temporal_ewma_worker-0.1.1/README.md +278 -0
- temporal_ewma_worker-0.1.1/pyproject.toml +39 -0
- temporal_ewma_worker-0.1.1/setup.cfg +4 -0
- temporal_ewma_worker-0.1.1/src/api/index.py +41 -0
- temporal_ewma_worker-0.1.1/src/api/rest/v1/app.py +33 -0
- temporal_ewma_worker-0.1.1/src/features/ewma_compute/index.py +3 -0
- temporal_ewma_worker-0.1.1/src/features/ewma_compute/service.py +40 -0
- temporal_ewma_worker-0.1.1/src/infra/adapters/clickhouse/clickhouse_adapter.py +95 -0
- temporal_ewma_worker-0.1.1/src/infra/adapters/kafka/kafka_alert_adapter.py +20 -0
- temporal_ewma_worker-0.1.1/src/infra/adapters/postgres/postgres_adapter.py +58 -0
- temporal_ewma_worker-0.1.1/src/infra/adapters/redis/redis_adapter.py +21 -0
- temporal_ewma_worker-0.1.1/src/shared/contracts/validator.py +53 -0
- temporal_ewma_worker-0.1.1/src/shared/errors/base.py +14 -0
- temporal_ewma_worker-0.1.1/src/shared/ports/alert_publisher_port.py +6 -0
- temporal_ewma_worker-0.1.1/src/shared/ports/clickhouse_port.py +16 -0
- temporal_ewma_worker-0.1.1/src/shared/ports/postgres_port.py +10 -0
- temporal_ewma_worker-0.1.1/src/shared/ports/redis_port.py +9 -0
- temporal_ewma_worker-0.1.1/src/shared/types/ewma_types.py +32 -0
- temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/PKG-INFO +21 -0
- temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/SOURCES.txt +27 -0
- temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/dependency_links.txt +1 -0
- temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/requires.txt +17 -0
- temporal_ewma_worker-0.1.1/src/temporal_ewma_worker.egg-info/top_level.txt +5 -0
- temporal_ewma_worker-0.1.1/src/worker/activities.py +62 -0
- temporal_ewma_worker-0.1.1/src/worker/config.py +53 -0
- temporal_ewma_worker-0.1.1/src/worker/index.py +69 -0
- temporal_ewma_worker-0.1.1/src/worker/registry.py +22 -0
- temporal_ewma_worker-0.1.1/src/worker/workflows.py +102 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: temporal-ewma-worker
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: temporalio>=1.5.0
|
|
7
|
+
Requires-Dist: clickhouse-connect>=0.7.0
|
|
8
|
+
Requires-Dist: redis>=5.0.0
|
|
9
|
+
Requires-Dist: psycopg[binary]>=3.1.0
|
|
10
|
+
Requires-Dist: confluent-kafka>=2.3.0
|
|
11
|
+
Requires-Dist: opentelemetry-sdk>=1.26.0
|
|
12
|
+
Requires-Dist: pydantic>=2.0.0
|
|
13
|
+
Requires-Dist: fastapi>=0.100.0
|
|
14
|
+
Requires-Dist: uvicorn>=0.22.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
19
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
20
|
+
Requires-Dist: ruff>=0.2.0; extra == "dev"
|
|
21
|
+
Requires-Dist: httpx>=0.24.0; extra == "dev"
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
# Temporal EWMA Worker
|
|
2
|
+
|
|
3
|
+
Temporal worker package for scheduled EWMA baseline updates and cost anomaly detection.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Folder Structure
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
.
|
|
11
|
+
├── build/
|
|
12
|
+
│ └── Dockerfile
|
|
13
|
+
├── contracts/
|
|
14
|
+
│ ├── asyncapi/
|
|
15
|
+
│ │ └── v1.yaml
|
|
16
|
+
│ ├── changelog.md
|
|
17
|
+
│ └── workflows/
|
|
18
|
+
│ └── ewma_baseline_update.yaml
|
|
19
|
+
├── database/
|
|
20
|
+
│ ├── migrations/
|
|
21
|
+
│ │ ├── 0001_init.rollback.sql
|
|
22
|
+
│ │ └── 0001_init.sql
|
|
23
|
+
│ └── schema.lock
|
|
24
|
+
├── deploy/
|
|
25
|
+
│ └── docker/
|
|
26
|
+
│ └── docker-compose.yaml
|
|
27
|
+
├── feature-registry.yaml
|
|
28
|
+
├── pyproject.toml
|
|
29
|
+
├── README.md
|
|
30
|
+
├── scripts/
|
|
31
|
+
│ ├── deploy_docker.sh
|
|
32
|
+
│ ├── migrate.py
|
|
33
|
+
│ ├── migrate.sh
|
|
34
|
+
│ ├── run.sh
|
|
35
|
+
│ └── test.sh
|
|
36
|
+
├── src/
|
|
37
|
+
│ ├── features/
|
|
38
|
+
│ │ └── ewma_compute/
|
|
39
|
+
│ │ ├── index.py
|
|
40
|
+
│ │ └── service.py
|
|
41
|
+
│ ├── infra/
|
|
42
|
+
│ │ └── adapters/
|
|
43
|
+
│ │ ├── clickhouse/
|
|
44
|
+
│ │ │ └── clickhouse_adapter.py
|
|
45
|
+
│ │ ├── kafka/
|
|
46
|
+
│ │ │ └── kafka_alert_adapter.py
|
|
47
|
+
│ │ ├── postgres/
|
|
48
|
+
│ │ │ └── postgres_adapter.py
|
|
49
|
+
│ │ └── redis/
|
|
50
|
+
│ │ └── redis_adapter.py
|
|
51
|
+
│ ├── shared/
|
|
52
|
+
│ │ ├── contracts/
|
|
53
|
+
│ │ │ └── validator.py
|
|
54
|
+
│ │ ├── errors/
|
|
55
|
+
│ │ │ └── base.py
|
|
56
|
+
│ │ ├── ports/
|
|
57
|
+
│ │ │ ├── clickhouse_port.py
|
|
58
|
+
│ │ │ ├── postgres_port.py
|
|
59
|
+
│ │ │ └── redis_port.py
|
|
60
|
+
│ │ └── types/
|
|
61
|
+
│ │ └── ewma_types.py
|
|
62
|
+
│ └── worker/
|
|
63
|
+
│ ├── activities.py
|
|
64
|
+
│ ├── config.py
|
|
65
|
+
│ ├── index.py
|
|
66
|
+
│ ├── registry.py
|
|
67
|
+
│ └── workflows.py
|
|
68
|
+
├── tests/
|
|
69
|
+
│ ├── integration/
|
|
70
|
+
│ │ └── test_adapters.py
|
|
71
|
+
│ └── unit/
|
|
72
|
+
│ ├── test_config.py
|
|
73
|
+
│ ├── test_contract.py
|
|
74
|
+
│ ├── test_ewma_service.py
|
|
75
|
+
│ └── test_workflow.py
|
|
76
|
+
└── worker-registry.yaml
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Work Execution & Decision Flow
|
|
82
|
+
|
|
83
|
+
The following detailed decision tree outlines how the hourly workflow updates baselines and flags anomalies, with justification for each design choice:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
[Hourly Cron Trigger (0 * * * *)]
|
|
87
|
+
└── EwmaBaselineUpdate Workflow Starts
|
|
88
|
+
│
|
|
89
|
+
│ ► RATIONALE: Scheduled cron triggers workflow at off-peak hour intervals.
|
|
90
|
+
│
|
|
91
|
+
└── Activity: fetch_active_pairs()
|
|
92
|
+
│
|
|
93
|
+
│ ► RATIONALE: Scans ClickHouse log volumes for active (service, model) pairs
|
|
94
|
+
│ in the last 7 days. This filters out millions of historical combinations,
|
|
95
|
+
│ focusing computation ONLY on active traffic to minimize cost and execution time.
|
|
96
|
+
│
|
|
97
|
+
└── Loop over active (service, model) pairs concurrently:
|
|
98
|
+
│
|
|
99
|
+
│ ► RATIONALE: Temporal workflows run loops concurrently. Concurrency allows
|
|
100
|
+
│ thousands of pairs to be evaluated in parallel without blocking.
|
|
101
|
+
│
|
|
102
|
+
├── Activity: get_baseline(service, model, hour_of_week)
|
|
103
|
+
│ │
|
|
104
|
+
│ │ ► RATIONALE: Reads the current baseline record from PostgreSQL. PostgreSQL is
|
|
105
|
+
│ │ used here because it provides ACID compliance for historical baselines.
|
|
106
|
+
│ │
|
|
107
|
+
│ ├── Existing Baseline NOT found (Cold Start)
|
|
108
|
+
│ │ ├── Activity: fetch_global_model_avg(model)
|
|
109
|
+
│ │ │
|
|
110
|
+
│ │ │ ► RATIONALE: Lacking historical service/model pairing, we seed the baseline
|
|
111
|
+
│ │ │ using the global average cost for this specific model (e.g. gpt-4o) across
|
|
112
|
+
│ │ │ all services. This prevents false positive anomaly triggers during cold starts.
|
|
113
|
+
│ │ │
|
|
114
|
+
│ │ └── Seed EWMA baseline value = Global Model Average
|
|
115
|
+
│ │
|
|
116
|
+
│ └── Existing Baseline found (Warm Status)
|
|
117
|
+
│ ├── Activity: fetch_cost_history(service, model, hour_of_week)
|
|
118
|
+
│ │ │
|
|
119
|
+
│ │ │ ► RATIONALE: Queries ClickHouse for the cost of the same hour_of_week (0-167)
|
|
120
|
+
│ │ │ over the last 4 weeks. ClickHouse is selected here because column-oriented
|
|
121
|
+
│ │ │ storage allows ultra-fast aggregation of historical logs.
|
|
122
|
+
│ │ │
|
|
123
|
+
│ │ └── Fetch last 4 occurrences from ClickHouse
|
|
124
|
+
│ └── Compute EWMA baseline value using α=0.1:
|
|
125
|
+
│ EWMA_new = (1 - α) * EWMA_prev + α * Cost_current
|
|
126
|
+
│
|
|
127
|
+
├── Activity: fetch_current_cost_1h(service, model)
|
|
128
|
+
│
|
|
129
|
+
├── Activity: upsert_baseline(EwmaRecord)
|
|
130
|
+
│ │
|
|
131
|
+
│ │ ► RATIONALE: Persists the calculated baseline to PostgreSQL for persistent audit trail.
|
|
132
|
+
│ │
|
|
133
|
+
│ └── Persist updated baseline to PostgreSQL
|
|
134
|
+
│
|
|
135
|
+
├── Write updated EWMA value to Redis Cache
|
|
136
|
+
│ │
|
|
137
|
+
│ │ ► RATIONALE: Anomaly-detection gateways on the ingestion path need ultra-low latency.
|
|
138
|
+
│ │ Redis caches the calculated baseline under: ewma:cost:{service}:{model}:{hour_of_week}
|
|
139
|
+
│ │
|
|
140
|
+
│ └── Key: ewma:cost:{service}:{model}:{hour_of_week}
|
|
141
|
+
│
|
|
142
|
+
└── Decision: Is Cost_current > (3 * EWMA_baseline)?
|
|
143
|
+
│
|
|
144
|
+
├── YES (Anomaly Detected)
|
|
145
|
+
│ ├── Activity: fetch_cost_by_cluster_1h(service, model)
|
|
146
|
+
│ │ │
|
|
147
|
+
│ │ │ ► RATIONALE: If cost spikes, we query ClickHouse to break down the cost
|
|
148
|
+
│ │ │ contributions by Kubernetes cluster/namespace to locate the root cause.
|
|
149
|
+
│ │ │
|
|
150
|
+
│ │ └── Get cluster drilldown metrics
|
|
151
|
+
│ │
|
|
152
|
+
│ └── Activity: publish_anomaly_alert(AnomalyPayload)
|
|
153
|
+
│ │
|
|
154
|
+
│ │ ► RATIONALE: Publishes to Kafka topic 'cost-anomaly-alerts'. Using Kafka
|
|
155
|
+
│ │ decouples anomaly detection from notification delivery (Slack, pager).
|
|
156
|
+
│ │
|
|
157
|
+
│ └── Emit alert JSON to Kafka topic
|
|
158
|
+
│
|
|
159
|
+
└── NO (Normal State)
|
|
160
|
+
└── Do nothing
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Sequencing & Dependency Map
|
|
166
|
+
|
|
167
|
+
To run the worker successfully, you MUST spin up and configure dependencies in the following strict order:
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
[Step 1: Docker Containers] ---> [Step 2: Configuration] ---> [Step 3: DB Migrations] ---> [Step 4: Verification] ---> [Step 5: Start Worker]
|
|
171
|
+
• ClickHouse (8123) • Copy .env.example • ./scripts/migrate.sh • ./scripts/test.sh • ./scripts/run.sh
|
|
172
|
+
• PostgreSQL (5432) • Set hosts & ports (Applies SQL schemas) (Ensures integrations (Starts polling
|
|
173
|
+
• Redis Cache (6379) and mock runs pass) Temporal task queue)
|
|
174
|
+
• Kafka & Zookeeper (9092)
|
|
175
|
+
• Temporal Server (7233)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Setup & Running
|
|
181
|
+
|
|
182
|
+
Follow these steps to set up the local development environment and run the worker:
|
|
183
|
+
|
|
184
|
+
### 1. Prerequisites
|
|
185
|
+
Ensure you have the following installed:
|
|
186
|
+
- Python 3.11+
|
|
187
|
+
- Docker & Docker Compose
|
|
188
|
+
- Git
|
|
189
|
+
|
|
190
|
+
### 2. Configure Virtual Environment & Dependencies
|
|
191
|
+
Create a virtual environment and install the package along with development requirements:
|
|
192
|
+
```bash
|
|
193
|
+
# Create virtual environment
|
|
194
|
+
python3 -m venv .venv
|
|
195
|
+
|
|
196
|
+
# Activate virtual environment
|
|
197
|
+
source .venv/bin/activate
|
|
198
|
+
|
|
199
|
+
# Install package in editable mode with development dependencies
|
|
200
|
+
pip install -e ".[dev]"
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### 3. Spin Up Infrastructure
|
|
204
|
+
Use the provided `docker-compose` to run ClickingHouse, Postgres, Redis, Kafka, and Temporal locally:
|
|
205
|
+
```bash
|
|
206
|
+
docker compose -f deploy/docker/docker-compose.yaml up -d
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### 4. Configure Environment Variables
|
|
210
|
+
Copy the template `.env.example` to `.env` and fill in custom connection strings if necessary:
|
|
211
|
+
```bash
|
|
212
|
+
cp .env.example .env
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Database Migrations Guide
|
|
218
|
+
|
|
219
|
+
The database schema is managed via light-weight migration scripts tracked under `database/migrations/` and verified using a `schema.lock` file.
|
|
220
|
+
|
|
221
|
+
### How it Works
|
|
222
|
+
The migration status is tracked inside `/database/migrations/schema.lock` containing the active version tag (e.g. `0001` or `0000`).
|
|
223
|
+
|
|
224
|
+
### Apply Migrations (UP)
|
|
225
|
+
To apply pending database schemas, run:
|
|
226
|
+
```bash
|
|
227
|
+
./scripts/migrate.sh up
|
|
228
|
+
```
|
|
229
|
+
This runs `0001_init.sql` against the configured PostgreSQL database and writes `0001` to `schema.lock`.
|
|
230
|
+
|
|
231
|
+
### Rollback Migrations
|
|
232
|
+
To revert schemas and return to baseline state, run:
|
|
233
|
+
```bash
|
|
234
|
+
./scripts/migrate.sh rollback
|
|
235
|
+
```
|
|
236
|
+
This executes the rollback SQL scripts and sets the `schema.lock` version to `0000`.
|
|
237
|
+
|
|
238
|
+
### Creating a New Migration
|
|
239
|
+
1. Add your SQL changes inside `database/migrations/` using a sequential identifier (e.g., `0002_add_index.sql` and `0002_add_index.rollback.sql`).
|
|
240
|
+
2. Update the transition mappings inside `scripts/migrate.py` to support applying and rolling back your new script file.
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Running Verification & Worker
|
|
245
|
+
|
|
246
|
+
### 1. Run Tests
|
|
247
|
+
Verify configuration, domain services, and workflow behavior using the test script:
|
|
248
|
+
```bash
|
|
249
|
+
./scripts/test.sh
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### 2. Run Worker
|
|
253
|
+
Start the Temporal worker polling queue `ewma-tasks`:
|
|
254
|
+
```bash
|
|
255
|
+
./scripts/run.sh
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Remote Management API (REST)
|
|
261
|
+
|
|
262
|
+
The worker now includes a FastAPI management layer (port 8000 in prod).
|
|
263
|
+
|
|
264
|
+
| Endpoint | Method | Description |
|
|
265
|
+
| :--- | :--- | :--- |
|
|
266
|
+
| `/health` | GET | Check worker status and config. |
|
|
267
|
+
| `/trigger` | POST | Trigger the EWMA baseline update workflow on-demand. |
|
|
268
|
+
|
|
269
|
+
### Example Execution
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
curl -X POST http://localhost:8000/trigger \
|
|
273
|
+
-H "Content-Type: application/json" \
|
|
274
|
+
-d '{
|
|
275
|
+
"force_hour": 42
|
|
276
|
+
}'
|
|
277
|
+
```
|
|
278
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "temporal-ewma-worker"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"temporalio>=1.5.0",
|
|
8
|
+
"clickhouse-connect>=0.7.0",
|
|
9
|
+
"redis>=5.0.0",
|
|
10
|
+
"psycopg[binary]>=3.1.0",
|
|
11
|
+
"confluent-kafka>=2.3.0",
|
|
12
|
+
"opentelemetry-sdk>=1.26.0",
|
|
13
|
+
"pydantic>=2.0.0",
|
|
14
|
+
"fastapi>=0.100.0",
|
|
15
|
+
"uvicorn>=0.22.0"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
dev = [
|
|
20
|
+
"pytest>=8.0",
|
|
21
|
+
"pytest-cov>=4.1.0",
|
|
22
|
+
"pytest-asyncio>=0.23.0",
|
|
23
|
+
"mypy>=1.8.0",
|
|
24
|
+
"ruff>=0.2.0",
|
|
25
|
+
"httpx>=0.24.0"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
[tool.pytest.ini_options]
|
|
30
|
+
pythonpath = ["src"]
|
|
31
|
+
testpaths = ["tests"]
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
35
|
+
build-backend = "setuptools.build_meta"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["src"]
|
|
39
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from temporalio.client import Client
|
|
3
|
+
from worker.config import load_config
|
|
4
|
+
from worker.workflows import EwmaWorkflowInput
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def health(env: dict[str, str] | None = None) -> dict:
|
|
8
|
+
cfg = load_config(env)
|
|
9
|
+
return {
|
|
10
|
+
"status": "ok",
|
|
11
|
+
"temporal_host": cfg.temporal_host,
|
|
12
|
+
"temporal_namespace": cfg.temporal_namespace,
|
|
13
|
+
"temporal_task_queue": cfg.temporal_task_queue,
|
|
14
|
+
"redis_url": cfg.redis_url,
|
|
15
|
+
"kafka_bootstrap_servers": cfg.kafka_bootstrap_servers,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def trigger_workflow(
|
|
20
|
+
force_hour: int | None = None, env: dict[str, str] | None = None
|
|
21
|
+
) -> dict:
|
|
22
|
+
cfg = load_config(env)
|
|
23
|
+
client = await Client.connect(cfg.temporal_host, namespace=cfg.temporal_namespace)
|
|
24
|
+
workflow_id = f"ewma-update-manual-{int(time.time())}"
|
|
25
|
+
|
|
26
|
+
workflow_input = (
|
|
27
|
+
EwmaWorkflowInput(force_hour=force_hour) if force_hour is not None else None
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
handle = await client.start_workflow(
|
|
31
|
+
"EwmaBaselineUpdate",
|
|
32
|
+
id=workflow_id,
|
|
33
|
+
task_queue=cfg.temporal_task_queue,
|
|
34
|
+
args=[workflow_input] if workflow_input else [],
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"status": "triggered",
|
|
39
|
+
"workflow_id": workflow_id,
|
|
40
|
+
"run_id": handle.first_execution_run_id,
|
|
41
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException
|
|
2
|
+
from pydantic import BaseModel, Field
|
|
3
|
+
from api.index import health as get_health, trigger_workflow as trigger_ewma_workflow
|
|
4
|
+
|
|
5
|
+
app = FastAPI(title="Temporal EWMA Worker API", version="1.0.0")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TriggerRequest(BaseModel):
|
|
9
|
+
force_hour: int | None = Field(default=None, ge=0, le=167)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.get("/health")
|
|
13
|
+
async def health() -> dict:
|
|
14
|
+
try:
|
|
15
|
+
return await get_health()
|
|
16
|
+
except Exception as e:
|
|
17
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@app.post("/trigger")
|
|
21
|
+
async def trigger_workflow(request: TriggerRequest | None = None) -> dict:
|
|
22
|
+
try:
|
|
23
|
+
force_hour = request.force_hour if request else None
|
|
24
|
+
res = await trigger_ewma_workflow(force_hour=force_hour)
|
|
25
|
+
return res
|
|
26
|
+
except Exception as e:
|
|
27
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
import uvicorn
|
|
32
|
+
|
|
33
|
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from shared.types.ewma_types import EwmaRecord
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EwmaService:
|
|
5
|
+
@staticmethod
|
|
6
|
+
def calculate_new_ewma(
|
|
7
|
+
current_value: float, previous_ewma: float, alpha: float = 0.1
|
|
8
|
+
) -> float:
|
|
9
|
+
return (alpha * current_value) + ((1.0 - alpha) * previous_ewma)
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def process_update(
|
|
13
|
+
current_value: float,
|
|
14
|
+
existing_record: EwmaRecord | None,
|
|
15
|
+
global_model_avg: float,
|
|
16
|
+
) -> EwmaRecord:
|
|
17
|
+
if existing_record is None:
|
|
18
|
+
return EwmaRecord(
|
|
19
|
+
service="",
|
|
20
|
+
model="",
|
|
21
|
+
hour_of_week=0,
|
|
22
|
+
ewma_value=global_model_avg,
|
|
23
|
+
sample_count=1,
|
|
24
|
+
is_cold_start=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
new_sample_count = existing_record.sample_count + 1
|
|
28
|
+
new_ewma = EwmaService.calculate_new_ewma(
|
|
29
|
+
current_value, existing_record.ewma_value
|
|
30
|
+
)
|
|
31
|
+
new_is_cold_start = new_sample_count < 7
|
|
32
|
+
|
|
33
|
+
return EwmaRecord(
|
|
34
|
+
service=existing_record.service,
|
|
35
|
+
model=existing_record.model,
|
|
36
|
+
hour_of_week=existing_record.hour_of_week,
|
|
37
|
+
ewma_value=new_ewma,
|
|
38
|
+
sample_count=new_sample_count,
|
|
39
|
+
is_cold_start=new_is_cold_start,
|
|
40
|
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
import clickhouse_connect
|
|
3
|
+
from shared.ports.clickhouse_port import ClickHousePort
|
|
4
|
+
from shared.types.ewma_types import ClusterCost
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ClickHouseAdapter(ClickHousePort):
|
|
8
|
+
def __init__(
|
|
9
|
+
self, host: str, port: int, username: str, password: str, database: str
|
|
10
|
+
):
|
|
11
|
+
self.client = clickhouse_connect.get_client(
|
|
12
|
+
host=host,
|
|
13
|
+
port=port,
|
|
14
|
+
username=username,
|
|
15
|
+
password=password,
|
|
16
|
+
database=database,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def get_active_pairs(self) -> List[Tuple[str, str]]:
|
|
20
|
+
query = """
|
|
21
|
+
SELECT DISTINCT service, model
|
|
22
|
+
FROM cost_by_dimension
|
|
23
|
+
WHERE timestamp >= now() - INTERVAL 7 DAY
|
|
24
|
+
"""
|
|
25
|
+
result = self.client.query(query)
|
|
26
|
+
return [(str(row[0]), str(row[1])) for row in result.result_rows]
|
|
27
|
+
|
|
28
|
+
def get_cost_history(
|
|
29
|
+
self, service: str, model: str, hour_of_week: int
|
|
30
|
+
) -> List[float]:
|
|
31
|
+
query = """
|
|
32
|
+
SELECT sum(cost) as hourly_cost
|
|
33
|
+
FROM cost_by_dimension
|
|
34
|
+
WHERE service = %(service)s
|
|
35
|
+
AND model = %(model)s
|
|
36
|
+
AND ((toDayOfWeek(timestamp) - 1) * 24 + toHour(timestamp)) = %(hour_of_week)s
|
|
37
|
+
AND timestamp >= now() - INTERVAL 28 DAY
|
|
38
|
+
GROUP BY toStartOfHour(timestamp)
|
|
39
|
+
ORDER BY toStartOfHour(timestamp) DESC
|
|
40
|
+
LIMIT 4
|
|
41
|
+
"""
|
|
42
|
+
result = self.client.query(
|
|
43
|
+
query, {"service": service, "model": model, "hour_of_week": hour_of_week}
|
|
44
|
+
)
|
|
45
|
+
return [float(row[0]) for row in result.result_rows if row[0] is not None]
|
|
46
|
+
|
|
47
|
+
def get_global_model_avg(self, model: str, hour_of_week: int) -> float:
|
|
48
|
+
query = """
|
|
49
|
+
SELECT avg(hourly_cost)
|
|
50
|
+
FROM (
|
|
51
|
+
SELECT sum(cost) as hourly_cost
|
|
52
|
+
FROM cost_by_dimension
|
|
53
|
+
WHERE model = %(model)s
|
|
54
|
+
AND ((toDayOfWeek(timestamp) - 1) * 24 + toHour(timestamp)) = %(hour_of_week)s
|
|
55
|
+
AND timestamp >= now() - INTERVAL 28 DAY
|
|
56
|
+
GROUP BY toStartOfHour(timestamp)
|
|
57
|
+
)
|
|
58
|
+
"""
|
|
59
|
+
result = self.client.query(
|
|
60
|
+
query, {"model": model, "hour_of_week": hour_of_week}
|
|
61
|
+
)
|
|
62
|
+
rows = result.result_rows
|
|
63
|
+
if rows and rows[0][0] is not None:
|
|
64
|
+
return float(rows[0][0])
|
|
65
|
+
return 0.0
|
|
66
|
+
|
|
67
|
+
def get_current_cost_1h(self, service: str, model: str) -> float:
|
|
68
|
+
query = """
|
|
69
|
+
SELECT sum(cost)
|
|
70
|
+
FROM cost_by_dimension
|
|
71
|
+
WHERE service = %(service)s
|
|
72
|
+
AND model = %(model)s
|
|
73
|
+
AND timestamp >= now() - INTERVAL 1 HOUR
|
|
74
|
+
"""
|
|
75
|
+
result = self.client.query(query, {"service": service, "model": model})
|
|
76
|
+
rows = result.result_rows
|
|
77
|
+
if rows and rows[0][0] is not None:
|
|
78
|
+
return float(rows[0][0])
|
|
79
|
+
return 0.0
|
|
80
|
+
|
|
81
|
+
def get_cost_by_cluster_1h(self, service: str, model: str) -> List[ClusterCost]:
|
|
82
|
+
query = """
|
|
83
|
+
SELECT cluster_id, sum(cost) as cost
|
|
84
|
+
FROM cost_by_dimension
|
|
85
|
+
WHERE service = %(service)s
|
|
86
|
+
AND model = %(model)s
|
|
87
|
+
AND timestamp >= now() - INTERVAL 1 HOUR
|
|
88
|
+
GROUP BY cluster_id
|
|
89
|
+
"""
|
|
90
|
+
result = self.client.query(query, {"service": service, "model": model})
|
|
91
|
+
return [
|
|
92
|
+
ClusterCost(cluster_id=str(row[0]), cost=float(row[1]))
|
|
93
|
+
for row in result.result_rows
|
|
94
|
+
if row[0] is not None and row[1] is not None
|
|
95
|
+
]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from dataclasses import asdict
|
|
3
|
+
from confluent_kafka import Producer
|
|
4
|
+
from shared.ports.alert_publisher_port import AlertPublisherPort
|
|
5
|
+
from shared.types.ewma_types import AnomalyPayload
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KafkaAlertAdapter(AlertPublisherPort):
|
|
9
|
+
def __init__(self, bootstrap_servers: str):
|
|
10
|
+
self.producer = Producer({"bootstrap.servers": bootstrap_servers})
|
|
11
|
+
|
|
12
|
+
def publish_anomaly(self, payload: AnomalyPayload) -> None:
|
|
13
|
+
topic = "alerts.cost.anomaly"
|
|
14
|
+
payload_dict = asdict(payload)
|
|
15
|
+
self.producer.produce(
|
|
16
|
+
topic=topic,
|
|
17
|
+
key=f"{payload.service}:{payload.model}",
|
|
18
|
+
value=json.dumps(payload_dict).encode("utf-8"),
|
|
19
|
+
)
|
|
20
|
+
self.producer.flush()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import psycopg
|
|
2
|
+
from shared.ports.postgres_port import PostgresPort
|
|
3
|
+
from shared.types.ewma_types import EwmaRecord
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PostgresAdapter(PostgresPort):
|
|
7
|
+
def __init__(self, dsn: str):
|
|
8
|
+
self.dsn = dsn
|
|
9
|
+
|
|
10
|
+
def get_baseline(
|
|
11
|
+
self, service: str, model: str, hour_of_week: int
|
|
12
|
+
) -> EwmaRecord | None:
|
|
13
|
+
query = """
|
|
14
|
+
SELECT service, model, hour_of_week, ewma_value, sample_count, is_cold_start, updated_at
|
|
15
|
+
FROM ewma_baselines
|
|
16
|
+
WHERE service = %s AND model = %s AND hour_of_week = %s
|
|
17
|
+
"""
|
|
18
|
+
with psycopg.connect(self.dsn) as conn:
|
|
19
|
+
with conn.cursor() as cur:
|
|
20
|
+
cur.execute(query, (service, model, hour_of_week))
|
|
21
|
+
row = cur.fetchone()
|
|
22
|
+
if row:
|
|
23
|
+
return EwmaRecord(
|
|
24
|
+
service=row[0],
|
|
25
|
+
model=row[1],
|
|
26
|
+
hour_of_week=row[2],
|
|
27
|
+
ewma_value=float(row[3]),
|
|
28
|
+
sample_count=int(row[4]),
|
|
29
|
+
is_cold_start=bool(row[5]),
|
|
30
|
+
updated_at=row[6],
|
|
31
|
+
)
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
def upsert_baseline(self, record: EwmaRecord) -> None:
|
|
35
|
+
query = """
|
|
36
|
+
INSERT INTO ewma_baselines (service, model, hour_of_week, ewma_value, sample_count, is_cold_start, updated_at)
|
|
37
|
+
VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
|
|
38
|
+
ON CONFLICT (service, model, hour_of_week)
|
|
39
|
+
DO UPDATE SET
|
|
40
|
+
ewma_value = EXCLUDED.ewma_value,
|
|
41
|
+
sample_count = EXCLUDED.sample_count,
|
|
42
|
+
is_cold_start = EXCLUDED.is_cold_start,
|
|
43
|
+
updated_at = CURRENT_TIMESTAMP
|
|
44
|
+
"""
|
|
45
|
+
with psycopg.connect(self.dsn) as conn:
|
|
46
|
+
with conn.cursor() as cur:
|
|
47
|
+
cur.execute(
|
|
48
|
+
query,
|
|
49
|
+
(
|
|
50
|
+
record.service,
|
|
51
|
+
record.model,
|
|
52
|
+
record.hour_of_week,
|
|
53
|
+
record.ewma_value,
|
|
54
|
+
record.sample_count,
|
|
55
|
+
record.is_cold_start,
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
conn.commit()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import redis
|
|
2
|
+
from shared.ports.redis_port import RedisPort
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class RedisAdapter(RedisPort):
|
|
6
|
+
def __init__(self, url: str):
|
|
7
|
+
self.client = redis.from_url(url)
|
|
8
|
+
|
|
9
|
+
def get_ewma(self, service: str, model: str, hour_of_week: int) -> float | None:
|
|
10
|
+
key = f"ewma:cost:{service}:{model}:{hour_of_week}"
|
|
11
|
+
val = self.client.get(key)
|
|
12
|
+
if val is not None:
|
|
13
|
+
if isinstance(val, (str, bytes)):
|
|
14
|
+
return float(val)
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
def set_ewma(
|
|
18
|
+
self, service: str, model: str, hour_of_week: int, value: float
|
|
19
|
+
) -> None:
|
|
20
|
+
key = f"ewma:cost:{service}:{model}:{hour_of_week}"
|
|
21
|
+
self.client.set(key, str(value))
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from shared.errors.base import ValidationError
|
|
5
|
+
|
|
6
|
+
BASE_DIR = Path(__file__).resolve().parents[3]
|
|
7
|
+
CONTRACT_FILE: Path = Path(
|
|
8
|
+
os.getenv(
|
|
9
|
+
"CONTRACTS_PATH",
|
|
10
|
+
BASE_DIR / "contracts" / "workflows" / "ewma_baseline_update.yaml",
|
|
11
|
+
)
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
if not CONTRACT_FILE.exists():
|
|
15
|
+
CONTRACT_FILE = Path("/app/contracts/workflows/ewma_baseline_update.yaml")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _extract_val(text: str, pattern: str) -> str:
|
|
19
|
+
m = re.search(pattern, text, re.MULTILINE)
|
|
20
|
+
if not m:
|
|
21
|
+
raise ValidationError(f"Missing expected pattern in contract: {pattern}")
|
|
22
|
+
return m.group(1).strip()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_workflow_contract() -> dict:
|
|
26
|
+
path = Path(CONTRACT_FILE)
|
|
27
|
+
if not path.exists():
|
|
28
|
+
raise ValidationError(f"Contract file not found at {path}")
|
|
29
|
+
text = path.read_text()
|
|
30
|
+
validate_workflow_contract(text)
|
|
31
|
+
return {
|
|
32
|
+
"workflow": "ewma_baseline_update",
|
|
33
|
+
"version": int(_extract_val(text, r"^version:\s*(\d+)\s*$")),
|
|
34
|
+
"cron": _extract_val(text, r"^schedule:\s*\n\s*cron:\s*\"(.*)\"\s*$"),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_workflow_contract(text: str) -> None:
|
|
39
|
+
required_fragments = [
|
|
40
|
+
"workflow: ewma_baseline_update",
|
|
41
|
+
'cron: "0 * * * *"',
|
|
42
|
+
"activities:",
|
|
43
|
+
"fetch_active_pairs:",
|
|
44
|
+
"upsert_baseline:",
|
|
45
|
+
"publish_anomaly_alert:",
|
|
46
|
+
]
|
|
47
|
+
for fragment in required_fragments:
|
|
48
|
+
if fragment not in text:
|
|
49
|
+
raise ValidationError(f"Missing required contract fragment: {fragment}")
|
|
50
|
+
|
|
51
|
+
version = int(_extract_val(text, r"^version:\s*(\d+)\s*$"))
|
|
52
|
+
if version < 1:
|
|
53
|
+
raise ValidationError("Contract version must be positive integer")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Protocol, Tuple, List
|
|
2
|
+
from shared.types.ewma_types import ClusterCost
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ClickHousePort(Protocol):
|
|
6
|
+
def get_active_pairs(self) -> List[Tuple[str, str]]: ...
|
|
7
|
+
|
|
8
|
+
def get_cost_history(
|
|
9
|
+
self, service: str, model: str, hour_of_week: int
|
|
10
|
+
) -> List[float]: ...
|
|
11
|
+
|
|
12
|
+
def get_global_model_avg(self, model: str, hour_of_week: int) -> float: ...
|
|
13
|
+
|
|
14
|
+
def get_current_cost_1h(self, service: str, model: str) -> float: ...
|
|
15
|
+
|
|
16
|
+
def get_cost_by_cluster_1h(self, service: str, model: str) -> List[ClusterCost]: ...
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Protocol
|
|
2
|
+
from shared.types.ewma_types import EwmaRecord
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class PostgresPort(Protocol):
|
|
6
|
+
def get_baseline(
|
|
7
|
+
self, service: str, model: str, hour_of_week: int
|
|
8
|
+
) -> EwmaRecord | None: ...
|
|
9
|
+
|
|
10
|
+
def upsert_baseline(self, record: EwmaRecord) -> None: ...
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class EwmaRecord:
|
|
7
|
+
service: str
|
|
8
|
+
model: str
|
|
9
|
+
hour_of_week: int
|
|
10
|
+
ewma_value: float
|
|
11
|
+
sample_count: int
|
|
12
|
+
is_cold_start: bool
|
|
13
|
+
updated_at: datetime | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ClusterCost:
|
|
18
|
+
cluster_id: str
|
|
19
|
+
cost: float
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class AnomalyPayload:
|
|
24
|
+
service: str
|
|
25
|
+
model: str
|
|
26
|
+
hour_of_week: int
|
|
27
|
+
current_cost: float
|
|
28
|
+
ewma_value: float
|
|
29
|
+
threshold_value: float
|
|
30
|
+
sample_count: int
|
|
31
|
+
timestamp: str
|
|
32
|
+
cluster_drilldown: list[ClusterCost]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: temporal-ewma-worker
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Temporal scheduled workflow worker for EWMA baseline updating and anomaly checking
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: temporalio>=1.5.0
|
|
7
|
+
Requires-Dist: clickhouse-connect>=0.7.0
|
|
8
|
+
Requires-Dist: redis>=5.0.0
|
|
9
|
+
Requires-Dist: psycopg[binary]>=3.1.0
|
|
10
|
+
Requires-Dist: confluent-kafka>=2.3.0
|
|
11
|
+
Requires-Dist: opentelemetry-sdk>=1.26.0
|
|
12
|
+
Requires-Dist: pydantic>=2.0.0
|
|
13
|
+
Requires-Dist: fastapi>=0.100.0
|
|
14
|
+
Requires-Dist: uvicorn>=0.22.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
19
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
20
|
+
Requires-Dist: ruff>=0.2.0; extra == "dev"
|
|
21
|
+
Requires-Dist: httpx>=0.24.0; extra == "dev"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/api/index.py
|
|
4
|
+
src/api/rest/v1/app.py
|
|
5
|
+
src/features/ewma_compute/index.py
|
|
6
|
+
src/features/ewma_compute/service.py
|
|
7
|
+
src/infra/adapters/clickhouse/clickhouse_adapter.py
|
|
8
|
+
src/infra/adapters/kafka/kafka_alert_adapter.py
|
|
9
|
+
src/infra/adapters/postgres/postgres_adapter.py
|
|
10
|
+
src/infra/adapters/redis/redis_adapter.py
|
|
11
|
+
src/shared/contracts/validator.py
|
|
12
|
+
src/shared/errors/base.py
|
|
13
|
+
src/shared/ports/alert_publisher_port.py
|
|
14
|
+
src/shared/ports/clickhouse_port.py
|
|
15
|
+
src/shared/ports/postgres_port.py
|
|
16
|
+
src/shared/ports/redis_port.py
|
|
17
|
+
src/shared/types/ewma_types.py
|
|
18
|
+
src/temporal_ewma_worker.egg-info/PKG-INFO
|
|
19
|
+
src/temporal_ewma_worker.egg-info/SOURCES.txt
|
|
20
|
+
src/temporal_ewma_worker.egg-info/dependency_links.txt
|
|
21
|
+
src/temporal_ewma_worker.egg-info/requires.txt
|
|
22
|
+
src/temporal_ewma_worker.egg-info/top_level.txt
|
|
23
|
+
src/worker/activities.py
|
|
24
|
+
src/worker/config.py
|
|
25
|
+
src/worker/index.py
|
|
26
|
+
src/worker/registry.py
|
|
27
|
+
src/worker/workflows.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
temporalio>=1.5.0
|
|
2
|
+
clickhouse-connect>=0.7.0
|
|
3
|
+
redis>=5.0.0
|
|
4
|
+
psycopg[binary]>=3.1.0
|
|
5
|
+
confluent-kafka>=2.3.0
|
|
6
|
+
opentelemetry-sdk>=1.26.0
|
|
7
|
+
pydantic>=2.0.0
|
|
8
|
+
fastapi>=0.100.0
|
|
9
|
+
uvicorn>=0.22.0
|
|
10
|
+
|
|
11
|
+
[dev]
|
|
12
|
+
pytest>=8.0
|
|
13
|
+
pytest-cov>=4.1.0
|
|
14
|
+
pytest-asyncio>=0.23.0
|
|
15
|
+
mypy>=1.8.0
|
|
16
|
+
ruff>=0.2.0
|
|
17
|
+
httpx>=0.24.0
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
from temporalio import activity
|
|
3
|
+
from shared.ports.clickhouse_port import ClickHousePort
|
|
4
|
+
from shared.ports.redis_port import RedisPort
|
|
5
|
+
from shared.ports.postgres_port import PostgresPort
|
|
6
|
+
from shared.ports.alert_publisher_port import AlertPublisherPort
|
|
7
|
+
from shared.types.ewma_types import EwmaRecord, ClusterCost, AnomalyPayload
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EwmaActivities:
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
clickhouse: ClickHousePort,
|
|
14
|
+
redis: RedisPort,
|
|
15
|
+
postgres: PostgresPort,
|
|
16
|
+
alert_publisher: AlertPublisherPort,
|
|
17
|
+
):
|
|
18
|
+
self.clickhouse = clickhouse
|
|
19
|
+
self.redis = redis
|
|
20
|
+
self.postgres = postgres
|
|
21
|
+
self.alert_publisher = alert_publisher
|
|
22
|
+
|
|
23
|
+
@activity.defn(name="fetch_active_pairs")
|
|
24
|
+
async def fetch_active_pairs(self) -> List[Tuple[str, str]]:
|
|
25
|
+
return self.clickhouse.get_active_pairs()
|
|
26
|
+
|
|
27
|
+
@activity.defn(name="fetch_cost_history")
|
|
28
|
+
async def fetch_cost_history(
|
|
29
|
+
self, service: str, model: str, hour_of_week: int
|
|
30
|
+
) -> List[float]:
|
|
31
|
+
return self.clickhouse.get_cost_history(service, model, hour_of_week)
|
|
32
|
+
|
|
33
|
+
@activity.defn(name="fetch_global_model_avg")
|
|
34
|
+
async def fetch_global_model_avg(self, model: str, hour_of_week: int) -> float:
|
|
35
|
+
return self.clickhouse.get_global_model_avg(model, hour_of_week)
|
|
36
|
+
|
|
37
|
+
@activity.defn(name="fetch_current_cost_1h")
|
|
38
|
+
async def fetch_current_cost_1h(self, service: str, model: str) -> float:
|
|
39
|
+
return self.clickhouse.get_current_cost_1h(service, model)
|
|
40
|
+
|
|
41
|
+
@activity.defn(name="fetch_cost_by_cluster_1h")
|
|
42
|
+
async def fetch_cost_by_cluster_1h(
|
|
43
|
+
self, service: str, model: str
|
|
44
|
+
) -> List[ClusterCost]:
|
|
45
|
+
return self.clickhouse.get_cost_by_cluster_1h(service, model)
|
|
46
|
+
|
|
47
|
+
@activity.defn(name="get_baseline")
|
|
48
|
+
async def get_baseline(
|
|
49
|
+
self, service: str, model: str, hour_of_week: int
|
|
50
|
+
) -> EwmaRecord | None:
|
|
51
|
+
return self.postgres.get_baseline(service, model, hour_of_week)
|
|
52
|
+
|
|
53
|
+
@activity.defn(name="upsert_baseline")
|
|
54
|
+
async def upsert_baseline(self, record: EwmaRecord) -> None:
|
|
55
|
+
self.postgres.upsert_baseline(record)
|
|
56
|
+
self.redis.set_ewma(
|
|
57
|
+
record.service, record.model, record.hour_of_week, record.ewma_value
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@activity.defn(name="publish_anomaly_alert")
|
|
61
|
+
async def publish_anomaly_alert(self, payload: AnomalyPayload) -> None:
|
|
62
|
+
self.alert_publisher.publish_anomaly(payload)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from shared.errors.base import ValidationError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class WorkerConfig:
|
|
8
|
+
temporal_host: str
|
|
9
|
+
temporal_namespace: str
|
|
10
|
+
temporal_task_queue: str
|
|
11
|
+
clickhouse_host: str
|
|
12
|
+
clickhouse_port: int
|
|
13
|
+
clickhouse_username: str
|
|
14
|
+
clickhouse_password: str
|
|
15
|
+
clickhouse_database: str
|
|
16
|
+
redis_url: str
|
|
17
|
+
postgres_dsn: str
|
|
18
|
+
kafka_bootstrap_servers: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _int_val(raw: str, key: str) -> int:
|
|
22
|
+
try:
|
|
23
|
+
return int(raw)
|
|
24
|
+
except ValueError as exc:
|
|
25
|
+
raise ValidationError(f"{key} must be an integer") from exc
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_config(env: dict[str, str] | None = None) -> WorkerConfig:
|
|
29
|
+
source = env or os.environ
|
|
30
|
+
|
|
31
|
+
postgres_user = source.get("POSTGRES_USER", "postgres")
|
|
32
|
+
postgres_password = source.get("POSTGRES_PASSWORD", "postgres")
|
|
33
|
+
postgres_host = source.get("POSTGRES_HOST", "localhost")
|
|
34
|
+
postgres_port = source.get("POSTGRES_PORT", "5439")
|
|
35
|
+
postgres_db = source.get("POSTGRES_DB", "ewma_db")
|
|
36
|
+
|
|
37
|
+
postgres_dsn = f"postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}"
|
|
38
|
+
|
|
39
|
+
return WorkerConfig(
|
|
40
|
+
temporal_host=source.get("TEMPORAL_HOST", "localhost:7239"),
|
|
41
|
+
temporal_namespace=source.get("TEMPORAL_NAMESPACE", "default"),
|
|
42
|
+
temporal_task_queue=source.get("TEMPORAL_TASK_QUEUE", "ewma-tasks"),
|
|
43
|
+
clickhouse_host=source.get("CLICKHOUSE_HOST", "localhost"),
|
|
44
|
+
clickhouse_port=_int_val(
|
|
45
|
+
source.get("CLICKHOUSE_PORT", "8129"), "CLICKHOUSE_PORT"
|
|
46
|
+
),
|
|
47
|
+
clickhouse_username=source.get("CLICKHOUSE_USERNAME", "default"),
|
|
48
|
+
clickhouse_password=source.get("CLICKHOUSE_PASSWORD", ""),
|
|
49
|
+
clickhouse_database=source.get("CLICKHOUSE_DATABASE", "default"),
|
|
50
|
+
redis_url=source.get("REDIS_URL", "redis://localhost:6389/0"),
|
|
51
|
+
postgres_dsn=source.get("POSTGRES_DSN", postgres_dsn),
|
|
52
|
+
kafka_bootstrap_servers=source.get("KAFKA_BOOTSTRAP_SERVERS", "localhost:9099"),
|
|
53
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import uvicorn
|
|
3
|
+
from api.rest.v1.app import app
|
|
4
|
+
from temporalio.client import Client
|
|
5
|
+
from temporalio.worker import Worker
|
|
6
|
+
from worker.config import load_config
|
|
7
|
+
from worker.activities import EwmaActivities
|
|
8
|
+
from worker.workflows import EwmaBaselineUpdate
|
|
9
|
+
from infra.adapters.clickhouse.clickhouse_adapter import ClickHouseAdapter
|
|
10
|
+
from infra.adapters.redis.redis_adapter import RedisAdapter
|
|
11
|
+
from infra.adapters.postgres.postgres_adapter import PostgresAdapter
|
|
12
|
+
from infra.adapters.kafka.kafka_alert_adapter import KafkaAlertAdapter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main() -> None:
|
|
16
|
+
config = load_config()
|
|
17
|
+
|
|
18
|
+
clickhouse = ClickHouseAdapter(
|
|
19
|
+
host=config.clickhouse_host,
|
|
20
|
+
port=config.clickhouse_port,
|
|
21
|
+
username=config.clickhouse_username,
|
|
22
|
+
password=config.clickhouse_password,
|
|
23
|
+
database=config.clickhouse_database,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
redis = RedisAdapter(url=config.redis_url)
|
|
27
|
+
postgres = PostgresAdapter(dsn=config.postgres_dsn)
|
|
28
|
+
alert_publisher = KafkaAlertAdapter(
|
|
29
|
+
bootstrap_servers=config.kafka_bootstrap_servers
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
activities = EwmaActivities(
|
|
33
|
+
clickhouse=clickhouse,
|
|
34
|
+
redis=redis,
|
|
35
|
+
postgres=postgres,
|
|
36
|
+
alert_publisher=alert_publisher,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
client = await Client.connect(
|
|
40
|
+
config.temporal_host, namespace=config.temporal_namespace
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
worker = Worker(
|
|
44
|
+
client,
|
|
45
|
+
task_queue=config.temporal_task_queue,
|
|
46
|
+
workflows=[EwmaBaselineUpdate],
|
|
47
|
+
activities=[
|
|
48
|
+
activities.fetch_active_pairs,
|
|
49
|
+
activities.fetch_cost_history,
|
|
50
|
+
activities.fetch_global_model_avg,
|
|
51
|
+
activities.fetch_current_cost_1h,
|
|
52
|
+
activities.fetch_cost_by_cluster_1h,
|
|
53
|
+
activities.get_baseline,
|
|
54
|
+
activities.upsert_baseline,
|
|
55
|
+
activities.publish_anomaly_alert,
|
|
56
|
+
],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
server_config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="info")
|
|
60
|
+
server = uvicorn.Server(server_config)
|
|
61
|
+
|
|
62
|
+
await asyncio.gather(
|
|
63
|
+
worker.run(),
|
|
64
|
+
server.serve(),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from shared.contracts.validator import load_workflow_contract
|
|
3
|
+
from worker.workflows import EwmaBaselineUpdate
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class WorkflowDefinition:
|
|
8
|
+
name: str
|
|
9
|
+
handler: type
|
|
10
|
+
contract: dict
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def build_registry() -> dict[str, WorkflowDefinition]:
|
|
14
|
+
contract = load_workflow_contract()
|
|
15
|
+
return {
|
|
16
|
+
"ewma_baseline_update": WorkflowDefinition(
|
|
17
|
+
name="ewma_baseline_update", handler=EwmaBaselineUpdate, contract=contract
|
|
18
|
+
)
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
WORKFLOW_REGISTRY = build_registry()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from temporalio import workflow
|
|
5
|
+
from shared.types.ewma_types import EwmaRecord, AnomalyPayload
|
|
6
|
+
from features.ewma_compute.service import EwmaService
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class EwmaWorkflowInput:
|
|
11
|
+
force_hour: int | None = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@workflow.defn(name="EwmaBaselineUpdate")
|
|
15
|
+
class EwmaBaselineUpdate:
|
|
16
|
+
@workflow.run
|
|
17
|
+
async def run(self, workflow_input: EwmaWorkflowInput | None = None) -> None:
|
|
18
|
+
if workflow_input and workflow_input.force_hour is not None:
|
|
19
|
+
hour_of_week = workflow_input.force_hour
|
|
20
|
+
else:
|
|
21
|
+
current_time = workflow.now()
|
|
22
|
+
hour_of_week = current_time.weekday() * 24 + current_time.hour
|
|
23
|
+
|
|
24
|
+
active_pairs = await workflow.execute_activity(
|
|
25
|
+
"fetch_active_pairs", start_to_close_timeout=timedelta(seconds=60)
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
tasks = [
|
|
29
|
+
self._process_pair(service, model, hour_of_week)
|
|
30
|
+
for service, model in active_pairs
|
|
31
|
+
]
|
|
32
|
+
await asyncio.gather(*tasks)
|
|
33
|
+
|
|
34
|
+
async def _process_pair(self, service: str, model: str, hour_of_week: int) -> None:
|
|
35
|
+
existing_record = await workflow.execute_activity(
|
|
36
|
+
"get_baseline",
|
|
37
|
+
args=[service, model, hour_of_week],
|
|
38
|
+
result_type=EwmaRecord,
|
|
39
|
+
start_to_close_timeout=timedelta(seconds=10),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
if existing_record is None:
|
|
43
|
+
global_avg = await workflow.execute_activity(
|
|
44
|
+
"fetch_global_model_avg",
|
|
45
|
+
args=[model, hour_of_week],
|
|
46
|
+
start_to_close_timeout=timedelta(seconds=30),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
new_record = EwmaService.process_update(0.0, None, global_avg)
|
|
50
|
+
new_record.service = service
|
|
51
|
+
new_record.model = model
|
|
52
|
+
new_record.hour_of_week = hour_of_week
|
|
53
|
+
|
|
54
|
+
await workflow.execute_activity(
|
|
55
|
+
"upsert_baseline",
|
|
56
|
+
args=[new_record],
|
|
57
|
+
start_to_close_timeout=timedelta(seconds=15),
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
current_cost = await workflow.execute_activity(
|
|
61
|
+
"fetch_current_cost_1h",
|
|
62
|
+
args=[service, model],
|
|
63
|
+
start_to_close_timeout=timedelta(seconds=30),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
is_anomaly = (
|
|
67
|
+
not existing_record.is_cold_start
|
|
68
|
+
and current_cost > 3.0 * existing_record.ewma_value
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if is_anomaly:
|
|
72
|
+
cluster_drilldown = await workflow.execute_activity(
|
|
73
|
+
"fetch_cost_by_cluster_1h",
|
|
74
|
+
args=[service, model],
|
|
75
|
+
start_to_close_timeout=timedelta(seconds=30),
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
alert_payload = AnomalyPayload(
|
|
79
|
+
service=service,
|
|
80
|
+
model=model,
|
|
81
|
+
hour_of_week=hour_of_week,
|
|
82
|
+
current_cost=current_cost,
|
|
83
|
+
ewma_value=existing_record.ewma_value,
|
|
84
|
+
threshold_value=3.0 * existing_record.ewma_value,
|
|
85
|
+
sample_count=existing_record.sample_count,
|
|
86
|
+
timestamp=workflow.now().isoformat(),
|
|
87
|
+
cluster_drilldown=cluster_drilldown,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
await workflow.execute_activity(
|
|
91
|
+
"publish_anomaly_alert",
|
|
92
|
+
args=[alert_payload],
|
|
93
|
+
start_to_close_timeout=timedelta(seconds=20),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
new_record = EwmaService.process_update(current_cost, existing_record, 0.0)
|
|
97
|
+
|
|
98
|
+
await workflow.execute_activity(
|
|
99
|
+
"upsert_baseline",
|
|
100
|
+
args=[new_record],
|
|
101
|
+
start_to_close_timeout=timedelta(seconds=15),
|
|
102
|
+
)
|