@gravito/zenith 1.1.3 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -10
- package/dist/bin.js +43235 -76691
- package/dist/client/index.html +13 -0
- package/dist/server/index.js +43235 -76691
- package/package.json +16 -7
- package/CHANGELOG.md +0 -62
- package/Dockerfile +0 -46
- package/Dockerfile.demo-worker +0 -29
- package/bin/flux-console.ts +0 -2
- package/doc/ECOSYSTEM_EXPANSION_RFC.md +0 -130
- package/docker-compose.yml +0 -40
- package/docs/ALERTING_GUIDE.md +0 -71
- package/docs/DEPLOYMENT.md +0 -157
- package/docs/DOCS_INTERNAL.md +0 -73
- package/docs/LARAVEL_ZENITH_ROADMAP.md +0 -109
- package/docs/QUASAR_MASTER_PLAN.md +0 -140
- package/docs/QUICK_TEST_GUIDE.md +0 -72
- package/docs/ROADMAP.md +0 -85
- package/docs/integrations/LARAVEL.md +0 -207
- package/postcss.config.js +0 -6
- package/scripts/debug_redis_keys.ts +0 -24
- package/scripts/flood-logs.ts +0 -21
- package/scripts/seed.ts +0 -213
- package/scripts/verify-throttle.ts +0 -49
- package/scripts/worker.ts +0 -124
- package/specs/PULSE_SPEC.md +0 -86
- package/src/bin.ts +0 -6
- package/src/client/App.tsx +0 -72
- package/src/client/Layout.tsx +0 -669
- package/src/client/Sidebar.tsx +0 -112
- package/src/client/ThroughputChart.tsx +0 -158
- package/src/client/WorkerStatus.tsx +0 -202
- package/src/client/components/BrandIcons.tsx +0 -168
- package/src/client/components/ConfirmDialog.tsx +0 -134
- package/src/client/components/JobInspector.tsx +0 -487
- package/src/client/components/LogArchiveModal.tsx +0 -432
- package/src/client/components/NotificationBell.tsx +0 -212
- package/src/client/components/PageHeader.tsx +0 -47
- package/src/client/components/Toaster.tsx +0 -90
- package/src/client/components/UserProfileDropdown.tsx +0 -186
- package/src/client/contexts/AuthContext.tsx +0 -105
- package/src/client/contexts/NotificationContext.tsx +0 -128
- package/src/client/index.css +0 -172
- package/src/client/main.tsx +0 -15
- package/src/client/pages/LoginPage.tsx +0 -164
- package/src/client/pages/MetricsPage.tsx +0 -445
- package/src/client/pages/OverviewPage.tsx +0 -519
- package/src/client/pages/PulsePage.tsx +0 -409
- package/src/client/pages/QueuesPage.tsx +0 -378
- package/src/client/pages/SchedulesPage.tsx +0 -535
- package/src/client/pages/SettingsPage.tsx +0 -1001
- package/src/client/pages/WorkersPage.tsx +0 -380
- package/src/client/pages/index.ts +0 -8
- package/src/client/utils.ts +0 -15
- package/src/server/config/ServerConfigManager.ts +0 -90
- package/src/server/index.ts +0 -860
- package/src/server/middleware/auth.ts +0 -127
- package/src/server/services/AlertService.ts +0 -321
- package/src/server/services/CommandService.ts +0 -136
- package/src/server/services/LogStreamProcessor.ts +0 -93
- package/src/server/services/MaintenanceScheduler.ts +0 -78
- package/src/server/services/PulseService.ts +0 -148
- package/src/server/services/QueueMetricsCollector.ts +0 -138
- package/src/server/services/QueueService.ts +0 -924
- package/src/shared/types.ts +0 -223
- package/tailwind.config.js +0 -80
- package/tests/placeholder.test.ts +0 -7
- package/tsconfig.json +0 -29
- package/tsconfig.node.json +0 -10
- package/vite.config.ts +0 -27
package/package.json
CHANGED
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gravito/zenith",
|
|
3
|
-
"
|
|
3
|
+
"sideEffects": false,
|
|
4
|
+
"version": "1.1.6",
|
|
4
5
|
"description": "Gravito Zenith: Zero-config control plane for Gravito Flux & Stream",
|
|
5
6
|
"type": "module",
|
|
6
7
|
"bin": {
|
|
7
8
|
"zenith": "dist/bin.js",
|
|
8
9
|
"flux-console": "dist/bin.js"
|
|
9
10
|
},
|
|
10
|
-
"main": "./dist/index.js",
|
|
11
|
-
"
|
|
11
|
+
"main": "./dist/server/index.js",
|
|
12
|
+
"exports": {
|
|
13
|
+
".": "./dist/server/index.js",
|
|
14
|
+
"./bin": "./dist/bin.js"
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist",
|
|
18
|
+
"README.md"
|
|
19
|
+
],
|
|
12
20
|
"scripts": {
|
|
13
21
|
"dev:server": "bun run --watch src/server/index.ts",
|
|
14
22
|
"dev:client": "vite",
|
|
@@ -22,10 +30,10 @@
|
|
|
22
30
|
"test:integration": "test $(find tests -name '*.integration.test.ts' 2>/dev/null | wc -l) -gt 0 && find tests -name '*.integration.test.ts' -print0 | xargs -0 bun test --timeout=10000 || echo 'No integration tests found'"
|
|
23
31
|
},
|
|
24
32
|
"dependencies": {
|
|
25
|
-
"@gravito/atlas": "^
|
|
26
|
-
"@gravito/photon": "^1.
|
|
27
|
-
"@gravito/quasar": "^1.3.
|
|
28
|
-
"@gravito/stream": "^2.
|
|
33
|
+
"@gravito/atlas": "^2.5.2",
|
|
34
|
+
"@gravito/photon": "^1.1.3",
|
|
35
|
+
"@gravito/quasar": "^1.3.2",
|
|
36
|
+
"@gravito/stream": "^2.1.1",
|
|
29
37
|
"@tanstack/react-query": "^5.0.0",
|
|
30
38
|
"clsx": "^2.1.1",
|
|
31
39
|
"date-fns": "^4.1.0",
|
|
@@ -43,6 +51,7 @@
|
|
|
43
51
|
"@types/nodemailer": "^7.0.4",
|
|
44
52
|
"@types/react": "^19.0.0",
|
|
45
53
|
"@types/react-dom": "^19.0.0",
|
|
54
|
+
"@types/node": "latest",
|
|
46
55
|
"@vitejs/plugin-react": "^5.1.2",
|
|
47
56
|
"autoprefixer": "^10.4.0",
|
|
48
57
|
"postcss": "^8.4.0",
|
package/CHANGELOG.md
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# @gravito/zenith
|
|
2
|
-
|
|
3
|
-
## 1.1.3
|
|
4
|
-
|
|
5
|
-
### Patch Changes
|
|
6
|
-
|
|
7
|
-
- Convert all workspace:\* dependencies to version numbers for npm publishing
|
|
8
|
-
|
|
9
|
-
- Fixed 144 workspace:\* dependencies across 58 packages
|
|
10
|
-
- Ensures all packages work properly when installed from npm
|
|
11
|
-
- Resolves issues with bunx and npm installation of CLI tools
|
|
12
|
-
- All internal dependencies now use explicit version constraints
|
|
13
|
-
|
|
14
|
-
- Updated dependencies
|
|
15
|
-
- @gravito/photon@1.0.1
|
|
16
|
-
- @gravito/stream@2.0.2
|
|
17
|
-
|
|
18
|
-
## 1.1.2
|
|
19
|
-
|
|
20
|
-
### Patch Changes
|
|
21
|
-
|
|
22
|
-
- Updated dependencies [905588f]
|
|
23
|
-
- @gravito/stream@2.0.1
|
|
24
|
-
|
|
25
|
-
## 1.1.1
|
|
26
|
-
|
|
27
|
-
### Patch Changes
|
|
28
|
-
|
|
29
|
-
- Updated dependencies
|
|
30
|
-
- @gravito/atlas@2.1.0
|
|
31
|
-
- @gravito/stream@1.0.3
|
|
32
|
-
|
|
33
|
-
## 1.1.0
|
|
34
|
-
|
|
35
|
-
### Minor Changes
|
|
36
|
-
|
|
37
|
-
- Implement several more examples and fix module issues, including:
|
|
38
|
-
- Support middleware in core route definitions.
|
|
39
|
-
- Improve Atlas driver loading and dependency injection.
|
|
40
|
-
- Add PostgreSQL support to Ecommerce MVC example.
|
|
41
|
-
- Fix internal type resolution issues across packages.
|
|
42
|
-
|
|
43
|
-
### Patch Changes
|
|
44
|
-
|
|
45
|
-
- Updated dependencies
|
|
46
|
-
- @gravito/atlas@1.2.0
|
|
47
|
-
- @gravito/quasar@1.2.0
|
|
48
|
-
- @gravito/stream@1.0.2
|
|
49
|
-
|
|
50
|
-
## 1.0.1
|
|
51
|
-
|
|
52
|
-
### Patch Changes
|
|
53
|
-
|
|
54
|
-
- @gravito/stream@1.0.1
|
|
55
|
-
|
|
56
|
-
## 1.0.0
|
|
57
|
-
|
|
58
|
-
### Patch Changes
|
|
59
|
-
|
|
60
|
-
- Updated dependencies
|
|
61
|
-
- @gravito/atlas@1.0.1
|
|
62
|
-
- @gravito/stream@1.0.0
|
package/Dockerfile
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# Use Bun official image
|
|
2
|
-
FROM oven/bun:1.1.26 AS base
|
|
3
|
-
WORKDIR /usr/src/app
|
|
4
|
-
|
|
5
|
-
# ---- 1. Install Dependencies ----
|
|
6
|
-
FROM base AS install
|
|
7
|
-
# Copy root files
|
|
8
|
-
COPY package.json bun.lock ./
|
|
9
|
-
# Copy package.json files for workspace resolution
|
|
10
|
-
COPY packages/photon/package.json ./packages/photon/
|
|
11
|
-
COPY packages/stream/package.json ./packages/stream/
|
|
12
|
-
COPY packages/flux-console/package.json ./packages/flux-console/
|
|
13
|
-
|
|
14
|
-
# Install dependencies
|
|
15
|
-
RUN bun install --frozen-lockfile
|
|
16
|
-
|
|
17
|
-
# ---- 2. Build Stage ----
|
|
18
|
-
FROM base AS build
|
|
19
|
-
COPY --from=install /usr/src/app/node_modules ./node_modules
|
|
20
|
-
COPY --from=install /usr/src/app/packages ./packages
|
|
21
|
-
COPY . .
|
|
22
|
-
|
|
23
|
-
# Build the console
|
|
24
|
-
# This bundles the server and builds the client (Vite)
|
|
25
|
-
RUN cd packages/flux-console && bun run build
|
|
26
|
-
|
|
27
|
-
# ---- 3. Production Runner ----
|
|
28
|
-
FROM base AS release
|
|
29
|
-
WORKDIR /app
|
|
30
|
-
|
|
31
|
-
# Copy built artifacts
|
|
32
|
-
# Note: server and bin are bundled into dist/
|
|
33
|
-
COPY --from=build /usr/src/app/packages/flux-console/dist ./dist
|
|
34
|
-
COPY --from=build /usr/src/app/packages/flux-console/package.json ./package.json
|
|
35
|
-
# Client source/assets are needed for the server to serve them
|
|
36
|
-
COPY --from=build /usr/src/app/packages/flux-console/src/client ./src/client
|
|
37
|
-
|
|
38
|
-
# Expose port
|
|
39
|
-
EXPOSE 3000
|
|
40
|
-
|
|
41
|
-
# Environment defaults
|
|
42
|
-
ENV PORT=3000
|
|
43
|
-
ENV NODE_ENV=production
|
|
44
|
-
|
|
45
|
-
# Start the console
|
|
46
|
-
CMD ["bun", "run", "dist/bin.js"]
|
package/Dockerfile.demo-worker
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# Use Bun official image
|
|
2
|
-
FROM oven/bun:1.1.26 AS base
|
|
3
|
-
WORKDIR /usr/src/app
|
|
4
|
-
|
|
5
|
-
# ---- 1. Install Dependencies ----
|
|
6
|
-
FROM base AS install
|
|
7
|
-
COPY package.json bun.lock ./
|
|
8
|
-
COPY packages/photon/package.json ./packages/photon/
|
|
9
|
-
COPY packages/stream/package.json ./packages/stream/
|
|
10
|
-
COPY packages/flux-console/package.json ./packages/flux-console/
|
|
11
|
-
RUN bun install --frozen-lockfile
|
|
12
|
-
|
|
13
|
-
# ---- 2. Copy Source ----
|
|
14
|
-
FROM base AS build
|
|
15
|
-
COPY --from=install /usr/src/app/node_modules ./node_modules
|
|
16
|
-
COPY --from=install /usr/src/app/packages ./packages
|
|
17
|
-
COPY . .
|
|
18
|
-
|
|
19
|
-
# ---- 3. Runner ----
|
|
20
|
-
FROM base AS release
|
|
21
|
-
WORKDIR /usr/src/app
|
|
22
|
-
COPY --from=build /usr/src/app ./
|
|
23
|
-
|
|
24
|
-
# Env defaults
|
|
25
|
-
ENV NODE_ENV=production
|
|
26
|
-
|
|
27
|
-
# Start the demo worker
|
|
28
|
-
# It uses the local packages/dist if available, but Bun can run TS directly
|
|
29
|
-
CMD ["bun", "run", "packages/flux-console/scripts/demo-worker.ts"]
|
package/bin/flux-console.ts
DELETED
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
# Zenith Ecosystem Expansion RFC
|
|
2
|
-
|
|
3
|
-
**Status**: Draft
|
|
4
|
-
**Date**: 2026-01-10
|
|
5
|
-
**Goal**: Expand Zenith monitoring capabilities beyond Gravito/Laravel to Python, Node.js, and Go ecosystems.
|
|
6
|
-
|
|
7
|
-
---
|
|
8
|
-
|
|
9
|
-
## 1. Executive Summary
|
|
10
|
-
|
|
11
|
-
Gravito Zenith (Flux Console) is a unified control plane for background job processing. Currently, it supports **Gravito Stream** (Native) and **Laravel Queues** (via `laravel-zenith`). To become a true polyglot observability platform, we need to implement connectors for other popular queue systems.
|
|
12
|
-
|
|
13
|
-
This RFC defines the **Universal Zenith Protocol (UZP)** and proposes implementation roadmaps for Python (Celery) and Node.js (BullMQ).
|
|
14
|
-
|
|
15
|
-
---
|
|
16
|
-
|
|
17
|
-
## 2. The Universal Zenith Protocol (UZP)
|
|
18
|
-
|
|
19
|
-
Any background job system can be monitored by Zenith if it implements the following Redis-based interfaces.
|
|
20
|
-
|
|
21
|
-
### 2.1. Discovery (Heartbeat)
|
|
22
|
-
Workers must announce their presence every 30 seconds to avoid being marked as "Offline".
|
|
23
|
-
|
|
24
|
-
* **Command**: `SETEX flux_console:worker:<worker_id> 60 <payload>`
|
|
25
|
-
* **Payload (JSON)**:
|
|
26
|
-
```json
|
|
27
|
-
{
|
|
28
|
-
"id": "celery@worker-1",
|
|
29
|
-
"hostname": "pod-xyz",
|
|
30
|
-
"pid": 1234,
|
|
31
|
-
"uptime": 3600,
|
|
32
|
-
"queues": ["high", "default"],
|
|
33
|
-
"concurrency": 4,
|
|
34
|
-
"memory": { "rss": "50MB", "heapUsed": "N/A" },
|
|
35
|
-
"framework": "celery", // "laravel", "bullmq", "asynq"
|
|
36
|
-
"language": "python", // "php", "typescript", "go"
|
|
37
|
-
"timestamp": "2026-01-10T12:00:00Z"
|
|
38
|
-
}
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### 2.2. Event Stream (Logs)
|
|
42
|
-
Workers publish lifecycle events to a shared Pub/Sub channel.
|
|
43
|
-
|
|
44
|
-
* **Command**: `PUBLISH flux_console:logs <payload>`
|
|
45
|
-
* **Payload (JSON)**:
|
|
46
|
-
```json
|
|
47
|
-
{
|
|
48
|
-
"level": "info", // "info" (start), "success", "error"
|
|
49
|
-
"message": "Processing Task: tasks.send_email",
|
|
50
|
-
"workerId": "celery@worker-1",
|
|
51
|
-
"queue": "default",
|
|
52
|
-
"jobId": "uuid-v4",
|
|
53
|
-
"timestamp": "2026-01-10T12:00:01Z",
|
|
54
|
-
"metadata": {
|
|
55
|
-
"attempt": 1,
|
|
56
|
-
"latency": 45 // ms (for success/error events)
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
### 2.3. Metrics (Optional but Recommended)
|
|
62
|
-
Connectors should increment counters for throughput aggregation.
|
|
63
|
-
|
|
64
|
-
* `INCR flux_console:metrics:processed`
|
|
65
|
-
* `INCR flux_console:metrics:failed`
|
|
66
|
-
|
|
67
|
-
---
|
|
68
|
-
|
|
69
|
-
## 3. Implementation Plan: Python (Celery)
|
|
70
|
-
|
|
71
|
-
**Target**: `gravito/zenith-celery` (PyPI Package)
|
|
72
|
-
|
|
73
|
-
### Architecture
|
|
74
|
-
Celery has a rich Signal system. We can hook into `worker_ready`, `task_prerun`, `task_success`, and `task_failure`.
|
|
75
|
-
|
|
76
|
-
### Component Design
|
|
77
|
-
1. **ZenithMonitor**: A Celery Bootstep that starts a background thread for Heartbeats.
|
|
78
|
-
2. **SignalHandlers**:
|
|
79
|
-
* `task_prerun`: Publish `level: info` log.
|
|
80
|
-
* `task_success`: Publish `level: success` log + metrics.
|
|
81
|
-
* `task_failure`: Publish `level: error` log with traceback.
|
|
82
|
-
|
|
83
|
-
### Configuration
|
|
84
|
-
```python
|
|
85
|
-
# celery.py
|
|
86
|
-
app.conf.zenith_redis_url = "redis://localhost:6379/0"
|
|
87
|
-
app.conf.zenith_enabled = True
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
---
|
|
91
|
-
|
|
92
|
-
## 4. Implementation Plan: Node.js (BullMQ)
|
|
93
|
-
|
|
94
|
-
**Target**: `@gravito/zenith-bullmq` (NPM Package)
|
|
95
|
-
|
|
96
|
-
*Note: Gravito Stream is based on BullMQ principles but internal. This adapter allows *standard* BullMQ instances (e.g., in a NestJS app) to report to Zenith.*
|
|
97
|
-
|
|
98
|
-
### Architecture
|
|
99
|
-
BullMQ uses `QueueEvents` (which listens to Redis streams). A separate "Monitor" process is the best approach to avoid modifying the worker code too much.
|
|
100
|
-
|
|
101
|
-
### Component Design
|
|
102
|
-
1. **ZenithMonitor Class**:
|
|
103
|
-
```typescript
|
|
104
|
-
const monitor = new ZenithMonitor({
|
|
105
|
-
connection: redisOptions,
|
|
106
|
-
queues: ['email', 'reports']
|
|
107
|
-
});
|
|
108
|
-
monitor.start();
|
|
109
|
-
```
|
|
110
|
-
2. It listens to BullMQ global events (completed, failed) and bridges them to UZP.
|
|
111
|
-
3. **Heartbeat**: Since BullMQ workers don't have a central registry, the Monitor acts as a "Virtual Worker" or we require users to instantiate a `ZenithWorker` wrapper.
|
|
112
|
-
|
|
113
|
-
---
|
|
114
|
-
|
|
115
|
-
## 5. Implementation Plan: Go (Asynq)
|
|
116
|
-
|
|
117
|
-
**Target**: `github.com/gravito-framework/zenith-asynq`
|
|
118
|
-
|
|
119
|
-
### Architecture
|
|
120
|
-
Asynq provides `Server` middleware.
|
|
121
|
-
|
|
122
|
-
### Component Design
|
|
123
|
-
1. **Middleware**: `zenith.NewMiddleware(redisClient)`.
|
|
124
|
-
2. Wraps handler execution to capture Start/Success/Fail times.
|
|
125
|
-
3. Publishes to Redis asynchronously.
|
|
126
|
-
|
|
127
|
-
---
|
|
128
|
-
|
|
129
|
-
## 6. Future Work: Rust (Faktory?)
|
|
130
|
-
(To be determined based on demand)
|
package/docker-compose.yml
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
version: '3.8'
|
|
2
|
-
|
|
3
|
-
services:
|
|
4
|
-
# Main Persistence for Archive
|
|
5
|
-
mysql:
|
|
6
|
-
image: mysql:8.0
|
|
7
|
-
container_name: flux-mysql
|
|
8
|
-
ports:
|
|
9
|
-
- "3306:3306"
|
|
10
|
-
environment:
|
|
11
|
-
MYSQL_ROOT_PASSWORD: root
|
|
12
|
-
MYSQL_DATABASE: flux
|
|
13
|
-
healthcheck:
|
|
14
|
-
test: [ "CMD", "mysqladmin", "ping", "-h", "localhost" ]
|
|
15
|
-
timeout: 20s
|
|
16
|
-
retries: 10
|
|
17
|
-
|
|
18
|
-
# Real-time state store
|
|
19
|
-
redis:
|
|
20
|
-
image: redis:7-alpine
|
|
21
|
-
container_name: flux-redis
|
|
22
|
-
ports:
|
|
23
|
-
- "6379:6379"
|
|
24
|
-
# Flux Console (Optional: run locally via npm dev instead)
|
|
25
|
-
# console:
|
|
26
|
-
# build: .
|
|
27
|
-
# ports:
|
|
28
|
-
# - "3000:3000"
|
|
29
|
-
# environment:
|
|
30
|
-
# - REDIS_URL=redis://redis:6379
|
|
31
|
-
# - DB_DRIVER=mysql
|
|
32
|
-
# - DB_HOST=mysql
|
|
33
|
-
# - DB_USER=root
|
|
34
|
-
# - DB_PASSWORD=root
|
|
35
|
-
# - DB_NAME=flux
|
|
36
|
-
# depends_on:
|
|
37
|
-
# mysql:
|
|
38
|
-
# condition: service_healthy
|
|
39
|
-
# redis:
|
|
40
|
-
# condition: service_started
|
package/docs/ALERTING_GUIDE.md
DELETED
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
# 🔔 Zenith Alerting Guide
|
|
2
|
-
|
|
3
|
-
This guide explains how to configure and manage the alerting system in Zenith to ensure your infrastructure and queues remain healthy.
|
|
4
|
-
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
## 🚀 Overview
|
|
8
|
-
|
|
9
|
-
Zenith's alerting engine is **Redis-Native** and **Stateless**.
|
|
10
|
-
* **Persistence**: Rules are stored in Redis (`gravito:zenith:alerts:rules`).
|
|
11
|
-
* **Evaluation**: The server evaluates all rules every 2 seconds against real-time metrics.
|
|
12
|
-
* **Delivery**: Alerts are dispatched via Slack Webhooks.
|
|
13
|
-
|
|
14
|
-
---
|
|
15
|
-
|
|
16
|
-
## 🛠️ Configuration Fields
|
|
17
|
-
|
|
18
|
-
When adding a new rule in **Settings > Alerting**, you will encounter these fields:
|
|
19
|
-
|
|
20
|
-
### 1. Rule Name
|
|
21
|
-
A descriptive label for the alert (e.g., `Critical Backlog`, `Agent Offline`). This name will appear in the Slack notification.
|
|
22
|
-
|
|
23
|
-
### 2. Type (Metric Category)
|
|
24
|
-
* **Queue Backlog**: Monitors the number of jobs in the `waiting` state.
|
|
25
|
-
* **High Failure Count**: Monitors the number of jobs in the `failed` state.
|
|
26
|
-
* **Worker Loss**: Monitors the total number of active worker nodes.
|
|
27
|
-
* **Node CPU (%)**: Monitors process-level CPU usage reported by Quasar Agents.
|
|
28
|
-
* **Node RAM (%)**: Monitors process-level RAM usage (RSS) relative to system total.
|
|
29
|
-
|
|
30
|
-
### 3. Threshold
|
|
31
|
-
The numeric value that triggers the alert.
|
|
32
|
-
* For **Backlog/Failure**: The number of jobs (e.g., `1000`).
|
|
33
|
-
* For **CPU/RAM**: The percentage (e.g., `90`).
|
|
34
|
-
* For **Worker Loss**: The *minimum* number of workers expected (e.g., alert triggers if count is `< 2`).
|
|
35
|
-
|
|
36
|
-
### 4. Cooldown (Minutes)
|
|
37
|
-
**Crucial Concept**: The period the system "stays silent" after an alert is fired.
|
|
38
|
-
* **Logic**: Once a rule triggers and sends a notification, it enters a "lock" state for the duration of the cooldown.
|
|
39
|
-
* **Purpose**: Prevents "Alert Fatigue" and notification storms.
|
|
40
|
-
* **Example**: If set to `30`, and a backlog spike occurs, you get **one** notification. You won't get another one for the same rule for 30 minutes, even if the backlog remains high.
|
|
41
|
-
|
|
42
|
-
### 5. Queue (Optional)
|
|
43
|
-
Specify a specific queue name (e.g., `orders`, `emails`) to monitor. If left empty, the rule applies to the **total sum** of all queues.
|
|
44
|
-
|
|
45
|
-
---
|
|
46
|
-
|
|
47
|
-
## 🌊 Best Practices
|
|
48
|
-
|
|
49
|
-
### The "Instant Fire" Design
|
|
50
|
-
Zenith alerts are designed for **instant awareness**.
|
|
51
|
-
* If a threshold is met during a 2-second check, the alert fires **immediately**.
|
|
52
|
-
* It does **not** wait for the condition to persist for multiple minutes (Debouncing).
|
|
53
|
-
* **Pro Tip**: If you have frequent "tiny spikes" that resolve themselves in seconds, set your **Threshold** slightly higher than the spikes to avoid noise.
|
|
54
|
-
|
|
55
|
-
### Recommended Settings
|
|
56
|
-
|
|
57
|
-
| Scenario | Type | Threshold | Cooldown |
|
|
58
|
-
| :--- | :--- | :--- | :--- |
|
|
59
|
-
| **Critical Failure** | High Failure Count | 50 | 15m |
|
|
60
|
-
| **System Overload** | Node CPU | 90 | 30m |
|
|
61
|
-
| **Quiet Hours** | Queue Backlog | 5000 | 120m |
|
|
62
|
-
| **Fatal Shutdown** | Worker Loss | 1 | 10m |
|
|
63
|
-
|
|
64
|
-
---
|
|
65
|
-
|
|
66
|
-
## 🔗 Slack Integration
|
|
67
|
-
To receive notifications, ensure the `SLACK_WEBHOOK_URL` environment variable is set before starting the Zenith server.
|
|
68
|
-
|
|
69
|
-
```bash
|
|
70
|
-
export SLACK_WEBHOOK_URL=https://hooks.slack.com/services/Txxx/Bxxx/Xxxx
|
|
71
|
-
```
|
package/docs/DEPLOYMENT.md
DELETED
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
# Flux Console Deployment Guide
|
|
2
|
-
|
|
3
|
-
This whitepaper outlines the recommended deployment strategies for Gravito Flux Console in various environments, from local development to enterprise-scale production clusters.
|
|
4
|
-
|
|
5
|
-
## 1. Deployment Philosophy: "Zero-Config, Anywhere"
|
|
6
|
-
|
|
7
|
-
Flux Console is designed to be infrastructure-agnostic. It acts as a stateless monitoring interface that connects to your existing infrastructure (Redis). It does not require its own dedicated database for basic operation.
|
|
8
|
-
|
|
9
|
-
### Core Dependencies
|
|
10
|
-
- **Runtime**: Node.js 18+ OR Bun 1.0+ (or use standard binary)
|
|
11
|
-
- **Infrastructure**: Redis 6.0+ (Required for state coordination)
|
|
12
|
-
- **Optional**: SQL Database (MySQL/PostgreSQL) for History Persistence (Future Feature)
|
|
13
|
-
|
|
14
|
-
---
|
|
15
|
-
|
|
16
|
-
## 2. Deployment Scenarios
|
|
17
|
-
|
|
18
|
-
### Scenario A: Local Development (The "NPM" Way)
|
|
19
|
-
Best for individual developers debugging workers locally.
|
|
20
|
-
|
|
21
|
-
**Prerequisites:** Node.js or Bun installed.
|
|
22
|
-
|
|
23
|
-
```bash
|
|
24
|
-
# S1. Run directly via npx (Zero Installation)
|
|
25
|
-
npx @gravito/flux-console
|
|
26
|
-
# Automatically detects local Redis at localhost:6379 and opens browser.
|
|
27
|
-
|
|
28
|
-
# S2. Install globally for frequent use
|
|
29
|
-
npm install -g @gravito/flux-console
|
|
30
|
-
flux-console start
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
### Scenario B: Traditional VM / EC2 (The "Process" Way)
|
|
34
|
-
Best for bare-metal servers or performance-critical environments where avoiding Docker overhead is desired.
|
|
35
|
-
|
|
36
|
-
**Option 1: Node.js + PM2 (Recommended)**
|
|
37
|
-
```bash
|
|
38
|
-
# 1. Install globally
|
|
39
|
-
npm install -g @gravito/flux-console pm2
|
|
40
|
-
|
|
41
|
-
# 2. Start with PM2 for auto-restart and log management
|
|
42
|
-
pm2 start flux-console --name flux-monitor -- --port 3000
|
|
43
|
-
|
|
44
|
-
# 3. Configure Env Vars (if Redis is remote)
|
|
45
|
-
pm2 set flux-monitor:env.REDIS_URL redis://prod-redis:6379
|
|
46
|
-
```
|
|
47
|
-
|
|
48
|
-
**Option 2: Standalone Binary (The "Go" Way)**
|
|
49
|
-
*Ideal for restricted environments without Node.js installed.*
|
|
50
|
-
1. Download the binary: `flux-console-linux-x64`
|
|
51
|
-
2. `chmod +x ./flux-console-linux-x64`
|
|
52
|
-
3. `./flux-console-linux-x64`
|
|
53
|
-
|
|
54
|
-
### Scenario C: Docker & Container Platforms (The "Cloud-Native" Way)
|
|
55
|
-
Best for Kubernetes, AWS ECS, Google Cloud Run, or simple Docker Compose setups.
|
|
56
|
-
|
|
57
|
-
**1. Docker Run**
|
|
58
|
-
```bash
|
|
59
|
-
docker run -d \
|
|
60
|
-
-p 3000:3000 \
|
|
61
|
-
-e REDIS_URL=redis://your-redis-host:6379 \
|
|
62
|
-
-e AUTH_SECRET=my-super-secret-password \
|
|
63
|
-
--name flux-console \
|
|
64
|
-
gravito/flux-console:latest
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
**2. Docker Compose (Full Stack Example)**
|
|
68
|
-
```yaml
|
|
69
|
-
version: '3.8'
|
|
70
|
-
services:
|
|
71
|
-
redis:
|
|
72
|
-
image: redis:alpine
|
|
73
|
-
ports:
|
|
74
|
-
- "6379:6379"
|
|
75
|
-
|
|
76
|
-
flux-console:
|
|
77
|
-
image: gravito/flux-console:latest
|
|
78
|
-
ports:
|
|
79
|
-
- "3000:3000"
|
|
80
|
-
environment:
|
|
81
|
-
- REDIS_URL=redis://redis:6379
|
|
82
|
-
- PORT=3000
|
|
83
|
-
depends_on:
|
|
84
|
-
- redis
|
|
85
|
-
|
|
86
|
-
# Your Application Workers
|
|
87
|
-
worker-orders:
|
|
88
|
-
build: .
|
|
89
|
-
command: npm run start:worker
|
|
90
|
-
environment:
|
|
91
|
-
- REDIS_URL=redis://redis:6379
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
**3. Kubernetes (K8s)**
|
|
95
|
-
Deploy as a simple Deployment + Service.
|
|
96
|
-
|
|
97
|
-
```yaml
|
|
98
|
-
apiVersion: apps/v1
|
|
99
|
-
kind: Deployment
|
|
100
|
-
metadata:
|
|
101
|
-
name: flux-console
|
|
102
|
-
spec:
|
|
103
|
-
replicas: 1
|
|
104
|
-
selector:
|
|
105
|
-
matchLabels:
|
|
106
|
-
app: flux-console
|
|
107
|
-
template:
|
|
108
|
-
metadata:
|
|
109
|
-
labels:
|
|
110
|
-
app: flux-console
|
|
111
|
-
spec:
|
|
112
|
-
containers:
|
|
113
|
-
- name: flux-console
|
|
114
|
-
image: gravito/flux-console:latest
|
|
115
|
-
env:
|
|
116
|
-
- name: REDIS_URL
|
|
117
|
-
valueFrom:
|
|
118
|
-
secretKeyRef:
|
|
119
|
-
name: redis-secrets
|
|
120
|
-
key: url
|
|
121
|
-
ports:
|
|
122
|
-
- containerPort: 3000
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
---
|
|
126
|
-
|
|
127
|
-
## 3. Security Best Practices
|
|
128
|
-
|
|
129
|
-
Since Flux Console provides administrative capabilities (Pause Queue, Retry Job, Delete Job), security is paramount in production.
|
|
130
|
-
|
|
131
|
-
1. **Network Isolation (Private VPC)**:
|
|
132
|
-
- **Recommendation**: Do NOT expose Flux Console to the public internet.
|
|
133
|
-
- Deploy it within your VPN / Private Subnet.
|
|
134
|
-
- Access via VPN or SSH Tunnel.
|
|
135
|
-
|
|
136
|
-
2. **Authentication**:
|
|
137
|
-
- Enable built-in simple auth by setting `AUTH_PASSWORD` env var.
|
|
138
|
-
- For enterprise, put it behind an Identity Aware Proxy (e.g., Cloudflare Access, AWS ALB OIDC) to enforce SSO (Google/Okta) login.
|
|
139
|
-
|
|
140
|
-
3. **Read-Only Mode (Future Feature)**:
|
|
141
|
-
- For giving access to support teams, run a separate instance with `READ_ONLY=true` env var (Roadmap item).
|
|
142
|
-
|
|
143
|
-
## 4. Scaling (High Availability)
|
|
144
|
-
|
|
145
|
-
Flux Console is **stateless**. You can run multiple instances behind a Load Balancer for high availability.
|
|
146
|
-
|
|
147
|
-
- **Session Affinity**: Not required (JWT based Auth).
|
|
148
|
-
- **Resource Usage**: Very low (mostly forwarding Redis data). A standard `t3.micro` or `256MB` container is usually sufficient for monitoring even large clusters.
|
|
149
|
-
|
|
150
|
-
---
|
|
151
|
-
|
|
152
|
-
## 5. Troubleshooting
|
|
153
|
-
|
|
154
|
-
**Common Issue: "Cannot connect to Redis"**
|
|
155
|
-
- **Docker**: Ensure you use the service name (e.g., `redis`) not `localhost` if inside the same network. Host networking might be needed for accessing host Redis.
|
|
156
|
-
- **AWS ElastiCache**: Ensure Security Groups allow traffic on port 6379 from the Console's security group.
|
|
157
|
-
- **Encryption**: If Redis uses TLS (rediss://), ensure certificates are trusted or use `REDIS_TLS_REJECT_UNAUTHORIZED=0` (not recommended for prod).
|
package/docs/DOCS_INTERNAL.md
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
# Internal Technical Documentation
|
|
2
|
-
|
|
3
|
-
This document records technical implementations for Dead Letter Queues (DLQ) and Worker Metrics within the Flux system.
|
|
4
|
-
|
|
5
|
-
## 1. Dead Letter Queue (DLQ)
|
|
6
|
-
|
|
7
|
-
### Storage (Redis)
|
|
8
|
-
Failed jobs are moved to a specific list with the suffix `:failed`.
|
|
9
|
-
- **Key**: `{queue}:failed`
|
|
10
|
-
- **Cap**: 1,000 items (capped via `LTRIM` in `RedisDriver.fail`).
|
|
11
|
-
|
|
12
|
-
### Life Cycle
|
|
13
|
-
1. `Worker` attempts to process a job.
|
|
14
|
-
2. On failure, `Worker` calculates retry delay using `job.getRetryDelay(attempt)`.
|
|
15
|
-
3. If `attempt >= maxAttempts`, `Consumer` catches the error.
|
|
16
|
-
4. `Consumer` calls `QueueManager.fail(job, error)`.
|
|
17
|
-
5. Driver pushes the job to the `:failed` list with `error` and `failedAt` metadata.
|
|
18
|
-
|
|
19
|
-
---
|
|
20
|
-
|
|
21
|
-
## 2. Worker Metrics
|
|
22
|
-
|
|
23
|
-
Workers report health metrics during their heartbeat cycle (default: every 5s).
|
|
24
|
-
|
|
25
|
-
### Metric Payload Schema
|
|
26
|
-
```json
|
|
27
|
-
{
|
|
28
|
-
"cpu": 0.15, // Load average (normalized by cores)
|
|
29
|
-
"ram": {
|
|
30
|
-
"rss": 120, // Resident Set Size (MB)
|
|
31
|
-
"heapUsed": 45, // V8 Heap Used (MB)
|
|
32
|
-
"heapTotal": 64 // V8 Heap Total (MB)
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
### Storage
|
|
38
|
-
In Redis, metrics are stored as part of the `flux_console:workers:{id}` hash.
|
|
39
|
-
- **Field**: `metrics` (JSON string)
|
|
40
|
-
|
|
41
|
-
---
|
|
42
|
-
|
|
43
|
-
## 3. Bulk Retry Logic (Lua)
|
|
44
|
-
|
|
45
|
-
To ensure atomicity and performance, bulk retries of failed jobs use Lua scripts.
|
|
46
|
-
|
|
47
|
-
### Retry All Script
|
|
48
|
-
Moves all elements from `{queue}:failed` to `{queue}` then deletes the failed list.
|
|
49
|
-
```lua
|
|
50
|
-
local jobs = redis.call('LRANGE', KEYS[1], 0, -1)
|
|
51
|
-
for i, job in ipairs(jobs) do
|
|
52
|
-
redis.call('RPUSH', KEYS[2], job)
|
|
53
|
-
end
|
|
54
|
-
redis.call('DEL', KEYS[1])
|
|
55
|
-
return #jobs
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
---
|
|
59
|
-
|
|
60
|
-
## 4. System Logs & Archiving
|
|
61
|
-
|
|
62
|
-
To maintain a permanent record of system events while keeping Redis memory usage low, Flux Console uses an asynchronous archiving pattern.
|
|
63
|
-
|
|
64
|
-
### Live Logs (Redis)
|
|
65
|
-
* **Key**: `flux_console:logs:system` (List)
|
|
66
|
-
* **Strategy**: LILO (Last-In-Last-Out) capped at 100 items.
|
|
67
|
-
* **Update**: Every `publishLog` call pushes to this list and trims it.
|
|
68
|
-
|
|
69
|
-
### Persistent Archiving (SQL)
|
|
70
|
-
* **Trigger**: Every `QueueService.publishLog` call asynchronously sends the log to the configured `PersistenceAdapter`.
|
|
71
|
-
* **Table**: `flux_system_logs` (MySQL or SQLite).
|
|
72
|
-
* **Search**: The `/api/logs/archive` endpoint performs direct SQL queries with filters on `level`, `worker_id`, `queue`, and `message` content.
|
|
73
|
-
* **Retention**: Cleanup is handled via `PersistenceAdapter.cleanup`, removing logs older than the configured threshold (default: 30 days).
|