@gravito/zenith 0.1.0-beta.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/dist/bin.js +38846 -27303
- package/dist/client/assets/index-C332gZ-J.css +1 -0
- package/dist/client/assets/index-D4HibwTK.js +436 -0
- package/dist/client/index.html +2 -2
- package/dist/server/index.js +38846 -27303
- package/docs/ALERTING_GUIDE.md +71 -0
- package/docs/LARAVEL_ZENITH_ROADMAP.md +109 -0
- package/docs/QUASAR_MASTER_PLAN.md +140 -0
- package/package.json +52 -48
- package/scripts/debug_redis_keys.ts +24 -0
- package/specs/PULSE_SPEC.md +86 -0
- package/src/client/App.tsx +2 -0
- package/src/client/Layout.tsx +18 -0
- package/src/client/Sidebar.tsx +2 -1
- package/src/client/WorkerStatus.tsx +121 -76
- package/src/client/components/BrandIcons.tsx +138 -0
- package/src/client/components/ConfirmDialog.tsx +0 -1
- package/src/client/components/JobInspector.tsx +18 -6
- package/src/client/components/PageHeader.tsx +38 -0
- package/src/client/pages/OverviewPage.tsx +17 -20
- package/src/client/pages/PulsePage.tsx +478 -0
- package/src/client/pages/QueuesPage.tsx +1 -3
- package/src/client/pages/SettingsPage.tsx +640 -78
- package/src/client/pages/WorkersPage.tsx +71 -3
- package/src/client/pages/index.ts +1 -0
- package/src/server/index.ts +311 -11
- package/src/server/services/AlertService.ts +189 -41
- package/src/server/services/CommandService.ts +137 -0
- package/src/server/services/PulseService.ts +80 -0
- package/src/server/services/QueueService.ts +63 -6
- package/src/shared/types.ts +99 -0
- package/tsconfig.json +2 -2
- package/ARCHITECTURE.md +0 -88
- package/BATCH_OPERATIONS_IMPLEMENTATION.md +0 -159
- package/EVOLUTION_BLUEPRINT.md +0 -112
- package/JOBINSPECTOR_SCROLL_FIX.md +0 -152
- package/PULSE_IMPLEMENTATION_PLAN.md +0 -111
- package/TESTING_BATCH_OPERATIONS.md +0 -252
- package/dist/client/assets/index-DGYEwTDL.css +0 -1
- package/dist/client/assets/index-oyTdySX0.js +0 -421
- /package/{DEPLOYMENT.md → docs/DEPLOYMENT.md} +0 -0
- /package/{DOCS_INTERNAL.md → docs/DOCS_INTERNAL.md} +0 -0
- /package/{QUICK_TEST_GUIDE.md → docs/QUICK_TEST_GUIDE.md} +0 -0
- /package/{ROADMAP.md → docs/ROADMAP.md} +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# 🔔 Zenith Alerting Guide
|
|
2
|
+
|
|
3
|
+
This guide explains how to configure and manage the alerting system in Zenith to ensure your infrastructure and queues remain healthy.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🚀 Overview
|
|
8
|
+
|
|
9
|
+
Zenith's alerting engine is **Redis-Native** and **Stateless**.
|
|
10
|
+
* **Persistence**: Rules are stored in Redis (`gravito:zenith:alerts:rules`).
|
|
11
|
+
* **Evaluation**: The server evaluates all rules every 2 seconds against real-time metrics.
|
|
12
|
+
* **Delivery**: Alerts are dispatched via Slack Webhooks.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 🛠️ Configuration Fields
|
|
17
|
+
|
|
18
|
+
When adding a new rule in **Settings > Alerting**, you will encounter these fields:
|
|
19
|
+
|
|
20
|
+
### 1. Rule Name
|
|
21
|
+
A descriptive label for the alert (e.g., `Critical Backlog`, `Agent Offline`). This name will appear in the Slack notification.
|
|
22
|
+
|
|
23
|
+
### 2. Type (Metric Category)
|
|
24
|
+
* **Queue Backlog**: Monitors the number of jobs in the `waiting` state.
|
|
25
|
+
* **High Failure Count**: Monitors the number of jobs in the `failed` state.
|
|
26
|
+
* **Worker Loss**: Monitors the total number of active worker nodes.
|
|
27
|
+
* **Node CPU (%)**: Monitors process-level CPU usage reported by Quasar Agents.
|
|
28
|
+
* **Node RAM (%)**: Monitors process-level RAM usage (RSS) relative to system total.
|
|
29
|
+
|
|
30
|
+
### 3. Threshold
|
|
31
|
+
The numeric value that triggers the alert.
|
|
32
|
+
* For **Backlog/Failure**: The number of jobs (e.g., `1000`).
|
|
33
|
+
* For **CPU/RAM**: The percentage (e.g., `90`).
|
|
34
|
+
* For **Worker Loss**: The *minimum* number of workers expected (e.g., alert triggers if count is `< 2`).
|
|
35
|
+
|
|
36
|
+
### 4. Cooldown (Minutes)
|
|
37
|
+
**Crucial Concept**: The period the system "stays silent" after an alert is fired.
|
|
38
|
+
* **Logic**: Once a rule triggers and sends a notification, it enters a "lock" state for the duration of the cooldown.
|
|
39
|
+
* **Purpose**: Prevents "Alert Fatigue" and notification storms.
|
|
40
|
+
* **Example**: If set to `30`, and a backlog spike occurs, you get **one** notification. You won't get another one for the same rule for 30 minutes, even if the backlog remains high.
|
|
41
|
+
|
|
42
|
+
### 5. Queue (Optional)
|
|
43
|
+
Specify a specific queue name (e.g., `orders`, `emails`) to monitor. If left empty, the rule applies to the **total sum** of all queues.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 🌊 Best Practices
|
|
48
|
+
|
|
49
|
+
### The "Instant Fire" Design
|
|
50
|
+
Zenith alerts are designed for **instant awareness**.
|
|
51
|
+
* If a threshold is met during a 2-second check, the alert fires **immediately**.
|
|
52
|
+
* It does **not** wait for the condition to persist for multiple minutes (Debouncing).
|
|
53
|
+
* **Pro Tip**: If you have frequent "tiny spikes" that resolve themselves in seconds, set your **Threshold** slightly higher than the spikes to avoid noise.
|
|
54
|
+
|
|
55
|
+
### Recommended Settings
|
|
56
|
+
|
|
57
|
+
| Scenario | Type | Threshold | Cooldown |
|
|
58
|
+
| :--- | :--- | :--- | :--- |
|
|
59
|
+
| **Critical Failure** | High Failure Count | 50 | 15m |
|
|
60
|
+
| **System Overload** | Node CPU | 90 | 30m |
|
|
61
|
+
| **Quiet Hours** | Queue Backlog | 5000 | 120m |
|
|
62
|
+
| **Fatal Shutdown** | Worker Loss | 1 | 10m |
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 🔗 Slack Integration
|
|
67
|
+
To receive notifications, ensure the `SLACK_WEBHOOK_URL` environment variable is set before starting the Zenith server.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
export SLACK_WEBHOOK_URL=https://hooks.slack.com/services/Txxx/Bxxx/Xxxx
|
|
71
|
+
```
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# 🚀 Project Zenith: Laravel Integration Roadmap
|
|
2
|
+
|
|
3
|
+
**Repository**: `gravito-framework/laravel-zenith`
|
|
4
|
+
**Target Audience**: Laravel 10/11 Applications
|
|
5
|
+
**Goal**: Provide deep, native introspection into Laravel applications for Gravito Zenith.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 1. Vision & Architecture
|
|
10
|
+
|
|
11
|
+
Unlike the **Quasar Agent** (which is a sidecar daemon for OS/Infrastructure monitoring), **Laravel Zenith** is a native Composer package that lives *inside* the application.
|
|
12
|
+
|
|
13
|
+
* **Role**: " The Reporter". It sees what the OS cannot see.
|
|
14
|
+
* **Transport**: Direct Redis connection (utilizing `swarrot` or standard `predis`/`phpredis`).
|
|
15
|
+
* **Philosophy**: Zero-blocking. All reporting should be "fire-and-forget" or queued to avoid slowing down the user request lifecycle.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 2. Core Features (The "Why")
|
|
20
|
+
|
|
21
|
+
### A. Live Operational Logs (`logs`)
|
|
22
|
+
* **Feature**: A custom `Log Channel` driver.
|
|
23
|
+
* **Goal**: Stream logs (Info/Error/Debug) directly to Zenith's Live Log view.
|
|
24
|
+
* **Implementation**:
|
|
25
|
+
* `config/logging.php`: Add a `zenith` channel.
|
|
26
|
+
* Push JSON payloads to `flux_console:logs` Redis channel.
|
|
27
|
+
|
|
28
|
+
### B. Queue Lifecycle Events (`queues`)
|
|
29
|
+
* **Feature**: Listen to Laravel Queue Events (`JobProcessing`, `JobProcessed`, `JobFailed`).
|
|
30
|
+
* **Goal**: Provide granular job insight that `quasar-go` cannot (e.g., "Job X failed with Exception Y", "Job Z took 45s").
|
|
31
|
+
* **Implementation**:
|
|
32
|
+
* Event Subscriber for `Illuminate\Queue\Events\*`.
|
|
33
|
+
* Capture `job->getRawBody()`, `exception->getMessage()`.
|
|
34
|
+
|
|
35
|
+
### C. Request Performance (`http`)
|
|
36
|
+
* **Feature**: Global Middleware (`ZenithMonitorMiddleware`).
|
|
37
|
+
* **Goal**: Track "Slow Requests", 500 Errors, and Throughput.
|
|
38
|
+
* **Metrics**:
|
|
39
|
+
* Status Codes (2xx, 4xx, 5xx).
|
|
40
|
+
* Duration (ms).
|
|
41
|
+
* Route Name / Controller Action.
|
|
42
|
+
|
|
43
|
+
### D. System Health Checks
|
|
44
|
+
* **Feature**: `php artisan zenith:check`
|
|
45
|
+
* **Goal**: Verify Redis connection and permissions.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 3. Implementation Roadmap
|
|
50
|
+
|
|
51
|
+
### Phase 1: The Foundation (Logs & Config)
|
|
52
|
+
**Goal**: Get the package installed and streaming basic logs.
|
|
53
|
+
- [ ] Initialize Repository `gravito-framework/laravel-zenith`.
|
|
54
|
+
- [ ] Create `ZenithServiceProvider`.
|
|
55
|
+
- [ ] Implement `ZenithLogger` (Monolog Handler).
|
|
56
|
+
- [ ] Publishing `config/zenith.php` (Redis connection settings).
|
|
57
|
+
- [ ] **Deliverable**: `Log::info('Hello Zenith')` appears in Zenith UI.
|
|
58
|
+
|
|
59
|
+
### Phase 2: The Worker's Eye (Queues)
|
|
60
|
+
**Goal**: Deep visibility into Queue Jobs.
|
|
61
|
+
- [ ] Create `ZenithQueueSubscriber`.
|
|
62
|
+
- [ ] Handle `JobFailed`: Serialize exception and push to Zenith Alerting.
|
|
63
|
+
- [ ] Handle `JobProcessed`: Record metrics for "Jobs per minute".
|
|
64
|
+
- [ ] **Deliverable**: Seeing real-time "Job Completed" toasts and Error details in Zenith.
|
|
65
|
+
|
|
66
|
+
### Phase 3: The Watchtower (HTTP & Exceptions)
|
|
67
|
+
**Goal**: Monitoring web requests.
|
|
68
|
+
- [ ] Create `RecordRequestMetrics` Middleware.
|
|
69
|
+
- [ ] Exception Handler integration (optional, for global error catching).
|
|
70
|
+
- [ ] Filter logic (ignore `/nova`, `/telescope`, etc.).
|
|
71
|
+
- [ ] **Deliverable**: HTTP Throughput graphs in Zenith.
|
|
72
|
+
|
|
73
|
+
### Phase 4: The Bridge (Remote Control Hooks)
|
|
74
|
+
**Goal**: Allow Zenith to trigger Laravel actions safely.
|
|
75
|
+
- [ ] Expose internal hooks for `quasar-go` to call?
|
|
76
|
+
* *Note*: `quasar-go` already calls `artisan`. Phase 4 might be about ensuring `artisan zenith:run-job {id}` exists if we need advanced job re-running that `queue:retry` can't handle.
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 4. Technical Specifications
|
|
81
|
+
|
|
82
|
+
### Redis Protocol
|
|
83
|
+
We will reuse the **Gravito Pulse Protocol (GPP)** used by `quasar-go`:
|
|
84
|
+
* **Logs**: `PUBLISH flux_console:logs`
|
|
85
|
+
* **Metrics**: `INCR flux_console:metrics:...`
|
|
86
|
+
|
|
87
|
+
### Configuration (`zenith.php`)
|
|
88
|
+
```php
|
|
89
|
+
return [
|
|
90
|
+
'enabled' => env('ZENITH_ENABLED', true),
|
|
91
|
+
|
|
92
|
+
'connection' => env('ZENITH_REDIS_CONNECTION', 'default'),
|
|
93
|
+
|
|
94
|
+
'logging' => [
|
|
95
|
+
'enabled' => true,
|
|
96
|
+
'level' => 'debug',
|
|
97
|
+
],
|
|
98
|
+
|
|
99
|
+
'queues' => [
|
|
100
|
+
'monitor_all' => true,
|
|
101
|
+
'ignore_jobs' => [],
|
|
102
|
+
],
|
|
103
|
+
];
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Dependency Strategy
|
|
107
|
+
* **Support**: Laravel 10.x, 11.x
|
|
108
|
+
* **Php**: 8.1+
|
|
109
|
+
* **Driver**: `phpredis` (preferred) or `predis`.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# 🌌 Project Quasar: Master Implementation Plan
|
|
2
|
+
|
|
3
|
+
**Version**: 1.0.0 (Unified)
|
|
4
|
+
**Target**: Zenith v1.0
|
|
5
|
+
**Context**: This document supersedes all previous "Pulse" plans. It is the single source of truth for the Quasar monitoring ecosystem.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 1. Vision & Identity
|
|
10
|
+
|
|
11
|
+
**Quasar** is the comprehensive observability layer for the Gravito ecosystem. It unifies infrastructure monitoring (CPU/RAM), application insights (Queues/Slow Logs), and availability checks into a single stream.
|
|
12
|
+
|
|
13
|
+
> **Slogan**: *"The brightest signal in your infrastructure."*
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 2. Architecture & Deployment Matrix
|
|
18
|
+
|
|
19
|
+
We employ a "Right Tool for the Job" strategy for deployment:
|
|
20
|
+
|
|
21
|
+
| Ecosystem | Tool | Package | Strategy |
|
|
22
|
+
| :--- | :--- | :--- | :--- |
|
|
23
|
+
| **Node.js / Bun** | **SDK** | `@gravito/quasar` | **In-App Integration**. Directly imports into the app. Captures Event Loop, Heap, and Queues. |
|
|
24
|
+
| **Legacy / Polyglot** | **Agent** | `gravito/quasar-agent` | **Sidecar / Daemon**. Standalone Go binary. Captures OS-level metrics and external Queue states via Redis/API. |
|
|
25
|
+
| **PHP / Laravel** | **Package** | `gravito/laravel-zenith` | **Native Integration**. Laravel Service Provider. Captures Jobs, Logs, and Exceptions. |
|
|
26
|
+
|
|
27
|
+
### 🚀 Deployment Methods (Zero Friction)
|
|
28
|
+
1. **NPM**: `npm install @gravito/quasar` (For Node developers)
|
|
29
|
+
2. **Docker**: `image: gravito/quasar-agent:latest` (For Container/K8s/Laravel Sail)
|
|
30
|
+
3. **Shell**: `curl -sL get.gravito.dev/quasar | bash` (For Bare Metal/VM)
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## 3. Data Protocol (The Quasar Schema)
|
|
35
|
+
|
|
36
|
+
All agents/SDKs report to Redis using this unified schema.
|
|
37
|
+
|
|
38
|
+
**Namespace**: `gravito:quasar:`
|
|
39
|
+
|
|
40
|
+
### A. Heartbeat (Infrastructure)
|
|
41
|
+
* **Key**: `gravito:quasar:node:{service_name}:{node_id}`
|
|
42
|
+
* **TTL**: 30 seconds
|
|
43
|
+
* **Metrics Philosophy**: Report **BOTH** Process and System metrics to isolate resource usage.
|
|
44
|
+
* `process`: metrics for the specific service (RAM usage, CPU time).
|
|
45
|
+
* `system`: metrics for the host OS (Load avg, Total RAM).
|
|
46
|
+
|
|
47
|
+
### B. Queues (Workload)
|
|
48
|
+
* **Key**: `gravito:quasar:queues:{service_name}`
|
|
49
|
+
* **TTL**: 30 seconds
|
|
50
|
+
* **Purpose**: Snapshots of queue depths from various drivers.
|
|
51
|
+
* Gravito Stream (Native)
|
|
52
|
+
* Laravel Horizon (Redis)
|
|
53
|
+
* BullMQ (Redis)
|
|
54
|
+
* AWS SQS (API)
|
|
55
|
+
|
|
56
|
+
### C. Insights (Performance)
|
|
57
|
+
* **Key**: `gravito:quasar:slow:{service_name}` (Stream)
|
|
58
|
+
* **Purpose**: Log requests or jobs that exceed performance thresholds.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## 4. Execution Roadmap
|
|
63
|
+
|
|
64
|
+
### Phase 1: Foundation & Application Monitoring (Pulse Node)
|
|
65
|
+
**Goal**: Establish the basic dashboard and Node.js SDK for monitoring application health (CPU/RAM).
|
|
66
|
+
|
|
67
|
+
- [x] **Define Schema**: Update `PULSE_SPEC.md` with new Redis key patterns (`gravito:quasar:node:*`) and payload structure.
|
|
68
|
+
- [x] **SDK Update**: Refactor `@gravito/quasar` (formerly pulse-node) to support:
|
|
69
|
+
- [x] Automatic runtime detection (Node, Bun, Deno).
|
|
70
|
+
- [x] System/Process split metrics.
|
|
71
|
+
- [x] Correct Redis namespacing.
|
|
72
|
+
- [x] **Server Update**: Update Zenith's `PulseService` to scan new key patterns.
|
|
73
|
+
- [x] **UI Overhaul**: Redesign `PulsePage` in Zenith:
|
|
74
|
+
- [x] Implement "Card" layout for nodes.
|
|
75
|
+
- [x] Rich metrics visualization (CPU/RAM split bars).
|
|
76
|
+
- [x] Add brand icons for runtimes (Node, Bun, Deno, PHP, Go, Python).
|
|
77
|
+
- [x] **Layout Optimization**: Compact Grid for Service Groups.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### Phase 2: Architecture Evolution - "The Brain-Hand Model" 🧠 🖐️ - **Completed** ✅
|
|
82
|
+
To support advanced features like **Queue Insights** (Phase 2) and **Remote Control** (Phase 3), we are adopting a bidirectional architecture.
|
|
83
|
+
|
|
84
|
+
* **Metric Transport (The Mouth)**: Agent sends metrics to Zenith (via shared Redis).
|
|
85
|
+
* **Local Insight (The Eyes)**: Agent inspects *its own* environment (Local Redis, Local Queue) to gather data. Zenith doesn't need to connect to the App DB directly.
|
|
86
|
+
* **Command execution (The Hand)**: Zenith publishes commands (Retry/Delete), and Agent listens and executes them locally.
|
|
87
|
+
|
|
88
|
+
#### Revised Phase 2: Application Insights (Queues) - **Completed** ✅
|
|
89
|
+
**Goal**: Enable Quasar Agent to "see" local queues and report their status.
|
|
90
|
+
|
|
91
|
+
- [x] **SDK Architecture**: Update `QuasarAgent` to handle **Dual Connections**:
|
|
92
|
+
- `transport`: Connection to Zenith (for sending heartbeat).
|
|
93
|
+
- `app`: Connection to Local App (for inspecting queues/bull/laravel).
|
|
94
|
+
- [x] **Probe Implementation**: Create `QueueProbe` interface and implementations:
|
|
95
|
+
- `RedisListProbe`: Simple `LLEN` checks.
|
|
96
|
+
- [x] `BullProbe` (Future): Check `bull:*:waiting`, etc.
|
|
97
|
+
- [x] `LaravelProbe`: Check `queues:default`, `queues:reserved`, `queues:delayed`.
|
|
98
|
+
- [x] **SDK API**: Expose `.monitorQueue(name, type)` method.
|
|
99
|
+
- [x] **UI Update**: Update `NodeCard` to render a "Queues" section if queue data is present in payload.
|
|
100
|
+
|
|
101
|
+
### Phase 3: Remote Control (Command & Control) - **Completed** ✅
|
|
102
|
+
**Goal**: Allow Zenith to instruct Quasar to perform actions (Retry Job, Delete Job).
|
|
103
|
+
|
|
104
|
+
- [x] **Protocol**: Define Command Protocol (Redis Pub/Sub: `gravito:quasar:cmd:{service}:{node_id}`).
|
|
105
|
+
- [x] **Agent**: Implement `CommandListener` in SDK.
|
|
106
|
+
- [x] **Command Executors**: Implement `RetryJobExecutor` and `DeleteJobExecutor`.
|
|
107
|
+
- [x] **Security (Allowlist Strategy)**:
|
|
108
|
+
- [x] Implement **Command Allowlist** inside Agent code (only `RETRY_JOB`, `DELETE_JOB` allowed).
|
|
109
|
+
- [ ] (Future) Use **Redis ACL** (v6+) to restrict Agent's `transport` connection.
|
|
110
|
+
- [x] **Server**: Add `CommandService` and `/api/pulse/command` endpoint.
|
|
111
|
+
- [x] **UI**: Add "Retry/Delete" buttons in Zenith `PulsePage` for failed queue jobs.
|
|
112
|
+
- [x] **Documentation**: Created `ALERTING_GUIDE.md` for configuration best practices.
|
|
113
|
+
|
|
114
|
+
### Phase 4: Polyglot Agent - **Completed** ✅
|
|
115
|
+
* [x] Create `gravito-framework/quasar` repo (`quasar-go`).
|
|
116
|
+
* [x] Develop Go Agent core (utilizing `gopsutil`).
|
|
117
|
+
* [x] System Probe (CPU/RAM)
|
|
118
|
+
* [x] Agent heartbeat loop
|
|
119
|
+
* [x] Config management (env vars)
|
|
120
|
+
* [x] Implement Queue Monitoring in Go Agent:
|
|
121
|
+
* [x] Redis List Probe
|
|
122
|
+
* [x] Laravel Queue Probe
|
|
123
|
+
* [x] Implement Remote Control in Go Agent:
|
|
124
|
+
* [x] Command Listener (Pub/Sub)
|
|
125
|
+
* [x] RETRY_JOB / DELETE_JOB Executors
|
|
126
|
+
* [x] **Laravel Deep Integration**:
|
|
127
|
+
* [x] `LARAVEL_ACTION` Executor (runs `artisan` safely).
|
|
128
|
+
* [x] Auto-discovery of Laravel project root via process inspection.
|
|
129
|
+
* [x] **Advanced Process Introspection**: Captures real-time CPU/RAM usage per Laravel Worker process.
|
|
130
|
+
* [x] **Virtual Node Mapping**: Visualizes individual Laravel Workers as distinct nodes in Zenith UI.
|
|
131
|
+
* [x] Support for `retry-all`, `retry {id}`, and `restart` (graceful worker reload).
|
|
132
|
+
* [x] Docker & Makefile setup.
|
|
133
|
+
* [x] Binary Release pipeline (GitHub Actions).
|
|
134
|
+
* [x] Publish to Docker Hub (`carllee/quasar-go-agent`).
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 5. Security & Access
|
|
139
|
+
* **Auth**: Agents authenticate via a shared secret (`QUASAR_TOKEN`) if writing to a remote Redis.
|
|
140
|
+
* **Isolation**: Process metrics only report what they have access to. System metrics require readable `/proc` (in Docker).
|
package/package.json
CHANGED
|
@@ -1,50 +1,54 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
"
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
"
|
|
48
|
-
|
|
49
|
-
|
|
2
|
+
"name": "@gravito/zenith",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Gravito Zenith: Zero-config control plane for Gravito Flux & Stream",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"zenith": "dist/bin.js",
|
|
8
|
+
"flux-console": "dist/bin.js"
|
|
9
|
+
},
|
|
10
|
+
"main": "./dist/index.js",
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"scripts": {
|
|
13
|
+
"dev:server": "bun run --watch src/server/index.ts",
|
|
14
|
+
"dev:client": "vite",
|
|
15
|
+
"build": "vite build && bun build ./src/server/index.ts ./src/bin.ts --outdir ./dist --target bun",
|
|
16
|
+
"start": "bun ./dist/bin.js",
|
|
17
|
+
"test": "bun test",
|
|
18
|
+
"typecheck": "bun tsc -p tsconfig.json --noEmit --skipLibCheck",
|
|
19
|
+
"seed": "bun scripts/seed.ts",
|
|
20
|
+
"worker": "bun scripts/worker.ts"
|
|
21
|
+
},
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"@gravito/atlas": "workspace:*",
|
|
24
|
+
"@gravito/photon": "workspace:*",
|
|
25
|
+
"@gravito/quasar": "workspace:*",
|
|
26
|
+
"@gravito/stream": "workspace:*",
|
|
27
|
+
"@tanstack/react-query": "^5.0.0",
|
|
28
|
+
"clsx": "^2.1.1",
|
|
29
|
+
"date-fns": "^4.1.0",
|
|
30
|
+
"framer-motion": "^12.23.26",
|
|
31
|
+
"ioredis": "^5.0.0",
|
|
32
|
+
"lucide-react": "^0.562.0",
|
|
33
|
+
"nodemailer": "^7.0.12",
|
|
34
|
+
"react": "^19.0.0",
|
|
35
|
+
"react-dom": "^19.0.0",
|
|
36
|
+
"react-router-dom": "^7.11.0",
|
|
37
|
+
"recharts": "^3.6.0",
|
|
38
|
+
"tailwind-merge": "^3.4.0"
|
|
39
|
+
},
|
|
40
|
+
"devDependencies": {
|
|
41
|
+
"@types/nodemailer": "^7.0.4",
|
|
42
|
+
"@types/react": "^19.0.0",
|
|
43
|
+
"@types/react-dom": "^19.0.0",
|
|
44
|
+
"@vitejs/plugin-react": "^5.1.2",
|
|
45
|
+
"autoprefixer": "^10.4.0",
|
|
46
|
+
"postcss": "^8.4.0",
|
|
47
|
+
"tailwindcss": "^3.4.0",
|
|
48
|
+
"typescript": "^5.9.3",
|
|
49
|
+
"vite": "^6.0.0"
|
|
50
|
+
},
|
|
51
|
+
"publishConfig": {
|
|
52
|
+
"access": "public"
|
|
53
|
+
}
|
|
50
54
|
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { Redis } from 'ioredis'
|
|
2
|
+
|
|
3
|
+
const redis = new Redis('redis://localhost:6379')
|
|
4
|
+
|
|
5
|
+
async function check() {
|
|
6
|
+
console.log('Connecting to Redis...')
|
|
7
|
+
try {
|
|
8
|
+
const keys = await redis.keys('gravito:quasar:node:*')
|
|
9
|
+
console.log('Keys found count:', keys.length)
|
|
10
|
+
console.log('Keys:', keys)
|
|
11
|
+
|
|
12
|
+
if (keys.length > 0) {
|
|
13
|
+
const val = await redis.get(keys[0])
|
|
14
|
+
console.log('--- Value of first key ---')
|
|
15
|
+
console.log(val)
|
|
16
|
+
console.log('--- End Value ---')
|
|
17
|
+
}
|
|
18
|
+
} catch (err) {
|
|
19
|
+
console.error('Redis Error:', err)
|
|
20
|
+
}
|
|
21
|
+
process.exit(0)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
check()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Gravito Pulse Implementation Spec
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
Gravito Pulse is a lightweight APM (Application Performance Monitoring) system integrated into Zenith. It follows the philosophy: *"If you can connect to Redis, you are monitored."*
|
|
5
|
+
|
|
6
|
+
## 1. Gravito Pulse Protocol (GPP)
|
|
7
|
+
|
|
8
|
+
### Data Structure
|
|
9
|
+
Pulse uses Redis keys with specific TTLs to represent live services.
|
|
10
|
+
|
|
11
|
+
- **Key Pattern**: `gravito:quasar:node:{service}:{node_id}`
|
|
12
|
+
- **TTL**: 30 seconds (Agents should heartbeat every 10-15s).
|
|
13
|
+
- **Data Type**: String (JSON)
|
|
14
|
+
|
|
15
|
+
### Payload Schema
|
|
16
|
+
```json
|
|
17
|
+
{
|
|
18
|
+
"id": "string", // Unique Instance ID (e.g., UUID or Hostname-PID)
|
|
19
|
+
"service": "string", // Group name (e.g., "worker-billing", "api-gateway")
|
|
20
|
+
"language": "string", // "node" | "bun" | "deno" | "php" | "go" | "python" | "other"
|
|
21
|
+
"version": "string", // Language/Runtime Version
|
|
22
|
+
"pid": "number", // Process ID
|
|
23
|
+
"hostname": "string", // Machine Hostname or Custom Name
|
|
24
|
+
"platform": "string", // OS Platform (linux, darwin, win32)
|
|
25
|
+
"cpu": {
|
|
26
|
+
"system": "number", // System Load % (0-100)
|
|
27
|
+
"process": "number", // Process Usage % (0-100)
|
|
28
|
+
"cores": "number" // Core count
|
|
29
|
+
},
|
|
30
|
+
"memory": {
|
|
31
|
+
"system": {
|
|
32
|
+
"total": "number", // System Total Memory (bytes)
|
|
33
|
+
"free": "number", // System Free Memory (bytes)
|
|
34
|
+
"used": "number" // System Used Memory (bytes)
|
|
35
|
+
},
|
|
36
|
+
"process": {
|
|
37
|
+
"rss": "number", // Resident Set Size (bytes)
|
|
38
|
+
"heapTotal": "number",// Heap Total (bytes)
|
|
39
|
+
"heapUsed": "number" // Heap Used (bytes)
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"runtime": {
|
|
43
|
+
"uptime": "number", // Process uptime in seconds
|
|
44
|
+
"framework": "string" // Optional framework info
|
|
45
|
+
},
|
|
46
|
+
"timestamp": "number" // Unix Ms Timestamp
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 2. Implementation Modules
|
|
51
|
+
|
|
52
|
+
### A. Client SDK (`@gravito/pulse-node`)
|
|
53
|
+
A lightweight agent to collect metrics and publish to Redis.
|
|
54
|
+
- **Dependencies**: `ioredis`, `pidusage` (optional, or use native `os`/`process`).
|
|
55
|
+
- **Functionality**:
|
|
56
|
+
- `startPulse({ service: string })`: Starts the heartbeat loop.
|
|
57
|
+
- Collects CPU/RAM usage.
|
|
58
|
+
- Publishes to Redis.
|
|
59
|
+
|
|
60
|
+
### B. Server Collector (Zenith Console)
|
|
61
|
+
- **Service**: `PulseService`
|
|
62
|
+
- **Method**: `getNodes()`
|
|
63
|
+
- Performs `SCAN 0 MATCH pulse:* COUNT 100`.
|
|
64
|
+
- Returns grouped nodes by `service`.
|
|
65
|
+
- **API**: `GET /api/pulse/nodes`
|
|
66
|
+
|
|
67
|
+
### C. Frontend Dashboard (Zenith UI)
|
|
68
|
+
- **Route**: `/pulse`
|
|
69
|
+
- **Components**:
|
|
70
|
+
- `ServiceGroup`: A container for nodes of a specific service.
|
|
71
|
+
- `NodeCard`: Displays CPU/RAM sparklines (optional) and current health.
|
|
72
|
+
- `HealthBadge`: Green (Fresh), Yellow (>15s ago), Red (Dead/Gone - though Redis TTL handles removal, frontend can handle stale UI).
|
|
73
|
+
|
|
74
|
+
## 3. Alerts (Phase 2)
|
|
75
|
+
- Server-side checker that monitors values from `PulseService`.
|
|
76
|
+
- Triggers `AlertService` if:
|
|
77
|
+
- CPU > 90% for 2 mins.
|
|
78
|
+
- Memory > 90% for 5 mins.
|
|
79
|
+
- Disk < 10% free.
|
|
80
|
+
|
|
81
|
+
## 4. Work Plan
|
|
82
|
+
1. **Define Types**: Add `PulseNode` interface to `@gravito/custom-types` or `flux-console` shared types.
|
|
83
|
+
2. **Implement Server Collector**: Create `PulseService` in `packages/flux-console/server/services`.
|
|
84
|
+
3. **Implement API**: Add route in `packages/flux-console/server/routes.ts`.
|
|
85
|
+
4. **Implement UI**: Create `PulsePage` and components.
|
|
86
|
+
5. **Implement Node Client**: Add `startPulse` to `packages/stream` (or separate package) to verify "dogfooding" by having the server monitor itself.
|
package/src/client/App.tsx
CHANGED
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
LoginPage,
|
|
9
9
|
MetricsPage,
|
|
10
10
|
OverviewPage,
|
|
11
|
+
PulsePage,
|
|
11
12
|
QueuesPage,
|
|
12
13
|
SchedulesPage,
|
|
13
14
|
SettingsPage,
|
|
@@ -48,6 +49,7 @@ function AuthenticatedRoutes() {
|
|
|
48
49
|
<Route path="/schedules" element={<SchedulesPage />} />
|
|
49
50
|
<Route path="/workers" element={<WorkersPage />} />
|
|
50
51
|
<Route path="/metrics" element={<MetricsPage />} />
|
|
52
|
+
<Route path="/pulse" element={<PulsePage />} />
|
|
51
53
|
<Route path="/settings" element={<SettingsPage />} />
|
|
52
54
|
</Routes>
|
|
53
55
|
</Layout>
|
package/src/client/Layout.tsx
CHANGED
|
@@ -90,6 +90,15 @@ export function Layout({ children }: LayoutProps) {
|
|
|
90
90
|
}
|
|
91
91
|
})
|
|
92
92
|
|
|
93
|
+
ev.addEventListener('pulse', (e) => {
|
|
94
|
+
try {
|
|
95
|
+
const data = JSON.parse(e.data)
|
|
96
|
+
window.dispatchEvent(new CustomEvent('flux-pulse-update', { detail: data }))
|
|
97
|
+
} catch (err) {
|
|
98
|
+
console.error('SSE Pulse Error', err)
|
|
99
|
+
}
|
|
100
|
+
})
|
|
101
|
+
|
|
93
102
|
ev.onerror = (err) => {
|
|
94
103
|
console.error('[Zenith] SSE Connection Error', err)
|
|
95
104
|
ev.close()
|
|
@@ -339,6 +348,14 @@ export function Layout({ children }: LayoutProps) {
|
|
|
339
348
|
return () => window.removeEventListener('keydown', handleKeyDown)
|
|
340
349
|
}, [])
|
|
341
350
|
|
|
351
|
+
// Auto-scroll to selected item
|
|
352
|
+
useEffect(() => {
|
|
353
|
+
const el = document.getElementById(`command-item-${selectedIndex}`)
|
|
354
|
+
if (el) {
|
|
355
|
+
el.scrollIntoView({ block: 'nearest', behavior: 'smooth' })
|
|
356
|
+
}
|
|
357
|
+
}, [selectedIndex])
|
|
358
|
+
|
|
342
359
|
const handleSelect = (cmd: CommandItem) => {
|
|
343
360
|
cmd.action()
|
|
344
361
|
setIsCommandPaletteOpen(false)
|
|
@@ -559,6 +576,7 @@ export function Layout({ children }: LayoutProps) {
|
|
|
559
576
|
{filteredCommands.map((cmd, i) => (
|
|
560
577
|
<button
|
|
561
578
|
type="button"
|
|
579
|
+
id={`command-item-${i}`}
|
|
562
580
|
key={cmd.id}
|
|
563
581
|
className={cn(
|
|
564
582
|
'w-full flex items-center justify-between p-4 rounded-2xl transition-all cursor-pointer group/cmd outline-none',
|
package/src/client/Sidebar.tsx
CHANGED
|
@@ -23,10 +23,11 @@ export function Sidebar({ className, collapsed, toggleCollapse }: SidebarProps)
|
|
|
23
23
|
|
|
24
24
|
const navItems = [
|
|
25
25
|
{ icon: LayoutDashboard, label: 'Overview', path: '/' },
|
|
26
|
+
{ icon: Activity, label: 'Pulse', path: '/pulse' },
|
|
26
27
|
{ icon: ListTree, label: 'Queues', path: '/queues' },
|
|
27
28
|
{ icon: Clock, label: 'Schedules', path: '/schedules' },
|
|
28
29
|
{ icon: HardDrive, label: 'Workers', path: '/workers' },
|
|
29
|
-
{ icon: Activity, label: 'Metrics', path: '/metrics' },
|
|
30
|
+
// { icon: Activity, label: 'Metrics', path: '/metrics' },
|
|
30
31
|
{ icon: Settings, label: 'Settings', path: '/settings' },
|
|
31
32
|
]
|
|
32
33
|
|