claude-flow-novice 1.6.5 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ # Docker Production Environment
2
+ # Used by: docker-compose.production.yml
3
+
4
+ # Environment
5
+ NODE_ENV=production
6
+
7
+ # Agent Configuration (Production: 500 agents)
8
+ CFN_MAX_AGENTS=500
9
+ CFN_SHARD_COUNT=32
10
+ CFN_METRICS_ENABLED=true
11
+ CFN_ALERTING_ENABLED=true
12
+ CFN_LOG_LEVEL=warn
13
+ CFN_CONSENSUS_THRESHOLD=0.95
14
+ CFN_ALERT_COORD_TIME_MS=8000
15
+
16
+ # Memory and Storage (tmpfs REQUIRED for production)
17
+ CFN_BASE_DIR=/dev/shm/cfn
18
+ CFN_AGENT_MEMORY_LIMIT_MB=100
19
+ CFN_TOTAL_MEMORY_LIMIT_MB=51200
20
+
21
+ # Performance
22
+ CFN_AGENT_TIMEOUT_MS=30000
23
+ CFN_CONSENSUS_TIMEOUT_MS=60000
24
+ CFN_SWARM_INIT_TIMEOUT_MS=10000
25
+ CFN_ENABLE_CACHING=true
26
+ CFN_CACHE_TTL_SECONDS=600
27
+ CFN_MAX_CONCURRENT_OPERATIONS=100
28
+
29
+ # Resource Cleanup
30
+ CFN_CLEANUP_INTERVAL_MS=60000
31
+ CFN_CLEANUP_STALE_AGENTS_ENABLED=true
32
+
33
+ # Monitoring
34
+ CFN_METRICS_COLLECTION_INTERVAL_MS=5000
35
+ CFN_METRICS_RETENTION_HOURS=168
36
+ CFN_TRACK_COORDINATION_TIME=true
37
+ CFN_TRACK_MEMORY_USAGE=true
38
+ CFN_TRACK_AGENT_LIFECYCLE=true
39
+
40
+ # Alerting Thresholds
41
+ CFN_ALERT_MEMORY_THRESHOLD_PERCENT=80
42
+ CFN_ALERT_AGENT_FAILURE_COUNT=3
43
+ CFN_ALERT_CONSENSUS_FAILURE_THRESHOLD=0.70
44
+
45
+ # Security (Production: REQUIRED)
46
+ CFN_ENABLE_AGENT_AUTH=true
47
+ CFN_AGENT_AUTH_TOKEN=${CFN_AGENT_AUTH_TOKEN}
48
+ CFN_ENABLE_TLS=true
49
+ CFN_TLS_CERT_PATH=/etc/cfn/certs/cert.pem
50
+ CFN_TLS_KEY_PATH=/etc/cfn/certs/key.pem
51
+ CFN_ENABLE_RATE_LIMITING=true
52
+ CFN_RATE_LIMIT_REQUESTS_PER_MINUTE=1000
53
+
54
+ # MCP Server
55
+ CFN_MCP_SERVER_ENABLED=true
56
+ CFN_MCP_SERVER_PORT=3000
57
+
58
+ # Testing and Debugging (Production: DISABLED)
59
+ CFN_TEST_MODE=false
60
+ CFN_DEBUG_AGENT_SPAWN=false
61
+ CFN_DEBUG_CONSENSUS=false
62
+ CFN_DEBUG_MEMORY=false
63
+ CFN_VERBOSE_LOGGING=false
64
+
65
+ # Chaos Engineering (Production: DISABLED)
66
+ CFN_CHAOS_ENABLED=false
67
+
68
+ # Consensus Algorithm
69
+ CFN_CONSENSUS_ALGORITHM=byzantine
70
+
71
+ # Database (Production: PostgreSQL recommended)
72
+ CFN_DB_ENABLED=true
73
+ CFN_DB_TYPE=postgresql
74
+ CFN_DB_HOST=${CFN_DB_HOST}
75
+ CFN_DB_PORT=5432
76
+ CFN_DB_NAME=cfn_metrics
77
+ CFN_DB_USER=${CFN_DB_USER}
78
+ CFN_DB_PASSWORD=${CFN_DB_PASSWORD}
79
+
80
+ # Distributed Tracing
81
+ CFN_TRACING_ENABLED=true
82
+ CFN_TRACING_ENDPOINT=${CFN_TRACING_ENDPOINT}
83
+ CFN_TRACING_SERVICE_NAME=claude-flow-novice
@@ -0,0 +1,70 @@
1
+ # Docker Staging Environment
2
+ # Used by: docker-compose.staging.yml
3
+
4
+ # Environment
5
+ NODE_ENV=staging
6
+
7
+ # Agent Configuration (Staging: 100 agents)
8
+ CFN_MAX_AGENTS=100
9
+ CFN_SHARD_COUNT=16
10
+ CFN_METRICS_ENABLED=true
11
+ CFN_ALERTING_ENABLED=true
12
+ CFN_LOG_LEVEL=info
13
+ CFN_CONSENSUS_THRESHOLD=0.90
14
+ CFN_ALERT_COORD_TIME_MS=5000
15
+
16
+ # Memory and Storage (tmpfs recommended)
17
+ CFN_BASE_DIR=/dev/shm/cfn
18
+ CFN_AGENT_MEMORY_LIMIT_MB=100
19
+ CFN_TOTAL_MEMORY_LIMIT_MB=10240
20
+
21
+ # Performance
22
+ CFN_AGENT_TIMEOUT_MS=30000
23
+ CFN_CONSENSUS_TIMEOUT_MS=60000
24
+ CFN_SWARM_INIT_TIMEOUT_MS=10000
25
+ CFN_ENABLE_CACHING=true
26
+ CFN_CACHE_TTL_SECONDS=300
27
+ CFN_MAX_CONCURRENT_OPERATIONS=50
28
+
29
+ # Monitoring
30
+ CFN_METRICS_COLLECTION_INTERVAL_MS=5000
31
+ CFN_METRICS_RETENTION_HOURS=168
32
+ CFN_TRACK_COORDINATION_TIME=true
33
+ CFN_TRACK_MEMORY_USAGE=true
34
+ CFN_TRACK_AGENT_LIFECYCLE=true
35
+
36
+ # Alerting Thresholds
37
+ CFN_ALERT_MEMORY_THRESHOLD_PERCENT=80
38
+ CFN_ALERT_AGENT_FAILURE_COUNT=3
39
+ CFN_ALERT_CONSENSUS_FAILURE_THRESHOLD=0.70
40
+
41
+ # Security (Staging: Enabled)
42
+ CFN_ENABLE_AGENT_AUTH=true
43
+ CFN_AGENT_AUTH_TOKEN=${CFN_AGENT_AUTH_TOKEN}
44
+ CFN_ENABLE_TLS=false
45
+ CFN_ENABLE_RATE_LIMITING=true
46
+ CFN_RATE_LIMIT_REQUESTS_PER_MINUTE=500
47
+
48
+ # MCP Server
49
+ CFN_MCP_SERVER_ENABLED=true
50
+ CFN_MCP_SERVER_PORT=3000
51
+
52
+ # Testing and Debugging
53
+ CFN_TEST_MODE=false
54
+ CFN_DEBUG_AGENT_SPAWN=false
55
+ CFN_DEBUG_CONSENSUS=false
56
+ CFN_DEBUG_MEMORY=false
57
+ CFN_VERBOSE_LOGGING=false
58
+
59
+ # Chaos Engineering (Staging only)
60
+ CFN_CHAOS_ENABLED=true
61
+ CFN_CHAOS_FAILURE_RATE=0.05
62
+ CFN_CHAOS_LATENCY_MS=100
63
+
64
+ # Consensus Algorithm
65
+ CFN_CONSENSUS_ALGORITHM=byzantine
66
+
67
+ # Database (Optional)
68
+ CFN_DB_ENABLED=true
69
+ CFN_DB_TYPE=sqlite
70
+ CFN_DB_PATH=/var/lib/cfn/metrics.db
@@ -0,0 +1,60 @@
1
+ apiVersion: v1
2
+ kind: ConfigMap
3
+ metadata:
4
+ name: cfn-coordination-config-dev
5
+ namespace: claude-flow-novice
6
+ labels:
7
+ app: claude-flow-novice
8
+ environment: development
9
+ component: coordination
10
+ data:
11
+ # Environment
12
+ NODE_ENV: "development"
13
+
14
+ # Agent Configuration (Development: 10 agents)
15
+ CFN_MAX_AGENTS: "10"
16
+ CFN_SHARD_COUNT: "4"
17
+ CFN_METRICS_ENABLED: "true"
18
+ CFN_ALERTING_ENABLED: "false"
19
+ CFN_LOG_LEVEL: "debug"
20
+ CFN_CONSENSUS_THRESHOLD: "0.90"
21
+ CFN_ALERT_COORD_TIME_MS: "3000"
22
+
23
+ # Memory and Storage
24
+ CFN_BASE_DIR: "/tmp/cfn"
25
+ CFN_AGENT_MEMORY_LIMIT_MB: "100"
26
+ CFN_TOTAL_MEMORY_LIMIT_MB: "2048"
27
+
28
+ # Performance
29
+ CFN_AGENT_TIMEOUT_MS: "30000"
30
+ CFN_CONSENSUS_TIMEOUT_MS: "60000"
31
+ CFN_SWARM_INIT_TIMEOUT_MS: "10000"
32
+ CFN_ENABLE_CACHING: "true"
33
+ CFN_CACHE_TTL_SECONDS: "300"
34
+ CFN_MAX_CONCURRENT_OPERATIONS: "10"
35
+
36
+ # Monitoring
37
+ CFN_METRICS_COLLECTION_INTERVAL_MS: "5000"
38
+ CFN_METRICS_RETENTION_HOURS: "24"
39
+ CFN_TRACK_COORDINATION_TIME: "true"
40
+ CFN_TRACK_MEMORY_USAGE: "true"
41
+ CFN_TRACK_AGENT_LIFECYCLE: "true"
42
+
43
+ # Security (Development: Disabled)
44
+ CFN_ENABLE_AGENT_AUTH: "false"
45
+ CFN_ENABLE_TLS: "false"
46
+ CFN_ENABLE_RATE_LIMITING: "false"
47
+
48
+ # MCP Server
49
+ CFN_MCP_SERVER_ENABLED: "true"
50
+ CFN_MCP_SERVER_PORT: "3000"
51
+
52
+ # Testing and Debugging
53
+ CFN_TEST_MODE: "false"
54
+ CFN_DEBUG_AGENT_SPAWN: "true"
55
+ CFN_DEBUG_CONSENSUS: "true"
56
+ CFN_DEBUG_MEMORY: "true"
57
+ CFN_VERBOSE_LOGGING: "true"
58
+
59
+ # Consensus Algorithm
60
+ CFN_CONSENSUS_ALGORITHM: "byzantine"
@@ -0,0 +1,85 @@
1
+ apiVersion: v1
2
+ kind: ConfigMap
3
+ metadata:
4
+ name: cfn-coordination-config-prod
5
+ namespace: claude-flow-novice
6
+ labels:
7
+ app: claude-flow-novice
8
+ environment: production
9
+ component: coordination
10
+ data:
11
+ # Environment
12
+ NODE_ENV: "production"
13
+
14
+ # Agent Configuration (Production: 500 agents)
15
+ CFN_MAX_AGENTS: "500"
16
+ CFN_SHARD_COUNT: "32"
17
+ CFN_METRICS_ENABLED: "true"
18
+ CFN_ALERTING_ENABLED: "true"
19
+ CFN_LOG_LEVEL: "warn"
20
+ CFN_CONSENSUS_THRESHOLD: "0.95"
21
+ CFN_ALERT_COORD_TIME_MS: "8000"
22
+
23
+ # Memory and Storage (tmpfs REQUIRED)
24
+ CFN_BASE_DIR: "/dev/shm/cfn"
25
+ CFN_AGENT_MEMORY_LIMIT_MB: "100"
26
+ CFN_TOTAL_MEMORY_LIMIT_MB: "51200"
27
+
28
+ # Performance
29
+ CFN_AGENT_TIMEOUT_MS: "30000"
30
+ CFN_CONSENSUS_TIMEOUT_MS: "60000"
31
+ CFN_SWARM_INIT_TIMEOUT_MS: "10000"
32
+ CFN_ENABLE_CACHING: "true"
33
+ CFN_CACHE_TTL_SECONDS: "600"
34
+ CFN_MAX_CONCURRENT_OPERATIONS: "100"
35
+
36
+ # Resource Cleanup
37
+ CFN_CLEANUP_INTERVAL_MS: "60000"
38
+ CFN_CLEANUP_STALE_AGENTS_ENABLED: "true"
39
+
40
+ # Monitoring
41
+ CFN_METRICS_COLLECTION_INTERVAL_MS: "5000"
42
+ CFN_METRICS_RETENTION_HOURS: "168"
43
+ CFN_TRACK_COORDINATION_TIME: "true"
44
+ CFN_TRACK_MEMORY_USAGE: "true"
45
+ CFN_TRACK_AGENT_LIFECYCLE: "true"
46
+
47
+ # Alerting Thresholds
48
+ CFN_ALERT_MEMORY_THRESHOLD_PERCENT: "80"
49
+ CFN_ALERT_AGENT_FAILURE_COUNT: "3"
50
+ CFN_ALERT_CONSENSUS_FAILURE_THRESHOLD: "0.70"
51
+
52
+ # Security (Production: REQUIRED)
53
+ CFN_ENABLE_AGENT_AUTH: "true"
54
+ CFN_ENABLE_TLS: "true"
55
+ CFN_TLS_CERT_PATH: "/etc/cfn/certs/cert.pem"
56
+ CFN_TLS_KEY_PATH: "/etc/cfn/certs/key.pem"
57
+ CFN_ENABLE_RATE_LIMITING: "true"
58
+ CFN_RATE_LIMIT_REQUESTS_PER_MINUTE: "1000"
59
+
60
+ # MCP Server
61
+ CFN_MCP_SERVER_ENABLED: "true"
62
+ CFN_MCP_SERVER_PORT: "3000"
63
+
64
+ # Testing and Debugging (Production: DISABLED)
65
+ CFN_TEST_MODE: "false"
66
+ CFN_DEBUG_AGENT_SPAWN: "false"
67
+ CFN_DEBUG_CONSENSUS: "false"
68
+ CFN_DEBUG_MEMORY: "false"
69
+ CFN_VERBOSE_LOGGING: "false"
70
+
71
+ # Chaos Engineering (Production: DISABLED)
72
+ CFN_CHAOS_ENABLED: "false"
73
+
74
+ # Consensus Algorithm
75
+ CFN_CONSENSUS_ALGORITHM: "byzantine"
76
+
77
+ # Database (Production: PostgreSQL)
78
+ CFN_DB_ENABLED: "true"
79
+ CFN_DB_TYPE: "postgresql"
80
+ CFN_DB_PORT: "5432"
81
+ CFN_DB_NAME: "cfn_metrics"
82
+
83
+ # Distributed Tracing
84
+ CFN_TRACING_ENABLED: "true"
85
+ CFN_TRACING_SERVICE_NAME: "claude-flow-novice"
@@ -0,0 +1,76 @@
1
+ apiVersion: v1
2
+ kind: ConfigMap
3
+ metadata:
4
+ name: cfn-coordination-config-staging
5
+ namespace: claude-flow-novice
6
+ labels:
7
+ app: claude-flow-novice
8
+ environment: staging
9
+ component: coordination
10
+ data:
11
+ # Environment
12
+ NODE_ENV: "staging"
13
+
14
+ # Agent Configuration (Staging: 100 agents)
15
+ CFN_MAX_AGENTS: "100"
16
+ CFN_SHARD_COUNT: "16"
17
+ CFN_METRICS_ENABLED: "true"
18
+ CFN_ALERTING_ENABLED: "true"
19
+ CFN_LOG_LEVEL: "info"
20
+ CFN_CONSENSUS_THRESHOLD: "0.90"
21
+ CFN_ALERT_COORD_TIME_MS: "5000"
22
+
23
+ # Memory and Storage (tmpfs)
24
+ CFN_BASE_DIR: "/dev/shm/cfn"
25
+ CFN_AGENT_MEMORY_LIMIT_MB: "100"
26
+ CFN_TOTAL_MEMORY_LIMIT_MB: "10240"
27
+
28
+ # Performance
29
+ CFN_AGENT_TIMEOUT_MS: "30000"
30
+ CFN_CONSENSUS_TIMEOUT_MS: "60000"
31
+ CFN_SWARM_INIT_TIMEOUT_MS: "10000"
32
+ CFN_ENABLE_CACHING: "true"
33
+ CFN_CACHE_TTL_SECONDS: "300"
34
+ CFN_MAX_CONCURRENT_OPERATIONS: "50"
35
+
36
+ # Monitoring
37
+ CFN_METRICS_COLLECTION_INTERVAL_MS: "5000"
38
+ CFN_METRICS_RETENTION_HOURS: "168"
39
+ CFN_TRACK_COORDINATION_TIME: "true"
40
+ CFN_TRACK_MEMORY_USAGE: "true"
41
+ CFN_TRACK_AGENT_LIFECYCLE: "true"
42
+
43
+ # Alerting Thresholds
44
+ CFN_ALERT_MEMORY_THRESHOLD_PERCENT: "80"
45
+ CFN_ALERT_AGENT_FAILURE_COUNT: "3"
46
+ CFN_ALERT_CONSENSUS_FAILURE_THRESHOLD: "0.70"
47
+
48
+ # Security (Staging: Enabled)
49
+ CFN_ENABLE_AGENT_AUTH: "true"
50
+ CFN_ENABLE_TLS: "false"
51
+ CFN_ENABLE_RATE_LIMITING: "true"
52
+ CFN_RATE_LIMIT_REQUESTS_PER_MINUTE: "500"
53
+
54
+ # MCP Server
55
+ CFN_MCP_SERVER_ENABLED: "true"
56
+ CFN_MCP_SERVER_PORT: "3000"
57
+
58
+ # Testing and Debugging
59
+ CFN_TEST_MODE: "false"
60
+ CFN_DEBUG_AGENT_SPAWN: "false"
61
+ CFN_DEBUG_CONSENSUS: "false"
62
+ CFN_DEBUG_MEMORY: "false"
63
+ CFN_VERBOSE_LOGGING: "false"
64
+
65
+ # Chaos Engineering (Staging only)
66
+ CFN_CHAOS_ENABLED: "true"
67
+ CFN_CHAOS_FAILURE_RATE: "0.05"
68
+ CFN_CHAOS_LATENCY_MS: "100"
69
+
70
+ # Consensus Algorithm
71
+ CFN_CONSENSUS_ALGORITHM: "byzantine"
72
+
73
+ # Database
74
+ CFN_DB_ENABLED: "true"
75
+ CFN_DB_TYPE: "sqlite"
76
+ CFN_DB_PATH: "/var/lib/cfn/metrics.db"
@@ -0,0 +1,62 @@
1
+ apiVersion: v1
2
+ kind: Secret
3
+ metadata:
4
+ name: cfn-coordination-secrets-prod
5
+ namespace: claude-flow-novice
6
+ labels:
7
+ app: claude-flow-novice
8
+ environment: production
9
+ component: coordination
10
+ type: Opaque
11
+ stringData:
12
+ # Agent Authentication Token
13
+ # Generate with: openssl rand -base64 32
14
+ CFN_AGENT_AUTH_TOKEN: "REPLACE_WITH_PRODUCTION_TOKEN"
15
+
16
+ # Database Credentials (PostgreSQL)
17
+ CFN_DB_HOST: "cfn-postgres.database.svc.cluster.local"
18
+ CFN_DB_USER: "cfn_production"
19
+ CFN_DB_PASSWORD: "REPLACE_WITH_DB_PASSWORD"
20
+
21
+ # GitHub Integration (optional)
22
+ CFN_GITHUB_TOKEN: "REPLACE_WITH_GITHUB_TOKEN"
23
+
24
+ # Distributed Tracing Endpoint
25
+ CFN_TRACING_ENDPOINT: "http://jaeger-collector.monitoring.svc.cluster.local:9411"
26
+
27
+ # Neural API Key (experimental features)
28
+ CFN_NEURAL_API_KEY: "REPLACE_WITH_NEURAL_API_KEY"
29
+ ---
30
+ # Production Secret Management Best Practices:
31
+ #
32
+ # 1. DO NOT commit actual secrets to git
33
+ # 2. Use external secret management:
34
+ # - AWS Secrets Manager
35
+ # - Azure Key Vault
36
+ # - HashiCorp Vault
37
+ # - Google Secret Manager
38
+ #
39
+ # 3. Create secrets using kubectl with --dry-run:
40
+ # kubectl create secret generic cfn-coordination-secrets-prod \
41
+ # --from-literal=CFN_AGENT_AUTH_TOKEN="$(openssl rand -base64 32)" \
42
+ # --from-literal=CFN_DB_PASSWORD="your-secure-password" \
43
+ # --dry-run=client -o yaml | kubectl apply -f -
44
+ #
45
+ # 4. Use External Secrets Operator for GitOps:
46
+ # apiVersion: external-secrets.io/v1beta1
47
+ # kind: ExternalSecret
48
+ # metadata:
49
+ # name: cfn-coordination-secrets-prod
50
+ # spec:
51
+ # secretStoreRef:
52
+ # name: aws-secrets-manager
53
+ # target:
54
+ # name: cfn-coordination-secrets-prod
55
+ # data:
56
+ # - secretKey: CFN_AGENT_AUTH_TOKEN
57
+ # remoteRef:
58
+ # key: prod/cfn/auth-token
59
+ #
60
+ # 5. Rotate secrets regularly (every 90 days minimum)
61
+ # 6. Use RBAC to restrict secret access
62
+ # 7. Enable audit logging for secret access
@@ -0,0 +1,36 @@
1
+ apiVersion: v1
2
+ kind: Secret
3
+ metadata:
4
+ name: cfn-coordination-secrets-staging
5
+ namespace: claude-flow-novice
6
+ labels:
7
+ app: claude-flow-novice
8
+ environment: staging
9
+ component: coordination
10
+ type: Opaque
11
+ stringData:
12
+ # Agent Authentication Token
13
+ # Generate with: openssl rand -base64 32
14
+ CFN_AGENT_AUTH_TOKEN: "REPLACE_WITH_STAGING_TOKEN"
15
+
16
+ # Database Credentials (if using external DB)
17
+ CFN_DB_USER: "cfn_staging"
18
+ CFN_DB_PASSWORD: "REPLACE_WITH_DB_PASSWORD"
19
+
20
+ # GitHub Integration (optional)
21
+ CFN_GITHUB_TOKEN: "REPLACE_WITH_GITHUB_TOKEN"
22
+
23
+ # Neural API Key (experimental features)
24
+ CFN_NEURAL_API_KEY: "REPLACE_WITH_NEURAL_API_KEY"
25
+ ---
26
+ # Usage Instructions:
27
+ # 1. Create secret: kubectl apply -f secret-staging.yaml
28
+ # 2. Verify: kubectl get secret cfn-coordination-secrets-staging -n claude-flow-novice
29
+ # 3. Update deployment to reference secret values:
30
+ #
31
+ # env:
32
+ # - name: CFN_AGENT_AUTH_TOKEN
33
+ # valueFrom:
34
+ # secretKeyRef:
35
+ # name: cfn-coordination-secrets-staging
36
+ # key: CFN_AGENT_AUTH_TOKEN
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-flow-novice",
3
- "version": "1.6.5",
3
+ "version": "1.6.6",
4
4
  "description": "Standalone Claude Flow for beginners - AI agent orchestration made easy with enhanced TDD testing pipeline, memory safety protection, and automated hooks. Enhanced init command creates complete agent system, MCP configuration with 30 essential tools, and automated hooks with single-file testing, real-time coverage analysis, and advanced validation. Features memory leak prevention for WSL/Windows environments and pre-tool safety validation. Fully standalone with zero external dependencies, complete project setup in one command.",
5
5
  "mcpName": "io.github.ruvnet/claude-flow",
6
6
  "main": ".claude-flow-novice/dist/index.js",
@@ -0,0 +1,211 @@
1
+ {
2
+ "dashboard": {
3
+ "title": "CFN Rate Limiting & Backpressure Monitoring",
4
+ "description": "Real-time monitoring of message inbox utilization, backpressure events, and overflow alerts",
5
+ "version": "1.0.0",
6
+ "tags": ["rate-limiting", "backpressure", "inbox", "coordination"],
7
+ "timezone": "UTC",
8
+ "refresh": "10s",
9
+ "panels": [
10
+ {
11
+ "id": 1,
12
+ "title": "Inbox Utilization by Agent",
13
+ "type": "timeseries",
14
+ "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
15
+ "targets": [
16
+ {
17
+ "metric": "inbox.utilization",
18
+ "legend": "{{agent}}",
19
+ "unit": "percent"
20
+ }
21
+ ],
22
+ "thresholds": [
23
+ { "value": 75, "color": "yellow", "label": "Warning" },
24
+ { "value": 90, "color": "red", "label": "Critical" }
25
+ ],
26
+ "description": "Message inbox utilization percentage per agent (max 100 messages)"
27
+ },
28
+ {
29
+ "id": 2,
30
+ "title": "Inbox Message Count",
31
+ "type": "timeseries",
32
+ "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
33
+ "targets": [
34
+ {
35
+ "metric": "inbox.size",
36
+ "legend": "{{agent}}",
37
+ "unit": "count"
38
+ }
39
+ ],
40
+ "thresholds": [
41
+ { "value": 75, "color": "yellow", "label": "Warning (75 msgs)" },
42
+ { "value": 90, "color": "red", "label": "Critical (90 msgs)" }
43
+ ],
44
+ "description": "Absolute message count in agent inboxes"
45
+ },
46
+ {
47
+ "id": 3,
48
+ "title": "Backpressure Events Rate",
49
+ "type": "timeseries",
50
+ "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
51
+ "targets": [
52
+ {
53
+ "metric": "backpressure.events_per_min",
54
+ "legend": "Backpressure events/min",
55
+ "unit": "count"
56
+ }
57
+ ],
58
+ "thresholds": [
59
+ { "value": 100, "color": "yellow", "label": "Warning threshold" }
60
+ ],
61
+ "description": "Rate of backpressure wait events (high rate indicates system load)"
62
+ },
63
+ {
64
+ "id": 4,
65
+ "title": "Message Send Failures",
66
+ "type": "timeseries",
67
+ "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
68
+ "targets": [
69
+ {
70
+ "metric": "coordination.send_failures_per_min",
71
+ "legend": "Send failures/min",
72
+ "unit": "count"
73
+ }
74
+ ],
75
+ "thresholds": [
76
+ { "value": 10, "color": "red", "label": "Critical threshold" }
77
+ ],
78
+ "description": "Message delivery failure rate (critical if >10/min)"
79
+ },
80
+ {
81
+ "id": 5,
82
+ "title": "Inbox Overflow Events",
83
+ "type": "timeseries",
84
+ "gridPos": { "x": 0, "y": 16, "w": 12, "h": 8 },
85
+ "targets": [
86
+ {
87
+ "metric": "inbox.overflow_events_per_min",
88
+ "legend": "Overflow events/min",
89
+ "unit": "count"
90
+ }
91
+ ],
92
+ "thresholds": [
93
+ { "value": 1, "color": "red", "label": "Any overflow is critical" }
94
+ ],
95
+ "description": "Inbox overflow events (messages dropped due to full inbox)"
96
+ },
97
+ {
98
+ "id": 6,
99
+ "title": "Alert Summary",
100
+ "type": "stat",
101
+ "gridPos": { "x": 12, "y": 16, "w": 6, "h": 8 },
102
+ "targets": [
103
+ {
104
+ "query": "count_alerts_by_severity",
105
+ "fields": ["critical", "warning", "info"]
106
+ }
107
+ ],
108
+ "description": "Alert counts by severity level (last hour)"
109
+ },
110
+ {
111
+ "id": 7,
112
+ "title": "Top Utilized Agents",
113
+ "type": "table",
114
+ "gridPos": { "x": 18, "y": 16, "w": 6, "h": 8 },
115
+ "targets": [
116
+ {
117
+ "query": "top_inbox_utilization",
118
+ "fields": ["agent", "utilization", "message_count"],
119
+ "limit": 10,
120
+ "order": "desc"
121
+ }
122
+ ],
123
+ "description": "Agents with highest inbox utilization"
124
+ }
125
+ ],
126
+ "annotations": [
127
+ {
128
+ "name": "Rate Limiting Alerts",
129
+ "datasource": "cfn-alerts",
130
+ "filter": {
131
+ "tags": ["inbox_high_utilization", "backpressure_high_rate", "inbox_overflow_detected"]
132
+ },
133
+ "color": "red"
134
+ }
135
+ ],
136
+ "variables": [
137
+ {
138
+ "name": "agent",
139
+ "type": "query",
140
+ "query": "SELECT DISTINCT agent FROM inbox_metrics",
141
+ "description": "Filter by specific agent",
142
+ "multi": true,
143
+ "includeAll": true
144
+ },
145
+ {
146
+ "name": "timeRange",
147
+ "type": "interval",
148
+ "options": ["5m", "15m", "1h", "6h", "24h"],
149
+ "default": "1h",
150
+ "description": "Time range for metrics"
151
+ }
152
+ ]
153
+ },
154
+ "queries": {
155
+ "count_alerts_by_severity": {
156
+ "description": "Count alerts grouped by severity",
157
+ "source": "/dev/shm/cfn-alerts.jsonl",
158
+ "aggregation": "GROUP BY severity, COUNT(*)",
159
+ "timeWindow": "1h"
160
+ },
161
+ "top_inbox_utilization": {
162
+ "description": "Rank agents by inbox utilization",
163
+ "source": "/dev/shm/cfn-metrics.jsonl",
164
+ "query": "SELECT agent, MAX(value) as utilization FROM metrics WHERE metric='inbox.utilization' GROUP BY agent ORDER BY utilization DESC LIMIT 10",
165
+ "timeWindow": "5m"
166
+ }
167
+ },
168
+ "alertRules": [
169
+ {
170
+ "name": "Inbox Critical Utilization",
171
+ "condition": "inbox.utilization >= 90",
172
+ "severity": "critical",
173
+ "message": "Agent {{agent}} inbox at {{value}}% utilization (critical threshold: 90%)",
174
+ "actions": ["emit_alert", "notify_oncall"]
175
+ },
176
+ {
177
+ "name": "Inbox Warning Utilization",
178
+ "condition": "inbox.utilization >= 75 AND inbox.utilization < 90",
179
+ "severity": "warning",
180
+ "message": "Agent {{agent}} inbox at {{value}}% utilization (warning threshold: 75%)",
181
+ "actions": ["emit_alert"]
182
+ },
183
+ {
184
+ "name": "Backpressure High Rate",
185
+ "condition": "backpressure.events_per_min > 100",
186
+ "severity": "warning",
187
+ "message": "Backpressure events exceeding threshold: {{value}} events/min (threshold: 100/min)",
188
+ "actions": ["emit_alert"]
189
+ },
190
+ {
191
+ "name": "Message Send Failures Critical",
192
+ "condition": "coordination.send_failures_per_min > 10",
193
+ "severity": "critical",
194
+ "message": "Message send failures critical: {{value}} failures/min (threshold: 10/min)",
195
+ "actions": ["emit_alert", "notify_oncall", "trigger_incident"]
196
+ },
197
+ {
198
+ "name": "Inbox Overflow Detected",
199
+ "condition": "inbox.overflow_events_per_min > 0",
200
+ "severity": "critical",
201
+ "message": "Inbox overflow detected: {{value}} overflow events in last minute",
202
+ "actions": ["emit_alert", "notify_oncall", "trigger_incident"]
203
+ }
204
+ ],
205
+ "metadata": {
206
+ "createdBy": "devops-engineer",
207
+ "phase": "1",
208
+ "sprint": "1.5",
209
+ "lastUpdated": "2025-10-06T19:35:00Z"
210
+ }
211
+ }