@dipseth/opensearch-logs 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +14 -0
- package/alerts/langfuse-usage.yaml +142 -0
- package/alerts/production-incidents.yaml +280 -0
- package/alerts/service-health.yaml +98 -0
- package/dashboards/langfuse-usage.yaml +57 -0
- package/dist/create-dashboards.d.ts +10 -0
- package/dist/create-dashboards.js +38 -0
- package/dist/create-dashboards.js.map +1 -0
- package/dist/interfaces/alert.interfaces.d.ts +323 -0
- package/dist/interfaces/alert.interfaces.js +6 -0
- package/dist/interfaces/alert.interfaces.js.map +1 -0
- package/dist/interfaces/dashboard-gen.interfaces.d.ts +33 -0
- package/dist/interfaces/dashboard-gen.interfaces.js +3 -0
- package/dist/interfaces/dashboard-gen.interfaces.js.map +1 -0
- package/dist/interfaces/interfaces.d.ts +312 -0
- package/dist/interfaces/interfaces.js +3 -0
- package/dist/interfaces/interfaces.js.map +1 -0
- package/dist/interfaces/playbook.interfaces.d.ts +140 -0
- package/dist/interfaces/playbook.interfaces.js +3 -0
- package/dist/interfaces/playbook.interfaces.js.map +1 -0
- package/dist/os-alert.d.ts +17 -0
- package/dist/os-alert.js +245 -0
- package/dist/os-alert.js.map +1 -0
- package/dist/os-dash.d.ts +9 -0
- package/dist/os-dash.js +53 -0
- package/dist/os-dash.js.map +1 -0
- package/dist/os-monitor.d.ts +12 -0
- package/dist/os-monitor.js +59 -0
- package/dist/os-monitor.js.map +1 -0
- package/dist/os-playbook.d.ts +9 -0
- package/dist/os-playbook.js +71 -0
- package/dist/os-playbook.js.map +1 -0
- package/dist/os-search.d.ts +11 -0
- package/dist/os-search.js +84 -0
- package/dist/os-search.js.map +1 -0
- package/dist/repositories/index.d.ts +1 -0
- package/dist/repositories/index.js +2 -0
- package/dist/repositories/index.js.map +1 -0
- package/dist/repositories/opensearch.repository.d.ts +51 -0
- package/dist/repositories/opensearch.repository.js +167 -0
- package/dist/repositories/opensearch.repository.js.map +1 -0
- package/dist/services/alert.service.d.ts +73 -0
- package/dist/services/alert.service.js +503 -0
- package/dist/services/alert.service.js.map +1 -0
- package/dist/services/dashboard-gen.service.d.ts +36 -0
- package/dist/services/dashboard-gen.service.js +162 -0
- package/dist/services/dashboard-gen.service.js.map +1 -0
- package/dist/services/dashboard.service.d.ts +33 -0
- package/dist/services/dashboard.service.js +428 -0
- package/dist/services/dashboard.service.js.map +1 -0
- package/dist/services/gchat.service.d.ts +45 -0
- package/dist/services/gchat.service.js +228 -0
- package/dist/services/gchat.service.js.map +1 -0
- package/dist/services/index.d.ts +8 -0
- package/dist/services/index.js +9 -0
- package/dist/services/index.js.map +1 -0
- package/dist/services/monitor.service.d.ts +18 -0
- package/dist/services/monitor.service.js +342 -0
- package/dist/services/monitor.service.js.map +1 -0
- package/dist/services/panel-layout.d.ts +21 -0
- package/dist/services/panel-layout.js +33 -0
- package/dist/services/panel-layout.js.map +1 -0
- package/dist/services/playbook-dashboard.service.d.ts +19 -0
- package/dist/services/playbook-dashboard.service.js +434 -0
- package/dist/services/playbook-dashboard.service.js.map +1 -0
- package/dist/services/playbook.service.d.ts +13 -0
- package/dist/services/playbook.service.js +621 -0
- package/dist/services/playbook.service.js.map +1 -0
- package/dist/services/search.service.d.ts +30 -0
- package/dist/services/search.service.js +885 -0
- package/dist/services/search.service.js.map +1 -0
- package/dist/utils/cli.d.ts +14 -0
- package/dist/utils/cli.js +90 -0
- package/dist/utils/cli.js.map +1 -0
- package/dist/utils/config.d.ts +20 -0
- package/dist/utils/config.js +104 -0
- package/dist/utils/config.js.map +1 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/index.js +5 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/service-registry.d.ts +15 -0
- package/dist/utils/service-registry.js +56 -0
- package/dist/utils/service-registry.js.map +1 -0
- package/dist/utils/template.d.ts +18 -0
- package/dist/utils/template.js +66 -0
- package/dist/utils/template.js.map +1 -0
- package/package.json +76 -0
- package/playbooks/error-investigation.yaml +45 -0
- package/playbooks/incident-triage.yaml +32 -0
- package/playbooks/post-deploy-validation.yaml +24 -0
- package/playbooks/service-deep-dive.yaml +42 -0
package/.env.example
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# OpenSearch credentials — copy to .env and fill in
|
|
2
|
+
OPENSEARCH_HOST=your-cluster-host.example.com
|
|
3
|
+
OPENSEARCH_PORT=25060
|
|
4
|
+
OPENSEARCH_USERNAME=admin
|
|
5
|
+
OPENSEARCH_PASSWORD=
|
|
6
|
+
|
|
7
|
+
# Optional: Google Chat webhook for alerts/reports
|
|
8
|
+
# GCHAT_WEBHOOK_URL=https://chat.googleapis.com/v1/spaces/.../messages?key=...&token=...
|
|
9
|
+
|
|
10
|
+
# Optional: non-default cluster config (e.g., AWS ConveyorCloud)
|
|
11
|
+
# OPENSEARCH_DATA_PORT=443
|
|
12
|
+
# OPENSEARCH_DASHBOARDS_PORT=443
|
|
13
|
+
# OPENSEARCH_INDEX_PREFIX=python-services
|
|
14
|
+
# OPENSEARCH_TENANT=global
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
name: langfuse-usage
|
|
2
|
+
description: >
|
|
3
|
+
LangFuse LLM tracing integration monitors — track validation throughput,
|
|
4
|
+
API timeouts, config issues, and API key problems.
|
|
5
|
+
|
|
6
|
+
destination:
|
|
7
|
+
name: gchat-production-alerts
|
|
8
|
+
type: gchat
|
|
9
|
+
|
|
10
|
+
card_templates:
|
|
11
|
+
langfuse_alert: |
|
|
12
|
+
{ "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
|
|
13
|
+
"header": { "title": "{{ctx.trigger.name}}", "subtitle": "LangFuse Integration" },
|
|
14
|
+
"sections": [
|
|
15
|
+
{ "widgets": [
|
|
16
|
+
{ "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
|
|
17
|
+
{ "decoratedText": { "topLabel": "Component", "text": "LangFuse LLM Tracing" } },
|
|
18
|
+
{ "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
|
|
19
|
+
{ "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
|
|
20
|
+
{ "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
|
|
21
|
+
]},
|
|
22
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
23
|
+
{ "text": "View LangFuse Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
|
|
24
|
+
]}}]}
|
|
25
|
+
]
|
|
26
|
+
}}]}
|
|
27
|
+
|
|
28
|
+
langfuse_health: |
|
|
29
|
+
{ "cardsV2": [{ "cardId": "health-{{ctx.monitor.name}}", "card": {
|
|
30
|
+
"header": { "title": "LangFuse Health Check", "subtitle": "{{defaults.env}}", "imageUrl": "https://fonts.gstatic.com/s/i/short-term/release/googlesymbols/monitoring/default/48px.svg", "imageType": "CIRCLE" },
|
|
31
|
+
"sections": [
|
|
32
|
+
{ "header": "LangFuse Summary (1h)", "widgets": [
|
|
33
|
+
{ "decoratedText": { "topLabel": "Total LangFuse Events", "text": "{{ctx.results.0.hits.total.value}}" } },
|
|
34
|
+
{ "decoratedText": { "topLabel": "Validations", "text": "{{ctx.results.0.aggregations.validations.doc_count}}" } },
|
|
35
|
+
{ "decoratedText": { "topLabel": "ReadTimeouts (llm-langfuse)", "text": "{{ctx.results.0.aggregations.timeouts.doc_count}}" } },
|
|
36
|
+
{ "decoratedText": { "topLabel": "API Key Missing", "text": "{{ctx.results.0.aggregations.apikey_missing.doc_count}}" } },
|
|
37
|
+
{ "decoratedText": { "topLabel": "Config Loads", "text": "{{ctx.results.0.aggregations.config_loads.doc_count}}" } }
|
|
38
|
+
]},
|
|
39
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
40
|
+
{ "text": "View LangFuse Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
|
|
41
|
+
]}}]}
|
|
42
|
+
]
|
|
43
|
+
}}]}
|
|
44
|
+
|
|
45
|
+
monitors:
|
|
46
|
+
# ── Periodic Health Check ────────────────────────────────────────────
|
|
47
|
+
langfuse_health_check:
|
|
48
|
+
description: "Hourly LangFuse health summary with key metrics"
|
|
49
|
+
schedule:
|
|
50
|
+
interval: 1
|
|
51
|
+
unit: HOURS
|
|
52
|
+
query:
|
|
53
|
+
bool:
|
|
54
|
+
must:
|
|
55
|
+
- query_string:
|
|
56
|
+
query: '"langfuse" OR "Langfuse"'
|
|
57
|
+
aggs:
|
|
58
|
+
validations:
|
|
59
|
+
filter:
|
|
60
|
+
query_string:
|
|
61
|
+
query: '"Validating" AND "matches" AND "Langfuse"'
|
|
62
|
+
timeouts:
|
|
63
|
+
filter:
|
|
64
|
+
query_string:
|
|
65
|
+
query: '"llm-langfuse" AND "ReadTimeout"'
|
|
66
|
+
apikey_missing:
|
|
67
|
+
filter:
|
|
68
|
+
query_string:
|
|
69
|
+
query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
|
|
70
|
+
config_loads:
|
|
71
|
+
filter:
|
|
72
|
+
query_string:
|
|
73
|
+
query: '"langfuse-config"'
|
|
74
|
+
trigger:
|
|
75
|
+
name: langfuse-hourly-health
|
|
76
|
+
severity: 5
|
|
77
|
+
condition: "ctx.results[0].hits.total.value >= 0"
|
|
78
|
+
card_template: langfuse_health
|
|
79
|
+
throttle:
|
|
80
|
+
value: 55
|
|
81
|
+
unit: MINUTES
|
|
82
|
+
|
|
83
|
+
# ── Error Monitors ──────────────────────────────────────────────────
|
|
84
|
+
langfuse_timeout_spike:
|
|
85
|
+
description: "ReadTimeout spike on llm-langfuse endpoint — AI Gateway may be degraded"
|
|
86
|
+
schedule:
|
|
87
|
+
interval: 5
|
|
88
|
+
unit: MINUTES
|
|
89
|
+
query:
|
|
90
|
+
query_string:
|
|
91
|
+
query: '"llm-langfuse" AND "ReadTimeout"'
|
|
92
|
+
trigger:
|
|
93
|
+
name: langfuse-timeout-spike
|
|
94
|
+
severity: 2
|
|
95
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.timeouts}}"
|
|
96
|
+
card_template: langfuse_alert
|
|
97
|
+
throttle:
|
|
98
|
+
value: 15
|
|
99
|
+
unit: MINUTES
|
|
100
|
+
|
|
101
|
+
langfuse_apikey_missing:
|
|
102
|
+
description: "LangFuse detecting missing ENCORE_G_API_KEY — tracing will fail"
|
|
103
|
+
schedule:
|
|
104
|
+
interval: 10
|
|
105
|
+
unit: MINUTES
|
|
106
|
+
query:
|
|
107
|
+
query_string:
|
|
108
|
+
query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
|
|
109
|
+
trigger:
|
|
110
|
+
name: langfuse-apikey-missing
|
|
111
|
+
severity: 1
|
|
112
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.apikey_missing}}"
|
|
113
|
+
card_template: langfuse_alert
|
|
114
|
+
throttle:
|
|
115
|
+
value: 30
|
|
116
|
+
unit: MINUTES
|
|
117
|
+
|
|
118
|
+
langfuse_no_activity:
|
|
119
|
+
description: "No LangFuse activity detected — tracing may be down"
|
|
120
|
+
schedule:
|
|
121
|
+
interval: 30
|
|
122
|
+
unit: MINUTES
|
|
123
|
+
query:
|
|
124
|
+
query_string:
|
|
125
|
+
query: '"langfuse" OR "Langfuse"'
|
|
126
|
+
trigger:
|
|
127
|
+
name: langfuse-no-activity
|
|
128
|
+
severity: 2
|
|
129
|
+
condition: "ctx.results[0].hits.total.value < {{thresholds.min_activity}}"
|
|
130
|
+
card_template: langfuse_alert
|
|
131
|
+
throttle:
|
|
132
|
+
value: 30
|
|
133
|
+
unit: MINUTES
|
|
134
|
+
|
|
135
|
+
thresholds:
|
|
136
|
+
timeouts: 10 # >10 ReadTimeouts in 5min is a spike
|
|
137
|
+
apikey_missing: 5 # any significant API key issue burst in 10min
|
|
138
|
+
min_activity: 5 # fewer than 5 LangFuse events in 30min = dead
|
|
139
|
+
|
|
140
|
+
defaults:
|
|
141
|
+
env: production
|
|
142
|
+
dashboard_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/dashboards?security_tenant=global#/view/langfuse-usage-dashboard"
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
name: production-incidents
|
|
2
|
+
description: >
|
|
3
|
+
Core production incident monitors tuned to actual error baselines.
|
|
4
|
+
Thresholds set above normal noise floors to avoid alert fatigue.
|
|
5
|
+
|
|
6
|
+
destination:
|
|
7
|
+
name: gchat-production-alerts
|
|
8
|
+
type: gchat
|
|
9
|
+
|
|
10
|
+
card_templates:
|
|
11
|
+
default: |
|
|
12
|
+
{ "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
|
|
13
|
+
"header": { "title": "{{ctx.trigger.name}}", "subtitle": "Severity {{ctx.trigger.severity}}" },
|
|
14
|
+
"sections": [
|
|
15
|
+
{ "widgets": [
|
|
16
|
+
{ "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
|
|
17
|
+
{ "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
|
|
18
|
+
{ "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
|
|
19
|
+
{ "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
|
|
20
|
+
]},
|
|
21
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
22
|
+
{ "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
|
|
23
|
+
]}}]}
|
|
24
|
+
]
|
|
25
|
+
}}]}
|
|
26
|
+
|
|
27
|
+
service_alert: |
|
|
28
|
+
{ "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
|
|
29
|
+
"header": { "title": "{{ctx.trigger.name}}", "subtitle": "{{defaults.service_label}}" },
|
|
30
|
+
"sections": [
|
|
31
|
+
{ "widgets": [
|
|
32
|
+
{ "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
|
|
33
|
+
{ "decoratedText": { "topLabel": "Service", "text": "{{defaults.service_label}}" } },
|
|
34
|
+
{ "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
|
|
35
|
+
{ "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
|
|
36
|
+
{ "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
|
|
37
|
+
]},
|
|
38
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
39
|
+
{ "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
|
|
40
|
+
]}}]}
|
|
41
|
+
]
|
|
42
|
+
}}]}
|
|
43
|
+
|
|
44
|
+
health_check: |
|
|
45
|
+
{ "cardsV2": [{ "cardId": "health-{{ctx.monitor.name}}", "card": {
|
|
46
|
+
"header": { "title": "Hourly Health Check", "subtitle": "{{defaults.env}}", "imageUrl": "https://fonts.gstatic.com/s/i/short-term/release/googlesymbols/monitoring/default/48px.svg", "imageType": "CIRCLE" },
|
|
47
|
+
"sections": [
|
|
48
|
+
{ "header": "Summary", "widgets": [
|
|
49
|
+
{ "decoratedText": { "topLabel": "Total Logs (1h)", "text": "{{ctx.results.0.hits.total.value}}" } },
|
|
50
|
+
{ "decoratedText": { "topLabel": "Errors (level=ERROR)", "text": "{{ctx.results.0.aggregations.error_count.doc_count}}" } },
|
|
51
|
+
{ "decoratedText": { "topLabel": "503s", "text": "{{ctx.results.0.aggregations.status_503.doc_count}}" } },
|
|
52
|
+
{ "decoratedText": { "topLabel": "Browser Pool Exhausted", "text": "{{ctx.results.0.aggregations.browser_pool.doc_count}}" } },
|
|
53
|
+
{ "decoratedText": { "topLabel": "Worker Shutdowns", "text": "{{ctx.results.0.aggregations.worker_shutdowns.doc_count}}" } }
|
|
54
|
+
]},
|
|
55
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
56
|
+
{ "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}},
|
|
57
|
+
{ "text": "View in Discover", "onClick": { "openLink": { "url": "{{defaults.discover_url}}" }}}
|
|
58
|
+
]}}]}
|
|
59
|
+
]
|
|
60
|
+
}}]}
|
|
61
|
+
|
|
62
|
+
monitors:
|
|
63
|
+
# ── Periodic Health Check ────────────────────────────────────────────
|
|
64
|
+
hourly_health_check:
|
|
65
|
+
description: "Hourly health status report — always fires, sends a summary card with key metrics"
|
|
66
|
+
schedule:
|
|
67
|
+
interval: 1
|
|
68
|
+
unit: HOURS
|
|
69
|
+
query:
|
|
70
|
+
bool:
|
|
71
|
+
must:
|
|
72
|
+
- match_all: {}
|
|
73
|
+
aggs:
|
|
74
|
+
error_count:
|
|
75
|
+
filter:
|
|
76
|
+
term:
|
|
77
|
+
level.keyword: ERROR
|
|
78
|
+
status_503:
|
|
79
|
+
filter:
|
|
80
|
+
query_string:
|
|
81
|
+
query: '"status=503" OR "503 Service Unavailable"'
|
|
82
|
+
browser_pool:
|
|
83
|
+
filter:
|
|
84
|
+
query_string:
|
|
85
|
+
query: '"Browser pool exhausted"'
|
|
86
|
+
worker_shutdowns:
|
|
87
|
+
filter:
|
|
88
|
+
query_string:
|
|
89
|
+
query: '"Shutting down" OR "Worker exiting" OR "SIGTERM"'
|
|
90
|
+
trigger:
|
|
91
|
+
name: hourly-health-report
|
|
92
|
+
severity: 5
|
|
93
|
+
condition: "ctx.results[0].hits.total.value >= 0"
|
|
94
|
+
card_template: health_check
|
|
95
|
+
throttle:
|
|
96
|
+
value: 55
|
|
97
|
+
unit: MINUTES
|
|
98
|
+
|
|
99
|
+
# ── Browser Pool / Playwright (inferpds_v5 dominant failure mode) ─────
|
|
100
|
+
browser_pool_exhausted:
|
|
101
|
+
description: "Browser pool exhausted — inferpds_v5 can't serve requests (baseline ~570/hr during load)"
|
|
102
|
+
schedule:
|
|
103
|
+
interval: 5
|
|
104
|
+
unit: MINUTES
|
|
105
|
+
query:
|
|
106
|
+
query_string:
|
|
107
|
+
query: '"Browser pool exhausted"'
|
|
108
|
+
trigger:
|
|
109
|
+
name: browser-pool-exhausted
|
|
110
|
+
severity: 1
|
|
111
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.browser_pool}}"
|
|
112
|
+
card_template: service_alert
|
|
113
|
+
throttle:
|
|
114
|
+
value: 15
|
|
115
|
+
unit: MINUTES
|
|
116
|
+
|
|
117
|
+
playwright_failures:
|
|
118
|
+
description: "Playwright can't launch browsers — resource exhaustion precursor"
|
|
119
|
+
schedule:
|
|
120
|
+
interval: 5
|
|
121
|
+
unit: MINUTES
|
|
122
|
+
query:
|
|
123
|
+
query_string:
|
|
124
|
+
query: '"Failed to launch browser"'
|
|
125
|
+
trigger:
|
|
126
|
+
name: playwright-launch-failures
|
|
127
|
+
severity: 1
|
|
128
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.playwright_failures}}"
|
|
129
|
+
card_template: service_alert
|
|
130
|
+
throttle:
|
|
131
|
+
value: 15
|
|
132
|
+
unit: MINUTES
|
|
133
|
+
|
|
134
|
+
# ── HTTP Status Errors ───────────────────────────────────────────────
|
|
135
|
+
http_503_spike:
|
|
136
|
+
description: "503 Service Unavailable spike (baseline ~150/5min during load, alert on 3x)"
|
|
137
|
+
schedule:
|
|
138
|
+
interval: 5
|
|
139
|
+
unit: MINUTES
|
|
140
|
+
query:
|
|
141
|
+
query_string:
|
|
142
|
+
query: '"status=503" OR "503 Service Unavailable" OR "503 Service Temporarily Unavailable"'
|
|
143
|
+
trigger:
|
|
144
|
+
name: http-503-spike
|
|
145
|
+
severity: 1
|
|
146
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.http_503s}}"
|
|
147
|
+
card_template: default
|
|
148
|
+
throttle:
|
|
149
|
+
value: 10
|
|
150
|
+
unit: MINUTES
|
|
151
|
+
|
|
152
|
+
structured_5xx_errors:
|
|
153
|
+
description: "Structured ERROR-level log entries across all services (catches errors the text patterns miss)"
|
|
154
|
+
schedule:
|
|
155
|
+
interval: 5
|
|
156
|
+
unit: MINUTES
|
|
157
|
+
query:
|
|
158
|
+
bool:
|
|
159
|
+
must:
|
|
160
|
+
- term:
|
|
161
|
+
level.keyword: ERROR
|
|
162
|
+
trigger:
|
|
163
|
+
name: high-error-rate
|
|
164
|
+
severity: 2
|
|
165
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.structured_errors}}"
|
|
166
|
+
card_template: default
|
|
167
|
+
throttle:
|
|
168
|
+
value: 10
|
|
169
|
+
unit: MINUTES
|
|
170
|
+
|
|
171
|
+
# ── Service-Specific: deal_structure ─────────────────────────────────
|
|
172
|
+
unsupported_pds_value:
|
|
173
|
+
description: "deal_structure receiving unsupported PDS input types"
|
|
174
|
+
schedule:
|
|
175
|
+
interval: 10
|
|
176
|
+
unit: MINUTES
|
|
177
|
+
query:
|
|
178
|
+
query_string:
|
|
179
|
+
query: '"Unsupported PDS value"'
|
|
180
|
+
trigger:
|
|
181
|
+
name: unsupported-pds-value
|
|
182
|
+
severity: 2
|
|
183
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.unsupported_pds}}"
|
|
184
|
+
card_template: service_alert
|
|
185
|
+
throttle:
|
|
186
|
+
value: 30
|
|
187
|
+
unit: MINUTES
|
|
188
|
+
|
|
189
|
+
# ── Infrastructure ───────────────────────────────────────────────────
|
|
190
|
+
connection_refused:
|
|
191
|
+
description: "Connection refused errors — upstream service unreachable"
|
|
192
|
+
schedule:
|
|
193
|
+
interval: 5
|
|
194
|
+
unit: MINUTES
|
|
195
|
+
query:
|
|
196
|
+
query_string:
|
|
197
|
+
query: '"Connection refused" OR "connect ECONNREFUSED"'
|
|
198
|
+
trigger:
|
|
199
|
+
name: connection-refused
|
|
200
|
+
severity: 1
|
|
201
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.conn_refused}}"
|
|
202
|
+
card_template: default
|
|
203
|
+
throttle:
|
|
204
|
+
value: 15
|
|
205
|
+
unit: MINUTES
|
|
206
|
+
|
|
207
|
+
worker_shutdown_spike:
|
|
208
|
+
description: "Worker shutdown spike (baseline ~10/hr normal recycling, alert on burst)"
|
|
209
|
+
schedule:
|
|
210
|
+
interval: 5
|
|
211
|
+
unit: MINUTES
|
|
212
|
+
query:
|
|
213
|
+
query_string:
|
|
214
|
+
query: '"Shutting down" OR "Worker exiting" OR "worker timeout" OR "SIGTERM"'
|
|
215
|
+
trigger:
|
|
216
|
+
name: worker-shutdown-spike
|
|
217
|
+
severity: 2
|
|
218
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.worker_shutdowns}}"
|
|
219
|
+
card_template: default
|
|
220
|
+
throttle:
|
|
221
|
+
value: 15
|
|
222
|
+
unit: MINUTES
|
|
223
|
+
|
|
224
|
+
oom_killed:
|
|
225
|
+
description: "Out-of-memory kills — always critical"
|
|
226
|
+
schedule:
|
|
227
|
+
interval: 5
|
|
228
|
+
unit: MINUTES
|
|
229
|
+
query:
|
|
230
|
+
query_string:
|
|
231
|
+
query: '"OOMKilled" OR "Out of memory" OR "oom-kill" OR "Cannot allocate memory"'
|
|
232
|
+
trigger:
|
|
233
|
+
name: oom-kills
|
|
234
|
+
severity: 1
|
|
235
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.oom_kills}}"
|
|
236
|
+
card_template: default
|
|
237
|
+
throttle:
|
|
238
|
+
value: 10
|
|
239
|
+
unit: MINUTES
|
|
240
|
+
|
|
241
|
+
read_timeout_spike:
|
|
242
|
+
description: "ReadTimeout errors — upstream services not responding"
|
|
243
|
+
schedule:
|
|
244
|
+
interval: 5
|
|
245
|
+
unit: MINUTES
|
|
246
|
+
query:
|
|
247
|
+
query_string:
|
|
248
|
+
query: '"ReadTimeout" OR "ReadError" OR "ConnectTimeout"'
|
|
249
|
+
trigger:
|
|
250
|
+
name: read-timeout-spike
|
|
251
|
+
severity: 2
|
|
252
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.read_timeouts}}"
|
|
253
|
+
card_template: default
|
|
254
|
+
throttle:
|
|
255
|
+
value: 15
|
|
256
|
+
unit: MINUTES
|
|
257
|
+
|
|
258
|
+
# ── Thresholds (per 5-minute window unless noted) ──────────────────────
|
|
259
|
+
# Tuned based on 24h error analysis on 2026-02-25/26.
|
|
260
|
+
# Normal noise floors observed:
|
|
261
|
+
# browser_pool: ~50/5min steady, 500+/5min during spikes
|
|
262
|
+
# 503s: ~150/5min during load, near-0 off-peak
|
|
263
|
+
# structured errors: ~200/5min at peak
|
|
264
|
+
# worker shutdowns: ~1/5min normal recycling
|
|
265
|
+
thresholds:
|
|
266
|
+
browser_pool: 100 # 2x normal steady-state for browser pool exhaustion
|
|
267
|
+
playwright_failures: 50 # any significant Playwright failure burst
|
|
268
|
+
http_503s: 500 # 3x normal load baseline; avoids alerting on normal traffic
|
|
269
|
+
structured_errors: 300 # catches broad error spikes across all services
|
|
270
|
+
unsupported_pds: 20 # 10-min window; baseline ~7/hr so alert on 3x
|
|
271
|
+
conn_refused: 5 # any connection refused is concerning
|
|
272
|
+
worker_shutdowns: 10 # 10x normal recycling rate in a 5-min window
|
|
273
|
+
oom_kills: 1 # any OOM is always critical
|
|
274
|
+
read_timeouts: 30 # baseline ~23/hr so alert on burst in 5min
|
|
275
|
+
|
|
276
|
+
defaults:
|
|
277
|
+
env: production
|
|
278
|
+
service_label: "aiaas-inferpds-v5"
|
|
279
|
+
dashboard_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/dashboards?security_tenant=global#/view/python-services-production-health-v2"
|
|
280
|
+
discover_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/discover?security_tenant=global#/?_g=(time:(from:now-1h,to:now))&_a=(query:(language:kuery,query:'level:ERROR'))"
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
name: service-health
|
|
2
|
+
description: Per-service error rate and latency monitors — use with --service to scope to a single service
|
|
3
|
+
|
|
4
|
+
destination:
|
|
5
|
+
name: gchat-service-alerts
|
|
6
|
+
type: gchat
|
|
7
|
+
|
|
8
|
+
card_templates:
|
|
9
|
+
service_alert: |
|
|
10
|
+
{ "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
|
|
11
|
+
"header": { "title": "{{ctx.trigger.name}}", "subtitle": "Service: {{defaults.service}}" },
|
|
12
|
+
"sections": [
|
|
13
|
+
{ "widgets": [
|
|
14
|
+
{ "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
|
|
15
|
+
{ "decoratedText": { "topLabel": "Service", "text": "{{defaults.service}}" } },
|
|
16
|
+
{ "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
|
|
17
|
+
{ "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
|
|
18
|
+
{ "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
|
|
19
|
+
]},
|
|
20
|
+
{ "widgets": [{ "buttonList": { "buttons": [
|
|
21
|
+
{ "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
|
|
22
|
+
]}}]}
|
|
23
|
+
]
|
|
24
|
+
}}]}
|
|
25
|
+
|
|
26
|
+
monitors:
|
|
27
|
+
service_errors:
|
|
28
|
+
description: "Service-level 5xx error rate"
|
|
29
|
+
schedule:
|
|
30
|
+
interval: 5
|
|
31
|
+
unit: MINUTES
|
|
32
|
+
query:
|
|
33
|
+
bool:
|
|
34
|
+
must:
|
|
35
|
+
- query_string:
|
|
36
|
+
query: '"500 Internal Server Error" OR "502 Bad Gateway" OR "503 Service" OR "504 Gateway"'
|
|
37
|
+
- match_phrase:
|
|
38
|
+
log: "{{defaults.service}}"
|
|
39
|
+
trigger:
|
|
40
|
+
name: service-5xx-errors
|
|
41
|
+
severity: 2
|
|
42
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.service_errors}}"
|
|
43
|
+
card_template: service_alert
|
|
44
|
+
throttle:
|
|
45
|
+
value: 10
|
|
46
|
+
unit: MINUTES
|
|
47
|
+
|
|
48
|
+
service_latency:
|
|
49
|
+
description: "High latency detected for service (slow response times)"
|
|
50
|
+
schedule:
|
|
51
|
+
interval: 5
|
|
52
|
+
unit: MINUTES
|
|
53
|
+
query:
|
|
54
|
+
bool:
|
|
55
|
+
must:
|
|
56
|
+
- match_phrase:
|
|
57
|
+
log: "{{defaults.service}}"
|
|
58
|
+
- query_string:
|
|
59
|
+
query: '"correlation middleware" AND "ms"'
|
|
60
|
+
trigger:
|
|
61
|
+
name: service-high-latency
|
|
62
|
+
severity: 2
|
|
63
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.latency_requests}}"
|
|
64
|
+
card_template: service_alert
|
|
65
|
+
throttle:
|
|
66
|
+
value: 15
|
|
67
|
+
unit: MINUTES
|
|
68
|
+
|
|
69
|
+
service_4xx_spike:
|
|
70
|
+
description: "Unusual spike in 4xx client errors for service"
|
|
71
|
+
schedule:
|
|
72
|
+
interval: 10
|
|
73
|
+
unit: MINUTES
|
|
74
|
+
query:
|
|
75
|
+
bool:
|
|
76
|
+
must:
|
|
77
|
+
- query_string:
|
|
78
|
+
query: '"400 Bad Request" OR "401 Unauthorized" OR "403 Forbidden" OR "404 Not Found" OR "422 Unprocessable"'
|
|
79
|
+
- match_phrase:
|
|
80
|
+
log: "{{defaults.service}}"
|
|
81
|
+
trigger:
|
|
82
|
+
name: service-4xx-spike
|
|
83
|
+
severity: 3
|
|
84
|
+
condition: "ctx.results[0].hits.total.value > {{thresholds.service_4xx}}"
|
|
85
|
+
card_template: service_alert
|
|
86
|
+
throttle:
|
|
87
|
+
value: 15
|
|
88
|
+
unit: MINUTES
|
|
89
|
+
|
|
90
|
+
thresholds:
|
|
91
|
+
service_errors: 5
|
|
92
|
+
latency_requests: 20
|
|
93
|
+
service_4xx: 100
|
|
94
|
+
|
|
95
|
+
defaults:
|
|
96
|
+
env: production
|
|
97
|
+
service: "unknown-service"
|
|
98
|
+
dashboard_url: "https://opensearch.example.com/app/dashboards?security_tenant=global#/view/service-health"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
name: langfuse-usage
|
|
2
|
+
title: LangFuse Usage Dashboard
|
|
3
|
+
description: LangFuse LLM tracing integration monitoring across Python microservices
|
|
4
|
+
time_from: now-24h
|
|
5
|
+
refresh_interval: 60000
|
|
6
|
+
|
|
7
|
+
header: |
|
|
8
|
+
## LangFuse Usage Dashboard
|
|
9
|
+
Tracking LangFuse LLM tracing integration across Python microservices.
|
|
10
|
+
Endpoint: `https://api.enc.groupon.com/ai-gateway/llm-langfuse`
|
|
11
|
+
|
|
12
|
+
metrics:
|
|
13
|
+
- title: Total Activity
|
|
14
|
+
query: '"langfuse" OR "Langfuse"'
|
|
15
|
+
label: Total Events
|
|
16
|
+
|
|
17
|
+
- title: Validations
|
|
18
|
+
query: '"Validating" AND "matches" AND "Langfuse"'
|
|
19
|
+
label: Validations
|
|
20
|
+
|
|
21
|
+
- title: Tracing Inits
|
|
22
|
+
query: '"langfuse" AND ("tracing enabled" OR "integration enabled")'
|
|
23
|
+
label: Inits
|
|
24
|
+
|
|
25
|
+
- title: Config Loads
|
|
26
|
+
query: '"langfuse-config"'
|
|
27
|
+
label: Config Loads
|
|
28
|
+
|
|
29
|
+
- title: Timeouts
|
|
30
|
+
query: '"llm-langfuse" AND "ReadTimeout"'
|
|
31
|
+
label: Timeouts
|
|
32
|
+
|
|
33
|
+
- title: API Key Missing
|
|
34
|
+
query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
|
|
35
|
+
label: API Key None
|
|
36
|
+
|
|
37
|
+
charts:
|
|
38
|
+
- title: LangFuse Activity Over Time
|
|
39
|
+
width: half
|
|
40
|
+
series:
|
|
41
|
+
All LangFuse: '"langfuse" OR "Langfuse"'
|
|
42
|
+
Validations: '"Validating" AND "matches" AND "Langfuse"'
|
|
43
|
+
Config Loads: '"langfuse-config"'
|
|
44
|
+
|
|
45
|
+
- title: LangFuse Errors & Issues
|
|
46
|
+
width: half
|
|
47
|
+
series:
|
|
48
|
+
ReadTimeout (llm-langfuse): '"llm-langfuse" AND "ReadTimeout"'
|
|
49
|
+
API Key Missing: '"ENCORE_G_API_KEY: None" AND "langfuse"'
|
|
50
|
+
Failed/Error: '"langfuse" AND ("failed" OR "error" OR "exception")'
|
|
51
|
+
|
|
52
|
+
- title: LangFuse by Service
|
|
53
|
+
width: full
|
|
54
|
+
series:
|
|
55
|
+
inferpds_v5: '"langfuse" AND service.keyword:"aiaas-inferpds-v5"'
|
|
56
|
+
deal_structure: '"langfuse" AND service.keyword:"aiaas-deal-structure"'
|
|
57
|
+
deal_content: '"langfuse" AND service.keyword:"aiaas-deal-content"'
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI entry point — create and validate OpenSearch Dashboards.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* node dist/create-dashboards.js # create/update default (v2)
|
|
6
|
+
* node dist/create-dashboards.js --name v3 # create a new "v3" dashboard
|
|
7
|
+
* node dist/create-dashboards.js --name v3 --validate # validate v3
|
|
8
|
+
* node dist/create-dashboards.js --name v3 --delete # delete v3
|
|
9
|
+
*/
|
|
10
|
+
export {};
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI entry point — create and validate OpenSearch Dashboards.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* node dist/create-dashboards.js # create/update default (v2)
|
|
6
|
+
* node dist/create-dashboards.js --name v3 # create a new "v3" dashboard
|
|
7
|
+
* node dist/create-dashboards.js --name v3 --validate # validate v3
|
|
8
|
+
* node dist/create-dashboards.js --name v3 --delete # delete v3
|
|
9
|
+
*/
|
|
10
|
+
import { loadConfig } from "./utils/index.js";
|
|
11
|
+
import { createOpenSearchRepository } from "./repositories/index.js";
|
|
12
|
+
import { DashboardService } from "./services/index.js";
|
|
13
|
+
import { parseArgs } from "./utils/index.js";
|
|
14
|
+
async function main() {
|
|
15
|
+
const args = parseArgs({
|
|
16
|
+
validate: { type: "boolean" },
|
|
17
|
+
delete: { type: "boolean" },
|
|
18
|
+
name: { type: "string" },
|
|
19
|
+
});
|
|
20
|
+
const config = loadConfig();
|
|
21
|
+
const repo = createOpenSearchRepository(config);
|
|
22
|
+
const service = new DashboardService(repo, args.name);
|
|
23
|
+
let exitCode;
|
|
24
|
+
if (args.validate) {
|
|
25
|
+
exitCode = await service.validate();
|
|
26
|
+
}
|
|
27
|
+
else if (args.delete) {
|
|
28
|
+
exitCode = await service.delete();
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
exitCode = await service.create();
|
|
32
|
+
}
|
|
33
|
+
process.exit(exitCode);
|
|
34
|
+
}
|
|
35
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
36
|
+
main();
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=create-dashboards.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"create-dashboards.js","sourceRoot":"","sources":["../src/create-dashboards.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,0BAA0B,EAAE,MAAM,yBAAyB,CAAC;AACrE,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAEvD,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAE7C,KAAK,UAAU,IAAI;IACjB,MAAM,IAAI,GAAG,SAAS,CAAgB;QACpC,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC7B,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC3B,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;KACzB,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;IAC5B,MAAM,IAAI,GAAG,0BAA0B,CAAC,MAAM,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,IAAI,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;IAEtD,IAAI,QAAgB,CAAC;IACrB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,QAAQ,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,CAAC;IACtC,CAAC;SAAM,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,QAAQ,GAAG,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC;IACpC,CAAC;SAAM,CAAC;QACN,QAAQ,GAAG,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC;IACpC,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;AACzB,CAAC;AAED,IAAI,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,UAAU,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACpD,IAAI,EAAE,CAAC;AACT,CAAC"}
|