@dipseth/opensearch-logs 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env.example +14 -0
  2. package/alerts/langfuse-usage.yaml +142 -0
  3. package/alerts/production-incidents.yaml +280 -0
  4. package/alerts/service-health.yaml +98 -0
  5. package/dashboards/langfuse-usage.yaml +57 -0
  6. package/dist/create-dashboards.d.ts +10 -0
  7. package/dist/create-dashboards.js +38 -0
  8. package/dist/create-dashboards.js.map +1 -0
  9. package/dist/interfaces/alert.interfaces.d.ts +323 -0
  10. package/dist/interfaces/alert.interfaces.js +6 -0
  11. package/dist/interfaces/alert.interfaces.js.map +1 -0
  12. package/dist/interfaces/dashboard-gen.interfaces.d.ts +33 -0
  13. package/dist/interfaces/dashboard-gen.interfaces.js +3 -0
  14. package/dist/interfaces/dashboard-gen.interfaces.js.map +1 -0
  15. package/dist/interfaces/interfaces.d.ts +312 -0
  16. package/dist/interfaces/interfaces.js +3 -0
  17. package/dist/interfaces/interfaces.js.map +1 -0
  18. package/dist/interfaces/playbook.interfaces.d.ts +140 -0
  19. package/dist/interfaces/playbook.interfaces.js +3 -0
  20. package/dist/interfaces/playbook.interfaces.js.map +1 -0
  21. package/dist/os-alert.d.ts +17 -0
  22. package/dist/os-alert.js +245 -0
  23. package/dist/os-alert.js.map +1 -0
  24. package/dist/os-dash.d.ts +9 -0
  25. package/dist/os-dash.js +53 -0
  26. package/dist/os-dash.js.map +1 -0
  27. package/dist/os-monitor.d.ts +12 -0
  28. package/dist/os-monitor.js +59 -0
  29. package/dist/os-monitor.js.map +1 -0
  30. package/dist/os-playbook.d.ts +9 -0
  31. package/dist/os-playbook.js +71 -0
  32. package/dist/os-playbook.js.map +1 -0
  33. package/dist/os-search.d.ts +11 -0
  34. package/dist/os-search.js +84 -0
  35. package/dist/os-search.js.map +1 -0
  36. package/dist/repositories/index.d.ts +1 -0
  37. package/dist/repositories/index.js +2 -0
  38. package/dist/repositories/index.js.map +1 -0
  39. package/dist/repositories/opensearch.repository.d.ts +51 -0
  40. package/dist/repositories/opensearch.repository.js +167 -0
  41. package/dist/repositories/opensearch.repository.js.map +1 -0
  42. package/dist/services/alert.service.d.ts +73 -0
  43. package/dist/services/alert.service.js +503 -0
  44. package/dist/services/alert.service.js.map +1 -0
  45. package/dist/services/dashboard-gen.service.d.ts +36 -0
  46. package/dist/services/dashboard-gen.service.js +162 -0
  47. package/dist/services/dashboard-gen.service.js.map +1 -0
  48. package/dist/services/dashboard.service.d.ts +33 -0
  49. package/dist/services/dashboard.service.js +428 -0
  50. package/dist/services/dashboard.service.js.map +1 -0
  51. package/dist/services/gchat.service.d.ts +45 -0
  52. package/dist/services/gchat.service.js +228 -0
  53. package/dist/services/gchat.service.js.map +1 -0
  54. package/dist/services/index.d.ts +8 -0
  55. package/dist/services/index.js +9 -0
  56. package/dist/services/index.js.map +1 -0
  57. package/dist/services/monitor.service.d.ts +18 -0
  58. package/dist/services/monitor.service.js +342 -0
  59. package/dist/services/monitor.service.js.map +1 -0
  60. package/dist/services/panel-layout.d.ts +21 -0
  61. package/dist/services/panel-layout.js +33 -0
  62. package/dist/services/panel-layout.js.map +1 -0
  63. package/dist/services/playbook-dashboard.service.d.ts +19 -0
  64. package/dist/services/playbook-dashboard.service.js +434 -0
  65. package/dist/services/playbook-dashboard.service.js.map +1 -0
  66. package/dist/services/playbook.service.d.ts +13 -0
  67. package/dist/services/playbook.service.js +621 -0
  68. package/dist/services/playbook.service.js.map +1 -0
  69. package/dist/services/search.service.d.ts +30 -0
  70. package/dist/services/search.service.js +885 -0
  71. package/dist/services/search.service.js.map +1 -0
  72. package/dist/utils/cli.d.ts +14 -0
  73. package/dist/utils/cli.js +90 -0
  74. package/dist/utils/cli.js.map +1 -0
  75. package/dist/utils/config.d.ts +20 -0
  76. package/dist/utils/config.js +104 -0
  77. package/dist/utils/config.js.map +1 -0
  78. package/dist/utils/index.d.ts +5 -0
  79. package/dist/utils/index.js +5 -0
  80. package/dist/utils/index.js.map +1 -0
  81. package/dist/utils/service-registry.d.ts +15 -0
  82. package/dist/utils/service-registry.js +56 -0
  83. package/dist/utils/service-registry.js.map +1 -0
  84. package/dist/utils/template.d.ts +18 -0
  85. package/dist/utils/template.js +66 -0
  86. package/dist/utils/template.js.map +1 -0
  87. package/package.json +76 -0
  88. package/playbooks/error-investigation.yaml +45 -0
  89. package/playbooks/incident-triage.yaml +32 -0
  90. package/playbooks/post-deploy-validation.yaml +24 -0
  91. package/playbooks/service-deep-dive.yaml +42 -0
package/.env.example ADDED
@@ -0,0 +1,14 @@
1
+ # OpenSearch credentials — copy to .env and fill in
2
+ OPENSEARCH_HOST=your-cluster-host.example.com
3
+ OPENSEARCH_PORT=25060
4
+ OPENSEARCH_USERNAME=admin
5
+ OPENSEARCH_PASSWORD=
6
+
7
+ # Optional: Google Chat webhook for alerts/reports
8
+ # GCHAT_WEBHOOK_URL=https://chat.googleapis.com/v1/spaces/.../messages?key=...&token=...
9
+
10
+ # Optional: non-default cluster config (e.g., AWS ConveyorCloud)
11
+ # OPENSEARCH_DATA_PORT=443
12
+ # OPENSEARCH_DASHBOARDS_PORT=443
13
+ # OPENSEARCH_INDEX_PREFIX=python-services
14
+ # OPENSEARCH_TENANT=global
@@ -0,0 +1,142 @@
1
+ name: langfuse-usage
2
+ description: >
3
+ LangFuse LLM tracing integration monitors — track validation throughput,
4
+ API timeouts, config issues, and API key problems.
5
+
6
+ destination:
7
+ name: gchat-production-alerts
8
+ type: gchat
9
+
10
+ card_templates:
11
+ langfuse_alert: |
12
+ { "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
13
+ "header": { "title": "{{ctx.trigger.name}}", "subtitle": "LangFuse Integration" },
14
+ "sections": [
15
+ { "widgets": [
16
+ { "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
17
+ { "decoratedText": { "topLabel": "Component", "text": "LangFuse LLM Tracing" } },
18
+ { "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
19
+ { "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
20
+ { "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
21
+ ]},
22
+ { "widgets": [{ "buttonList": { "buttons": [
23
+ { "text": "View LangFuse Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
24
+ ]}}]}
25
+ ]
26
+ }}]}
27
+
28
+ langfuse_health: |
29
+ { "cardsV2": [{ "cardId": "health-{{ctx.monitor.name}}", "card": {
30
+ "header": { "title": "LangFuse Health Check", "subtitle": "{{defaults.env}}", "imageUrl": "https://fonts.gstatic.com/s/i/short-term/release/googlesymbols/monitoring/default/48px.svg", "imageType": "CIRCLE" },
31
+ "sections": [
32
+ { "header": "LangFuse Summary (1h)", "widgets": [
33
+ { "decoratedText": { "topLabel": "Total LangFuse Events", "text": "{{ctx.results.0.hits.total.value}}" } },
34
+ { "decoratedText": { "topLabel": "Validations", "text": "{{ctx.results.0.aggregations.validations.doc_count}}" } },
35
+ { "decoratedText": { "topLabel": "ReadTimeouts (llm-langfuse)", "text": "{{ctx.results.0.aggregations.timeouts.doc_count}}" } },
36
+ { "decoratedText": { "topLabel": "API Key Missing", "text": "{{ctx.results.0.aggregations.apikey_missing.doc_count}}" } },
37
+ { "decoratedText": { "topLabel": "Config Loads", "text": "{{ctx.results.0.aggregations.config_loads.doc_count}}" } }
38
+ ]},
39
+ { "widgets": [{ "buttonList": { "buttons": [
40
+ { "text": "View LangFuse Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
41
+ ]}}]}
42
+ ]
43
+ }}]}
44
+
45
+ monitors:
46
+ # ── Periodic Health Check ────────────────────────────────────────────
47
+ langfuse_health_check:
48
+ description: "Hourly LangFuse health summary with key metrics"
49
+ schedule:
50
+ interval: 1
51
+ unit: HOURS
52
+ query:
53
+ bool:
54
+ must:
55
+ - query_string:
56
+ query: '"langfuse" OR "Langfuse"'
57
+ aggs:
58
+ validations:
59
+ filter:
60
+ query_string:
61
+ query: '"Validating" AND "matches" AND "Langfuse"'
62
+ timeouts:
63
+ filter:
64
+ query_string:
65
+ query: '"llm-langfuse" AND "ReadTimeout"'
66
+ apikey_missing:
67
+ filter:
68
+ query_string:
69
+ query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
70
+ config_loads:
71
+ filter:
72
+ query_string:
73
+ query: '"langfuse-config"'
74
+ trigger:
75
+ name: langfuse-hourly-health
76
+ severity: 5
77
+ condition: "ctx.results[0].hits.total.value >= 0"
78
+ card_template: langfuse_health
79
+ throttle:
80
+ value: 55
81
+ unit: MINUTES
82
+
83
+ # ── Error Monitors ──────────────────────────────────────────────────
84
+ langfuse_timeout_spike:
85
+ description: "ReadTimeout spike on llm-langfuse endpoint — AI Gateway may be degraded"
86
+ schedule:
87
+ interval: 5
88
+ unit: MINUTES
89
+ query:
90
+ query_string:
91
+ query: '"llm-langfuse" AND "ReadTimeout"'
92
+ trigger:
93
+ name: langfuse-timeout-spike
94
+ severity: 2
95
+ condition: "ctx.results[0].hits.total.value > {{thresholds.timeouts}}"
96
+ card_template: langfuse_alert
97
+ throttle:
98
+ value: 15
99
+ unit: MINUTES
100
+
101
+ langfuse_apikey_missing:
102
+ description: "LangFuse detecting missing ENCORE_G_API_KEY — tracing will fail"
103
+ schedule:
104
+ interval: 10
105
+ unit: MINUTES
106
+ query:
107
+ query_string:
108
+ query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
109
+ trigger:
110
+ name: langfuse-apikey-missing
111
+ severity: 1
112
+ condition: "ctx.results[0].hits.total.value > {{thresholds.apikey_missing}}"
113
+ card_template: langfuse_alert
114
+ throttle:
115
+ value: 30
116
+ unit: MINUTES
117
+
118
+ langfuse_no_activity:
119
+ description: "No LangFuse activity detected — tracing may be down"
120
+ schedule:
121
+ interval: 30
122
+ unit: MINUTES
123
+ query:
124
+ query_string:
125
+ query: '"langfuse" OR "Langfuse"'
126
+ trigger:
127
+ name: langfuse-no-activity
128
+ severity: 2
129
+ condition: "ctx.results[0].hits.total.value < {{thresholds.min_activity}}"
130
+ card_template: langfuse_alert
131
+ throttle:
132
+ value: 30
133
+ unit: MINUTES
134
+
135
+ thresholds:
136
+ timeouts: 10 # >10 ReadTimeouts in 5min is a spike
137
+ apikey_missing: 5 # any significant API key issue burst in 10min
138
+ min_activity: 5 # fewer than 5 LangFuse events in 30min = dead
139
+
140
+ defaults:
141
+ env: production
142
+ dashboard_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/dashboards?security_tenant=global#/view/langfuse-usage-dashboard"
@@ -0,0 +1,280 @@
1
+ name: production-incidents
2
+ description: >
3
+ Core production incident monitors tuned to actual error baselines.
4
+ Thresholds set above normal noise floors to avoid alert fatigue.
5
+
6
+ destination:
7
+ name: gchat-production-alerts
8
+ type: gchat
9
+
10
+ card_templates:
11
+ default: |
12
+ { "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
13
+ "header": { "title": "{{ctx.trigger.name}}", "subtitle": "Severity {{ctx.trigger.severity}}" },
14
+ "sections": [
15
+ { "widgets": [
16
+ { "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
17
+ { "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
18
+ { "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
19
+ { "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
20
+ ]},
21
+ { "widgets": [{ "buttonList": { "buttons": [
22
+ { "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
23
+ ]}}]}
24
+ ]
25
+ }}]}
26
+
27
+ service_alert: |
28
+ { "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
29
+ "header": { "title": "{{ctx.trigger.name}}", "subtitle": "{{defaults.service_label}}" },
30
+ "sections": [
31
+ { "widgets": [
32
+ { "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
33
+ { "decoratedText": { "topLabel": "Service", "text": "{{defaults.service_label}}" } },
34
+ { "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
35
+ { "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
36
+ { "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
37
+ ]},
38
+ { "widgets": [{ "buttonList": { "buttons": [
39
+ { "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
40
+ ]}}]}
41
+ ]
42
+ }}]}
43
+
44
+ health_check: |
45
+ { "cardsV2": [{ "cardId": "health-{{ctx.monitor.name}}", "card": {
46
+ "header": { "title": "Hourly Health Check", "subtitle": "{{defaults.env}}", "imageUrl": "https://fonts.gstatic.com/s/i/short-term/release/googlesymbols/monitoring/default/48px.svg", "imageType": "CIRCLE" },
47
+ "sections": [
48
+ { "header": "Summary", "widgets": [
49
+ { "decoratedText": { "topLabel": "Total Logs (1h)", "text": "{{ctx.results.0.hits.total.value}}" } },
50
+ { "decoratedText": { "topLabel": "Errors (level=ERROR)", "text": "{{ctx.results.0.aggregations.error_count.doc_count}}" } },
51
+ { "decoratedText": { "topLabel": "503s", "text": "{{ctx.results.0.aggregations.status_503.doc_count}}" } },
52
+ { "decoratedText": { "topLabel": "Browser Pool Exhausted", "text": "{{ctx.results.0.aggregations.browser_pool.doc_count}}" } },
53
+ { "decoratedText": { "topLabel": "Worker Shutdowns", "text": "{{ctx.results.0.aggregations.worker_shutdowns.doc_count}}" } }
54
+ ]},
55
+ { "widgets": [{ "buttonList": { "buttons": [
56
+ { "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}},
57
+ { "text": "View in Discover", "onClick": { "openLink": { "url": "{{defaults.discover_url}}" }}}
58
+ ]}}]}
59
+ ]
60
+ }}]}
61
+
62
+ monitors:
63
+ # ── Periodic Health Check ────────────────────────────────────────────
64
+ hourly_health_check:
65
+ description: "Hourly health status report — always fires, sends a summary card with key metrics"
66
+ schedule:
67
+ interval: 1
68
+ unit: HOURS
69
+ query:
70
+ bool:
71
+ must:
72
+ - match_all: {}
73
+ aggs:
74
+ error_count:
75
+ filter:
76
+ term:
77
+ level.keyword: ERROR
78
+ status_503:
79
+ filter:
80
+ query_string:
81
+ query: '"status=503" OR "503 Service Unavailable"'
82
+ browser_pool:
83
+ filter:
84
+ query_string:
85
+ query: '"Browser pool exhausted"'
86
+ worker_shutdowns:
87
+ filter:
88
+ query_string:
89
+ query: '"Shutting down" OR "Worker exiting" OR "SIGTERM"'
90
+ trigger:
91
+ name: hourly-health-report
92
+ severity: 5
93
+ condition: "ctx.results[0].hits.total.value >= 0"
94
+ card_template: health_check
95
+ throttle:
96
+ value: 55
97
+ unit: MINUTES
98
+
99
+ # ── Browser Pool / Playwright (inferpds_v5 dominant failure mode) ─────
100
+ browser_pool_exhausted:
101
+ description: "Browser pool exhausted — inferpds_v5 can't serve requests (baseline ~570/hr during load)"
102
+ schedule:
103
+ interval: 5
104
+ unit: MINUTES
105
+ query:
106
+ query_string:
107
+ query: '"Browser pool exhausted"'
108
+ trigger:
109
+ name: browser-pool-exhausted
110
+ severity: 1
111
+ condition: "ctx.results[0].hits.total.value > {{thresholds.browser_pool}}"
112
+ card_template: service_alert
113
+ throttle:
114
+ value: 15
115
+ unit: MINUTES
116
+
117
+ playwright_failures:
118
+ description: "Playwright can't launch browsers — resource exhaustion precursor"
119
+ schedule:
120
+ interval: 5
121
+ unit: MINUTES
122
+ query:
123
+ query_string:
124
+ query: '"Failed to launch browser"'
125
+ trigger:
126
+ name: playwright-launch-failures
127
+ severity: 1
128
+ condition: "ctx.results[0].hits.total.value > {{thresholds.playwright_failures}}"
129
+ card_template: service_alert
130
+ throttle:
131
+ value: 15
132
+ unit: MINUTES
133
+
134
+ # ── HTTP Status Errors ───────────────────────────────────────────────
135
+ http_503_spike:
136
+ description: "503 Service Unavailable spike (baseline ~150/5min during load, alert on 3x)"
137
+ schedule:
138
+ interval: 5
139
+ unit: MINUTES
140
+ query:
141
+ query_string:
142
+ query: '"status=503" OR "503 Service Unavailable" OR "503 Service Temporarily Unavailable"'
143
+ trigger:
144
+ name: http-503-spike
145
+ severity: 1
146
+ condition: "ctx.results[0].hits.total.value > {{thresholds.http_503s}}"
147
+ card_template: default
148
+ throttle:
149
+ value: 10
150
+ unit: MINUTES
151
+
152
+ structured_5xx_errors:
153
+ description: "Structured ERROR-level log entries across all services (catches errors the text patterns miss)"
154
+ schedule:
155
+ interval: 5
156
+ unit: MINUTES
157
+ query:
158
+ bool:
159
+ must:
160
+ - term:
161
+ level.keyword: ERROR
162
+ trigger:
163
+ name: high-error-rate
164
+ severity: 2
165
+ condition: "ctx.results[0].hits.total.value > {{thresholds.structured_errors}}"
166
+ card_template: default
167
+ throttle:
168
+ value: 10
169
+ unit: MINUTES
170
+
171
+ # ── Service-Specific: deal_structure ─────────────────────────────────
172
+ unsupported_pds_value:
173
+ description: "deal_structure receiving unsupported PDS input types"
174
+ schedule:
175
+ interval: 10
176
+ unit: MINUTES
177
+ query:
178
+ query_string:
179
+ query: '"Unsupported PDS value"'
180
+ trigger:
181
+ name: unsupported-pds-value
182
+ severity: 2
183
+ condition: "ctx.results[0].hits.total.value > {{thresholds.unsupported_pds}}"
184
+ card_template: service_alert
185
+ throttle:
186
+ value: 30
187
+ unit: MINUTES
188
+
189
+ # ── Infrastructure ───────────────────────────────────────────────────
190
+ connection_refused:
191
+ description: "Connection refused errors — upstream service unreachable"
192
+ schedule:
193
+ interval: 5
194
+ unit: MINUTES
195
+ query:
196
+ query_string:
197
+ query: '"Connection refused" OR "connect ECONNREFUSED"'
198
+ trigger:
199
+ name: connection-refused
200
+ severity: 1
201
+ condition: "ctx.results[0].hits.total.value > {{thresholds.conn_refused}}"
202
+ card_template: default
203
+ throttle:
204
+ value: 15
205
+ unit: MINUTES
206
+
207
+ worker_shutdown_spike:
208
+ description: "Worker shutdown spike (baseline ~10/hr normal recycling, alert on burst)"
209
+ schedule:
210
+ interval: 5
211
+ unit: MINUTES
212
+ query:
213
+ query_string:
214
+ query: '"Shutting down" OR "Worker exiting" OR "worker timeout" OR "SIGTERM"'
215
+ trigger:
216
+ name: worker-shutdown-spike
217
+ severity: 2
218
+ condition: "ctx.results[0].hits.total.value > {{thresholds.worker_shutdowns}}"
219
+ card_template: default
220
+ throttle:
221
+ value: 15
222
+ unit: MINUTES
223
+
224
+ oom_killed:
225
+ description: "Out-of-memory kills — always critical"
226
+ schedule:
227
+ interval: 5
228
+ unit: MINUTES
229
+ query:
230
+ query_string:
231
+ query: '"OOMKilled" OR "Out of memory" OR "oom-kill" OR "Cannot allocate memory"'
232
+ trigger:
233
+ name: oom-kills
234
+ severity: 1
235
+ condition: "ctx.results[0].hits.total.value > {{thresholds.oom_kills}}"
236
+ card_template: default
237
+ throttle:
238
+ value: 10
239
+ unit: MINUTES
240
+
241
+ read_timeout_spike:
242
+ description: "ReadTimeout errors — upstream services not responding"
243
+ schedule:
244
+ interval: 5
245
+ unit: MINUTES
246
+ query:
247
+ query_string:
248
+ query: '"ReadTimeout" OR "ReadError" OR "ConnectTimeout"'
249
+ trigger:
250
+ name: read-timeout-spike
251
+ severity: 2
252
+ condition: "ctx.results[0].hits.total.value > {{thresholds.read_timeouts}}"
253
+ card_template: default
254
+ throttle:
255
+ value: 15
256
+ unit: MINUTES
257
+
258
+ # ── Thresholds (per 5-minute window unless noted) ──────────────────────
259
+ # Tuned based on 24h error analysis on 2026-02-25/26.
260
+ # Normal noise floors observed:
261
+ # browser_pool: ~50/5min steady, 500+/5min during spikes
262
+ # 503s: ~150/5min during load, near-0 off-peak
263
+ # structured errors: ~200/5min at peak
264
+ # worker shutdowns: ~1/5min normal recycling
265
+ thresholds:
266
+ browser_pool: 100 # 2x normal steady-state for browser pool exhaustion
267
+ playwright_failures: 50 # any significant Playwright failure burst
268
+ http_503s: 500 # 3x normal load baseline; avoids alerting on normal traffic
269
+ structured_errors: 300 # catches broad error spikes across all services
270
+ unsupported_pds: 20 # 10-min window; baseline ~7/hr so alert on 3x
271
+ conn_refused: 5 # any connection refused is concerning
272
+ worker_shutdowns: 10 # 10x normal recycling rate in a 5-min window
273
+ oom_kills: 1 # any OOM is always critical
274
+ read_timeouts: 30 # baseline ~23/hr so alert on burst in 5min
275
+
276
+ defaults:
277
+ env: production
278
+ service_label: "aiaas-inferpds-v5"
279
+ dashboard_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/dashboards?security_tenant=global#/view/python-services-production-health-v2"
280
+ discover_url: "https://microservices-python-do-user-18030911-0.e.db.ondigitalocean.com/app/discover?security_tenant=global#/?_g=(time:(from:now-1h,to:now))&_a=(query:(language:kuery,query:'level:ERROR'))"
@@ -0,0 +1,98 @@
1
+ name: service-health
2
+ description: Per-service error rate and latency monitors — use with --service to scope to a single service
3
+
4
+ destination:
5
+ name: gchat-service-alerts
6
+ type: gchat
7
+
8
+ card_templates:
9
+ service_alert: |
10
+ { "cardsV2": [{ "cardId": "alert-{{ctx.monitor.name}}", "card": {
11
+ "header": { "title": "{{ctx.trigger.name}}", "subtitle": "Service: {{defaults.service}}" },
12
+ "sections": [
13
+ { "widgets": [
14
+ { "decoratedText": { "topLabel": "Monitor", "text": "{{ctx.monitor.name}}" } },
15
+ { "decoratedText": { "topLabel": "Service", "text": "{{defaults.service}}" } },
16
+ { "decoratedText": { "topLabel": "Environment", "text": "{{defaults.env}}" } },
17
+ { "decoratedText": { "topLabel": "Period", "text": "{{ctx.periodStart}} — {{ctx.periodEnd}}" } },
18
+ { "decoratedText": { "topLabel": "Hits", "text": "{{ctx.results.0.hits.total.value}}" } }
19
+ ]},
20
+ { "widgets": [{ "buttonList": { "buttons": [
21
+ { "text": "View Dashboard", "onClick": { "openLink": { "url": "{{defaults.dashboard_url}}" }}}
22
+ ]}}]}
23
+ ]
24
+ }}]}
25
+
26
+ monitors:
27
+ service_errors:
28
+ description: "Service-level 5xx error rate"
29
+ schedule:
30
+ interval: 5
31
+ unit: MINUTES
32
+ query:
33
+ bool:
34
+ must:
35
+ - query_string:
36
+ query: '"500 Internal Server Error" OR "502 Bad Gateway" OR "503 Service" OR "504 Gateway"'
37
+ - match_phrase:
38
+ log: "{{defaults.service}}"
39
+ trigger:
40
+ name: service-5xx-errors
41
+ severity: 2
42
+ condition: "ctx.results[0].hits.total.value > {{thresholds.service_errors}}"
43
+ card_template: service_alert
44
+ throttle:
45
+ value: 10
46
+ unit: MINUTES
47
+
48
+ service_latency:
49
+ description: "High latency detected for service (slow response times)"
50
+ schedule:
51
+ interval: 5
52
+ unit: MINUTES
53
+ query:
54
+ bool:
55
+ must:
56
+ - match_phrase:
57
+ log: "{{defaults.service}}"
58
+ - query_string:
59
+ query: '"correlation middleware" AND "ms"'
60
+ trigger:
61
+ name: service-high-latency
62
+ severity: 2
63
+ condition: "ctx.results[0].hits.total.value > {{thresholds.latency_requests}}"
64
+ card_template: service_alert
65
+ throttle:
66
+ value: 15
67
+ unit: MINUTES
68
+
69
+ service_4xx_spike:
70
+ description: "Unusual spike in 4xx client errors for service"
71
+ schedule:
72
+ interval: 10
73
+ unit: MINUTES
74
+ query:
75
+ bool:
76
+ must:
77
+ - query_string:
78
+ query: '"400 Bad Request" OR "401 Unauthorized" OR "403 Forbidden" OR "404 Not Found" OR "422 Unprocessable"'
79
+ - match_phrase:
80
+ log: "{{defaults.service}}"
81
+ trigger:
82
+ name: service-4xx-spike
83
+ severity: 3
84
+ condition: "ctx.results[0].hits.total.value > {{thresholds.service_4xx}}"
85
+ card_template: service_alert
86
+ throttle:
87
+ value: 15
88
+ unit: MINUTES
89
+
90
+ thresholds:
91
+ service_errors: 5
92
+ latency_requests: 20
93
+ service_4xx: 100
94
+
95
+ defaults:
96
+ env: production
97
+ service: "unknown-service"
98
+ dashboard_url: "https://opensearch.example.com/app/dashboards?security_tenant=global#/view/service-health"
@@ -0,0 +1,57 @@
1
+ name: langfuse-usage
2
+ title: LangFuse Usage Dashboard
3
+ description: LangFuse LLM tracing integration monitoring across Python microservices
4
+ time_from: now-24h
5
+ refresh_interval: 60000
6
+
7
+ header: |
8
+ ## LangFuse Usage Dashboard
9
+ Tracking LangFuse LLM tracing integration across Python microservices.
10
+ Endpoint: `https://api.enc.groupon.com/ai-gateway/llm-langfuse`
11
+
12
+ metrics:
13
+ - title: Total Activity
14
+ query: '"langfuse" OR "Langfuse"'
15
+ label: Total Events
16
+
17
+ - title: Validations
18
+ query: '"Validating" AND "matches" AND "Langfuse"'
19
+ label: Validations
20
+
21
+ - title: Tracing Inits
22
+ query: '"langfuse" AND ("tracing enabled" OR "integration enabled")'
23
+ label: Inits
24
+
25
+ - title: Config Loads
26
+ query: '"langfuse-config"'
27
+ label: Config Loads
28
+
29
+ - title: Timeouts
30
+ query: '"llm-langfuse" AND "ReadTimeout"'
31
+ label: Timeouts
32
+
33
+ - title: API Key Missing
34
+ query: '"ENCORE_G_API_KEY: None" AND "langfuse"'
35
+ label: API Key None
36
+
37
+ charts:
38
+ - title: LangFuse Activity Over Time
39
+ width: half
40
+ series:
41
+ All LangFuse: '"langfuse" OR "Langfuse"'
42
+ Validations: '"Validating" AND "matches" AND "Langfuse"'
43
+ Config Loads: '"langfuse-config"'
44
+
45
+ - title: LangFuse Errors & Issues
46
+ width: half
47
+ series:
48
+ ReadTimeout (llm-langfuse): '"llm-langfuse" AND "ReadTimeout"'
49
+ API Key Missing: '"ENCORE_G_API_KEY: None" AND "langfuse"'
50
+ Failed/Error: '"langfuse" AND ("failed" OR "error" OR "exception")'
51
+
52
+ - title: LangFuse by Service
53
+ width: full
54
+ series:
55
+ inferpds_v5: '"langfuse" AND service.keyword:"aiaas-inferpds-v5"'
56
+ deal_structure: '"langfuse" AND service.keyword:"aiaas-deal-structure"'
57
+ deal_content: '"langfuse" AND service.keyword:"aiaas-deal-content"'
@@ -0,0 +1,10 @@
1
+ /**
2
+ * CLI entry point — create and validate OpenSearch Dashboards.
3
+ *
4
+ * Usage:
5
+ * node dist/create-dashboards.js # create/update default (v2)
6
+ * node dist/create-dashboards.js --name v3 # create a new "v3" dashboard
7
+ * node dist/create-dashboards.js --name v3 --validate # validate v3
8
+ * node dist/create-dashboards.js --name v3 --delete # delete v3
9
+ */
10
+ export {};
@@ -0,0 +1,38 @@
1
+ /**
2
+ * CLI entry point — create and validate OpenSearch Dashboards.
3
+ *
4
+ * Usage:
5
+ * node dist/create-dashboards.js # create/update default (v2)
6
+ * node dist/create-dashboards.js --name v3 # create a new "v3" dashboard
7
+ * node dist/create-dashboards.js --name v3 --validate # validate v3
8
+ * node dist/create-dashboards.js --name v3 --delete # delete v3
9
+ */
10
+ import { loadConfig } from "./utils/index.js";
11
+ import { createOpenSearchRepository } from "./repositories/index.js";
12
+ import { DashboardService } from "./services/index.js";
13
+ import { parseArgs } from "./utils/index.js";
14
+ async function main() {
15
+ const args = parseArgs({
16
+ validate: { type: "boolean" },
17
+ delete: { type: "boolean" },
18
+ name: { type: "string" },
19
+ });
20
+ const config = loadConfig();
21
+ const repo = createOpenSearchRepository(config);
22
+ const service = new DashboardService(repo, args.name);
23
+ let exitCode;
24
+ if (args.validate) {
25
+ exitCode = await service.validate();
26
+ }
27
+ else if (args.delete) {
28
+ exitCode = await service.delete();
29
+ }
30
+ else {
31
+ exitCode = await service.create();
32
+ }
33
+ process.exit(exitCode);
34
+ }
35
+ if (import.meta.url === `file://${process.argv[1]}`) {
36
+ main();
37
+ }
38
+ //# sourceMappingURL=create-dashboards.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"create-dashboards.js","sourceRoot":"","sources":["../src/create-dashboards.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,0BAA0B,EAAE,MAAM,yBAAyB,CAAC;AACrE,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAEvD,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAE7C,KAAK,UAAU,IAAI;IACjB,MAAM,IAAI,GAAG,SAAS,CAAgB;QACpC,QAAQ,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC7B,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE;QAC3B,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;KACzB,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;IAC5B,MAAM,IAAI,GAAG,0BAA0B,CAAC,MAAM,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,IAAI,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;IAEtD,IAAI,QAAgB,CAAC;IACrB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAClB,QAAQ,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,CAAC;IACtC,CAAC;SAAM,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,QAAQ,GAAG,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC;IACpC,CAAC;SAAM,CAAC;QACN,QAAQ,GAAG,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC;IACpC,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;AACzB,CAAC;AAED,IAAI,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,UAAU,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACpD,IAAI,EAAE,CAAC;AACT,CAAC"}