tracekit 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +92 -1
- data/lib/tracekit/security/detector.rb +35 -47
- data/lib/tracekit/security/patterns.rb +29 -7
- data/lib/tracekit/snapshots/client.rb +353 -6
- data/lib/tracekit/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 337c03bb944151dd73e06b09bb8576cab74a557747fee9a19ca8db62ea70be00
|
|
4
|
+
data.tar.gz: 1561ed0d41ead50afe77cf45a2690c654bc7103d8ace66d377e28e92d89829ce
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 46a4977761f03b79ddeb24be2505ce8fdb2104f874efaa499aa61c1b077666e36f49ad37941af5b49b97ba9db59202c4e09248fe5ec899c332f2731ec7fb3ec1
|
|
7
|
+
data.tar.gz: 423417972e701e2eff1a6aa6466421525b81d2cadd09bc02523a61e14591243d78caa2516da17c860750e5aa7e3f8a795608fa998f843a1e5907e59f049639a6
|
data/README.md
CHANGED
|
@@ -286,6 +286,97 @@ sdk.capture_snapshot("user-login", {
|
|
|
286
286
|
})
|
|
287
287
|
```
|
|
288
288
|
|
|
289
|
+
## Kill Switch
|
|
290
|
+
|
|
291
|
+
TraceKit provides a server-side toggle to disable code monitoring per service without deploying code changes.
|
|
292
|
+
|
|
293
|
+
### How It Works
|
|
294
|
+
|
|
295
|
+
When the kill switch is enabled for your service, the SDK sets `@kill_switch_active = true` and suppresses all snapshot captures. The SDK detects kill switch state through two channels:
|
|
296
|
+
|
|
297
|
+
1. **Polling** — The SDK checks for kill switch state on every poll cycle (default 30s)
|
|
298
|
+
2. **SSE** — Real-time kill switch events are received instantly via Server-Sent Events
|
|
299
|
+
|
|
300
|
+
```ruby
|
|
301
|
+
sdk = Tracekit.sdk
|
|
302
|
+
|
|
303
|
+
# No code changes needed — captures are automatically suppressed
|
|
304
|
+
sdk.capture_snapshot("checkout-start", {
|
|
305
|
+
userId: 123,
|
|
306
|
+
amount: 99.99
|
|
307
|
+
})
|
|
308
|
+
# When kill switch is active, this is a no-op
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Behavior When Active
|
|
312
|
+
|
|
313
|
+
- All `capture_snapshot` calls become no-ops (zero overhead)
|
|
314
|
+
- Polling frequency reduces from 30s to **60s** to minimize server load
|
|
315
|
+
- Distributed tracing and metrics continue to function normally
|
|
316
|
+
- When the kill switch is disabled, captures resume automatically on the next poll cycle
|
|
317
|
+
|
|
318
|
+
### Controlling the Kill Switch
|
|
319
|
+
|
|
320
|
+
- **Dashboard**: Toggle code monitoring on/off per service in the TraceKit dashboard
|
|
321
|
+
- **API**: `POST /api/services/:name/kill-switch` with `{"enabled": true}` or `{"enabled": false}`
|
|
322
|
+
|
|
323
|
+
## SSE Real-time Updates
|
|
324
|
+
|
|
325
|
+
The SDK supports Server-Sent Events (SSE) for receiving breakpoint changes and kill switch events in real time, without waiting for the next poll cycle.
|
|
326
|
+
|
|
327
|
+
### How It Works
|
|
328
|
+
|
|
329
|
+
1. The SDK auto-discovers the SSE endpoint from the poll response
|
|
330
|
+
2. A background thread opens a persistent SSE connection
|
|
331
|
+
3. Breakpoint activations/deactivations and kill switch events are applied instantly
|
|
332
|
+
4. If SSE fails, the SDK falls back to polling seamlessly
|
|
333
|
+
|
|
334
|
+
```ruby
|
|
335
|
+
# SSE is enabled automatically when code monitoring is active.
|
|
336
|
+
# No additional configuration needed.
|
|
337
|
+
|
|
338
|
+
Tracekit.configure do |config|
|
|
339
|
+
config.enable_code_monitoring = true
|
|
340
|
+
config.code_monitoring_poll_interval = 30 # Polling still runs as fallback
|
|
341
|
+
end
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### Events Received via SSE
|
|
345
|
+
|
|
346
|
+
| Event | Description |
|
|
347
|
+
|-------|-------------|
|
|
348
|
+
| `breakpoint.activated` | A breakpoint was enabled — start capturing |
|
|
349
|
+
| `breakpoint.deactivated` | A breakpoint was disabled — stop capturing |
|
|
350
|
+
| `kill_switch.enabled` | Code monitoring disabled for this service |
|
|
351
|
+
| `kill_switch.disabled` | Code monitoring re-enabled for this service |
|
|
352
|
+
|
|
353
|
+
## Circuit Breaker
|
|
354
|
+
|
|
355
|
+
The circuit breaker protects your application if the TraceKit backend becomes unreachable.
|
|
356
|
+
|
|
357
|
+
### How It Works
|
|
358
|
+
|
|
359
|
+
1. The SDK tracks consecutive snapshot capture failures
|
|
360
|
+
2. After **3 failures within 60 seconds**, code monitoring is automatically paused
|
|
361
|
+
3. After a **5-minute cooldown**, the circuit breaker resets and captures resume
|
|
362
|
+
|
|
363
|
+
```ruby
|
|
364
|
+
# No configuration needed — circuit breaker is built into the SDK.
|
|
365
|
+
# Thread-safe implementation via Mutex.
|
|
366
|
+
|
|
367
|
+
sdk = Tracekit.sdk
|
|
368
|
+
sdk.capture_snapshot("process-data", { batch_size: 100 })
|
|
369
|
+
# If backend is down, circuit breaker trips after 3 failures
|
|
370
|
+
# Captures resume automatically after 5-minute cooldown
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Behavior When Tripped
|
|
374
|
+
|
|
375
|
+
- All `capture_snapshot` calls become no-ops (zero overhead)
|
|
376
|
+
- Distributed tracing and metrics continue to function normally
|
|
377
|
+
- The SDK automatically retries after the cooldown period
|
|
378
|
+
- Thread-safe via `Mutex` — safe for multi-threaded Ruby applications (Puma, Sidekiq)
|
|
379
|
+
|
|
289
380
|
## Distributed Tracing
|
|
290
381
|
|
|
291
382
|
The SDK automatically:
|
|
@@ -506,4 +597,4 @@ Built on [OpenTelemetry](https://opentelemetry.io/) - the industry standard for
|
|
|
506
597
|
---
|
|
507
598
|
|
|
508
599
|
**Repository**: git@github.com:Tracekit-Dev/ruby-sdk.git
|
|
509
|
-
**Version**: v0.
|
|
600
|
+
**Version**: v0.2.0
|
|
@@ -2,15 +2,30 @@
|
|
|
2
2
|
|
|
3
3
|
module Tracekit
|
|
4
4
|
module Security
|
|
5
|
-
# Detects and redacts sensitive data (PII, credentials) from variable snapshots
|
|
5
|
+
# Detects and redacts sensitive data (PII, credentials) from variable snapshots.
|
|
6
|
+
# Uses typed [REDACTED:type] markers. PII scrubbing is enabled by default.
|
|
6
7
|
class Detector
|
|
7
8
|
SecurityFlag = Struct.new(:type, :category, :severity, :variable, :redacted, keyword_init: true)
|
|
8
9
|
ScanResult = Struct.new(:sanitized_variables, :security_flags, keyword_init: true)
|
|
9
10
|
|
|
11
|
+
attr_accessor :pii_scrubbing
|
|
12
|
+
|
|
13
|
+
# @param pii_scrubbing [Boolean] whether PII scrubbing is enabled (default: true)
|
|
14
|
+
# @param custom_patterns [Array<Hash>] custom patterns, each with :pattern (Regexp) and :marker (String)
|
|
15
|
+
def initialize(pii_scrubbing: true, custom_patterns: [])
|
|
16
|
+
@pii_scrubbing = pii_scrubbing
|
|
17
|
+
@custom_patterns = custom_patterns.map { |p| [p[:pattern], p[:marker]] }
|
|
18
|
+
end
|
|
19
|
+
|
|
10
20
|
def scan(variables)
|
|
11
21
|
sanitized = {}
|
|
12
22
|
flags = []
|
|
13
23
|
|
|
24
|
+
# If PII scrubbing is disabled, return as-is
|
|
25
|
+
unless @pii_scrubbing
|
|
26
|
+
return ScanResult.new(sanitized_variables: variables.dup, security_flags: [])
|
|
27
|
+
end
|
|
28
|
+
|
|
14
29
|
variables.each do |key, value|
|
|
15
30
|
sanitized_value, detected_flags = scan_value(key, value)
|
|
16
31
|
sanitized[key] = sanitized_value
|
|
@@ -26,58 +41,31 @@ module Tracekit
|
|
|
26
41
|
return ["[NULL]", []] if value.nil?
|
|
27
42
|
|
|
28
43
|
flags = []
|
|
29
|
-
value_str = value.to_s
|
|
30
|
-
|
|
31
|
-
# Check PII
|
|
32
|
-
if Patterns::EMAIL.match?(value_str)
|
|
33
|
-
flags << SecurityFlag.new(type: "pii", category: "email", severity: "medium", variable: key, redacted: true)
|
|
34
|
-
return ["[REDACTED]", flags]
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
if Patterns::SSN.match?(value_str)
|
|
38
|
-
flags << SecurityFlag.new(type: "pii", category: "ssn", severity: "critical", variable: key, redacted: true)
|
|
39
|
-
return ["[REDACTED]", flags]
|
|
40
|
-
end
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
+
# Check variable name for sensitive keywords (word-boundary matching)
|
|
46
|
+
if Patterns::SENSITIVE_NAME.match?(key.to_s)
|
|
47
|
+
flags << SecurityFlag.new(type: "sensitive_name", category: "name", severity: "medium", variable: key, redacted: true)
|
|
48
|
+
return ["[REDACTED:sensitive_name]", flags]
|
|
45
49
|
end
|
|
46
50
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
return ["[REDACTED]", flags]
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Check Credentials
|
|
53
|
-
if Patterns::API_KEY.match?(value_str)
|
|
54
|
-
flags << SecurityFlag.new(type: "credential", category: "api_key", severity: "critical", variable: key, redacted: true)
|
|
55
|
-
return ["[REDACTED]", flags]
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
if Patterns::AWS_KEY.match?(value_str)
|
|
59
|
-
flags << SecurityFlag.new(type: "credential", category: "aws_key", severity: "critical", variable: key, redacted: true)
|
|
60
|
-
return ["[REDACTED]", flags]
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
if Patterns::STRIPE_KEY.match?(value_str)
|
|
64
|
-
flags << SecurityFlag.new(type: "credential", category: "stripe_key", severity: "critical", variable: key, redacted: true)
|
|
65
|
-
return ["[REDACTED]", flags]
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
if Patterns::PASSWORD.match?(value_str)
|
|
69
|
-
flags << SecurityFlag.new(type: "credential", category: "password", severity: "critical", variable: key, redacted: true)
|
|
70
|
-
return ["[REDACTED]", flags]
|
|
71
|
-
end
|
|
51
|
+
# Serialize value to string for deep scanning
|
|
52
|
+
value_str = value.to_s
|
|
72
53
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
54
|
+
# Check built-in patterns with typed markers
|
|
55
|
+
Patterns::PATTERN_MARKERS.each do |pattern, marker|
|
|
56
|
+
if pattern.match?(value_str)
|
|
57
|
+
category = marker.match(/REDACTED:(\w+)/)[1]
|
|
58
|
+
flags << SecurityFlag.new(type: "sensitive_data", category: category, severity: "high", variable: key, redacted: true)
|
|
59
|
+
return [marker, flags]
|
|
60
|
+
end
|
|
76
61
|
end
|
|
77
62
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
63
|
+
# Check custom patterns
|
|
64
|
+
@custom_patterns.each do |pattern, marker|
|
|
65
|
+
if pattern.match?(value_str)
|
|
66
|
+
flags << SecurityFlag.new(type: "custom", category: "custom", severity: "high", variable: key, redacted: true)
|
|
67
|
+
return [marker, flags]
|
|
68
|
+
end
|
|
81
69
|
end
|
|
82
70
|
|
|
83
71
|
[value, flags]
|
|
@@ -2,21 +2,43 @@
|
|
|
2
2
|
|
|
3
3
|
module Tracekit
|
|
4
4
|
module Security
|
|
5
|
-
# Regex patterns for detecting sensitive data in snapshots
|
|
5
|
+
# Regex patterns for detecting sensitive data in snapshots.
|
|
6
|
+
# 13 standard patterns with typed [REDACTED:type] markers.
|
|
6
7
|
module Patterns
|
|
7
8
|
# PII Patterns
|
|
8
|
-
EMAIL = /\b[A-Za-z0-9._
|
|
9
|
+
EMAIL = /\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b/
|
|
9
10
|
SSN = /\b\d{3}-\d{2}-\d{4}\b/
|
|
10
11
|
CREDIT_CARD = /\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b/
|
|
11
12
|
PHONE = /\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/
|
|
12
13
|
|
|
13
14
|
# Credential Patterns
|
|
14
|
-
API_KEY = /(api[_
|
|
15
|
+
API_KEY = /(?:api[_\-]?key|apikey)\s*[:=]\s*['"]?[A-Za-z0-9_\-]{20,}/i
|
|
15
16
|
AWS_KEY = /AKIA[0-9A-Z]{16}/
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
AWS_SECRET = /aws.{0,20}secret.{0,20}[A-Za-z0-9\/+=]{40}/i
|
|
18
|
+
OAUTH_TOKEN = /(?:bearer\s+)[A-Za-z0-9._~+\/=\-]{20,}/i
|
|
19
|
+
STRIPE_KEY = /sk_live_[0-9a-zA-Z]{10,}/
|
|
20
|
+
PASSWORD = /(?:password|passwd|pwd)\s*[=:]\s*['"]?[^\s'"]{6,}/i
|
|
21
|
+
JWT = /eyJ[a-zA-Z0-9_\-]+\.eyJ[a-zA-Z0-9_\-]+\.[a-zA-Z0-9_\-]+/
|
|
22
|
+
PRIVATE_KEY = /-----BEGIN (?:RSA |EC )?PRIVATE KEY-----/
|
|
23
|
+
|
|
24
|
+
# Letter-boundary pattern -- \b treats _ as word char, so api_key/user_token won't match
|
|
25
|
+
SENSITIVE_NAME = /(?:^|[^a-zA-Z])(?:password|passwd|pwd|secret|token|key|credential|api_key|apikey)(?:[^a-zA-Z]|$)/i
|
|
26
|
+
|
|
27
|
+
# Mapping of pattern -> typed redaction marker
|
|
28
|
+
PATTERN_MARKERS = {
|
|
29
|
+
EMAIL => "[REDACTED:email]",
|
|
30
|
+
SSN => "[REDACTED:ssn]",
|
|
31
|
+
CREDIT_CARD => "[REDACTED:credit_card]",
|
|
32
|
+
PHONE => "[REDACTED:phone]",
|
|
33
|
+
AWS_KEY => "[REDACTED:aws_key]",
|
|
34
|
+
AWS_SECRET => "[REDACTED:aws_secret]",
|
|
35
|
+
OAUTH_TOKEN => "[REDACTED:oauth_token]",
|
|
36
|
+
STRIPE_KEY => "[REDACTED:stripe_key]",
|
|
37
|
+
PASSWORD => "[REDACTED:password]",
|
|
38
|
+
JWT => "[REDACTED:jwt]",
|
|
39
|
+
PRIVATE_KEY => "[REDACTED:private_key]",
|
|
40
|
+
API_KEY => "[REDACTED:api_key]"
|
|
41
|
+
}.freeze
|
|
20
42
|
end
|
|
21
43
|
end
|
|
22
44
|
end
|
|
@@ -9,7 +9,12 @@ module Tracekit
|
|
|
9
9
|
module Snapshots
|
|
10
10
|
# Client for code monitoring - polls breakpoints and captures snapshots
|
|
11
11
|
class Client
|
|
12
|
-
|
|
12
|
+
# Opt-in capture limits (all disabled by default: nil = unlimited)
|
|
13
|
+
attr_accessor :capture_depth # nil = unlimited depth (default)
|
|
14
|
+
attr_accessor :max_payload # nil = unlimited payload bytes (default)
|
|
15
|
+
attr_accessor :capture_timeout # nil = no timeout seconds (default)
|
|
16
|
+
|
|
17
|
+
def initialize(api_key, base_url, service_name, poll_interval_seconds = 30, **opts)
|
|
13
18
|
@api_key = api_key
|
|
14
19
|
@base_url = base_url
|
|
15
20
|
@service_name = service_name
|
|
@@ -17,6 +22,31 @@ module Tracekit
|
|
|
17
22
|
@breakpoints_cache = Concurrent::Hash.new
|
|
18
23
|
@registration_cache = Concurrent::Hash.new
|
|
19
24
|
|
|
25
|
+
# Opt-in capture limits
|
|
26
|
+
@capture_depth = opts[:capture_depth]
|
|
27
|
+
@max_payload = opts[:max_payload]
|
|
28
|
+
@capture_timeout = opts[:capture_timeout]
|
|
29
|
+
|
|
30
|
+
# Kill switch: server-initiated monitoring disable
|
|
31
|
+
@kill_switch_active = false
|
|
32
|
+
@normal_poll_interval = poll_interval_seconds
|
|
33
|
+
|
|
34
|
+
# SSE (Server-Sent Events) real-time updates
|
|
35
|
+
@sse_endpoint = nil
|
|
36
|
+
@sse_active = false
|
|
37
|
+
@sse_thread = nil
|
|
38
|
+
|
|
39
|
+
# Circuit breaker state (Mutex-protected for thread safety)
|
|
40
|
+
cb_config = opts[:circuit_breaker] || {}
|
|
41
|
+
@cb_mutex = Mutex.new
|
|
42
|
+
@cb_failure_timestamps = []
|
|
43
|
+
@cb_state = "closed"
|
|
44
|
+
@cb_opened_at = nil
|
|
45
|
+
@cb_max_failures = cb_config[:max_failures] || 3
|
|
46
|
+
@cb_window_seconds = cb_config[:window_seconds] || 60
|
|
47
|
+
@cb_cooldown_seconds = cb_config[:cooldown_seconds] || 300
|
|
48
|
+
@pending_events = []
|
|
49
|
+
|
|
20
50
|
# Start polling timer
|
|
21
51
|
@poll_task = Concurrent::TimerTask.new(execution_interval: poll_interval_seconds) do
|
|
22
52
|
fetch_active_breakpoints
|
|
@@ -27,8 +57,22 @@ module Tracekit
|
|
|
27
57
|
fetch_active_breakpoints
|
|
28
58
|
end
|
|
29
59
|
|
|
30
|
-
# Captures a snapshot at the caller's location
|
|
60
|
+
# Captures a snapshot at the caller's location.
|
|
61
|
+
# Crash isolation: rescues all exceptions so TraceKit never crashes the host app.
|
|
31
62
|
def capture_snapshot(label, variables, caller_location = nil)
|
|
63
|
+
begin
|
|
64
|
+
do_capture_snapshot(label, variables, caller_location)
|
|
65
|
+
rescue => e
|
|
66
|
+
warn "TraceKit: error in capture_snapshot: #{e.message}" if ENV["DEBUG"]
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def do_capture_snapshot(label, variables, caller_location)
|
|
73
|
+
# Kill switch: skip all capture when server has disabled monitoring
|
|
74
|
+
return if @kill_switch_active
|
|
75
|
+
|
|
32
76
|
# Extract caller information
|
|
33
77
|
caller_location ||= caller_locations(1, 1).first
|
|
34
78
|
file_path = caller_location.path
|
|
@@ -47,6 +91,11 @@ module Tracekit
|
|
|
47
91
|
return if breakpoint.expire_at && Time.now > breakpoint.expire_at
|
|
48
92
|
return if breakpoint.max_captures > 0 && breakpoint.capture_count >= breakpoint.max_captures
|
|
49
93
|
|
|
94
|
+
# Apply opt-in capture depth limit
|
|
95
|
+
if @capture_depth && @capture_depth > 0
|
|
96
|
+
variables = limit_depth(variables, 0)
|
|
97
|
+
end
|
|
98
|
+
|
|
50
99
|
# Scan for security issues
|
|
51
100
|
scan_result = @security_detector.scan(variables)
|
|
52
101
|
|
|
@@ -79,17 +128,65 @@ module Tracekit
|
|
|
79
128
|
captured_at: Time.now.utc.iso8601
|
|
80
129
|
)
|
|
81
130
|
|
|
82
|
-
#
|
|
83
|
-
|
|
131
|
+
# Apply opt-in max payload limit
|
|
132
|
+
serialized = JSON.generate(snapshot.to_h)
|
|
133
|
+
if @max_payload && @max_payload > 0 && serialized.bytesize > @max_payload
|
|
134
|
+
snapshot = Snapshot.new(
|
|
135
|
+
breakpoint_id: breakpoint.id,
|
|
136
|
+
service_name: @service_name,
|
|
137
|
+
file_path: file_path,
|
|
138
|
+
function_name: function_name,
|
|
139
|
+
label: label,
|
|
140
|
+
line_number: line_number,
|
|
141
|
+
variables: { "_truncated" => true, "_payload_size" => serialized.bytesize, "_max_payload" => @max_payload },
|
|
142
|
+
security_flags: [],
|
|
143
|
+
stack_trace: stack_trace,
|
|
144
|
+
trace_id: trace_id,
|
|
145
|
+
span_id: span_id,
|
|
146
|
+
captured_at: Time.now.utc.iso8601
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Submit asynchronously (with optional timeout)
|
|
151
|
+
if @capture_timeout && @capture_timeout > 0
|
|
152
|
+
thread = Thread.new { submit_snapshot(snapshot) }
|
|
153
|
+
unless thread.join(@capture_timeout)
|
|
154
|
+
warn "TraceKit: capture timeout exceeded (#{@capture_timeout}s)" if ENV["DEBUG"]
|
|
155
|
+
thread.kill
|
|
156
|
+
end
|
|
157
|
+
else
|
|
158
|
+
Thread.new { submit_snapshot(snapshot) }
|
|
159
|
+
end
|
|
84
160
|
end
|
|
85
161
|
|
|
162
|
+
public
|
|
163
|
+
|
|
86
164
|
# Shuts down the client
|
|
87
165
|
def shutdown
|
|
88
166
|
@poll_task&.shutdown
|
|
167
|
+
close_sse
|
|
89
168
|
end
|
|
90
169
|
|
|
91
170
|
private
|
|
92
171
|
|
|
172
|
+
# Limit variable nesting depth (opt-in)
|
|
173
|
+
def limit_depth(data, current_depth)
|
|
174
|
+
return { "_truncated" => true, "_depth" => current_depth } if current_depth >= @capture_depth
|
|
175
|
+
|
|
176
|
+
case data
|
|
177
|
+
when Hash
|
|
178
|
+
result = {}
|
|
179
|
+
data.each do |k, v|
|
|
180
|
+
result[k] = limit_depth(v, current_depth + 1)
|
|
181
|
+
end
|
|
182
|
+
result
|
|
183
|
+
when Array
|
|
184
|
+
data.map { |item| limit_depth(item, current_depth + 1) }
|
|
185
|
+
else
|
|
186
|
+
data
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
93
190
|
def fetch_active_breakpoints
|
|
94
191
|
url = "#{@base_url}/sdk/snapshots/active/#{@service_name}"
|
|
95
192
|
uri = URI(url)
|
|
@@ -106,11 +203,36 @@ module Tracekit
|
|
|
106
203
|
|
|
107
204
|
data = JSON.parse(response.body, symbolize_names: true)
|
|
108
205
|
update_breakpoint_cache(data[:breakpoints]) if data[:breakpoints]
|
|
206
|
+
|
|
207
|
+
# SSE auto-discovery: if polling response includes sse_endpoint, start SSE connection
|
|
208
|
+
if data[:sse_endpoint] && !@sse_active
|
|
209
|
+
@sse_endpoint = data[:sse_endpoint]
|
|
210
|
+
start_sse_thread(@sse_endpoint)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Handle kill switch state (missing field = false for backward compat)
|
|
214
|
+
new_kill_state = data[:kill_switch] == true
|
|
215
|
+
if new_kill_state && !@kill_switch_active
|
|
216
|
+
warn "TraceKit: Code monitoring disabled by server kill switch. Polling at reduced frequency."
|
|
217
|
+
reschedule_polling(60)
|
|
218
|
+
elsif !new_kill_state && @kill_switch_active
|
|
219
|
+
warn "TraceKit: Code monitoring re-enabled by server."
|
|
220
|
+
reschedule_polling(@normal_poll_interval)
|
|
221
|
+
end
|
|
222
|
+
@kill_switch_active = new_kill_state
|
|
109
223
|
rescue => e
|
|
110
224
|
# Silently ignore errors fetching breakpoints
|
|
111
225
|
warn "Error fetching breakpoints: #{e.message}" if ENV["DEBUG"]
|
|
112
226
|
end
|
|
113
227
|
|
|
228
|
+
def reschedule_polling(interval_seconds)
|
|
229
|
+
@poll_task&.shutdown
|
|
230
|
+
@poll_task = Concurrent::TimerTask.new(execution_interval: interval_seconds) do
|
|
231
|
+
fetch_active_breakpoints
|
|
232
|
+
end
|
|
233
|
+
@poll_task.execute
|
|
234
|
+
end
|
|
235
|
+
|
|
114
236
|
def update_breakpoint_cache(breakpoints)
|
|
115
237
|
@breakpoints_cache.clear
|
|
116
238
|
|
|
@@ -181,6 +303,9 @@ module Tracekit
|
|
|
181
303
|
end
|
|
182
304
|
|
|
183
305
|
def submit_snapshot(snapshot)
|
|
306
|
+
# Circuit breaker check
|
|
307
|
+
return unless circuit_breaker_should_allow?
|
|
308
|
+
|
|
184
309
|
uri = URI("#{@base_url}/sdk/snapshots/capture")
|
|
185
310
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
186
311
|
http.use_ssl = uri.scheme == "https"
|
|
@@ -192,11 +317,233 @@ module Tracekit
|
|
|
192
317
|
})
|
|
193
318
|
request.body = JSON.generate(snapshot.to_h)
|
|
194
319
|
|
|
195
|
-
http.request(request)
|
|
320
|
+
response = http.request(request)
|
|
321
|
+
|
|
322
|
+
# Server error (5xx) -- count as circuit breaker failure
|
|
323
|
+
if response.is_a?(Net::HTTPServerError)
|
|
324
|
+
queue_circuit_breaker_event if circuit_breaker_record_failure
|
|
325
|
+
end
|
|
326
|
+
rescue SocketError, Errno::ECONNREFUSED, Errno::EHOSTUNREACH,
|
|
327
|
+
Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout => e
|
|
328
|
+
# Network/timeout error -- count as circuit breaker failure
|
|
329
|
+
warn "Error submitting snapshot: #{e.message}" if ENV["DEBUG"]
|
|
330
|
+
queue_circuit_breaker_event if circuit_breaker_record_failure
|
|
196
331
|
rescue => e
|
|
197
|
-
#
|
|
332
|
+
# Other errors -- do NOT count as circuit breaker failure
|
|
198
333
|
warn "Error submitting snapshot: #{e.message}" if ENV["DEBUG"]
|
|
199
334
|
end
|
|
335
|
+
|
|
336
|
+
# Start SSE connection in a daemon thread
|
|
337
|
+
def start_sse_thread(endpoint)
|
|
338
|
+
close_sse # Close any existing SSE connection
|
|
339
|
+
|
|
340
|
+
@sse_thread = Thread.new do
|
|
341
|
+
begin
|
|
342
|
+
connect_sse(endpoint)
|
|
343
|
+
rescue => e
|
|
344
|
+
warn "TraceKit: SSE thread error: #{e.message}" if ENV["DEBUG"]
|
|
345
|
+
@sse_active = false
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
@sse_thread.abort_on_exception = false
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# Connect to the SSE endpoint for real-time breakpoint updates.
|
|
352
|
+
# Falls back to polling if SSE connection fails or disconnects.
|
|
353
|
+
# Crash isolation: all exceptions are rescued so TraceKit never crashes the host app.
|
|
354
|
+
def connect_sse(endpoint)
|
|
355
|
+
full_url = "#{@base_url}#{endpoint}"
|
|
356
|
+
uri = URI(full_url)
|
|
357
|
+
|
|
358
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
359
|
+
http.use_ssl = uri.scheme == "https"
|
|
360
|
+
http.read_timeout = 0 # No timeout for SSE (long-lived connection)
|
|
361
|
+
http.open_timeout = 10
|
|
362
|
+
|
|
363
|
+
request = Net::HTTP::Get.new(uri.path)
|
|
364
|
+
request["X-API-Key"] = @api_key
|
|
365
|
+
request["Accept"] = "text/event-stream"
|
|
366
|
+
request["Cache-Control"] = "no-cache"
|
|
367
|
+
|
|
368
|
+
http.request(request) do |response|
|
|
369
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
370
|
+
warn "TraceKit: SSE connection failed with HTTP #{response.code}, falling back to polling" if ENV["DEBUG"]
|
|
371
|
+
@sse_active = false
|
|
372
|
+
return
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
@sse_active = true
|
|
376
|
+
warn "TraceKit: SSE connected to #{endpoint}" if ENV["DEBUG"]
|
|
377
|
+
|
|
378
|
+
event_type = nil
|
|
379
|
+
event_data = ""
|
|
380
|
+
|
|
381
|
+
response.read_body do |chunk|
|
|
382
|
+
chunk.each_line do |line|
|
|
383
|
+
line = line.chomp
|
|
384
|
+
|
|
385
|
+
if line.start_with?("event:")
|
|
386
|
+
event_type = line.sub(/^event:\s*/, "").strip
|
|
387
|
+
elsif line.start_with?("data:")
|
|
388
|
+
event_data += line.sub(/^data:\s*/, "")
|
|
389
|
+
elsif line.empty? && event_type
|
|
390
|
+
# Empty line signals end of event -- process it
|
|
391
|
+
handle_sse_event(event_type, event_data)
|
|
392
|
+
event_type = nil
|
|
393
|
+
event_data = ""
|
|
394
|
+
end
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# Connection closed cleanly
|
|
400
|
+
@sse_active = false
|
|
401
|
+
warn "TraceKit: SSE connection closed, falling back to polling" if ENV["DEBUG"]
|
|
402
|
+
rescue SocketError, Errno::ECONNREFUSED, Errno::EHOSTUNREACH,
|
|
403
|
+
Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout,
|
|
404
|
+
IOError, EOFError => e
|
|
405
|
+
warn "TraceKit: SSE connection error: #{e.message}, falling back to polling" if ENV["DEBUG"]
|
|
406
|
+
@sse_active = false
|
|
407
|
+
rescue => e
|
|
408
|
+
warn "TraceKit: SSE unexpected error: #{e.message}" if ENV["DEBUG"]
|
|
409
|
+
@sse_active = false
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
# Handle a parsed SSE event
|
|
413
|
+
def handle_sse_event(event_type, data_str)
|
|
414
|
+
case event_type
|
|
415
|
+
when "init"
|
|
416
|
+
payload = JSON.parse(data_str, symbolize_names: true)
|
|
417
|
+
update_breakpoint_cache(payload[:breakpoints]) if payload[:breakpoints]
|
|
418
|
+
|
|
419
|
+
# Update kill switch from init event
|
|
420
|
+
if payload.key?(:kill_switch)
|
|
421
|
+
new_kill_state = payload[:kill_switch] == true
|
|
422
|
+
if new_kill_state && !@kill_switch_active
|
|
423
|
+
warn "TraceKit: Code monitoring disabled by server kill switch."
|
|
424
|
+
close_sse
|
|
425
|
+
end
|
|
426
|
+
@kill_switch_active = new_kill_state
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
when "breakpoint_created", "breakpoint_updated"
|
|
430
|
+
bp_data = JSON.parse(data_str, symbolize_names: true)
|
|
431
|
+
upsert_breakpoint(bp_data)
|
|
432
|
+
|
|
433
|
+
when "breakpoint_deleted"
|
|
434
|
+
bp_data = JSON.parse(data_str, symbolize_names: true)
|
|
435
|
+
remove_breakpoint(bp_data[:id])
|
|
436
|
+
|
|
437
|
+
when "kill_switch"
|
|
438
|
+
payload = JSON.parse(data_str, symbolize_names: true)
|
|
439
|
+
@kill_switch_active = payload[:enabled] == true
|
|
440
|
+
if @kill_switch_active
|
|
441
|
+
warn "TraceKit: Code monitoring disabled by server kill switch via SSE."
|
|
442
|
+
close_sse
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
when "heartbeat"
|
|
446
|
+
# No action needed -- keeps connection alive
|
|
447
|
+
|
|
448
|
+
else
|
|
449
|
+
warn "TraceKit: Unknown SSE event type: #{event_type}" if ENV["DEBUG"]
|
|
450
|
+
end
|
|
451
|
+
rescue JSON::ParserError => e
|
|
452
|
+
warn "TraceKit: SSE JSON parse error for '#{event_type}': #{e.message}" if ENV["DEBUG"]
|
|
453
|
+
rescue => e
|
|
454
|
+
warn "TraceKit: SSE event handling error: #{e.message}" if ENV["DEBUG"]
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Upsert a single breakpoint into the cache
|
|
458
|
+
def upsert_breakpoint(bp_data)
|
|
459
|
+
bp = BreakpointConfig.new(
|
|
460
|
+
id: bp_data[:id],
|
|
461
|
+
file_path: bp_data[:file_path],
|
|
462
|
+
line_number: bp_data[:line_number],
|
|
463
|
+
function_name: bp_data[:function_name],
|
|
464
|
+
label: bp_data[:label],
|
|
465
|
+
enabled: bp_data[:enabled],
|
|
466
|
+
max_captures: bp_data[:max_captures] || 0,
|
|
467
|
+
capture_count: bp_data[:capture_count] || 0,
|
|
468
|
+
expire_at: bp_data[:expire_at] ? Time.parse(bp_data[:expire_at]) : nil
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Key by function + label
|
|
472
|
+
if bp.label && bp.function_name
|
|
473
|
+
label_key = "#{bp.function_name}:#{bp.label}"
|
|
474
|
+
@breakpoints_cache[label_key] = bp
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
# Key by file + line
|
|
478
|
+
line_key = "#{bp.file_path}:#{bp.line_number}"
|
|
479
|
+
@breakpoints_cache[line_key] = bp
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
# Remove a breakpoint from the cache by ID
|
|
483
|
+
def remove_breakpoint(breakpoint_id)
|
|
484
|
+
return unless breakpoint_id
|
|
485
|
+
|
|
486
|
+
@breakpoints_cache.delete_if { |_key, bp| bp.id == breakpoint_id }
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Close the active SSE connection
|
|
490
|
+
def close_sse
|
|
491
|
+
@sse_active = false
|
|
492
|
+
if @sse_thread&.alive?
|
|
493
|
+
@sse_thread.kill
|
|
494
|
+
@sse_thread = nil
|
|
495
|
+
end
|
|
496
|
+
end
|
|
497
|
+
|
|
498
|
+
def circuit_breaker_should_allow?
|
|
499
|
+
@cb_mutex.synchronize do
|
|
500
|
+
return true if @cb_state == "closed"
|
|
501
|
+
|
|
502
|
+
# Check cooldown
|
|
503
|
+
if @cb_opened_at && (Time.now.to_f - @cb_opened_at) >= @cb_cooldown_seconds
|
|
504
|
+
@cb_state = "closed"
|
|
505
|
+
@cb_failure_timestamps.clear
|
|
506
|
+
@cb_opened_at = nil
|
|
507
|
+
warn "TraceKit: Code monitoring resumed"
|
|
508
|
+
return true
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
false
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
def circuit_breaker_record_failure
|
|
516
|
+
@cb_mutex.synchronize do
|
|
517
|
+
now = Time.now.to_f
|
|
518
|
+
@cb_failure_timestamps << now
|
|
519
|
+
|
|
520
|
+
# Prune old timestamps
|
|
521
|
+
cutoff = now - @cb_window_seconds
|
|
522
|
+
@cb_failure_timestamps.reject! { |ts| ts <= cutoff }
|
|
523
|
+
|
|
524
|
+
if @cb_failure_timestamps.size >= @cb_max_failures && @cb_state == "closed"
|
|
525
|
+
@cb_state = "open"
|
|
526
|
+
@cb_opened_at = now
|
|
527
|
+
warn "TraceKit: Code monitoring paused (#{@cb_max_failures} capture failures in #{@cb_window_seconds}s). Auto-resumes in #{@cb_cooldown_seconds / 60} min."
|
|
528
|
+
return true
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
false
|
|
532
|
+
end
|
|
533
|
+
end
|
|
534
|
+
|
|
535
|
+
def queue_circuit_breaker_event
|
|
536
|
+
@cb_mutex.synchronize do
|
|
537
|
+
@pending_events << {
|
|
538
|
+
type: "circuit_breaker_tripped",
|
|
539
|
+
service_name: @service_name,
|
|
540
|
+
failure_count: @cb_max_failures,
|
|
541
|
+
window_seconds: @cb_window_seconds,
|
|
542
|
+
cooldown_seconds: @cb_cooldown_seconds,
|
|
543
|
+
timestamp: Time.now.utc.iso8601
|
|
544
|
+
}
|
|
545
|
+
end
|
|
546
|
+
end
|
|
200
547
|
end
|
|
201
548
|
end
|
|
202
549
|
end
|
data/lib/tracekit/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tracekit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- TraceKit
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: opentelemetry-sdk
|
|
@@ -173,7 +173,7 @@ metadata:
|
|
|
173
173
|
homepage_uri: https://github.com/Tracekit-Dev/ruby-sdk
|
|
174
174
|
source_code_uri: https://github.com/Tracekit-Dev/ruby-sdk
|
|
175
175
|
changelog_uri: https://github.com/Tracekit-Dev/ruby-sdk/blob/main/CHANGELOG.md
|
|
176
|
-
post_install_message:
|
|
176
|
+
post_install_message:
|
|
177
177
|
rdoc_options: []
|
|
178
178
|
require_paths:
|
|
179
179
|
- lib
|
|
@@ -188,8 +188,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
188
188
|
- !ruby/object:Gem::Version
|
|
189
189
|
version: '0'
|
|
190
190
|
requirements: []
|
|
191
|
-
rubygems_version: 3.
|
|
192
|
-
signing_key:
|
|
191
|
+
rubygems_version: 3.5.3
|
|
192
|
+
signing_key:
|
|
193
193
|
specification_version: 4
|
|
194
194
|
summary: TraceKit Ruby SDK - OpenTelemetry-based APM for Ruby applications
|
|
195
195
|
test_files: []
|