@sentienguard/apm 1.0.10 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sentienguard/apm",
3
- "version": "1.0.10",
3
+ "version": "1.0.14",
4
4
  "description": "SentienGuard APM SDK - Minimal, production-safe application performance monitoring",
5
5
  "main": "src/index.js",
6
6
  "types": "src/index.d.ts",
package/src/config.js CHANGED
@@ -14,6 +14,7 @@ const config = {
14
14
  service: '',
15
15
  environment: 'production',
16
16
  endpoint: 'https://sentienguard-dev.the-algo.com/api/v1/apm/ingest',
17
+ tracesEndpoint: '',
17
18
  flushInterval: 10,
18
19
  maxRoutes: 100,
19
20
  maxPayloadSize: 1024 * 1024,
@@ -43,7 +44,12 @@ const config = {
43
44
  /** When true, outgoing HTTP to localhost is traced (for multi-service dev). Default false. */
44
45
  traceLocalHttp: false,
45
46
  /** Port -> display name for local peers, from SENTIENGUARD_PEER_SERVICE_MAP */
46
- peerServiceMap: {}
47
+ peerServiceMap: {},
48
+ /** Sampling rate for exporting raw spans (0..1). Parent-based. */
49
+ sampleRate: 0.05,
50
+ /** Span export queue + batch sizes (drop-on-pressure). */
51
+ maxQueueSize: 2048,
52
+ maxBatchSize: 256
47
53
  }
48
54
  };
49
55
 
@@ -60,6 +66,10 @@ export function loadConfig({ force = false } = {}) {
60
66
  config.service = process.env.SENTIENGUARD_SERVICE || '';
61
67
  config.environment = process.env.SENTIENGUARD_ENV || 'production';
62
68
  config.endpoint = process.env.SENTIENGUARD_ENDPOINT || 'https://sentienguard-dev.the-algo.com/api/v1/apm/ingest';
69
+ config.tracesEndpoint =
70
+ process.env.SENTIENGUARD_TRACES_ENDPOINT ||
71
+ deriveTracesEndpoint(config.endpoint) ||
72
+ 'https://sentienguard-dev.the-algo.com/api/v1/apm/traces';
63
73
  config.flushInterval = parseInt(process.env.SENTIENGUARD_FLUSH_INTERVAL, 10) || 10;
64
74
  config.maxRoutes = parseInt(process.env.SENTIENGUARD_MAX_ROUTES, 10) || 100;
65
75
  config.maxPayloadSize = parseInt(process.env.SENTIENGUARD_MAX_PAYLOAD_SIZE, 10) || 1024 * 1024;
@@ -82,6 +92,13 @@ export function loadConfig({ force = false } = {}) {
82
92
  config.openai.slowCallMs = parseInt(process.env.SENTIENGUARD_OPENAI_SLOW_CALL_MS, 10) || 5000;
83
93
 
84
94
  config.tracing.enabled = process.env.SENTIENGUARD_TRACING !== 'false';
95
+ const sampleRaw = process.env.SENTIENGUARD_TRACE_SAMPLE_RATE;
96
+ const sample = sampleRaw != null ? Number(sampleRaw) : NaN;
97
+ if (!Number.isNaN(sample) && sample >= 0 && sample <= 1) {
98
+ config.tracing.sampleRate = sample;
99
+ }
100
+ config.tracing.maxQueueSize = parseInt(process.env.SENTIENGUARD_TRACE_MAX_QUEUE_SIZE, 10) || config.tracing.maxQueueSize;
101
+ config.tracing.maxBatchSize = parseInt(process.env.SENTIENGUARD_TRACE_MAX_BATCH_SIZE, 10) || config.tracing.maxBatchSize;
85
102
  // Default behavior:
86
103
  // - production: do NOT record localhost dependency edges (noise + self-calls)
87
104
  // - non-production: DO record localhost edges (local multi-service dev "just works")
@@ -114,6 +131,17 @@ function parsePeerServiceMap(raw) {
114
131
  return map;
115
132
  }
116
133
 
134
+ function deriveTracesEndpoint(ingestEndpoint) {
135
+ try {
136
+ const u = new URL(String(ingestEndpoint));
137
+ // Common default: /api/v1/apm/ingest -> /api/v1/apm/traces
138
+ u.pathname = u.pathname.replace(/\/apm\/ingest\/?$/i, '/apm/traces');
139
+ return u.toString();
140
+ } catch {
141
+ return '';
142
+ }
143
+ }
144
+
117
145
  /**
118
146
  * Check if SDK is properly configured and should be active.
119
147
  * Triggers lazy config load if not yet loaded.
package/src/index.d.ts CHANGED
@@ -8,6 +8,7 @@ export interface ApmConfig {
8
8
  service?: string;
9
9
  environment?: string;
10
10
  endpoint?: string;
11
+ tracesEndpoint?: string;
11
12
  flushInterval?: number;
12
13
  maxRoutes?: number;
13
14
  maxPayloadSize?: number;
@@ -17,6 +18,12 @@ export interface ApmConfig {
17
18
  enabled?: boolean;
18
19
  traceLocalHttp?: boolean;
19
20
  peerServiceMap?: Record<string, string>;
21
+ /** Export sampling for raw traces only (0..1). Metrics are not sampled. */
22
+ sampleRate?: number;
23
+ /** Drop-on-pressure queue size for raw span export */
24
+ maxQueueSize?: number;
25
+ /** Batch size for raw span export */
26
+ maxBatchSize?: number;
20
27
  };
21
28
  }
22
29
 
package/src/index.js CHANGED
@@ -14,8 +14,12 @@
14
14
  * SENTIENGUARD_SERVICE=my-api (required)
15
15
  * SENTIENGUARD_ENV=production (optional, default: production)
16
16
  * SENTIENGUARD_ENDPOINT=https://... (optional)
17
+ * SENTIENGUARD_TRACES_ENDPOINT=https://... (optional, raw span ingest; default derived from SENTIENGUARD_ENDPOINT)
17
18
  * SENTIENGUARD_FLUSH_INTERVAL=10 (optional, seconds)
18
19
  * SENTIENGUARD_TRACING=false (optional, disable OpenTelemetry / W3C propagation; use legacy HTTP patches)
20
+ * SENTIENGUARD_TRACE_SAMPLE_RATE=0.05 (optional, export sampling for raw traces only; metrics are not sampled)
21
+ * SENTIENGUARD_TRACE_MAX_QUEUE_SIZE=2048 (optional, drop-on-pressure queue size for raw spans)
22
+ * SENTIENGUARD_TRACE_MAX_BATCH_SIZE=256 (optional, batch size for raw span export)
19
23
  * SENTIENGUARD_TRACE_LOCAL_HTTP=true (optional, record outgoing HTTP to localhost as dependencies; use with SENTIENGUARD_PEER_SERVICE_MAP)
20
24
  * SENTIENGUARD_PEER_SERVICE_MAP=3001:service-b,3002:other (optional, port -> callee name for local peers)
21
25
  *
@@ -33,6 +37,7 @@ import { instrumentMongoDB, autoInstrumentMongoDB, stopMongoDBInstrumentation }
33
37
  import { instrumentOpenAI, stopOpenAIInstrumentation } from './openai.js';
34
38
  import { createBreaker, wrapMongoOperation, getBreakerStats, shutdownBreakers } from './circuitBreaker.js';
35
39
  import { startTracing, shutdownTracing, getActiveTraceId, isTracingActive } from './tracing.js';
40
+ import { flushTraceQueue } from './traceTransport.js';
36
41
 
37
42
  let isInitialized = false;
38
43
 
@@ -100,6 +105,8 @@ function setupGracefulShutdown() {
100
105
 
101
106
  // Final flush
102
107
  await finalFlush();
108
+ // Best-effort flush of queued raw spans
109
+ await flushTraceQueue();
103
110
 
104
111
  debug('Shutdown complete');
105
112
  };
@@ -122,6 +129,8 @@ async function shutdown() {
122
129
  debug('Shutting down SDK');
123
130
 
124
131
  await shutdownTracing();
132
+ // After OTel shutdown, best-effort drain any serialized spans still queued in transport.
133
+ await flushTraceQueue();
125
134
 
126
135
  // Stop MongoDB instrumentation
127
136
  stopMongoDBInstrumentation();
@@ -0,0 +1,123 @@
1
+ /**
2
+ * SpanExporter that ships sampled raw spans to SentienGuard trace ingest.
3
+ *
4
+ * This is intentionally "lossy": it enqueues serialized spans to an async transport
5
+ * and returns SUCCESS quickly to avoid blocking the app.
6
+ */
7
+
8
+ import { ExportResultCode, hrTimeToMilliseconds } from '@opentelemetry/core';
9
+ import { SpanStatusCode } from '@opentelemetry/api';
10
+ import { enqueueSpans } from './traceTransport.js';
11
+ import { getConfig } from './config.js';
12
+
13
+ function hrTimeToUnixNanoString(hrTime) {
14
+ // hrTime is [seconds, nanoseconds]
15
+ if (!Array.isArray(hrTime) || hrTime.length !== 2) return '';
16
+ const sec = BigInt(hrTime[0] || 0);
17
+ const ns = BigInt(hrTime[1] || 0);
18
+ return String(sec * 1000000000n + ns);
19
+ }
20
+
21
+ function statusForSpan(span) {
22
+ const code = span?.status?.code;
23
+ if (code === SpanStatusCode.ERROR) return { code: 'ERROR', message: span.status?.message || '' };
24
+ if (code === SpanStatusCode.OK) return { code: 'OK', message: span.status?.message || '' };
25
+ return { code: 'UNSET', message: span?.status?.message || '' };
26
+ }
27
+
28
+ function safeAttrs(attrs) {
29
+ if (!attrs) return {};
30
+ if (typeof attrs.get === 'function') {
31
+ const out = {};
32
+ for (const [k, v] of attrs.entries()) out[k] = v;
33
+ return out;
34
+ }
35
+ if (typeof attrs === 'object') return attrs;
36
+ return {};
37
+ }
38
+
39
+ function shouldSampleTraceId(traceId, sampleRate) {
40
+ if (sampleRate == null) return true;
41
+ const r = Number(sampleRate);
42
+ if (!Number.isFinite(r)) return true;
43
+ if (r <= 0) return false;
44
+ if (r >= 1) return true;
45
+
46
+ // Deterministic sampling based on trace_id (stable across services).
47
+ // Use the first 8 hex chars (32 bits) -> [0,1).
48
+ try {
49
+ const prefix = String(traceId).slice(0, 8);
50
+ if (!/^[0-9a-f]{8}$/i.test(prefix)) return Math.random() < r;
51
+ const n = parseInt(prefix, 16) >>> 0;
52
+ const p = n / 0x100000000; // 2^32
53
+ return p < r;
54
+ } catch {
55
+ return Math.random() < r;
56
+ }
57
+ }
58
+
59
+ function serializeSpan(span) {
60
+ const ctx = span?.spanContext?.();
61
+ if (!ctx?.traceId || !ctx?.spanId) return null;
62
+
63
+ const startNano = hrTimeToUnixNanoString(span.startTime);
64
+ const endNano = hrTimeToUnixNanoString(span.endTime);
65
+ if (!startNano || !endNano) return null;
66
+
67
+ const parentSpanId = span?.parentSpanId || span?.parentSpanContext?.spanId || null;
68
+ const status = statusForSpan(span);
69
+
70
+ const durationMs =
71
+ span.endTime && span.startTime
72
+ ? Math.max(0, hrTimeToMilliseconds(span.endTime) - hrTimeToMilliseconds(span.startTime))
73
+ : 0;
74
+
75
+ return {
76
+ trace_id: ctx.traceId,
77
+ span_id: ctx.spanId,
78
+ parent_span_id: parentSpanId || null,
79
+ name: span.name || '',
80
+ kind: span.kind != null ? String(span.kind) : undefined,
81
+ start_time_unix_nano: startNano,
82
+ end_time_unix_nano: endNano,
83
+ status,
84
+ attributes: safeAttrs(span.attributes),
85
+ events: Array.isArray(span.events) ? span.events : [],
86
+ links: Array.isArray(span.links) ? span.links : [],
87
+ duration_ms: Math.round(durationMs)
88
+ };
89
+ }
90
+
91
+ export class SentienGuardTraceSpanExporter {
92
+ export(spans, resultCallback) {
93
+ try {
94
+ const cfg = getConfig();
95
+ const rate = cfg?.tracing?.sampleRate;
96
+
97
+ const serialized = [];
98
+ for (const span of spans) {
99
+ try {
100
+ const s = serializeSpan(span);
101
+ if (s && shouldSampleTraceId(s.trace_id, rate)) serialized.push(s);
102
+ } catch {
103
+ // ignore
104
+ }
105
+ }
106
+
107
+ if (serialized.length) {
108
+ enqueueSpans(serialized);
109
+ }
110
+
111
+ resultCallback({ code: ExportResultCode.SUCCESS });
112
+ } catch (err) {
113
+ resultCallback({ code: ExportResultCode.FAILED, error: err });
114
+ }
115
+ }
116
+
117
+ shutdown() {
118
+ return Promise.resolve();
119
+ }
120
+ }
121
+
122
+ export default SentienGuardTraceSpanExporter;
123
+
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Trace Transport (raw spans)
3
+ *
4
+ * Rules:
5
+ * - Never block app requests.
6
+ * - Best-effort delivery; data loss is acceptable.
7
+ * - Drop under sustained failure or memory pressure.
8
+ */
9
+
10
+ import https from 'https';
11
+ import http from 'http';
12
+ import { debug, warn, getConfig, isEnabled } from './config.js';
13
+
14
+ let queue = [];
15
+ let scheduled = false;
16
+ let consecutiveFailures = 0;
17
+ let lastFailureAtMs = 0;
18
+ let recoveryTimer = null;
19
+
20
+ const MAX_CONSECUTIVE_FAILURES = 5;
21
+ const RECOVERY_BACKOFF_MS = 30_000;
22
+
23
+ function sendToBackend(payload) {
24
+ return new Promise((resolve, reject) => {
25
+ const cfg = getConfig();
26
+ const data = JSON.stringify(payload);
27
+
28
+ // Reuse the same payload size protection as metrics
29
+ const maxBytes = cfg.maxPayloadSize || 1024 * 1024;
30
+ if (Buffer.byteLength(data) > maxBytes) {
31
+ return reject(new Error('Payload too large'));
32
+ }
33
+
34
+ let url;
35
+ try {
36
+ url = new URL(cfg.tracesEndpoint);
37
+ } catch {
38
+ return reject(new Error('Invalid traces endpoint URL'));
39
+ }
40
+
41
+ const isHttps = url.protocol === 'https:';
42
+ const transport = isHttps ? https : http;
43
+
44
+ const options = {
45
+ hostname: url.hostname,
46
+ port: url.port || (isHttps ? 443 : 80),
47
+ path: url.pathname + url.search,
48
+ method: 'POST',
49
+ headers: {
50
+ 'Content-Type': 'application/json',
51
+ 'Content-Length': Buffer.byteLength(data),
52
+ 'X-APM-Key': cfg.apiKey,
53
+ 'X-Service': cfg.service,
54
+ 'User-Agent': '@sentienguard/apm/1.0.0'
55
+ },
56
+ timeout: 5000
57
+ };
58
+
59
+ const req = transport.request(options, (res) => {
60
+ let responseData = '';
61
+ res.on('data', (chunk) => {
62
+ responseData += chunk;
63
+ });
64
+ res.on('end', () => {
65
+ if (res.statusCode >= 200 && res.statusCode < 300) {
66
+ resolve({ statusCode: res.statusCode, data: responseData });
67
+ } else {
68
+ reject(new Error(`HTTP ${res.statusCode}: ${responseData}`));
69
+ }
70
+ });
71
+ });
72
+
73
+ req.on('error', (error) => {
74
+ const reason = error instanceof Error ? error : new Error(String(error));
75
+ reject(reason);
76
+ });
77
+
78
+ req.on('timeout', () => {
79
+ req.destroy();
80
+ reject(new Error('Request timeout'));
81
+ });
82
+
83
+ req.write(data);
84
+ req.end();
85
+ });
86
+ }
87
+
88
+ async function flushOnce(batch) {
89
+ if (!isEnabled()) return;
90
+ if (!batch.length) return;
91
+
92
+ const cfg = getConfig();
93
+ const payload = {
94
+ service: cfg.service,
95
+ environment: cfg.environment,
96
+ spans: batch
97
+ };
98
+
99
+ try {
100
+ await sendToBackend(payload);
101
+ consecutiveFailures = 0;
102
+ lastFailureAtMs = 0;
103
+ debug(`Trace flush ok: spans=${batch.length}`);
104
+ } catch (err) {
105
+ consecutiveFailures++;
106
+ lastFailureAtMs = Date.now();
107
+ warn(`Trace flush failed (attempt ${consecutiveFailures}): ${err.message}`);
108
+ if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
109
+ // Stop retrying aggressively; drop future spans until backend recovers.
110
+ warn('Trace flush: max failures reached; dropping spans under backpressure');
111
+ if (!recoveryTimer) {
112
+ recoveryTimer = setTimeout(() => {
113
+ recoveryTimer = null;
114
+ consecutiveFailures = 0;
115
+ lastFailureAtMs = 0;
116
+ debug('Trace flush: recovery window elapsed; retrying delivery');
117
+ if (queue.length && !scheduled) {
118
+ scheduled = true;
119
+ setImmediate(drainQueue);
120
+ }
121
+ }, RECOVERY_BACKOFF_MS);
122
+ // Allow process to exit naturally.
123
+ if (typeof recoveryTimer.unref === 'function') recoveryTimer.unref();
124
+ }
125
+ }
126
+ }
127
+ }
128
+
129
+ function drainQueue() {
130
+ scheduled = false;
131
+ const cfg = getConfig();
132
+ const maxBatch = cfg.tracing?.maxBatchSize || 256;
133
+
134
+ // If backend is unhealthy, drop to protect app memory.
135
+ if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
136
+ // During the recovery backoff window, keep dropping to protect memory.
137
+ // After backoff, recoveryTimer resets failures and normal flushing resumes.
138
+ queue = [];
139
+ return;
140
+ }
141
+
142
+ // Send at most one batch per tick to keep exporter callbacks cheap.
143
+ const batch = queue.slice(0, maxBatch);
144
+ queue = queue.slice(batch.length);
145
+ void flushOnce(batch);
146
+
147
+ if (queue.length) {
148
+ scheduled = true;
149
+ setImmediate(drainQueue);
150
+ }
151
+ }
152
+
153
+ export function enqueueSpans(serializedSpans) {
154
+ const cfg = getConfig();
155
+ const maxQueue = cfg.tracing?.maxQueueSize || 2048;
156
+
157
+ if (!Array.isArray(serializedSpans) || serializedSpans.length === 0) return;
158
+ if (!isEnabled()) return;
159
+
160
+ // Drop-on-pressure.
161
+ const room = maxQueue - queue.length;
162
+ if (room <= 0) return;
163
+ if (serializedSpans.length > room) {
164
+ queue.push(...serializedSpans.slice(0, room));
165
+ } else {
166
+ queue.push(...serializedSpans);
167
+ }
168
+
169
+ if (!scheduled) {
170
+ scheduled = true;
171
+ setImmediate(drainQueue);
172
+ }
173
+ }
174
+
175
+ export function resetTraceQueueForTests() {
176
+ queue = [];
177
+ scheduled = false;
178
+ consecutiveFailures = 0;
179
+ lastFailureAtMs = 0;
180
+ if (recoveryTimer) {
181
+ clearTimeout(recoveryTimer);
182
+ recoveryTimer = null;
183
+ }
184
+ }
185
+
186
+ /**
187
+ * Best-effort flush of queued spans (e.g. during shutdown).
188
+ * Never throws; safe to call even if tracing is disabled.
189
+ */
190
+ export async function flushTraceQueue({ timeoutMs = 1500 } = {}) {
191
+ try {
192
+ if (!isEnabled()) return;
193
+ const deadline = Date.now() + Math.max(0, Number(timeoutMs) || 0);
194
+ const cfg = getConfig();
195
+ const maxBatch = cfg.tracing?.maxBatchSize || 256;
196
+
197
+ while (queue.length && Date.now() < deadline) {
198
+ const batch = queue.slice(0, maxBatch);
199
+ queue = queue.slice(batch.length);
200
+ // Attempt flush even if failures happened earlier; this is a single best-effort drain.
201
+ consecutiveFailures = 0;
202
+ lastFailureAtMs = 0;
203
+ // eslint-disable-next-line no-await-in-loop
204
+ await flushOnce(batch);
205
+ }
206
+ } catch {
207
+ // ignore
208
+ }
209
+ }
210
+
package/src/tracing.js CHANGED
@@ -10,8 +10,10 @@ import { SEMRESATTRS_SERVICE_NAME, SEMRESATTRS_DEPLOYMENT_ENVIRONMENT } from '@o
10
10
  import { W3CTraceContextPropagator } from '@opentelemetry/core';
11
11
  import { HttpInstrumentation } from '@opentelemetry/instrumentation-http';
12
12
  import { ExpressInstrumentation } from '@opentelemetry/instrumentation-express';
13
+ import { AlwaysOnSampler, BatchSpanProcessor } from '@opentelemetry/sdk-trace-base';
13
14
  import { getConfig, debug } from './config.js';
14
15
  import { SentienGuardSpanExporter } from './spanExporter.js';
16
+ import { SentienGuardTraceSpanExporter } from './traceSpanExporter.js';
15
17
 
16
18
  let sdk = null;
17
19
  let tracingActive = false;
@@ -55,7 +57,8 @@ export function startTracing() {
55
57
  [SEMRESATTRS_DEPLOYMENT_ENVIRONMENT]: cfg.environment
56
58
  });
57
59
 
58
- const traceExporter = new SentienGuardSpanExporter();
60
+ const metricsExporter = new SentienGuardSpanExporter();
61
+ const traceExporter = new SentienGuardTraceSpanExporter();
59
62
 
60
63
  const httpInstrumentation = new HttpInstrumentation({
61
64
  ignoreOutgoingRequestHook: (requestOptions) => {
@@ -68,9 +71,15 @@ export function startTracing() {
68
71
 
69
72
  sdk = new NodeSDK({
70
73
  resource,
71
- traceExporter,
74
+ // Important: keep span recording ON so APM metrics derived from spans are not undersampled.
75
+ // Raw trace export sampling is handled inside SentienGuardTraceSpanExporter instead.
76
+ sampler: new AlwaysOnSampler(),
72
77
  textMapPropagator: new W3CTraceContextPropagator(),
73
78
  instrumentations: [httpInstrumentation, expressInstrumentation],
79
+ spanProcessors: [
80
+ new BatchSpanProcessor(metricsExporter),
81
+ new BatchSpanProcessor(traceExporter)
82
+ ],
74
83
  autoDetectResources: false
75
84
  });
76
85