@omen.foundation/node-microservice-runtime 0.1.65 → 0.1.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1267 +0,0 @@
1
- import { spawn, ChildProcess } from 'child_process';
2
- import { createWriteStream, existsSync, chmodSync, mkdirSync } from 'fs';
3
- import { join, dirname } from 'path';
4
- import { pipeline } from 'stream/promises';
5
- import { createGunzip } from 'zlib';
6
- import { createHash } from 'node:crypto';
7
- import { createServer } from 'net';
8
- import type { Logger } from 'pino';
9
- import pino from 'pino';
10
- import dgram from 'dgram';
11
- import deasync from 'deasync';
12
- import type { EnvironmentConfig } from './types.js';
13
- import { hostToHttpUrl } from './utils/urls.js';
14
-
15
- // Protocol is httpprotobuf
16
-
17
- interface CollectorDiscoveryEntry {
18
- version: string;
19
- status: string;
20
- pid: number;
21
- otlpEndpoint: string;
22
- }
23
-
24
- export interface CollectorStatus {
25
- isRunning: boolean;
26
- isReady: boolean;
27
- pid: number;
28
- otlpEndpoint?: string;
29
- version?: string;
30
- }
31
-
32
- const COLLECTOR_VERSION = '1.0.1'; // Match C# collector version (from collector-version.json)
33
- const COLLECTOR_DOWNLOAD_BASE = `https://collectors.beamable.com/version/${COLLECTOR_VERSION}`;
34
- const DISCOVERY_PORT = parseInt(process.env.BEAM_COLLECTOR_DISCOVERY_PORT || '8688', 10);
35
-
36
- /**
37
- * Finds a free TCP port (matching C# PortUtil.FreeTcpPort())
38
- */
39
- function findFreePort(): Promise<number> {
40
- return new Promise((resolve, reject) => {
41
- const server = createServer();
42
- server.listen(0, () => {
43
- const address = server.address();
44
- const port = typeof address === 'object' && address !== null ? address.port : 0;
45
- server.close(() => {
46
- if (port > 0) {
47
- resolve(port);
48
- } else {
49
- // Fallback to a default port if something went wrong
50
- resolve(9090);
51
- }
52
- });
53
- });
54
- server.on('error', (err) => {
55
- reject(err);
56
- });
57
- });
58
- }
59
-
60
- // Global storage for collector process and errors (to prevent GC and track status)
61
- let globalCollectorProcess: ChildProcess | null = null;
62
- let globalCollectorStartError: string | null = null;
63
- let globalCollectorExitCode: number | null = null;
64
- let globalCollectorStderr: string[] = [];
65
- // Track if collector startup is in progress to prevent duplicate starts
66
- let globalCollectorStartupPromise: Promise<string | null> | null = null;
67
- let globalCollectorInitError: string | null = null; // Tracks errors from discoverOrStartCollector
68
-
69
- /**
70
- * ClickHouse credentials response from Beamable API
71
- */
72
- interface ClickHouseCredentials {
73
- endpoint: string;
74
- expiresAt?: string;
75
- password: string;
76
- username: string;
77
- }
78
-
79
- /**
80
- * Calculates Beamable signature for signed requests
81
- * Signature format: MD5(secret + pid + version + uriPathAndQuery + body) as Base64
82
- * Matches C# SignedRequesterHelper.CalculateSignature
83
- */
84
- function calculateSignature(
85
- pid: string,
86
- secret: string,
87
- uriPathAndQuery: string,
88
- body: string | null = null,
89
- version: string = '1'
90
- ): string {
91
- let dataToSign = `${secret}${pid}${version}${uriPathAndQuery}`;
92
- if (body) {
93
- dataToSign += body;
94
- }
95
-
96
- // MD5 hash to Base64 (matching C# implementation)
97
- const hash = createHash('md5').update(dataToSign, 'utf8').digest('base64');
98
- return hash;
99
- }
100
-
101
- /**
102
- * Extracts path and query from a URL
103
- */
104
- function getPathAndQuery(url: string): string {
105
- try {
106
- const urlObj = new URL(url);
107
- return urlObj.pathname + urlObj.search;
108
- } catch {
109
- // If URL parsing fails, return as-is
110
- return url;
111
- }
112
- }
113
-
114
- /**
115
- * Fetches ClickHouse credentials from Beamable API
116
- * GET /api/beamo/otel/auth/writer/config
117
- * Requires signed request authentication (X-BEAM-SIGNATURE header)
118
- */
119
- export async function fetchClickHouseCredentials(
120
- env: EnvironmentConfig
121
- ): Promise<ClickHouseCredentials> {
122
- const apiUrl = hostToHttpUrl(env.host);
123
- const uriPath = '/api/beamo/otel/auth/writer/config';
124
- const configUrl = new URL(uriPath, apiUrl).toString();
125
-
126
- // Create a minimal logger for verbose output
127
- const verboseLogger = pino({ name: 'fetch-credentials', level: 'info' }, process.stdout);
128
- verboseLogger.info(`[Credentials] Starting credential fetch from ${configUrl}...`);
129
-
130
- // Get secret from environment (required for signed requests)
131
- const secret = process.env.SECRET;
132
- if (!secret) {
133
- verboseLogger.error('[Credentials] SECRET environment variable is missing');
134
- throw new Error('SECRET environment variable is required to fetch ClickHouse credentials');
135
- }
136
- verboseLogger.info('[Credentials] SECRET found, calculating signature...');
137
-
138
- // Calculate signature for signed request (matching C# HttpSignedRequester)
139
- const pathAndQuery = getPathAndQuery(uriPath);
140
- const signature = calculateSignature(env.pid, secret, pathAndQuery, null, '1');
141
- verboseLogger.info('[Credentials] Signature calculated, making API request...');
142
-
143
- // Build headers with signed authentication (matching C# HttpSignedRequester)
144
- const headers: Record<string, string> = {
145
- 'Content-Type': 'application/json',
146
- Accept: 'application/json',
147
- 'X-BEAM-SCOPE': `${env.cid}.${env.pid}`,
148
- 'X-BEAM-SIGNATURE': signature,
149
- };
150
-
151
- const fetchStartTime = Date.now();
152
- const response = await fetch(configUrl, {
153
- method: 'GET',
154
- headers,
155
- });
156
- const fetchElapsed = Date.now() - fetchStartTime;
157
- verboseLogger.info(`[Credentials] API request completed in ${fetchElapsed}ms, status: ${response.status}`);
158
-
159
- if (!response.ok) {
160
- const errorText = await response.text().catch(() => 'Unknown error');
161
- verboseLogger.error(`[Credentials] API request failed: ${response.status} ${response.statusText}`);
162
- throw new Error(`Failed to fetch ClickHouse credentials from ${configUrl}: ${response.status} ${response.statusText} - ${errorText.substring(0, 200)}`);
163
- }
164
-
165
- verboseLogger.info('[Credentials] Parsing response JSON...');
166
- const credentials = await response.json() as ClickHouseCredentials;
167
-
168
- if (!credentials.endpoint || !credentials.username || !credentials.password) {
169
- verboseLogger.error('[Credentials] Invalid response: missing required fields');
170
- throw new Error('Invalid ClickHouse credentials response: missing required fields');
171
- }
172
-
173
- verboseLogger.info(`[Credentials] Successfully fetched credentials (endpoint: ${credentials.endpoint}, username: ${credentials.username})`);
174
- return credentials;
175
- }
176
-
177
- /**
178
- * Gets the collector storage directory (similar to C# LocalApplicationData/beam/collectors/version)
179
- * Checks production location first (pre-installed in Docker), then falls back to temp directory
180
- */
181
- function getCollectorStoragePath(): string {
182
- // Check production location first (pre-installed in Docker image at /opt/beam/collectors)
183
- // This avoids runtime download delays (~12 seconds)
184
- const productionPath = '/opt/beam/collectors';
185
- const productionVersionPath = join(productionPath, COLLECTOR_VERSION);
186
-
187
- // Use production path if it exists (Docker image pre-installed collector)
188
- if (existsSync(productionPath)) {
189
- return productionVersionPath;
190
- }
191
-
192
- // Fallback to temp directory (local development without Docker)
193
- const tempDir = process.env.TMPDIR || process.env.TMP || '/tmp';
194
- return join(tempDir, 'beam', 'collectors', COLLECTOR_VERSION);
195
- }
196
-
197
- /**
198
- * Gets the collector binary name for the current platform
199
- */
200
- function getCollectorBinaryName(): string {
201
- const platform = process.platform;
202
- const arch = process.arch;
203
-
204
- // Match C# naming: collector-{osArchSuffix}
205
- // C# returns "collector-linux-amd64", not "beamable-collector-linux-amd64"
206
- if (platform === 'linux' && arch === 'x64') {
207
- return 'collector-linux-amd64';
208
- } else if (platform === 'linux' && arch === 'arm64') {
209
- return 'collector-linux-arm64';
210
- } else if (platform === 'darwin' && arch === 'x64') {
211
- return 'collector-darwin-amd64';
212
- } else if (platform === 'darwin' && arch === 'arm64') {
213
- return 'collector-darwin-arm64';
214
- } else if (platform === 'win32' && arch === 'x64') {
215
- return 'collector-windows-amd64.exe';
216
- }
217
-
218
- throw new Error(`Unsupported platform: ${platform} ${arch}`);
219
- }
220
-
221
- /**
222
- * Downloads and decompresses a gzipped file
223
- */
224
- async function downloadAndDecompressGzip(url: string, outputPath: string, makeExecutable: boolean = false): Promise<void> {
225
- let response: Response;
226
- try {
227
- // Add timeout and better error handling
228
- const controller = new AbortController();
229
- const timeoutId = setTimeout(() => controller.abort(), 30000); // 30 second timeout
230
-
231
- try {
232
- response = await fetch(url, {
233
- signal: controller.signal,
234
- headers: {
235
- 'User-Agent': 'Beamable-Node-Microservice-Runtime/1.0',
236
- },
237
- });
238
- clearTimeout(timeoutId);
239
- } catch (fetchError) {
240
- clearTimeout(timeoutId);
241
- if (fetchError instanceof Error) {
242
- // Extract underlying error details from cause
243
- let underlyingError = '';
244
- if ((fetchError as any).cause) {
245
- const cause = (fetchError as any).cause;
246
- if (cause instanceof Error) {
247
- const causeCode = (cause as any).code;
248
- underlyingError = ` (cause: ${cause.name} - ${cause.message}${causeCode ? ` [code: ${causeCode}]` : ''})`;
249
- } else if (typeof cause === 'object' && cause.code) {
250
- underlyingError = ` (cause code: ${cause.code})`;
251
- }
252
- }
253
-
254
- // Provide more context about the error
255
- if (fetchError.name === 'AbortError') {
256
- throw new Error(`Timeout downloading ${url} (30s limit exceeded)${underlyingError}`);
257
- }
258
-
259
- // Check error code if available (Node.js system errors)
260
- const errorCode = (fetchError as any).code || ((fetchError as any).cause?.code);
261
- if (errorCode) {
262
- if (errorCode === 'ENOTFOUND' || errorCode === 'EAI_AGAIN') {
263
- throw new Error(`DNS resolution failed for ${url} (code: ${errorCode}). Check network connectivity and DNS settings.`);
264
- }
265
- if (errorCode === 'ECONNREFUSED') {
266
- throw new Error(`Connection refused to ${url} (code: ${errorCode}). Server may be down or firewall blocking.`);
267
- }
268
- if (errorCode === 'ETIMEDOUT' || errorCode === 'ECONNRESET') {
269
- throw new Error(`Connection timeout/reset to ${url} (code: ${errorCode}). Network may be slow or unreachable.`);
270
- }
271
- if (errorCode === 'CERT_HAS_EXPIRED' || errorCode === 'UNABLE_TO_VERIFY_LEAF_SIGNATURE') {
272
- throw new Error(`SSL/TLS certificate error connecting to ${url} (code: ${errorCode}): ${fetchError.message}`);
273
- }
274
- }
275
-
276
- // Check for common network errors in message
277
- const errorMsg = fetchError.message.toLowerCase();
278
- if (errorMsg.includes('enotfound') || errorMsg.includes('dns')) {
279
- throw new Error(`DNS resolution failed for ${url}. Check network connectivity and DNS settings.${underlyingError}`);
280
- }
281
- if (errorMsg.includes('econnrefused') || errorMsg.includes('connection refused')) {
282
- throw new Error(`Connection refused to ${url}. Server may be down or firewall blocking.${underlyingError}`);
283
- }
284
- if (errorMsg.includes('etimedout') || errorMsg.includes('timeout')) {
285
- throw new Error(`Connection timeout to ${url}. Network may be slow or unreachable.${underlyingError}`);
286
- }
287
- if (errorMsg.includes('certificate') || errorMsg.includes('ssl') || errorMsg.includes('tls')) {
288
- throw new Error(`SSL/TLS error connecting to ${url}: ${fetchError.message}${underlyingError}`);
289
- }
290
- throw new Error(`Network error downloading ${url}: ${fetchError.name} - ${fetchError.message}${underlyingError}`);
291
- }
292
- throw new Error(`Network error downloading ${url}: ${String(fetchError)}`);
293
- }
294
- } catch (error) {
295
- // Re-throw with more context
296
- if (error instanceof Error) {
297
- if (error.message.includes('Timeout') || error.message.includes('DNS') || error.message.includes('Connection')) {
298
- throw error; // Already has good context
299
- }
300
- throw new Error(`Network error downloading ${url}: ${error.message}`);
301
- }
302
- throw new Error(`Network error downloading ${url}: ${String(error)}`);
303
- }
304
-
305
- if (!response.ok) {
306
- const errorText = await response.text().catch(() => '');
307
- throw new Error(`HTTP ${response.status} downloading ${url}: ${response.statusText}${errorText ? ` - ${errorText.substring(0, 200)}` : ''}`);
308
- }
309
-
310
- const dir = dirname(outputPath);
311
- if (!existsSync(dir)) {
312
- mkdirSync(dir, { recursive: true });
313
- }
314
-
315
- const gunzip = createGunzip();
316
- const writeStream = createWriteStream(outputPath);
317
-
318
- await pipeline(response.body as any, gunzip, writeStream);
319
-
320
- if (makeExecutable && process.platform !== 'win32') {
321
- try {
322
- chmodSync(outputPath, 0o755);
323
- } catch (error) {
324
- console.error(`Failed to make ${outputPath} executable:`, error);
325
- }
326
- }
327
- }
328
-
329
- /**
330
- * Resolves the collector binary and config, downloading if needed
331
- */
332
- async function resolveCollector(allowDownload: boolean = true, logger?: Logger): Promise<{ binaryPath: string | null; configPath: string | null }> {
333
- const basePath = getCollectorStoragePath();
334
- const binaryName = getCollectorBinaryName();
335
- const configName = 'clickhouse-config.yaml';
336
-
337
- const binaryPath = join(basePath, binaryName);
338
- const configPath = join(basePath, configName);
339
-
340
- const itemsToDownload: Array<{ url: string; path: string; executable: boolean }> = [];
341
-
342
- if (!existsSync(binaryPath) && allowDownload) {
343
- const binaryUrl = `${COLLECTOR_DOWNLOAD_BASE}/${binaryName}.gz`;
344
- itemsToDownload.push({ url: binaryUrl, path: binaryPath, executable: true });
345
- logger?.info(`[Collector] Will download binary from: ${binaryUrl}`);
346
- } else if (existsSync(binaryPath)) {
347
- logger?.info(`[Collector] Binary found at: ${binaryPath}`);
348
- }
349
-
350
- if (!existsSync(configPath) && allowDownload) {
351
- const configUrl = `${COLLECTOR_DOWNLOAD_BASE}/${configName}.gz`;
352
- itemsToDownload.push({ url: configUrl, path: configPath, executable: false });
353
- logger?.info(`[Collector] Will download config from: ${configUrl}`);
354
- } else if (existsSync(configPath)) {
355
- logger?.info(`[Collector] Config found at: ${configPath}`);
356
- }
357
-
358
- // Download items in parallel for faster startup (binary and config simultaneously)
359
- if (itemsToDownload.length > 0) {
360
- logger?.info(`[Collector] Downloading ${itemsToDownload.length} item(s) in parallel...`);
361
- const downloadPromises = itemsToDownload.map(async (item) => {
362
- try {
363
- logger?.info(`[Collector] Downloading ${item.url}...`);
364
- await downloadAndDecompressGzip(item.url, item.path, item.executable);
365
- logger?.info(`[Collector] Downloaded to ${item.path}`);
366
- } catch (error) {
367
- // Don't wrap the error again if it already contains the URL and details
368
- const errorMsg = error instanceof Error ? error.message : String(error);
369
- logger?.error(`[Collector] Failed to download ${item.url}: ${errorMsg}`);
370
- // Only wrap if the error doesn't already contain the URL
371
- if (error instanceof Error && errorMsg.includes(item.url)) {
372
- throw error; // Error already has URL and context
373
- }
374
- throw new Error(`Failed to download collector ${item.executable ? 'binary' : 'config'} from ${item.url}: ${errorMsg}`);
375
- }
376
- });
377
-
378
- // Wait for all downloads to complete (or fail)
379
- await Promise.all(downloadPromises);
380
- }
381
-
382
- return {
383
- binaryPath: existsSync(binaryPath) ? binaryPath : null,
384
- configPath: existsSync(configPath) ? configPath : null,
385
- };
386
- }
387
-
388
- /**
389
- * Discovers collector via UDP broadcast
390
- */
391
- function discoverCollectorViaUDP(timeoutMs: number = 1000): Promise<CollectorDiscoveryEntry | null> {
392
- return new Promise((resolve) => {
393
- const socket = dgram.createSocket('udp4');
394
- const discovered: CollectorDiscoveryEntry[] = [];
395
- let timeout: NodeJS.Timeout;
396
- let resolved = false; // Guard to prevent double resolution
397
- const startTime = Date.now();
398
-
399
- // Safety timeout - force resolve after timeout + 100ms buffer
400
- const safetyTimeout = setTimeout(() => {
401
- if (!resolved) {
402
- console.log(`[UDP Discovery] Safety timeout triggered - forcing resolution`);
403
- resolved = true;
404
- try {
405
- socket.removeAllListeners();
406
- socket.close();
407
- } catch (e) {
408
- // Ignore
409
- }
410
- clearTimeout(timeout);
411
- resolve(discovered.length > 0 ? discovered[0] : null);
412
- }
413
- }, timeoutMs + 100);
414
-
415
- const doResolve = (result: CollectorDiscoveryEntry | null) => {
416
- if (!resolved) {
417
- resolved = true;
418
- clearTimeout(timeout);
419
- clearTimeout(safetyTimeout);
420
- try {
421
- socket.removeAllListeners();
422
- socket.close();
423
- } catch (e) {
424
- // Ignore
425
- }
426
- console.log(`[UDP Discovery] Resolving promise with result: ${result ? 'found' : 'null'}`);
427
- resolve(result);
428
- }
429
- };
430
-
431
- socket.on('message', (msg) => {
432
- try {
433
- const message = JSON.parse(msg.toString()) as CollectorDiscoveryEntry;
434
- // Check if version matches
435
- if (message.version === COLLECTOR_VERSION && message.status === 'READY') {
436
- discovered.push(message);
437
- const elapsed = Date.now() - startTime;
438
- console.log(`[UDP Discovery] Found collector after ${elapsed}ms: ${message.otlpEndpoint}`);
439
- // Resolve immediately when collector found
440
- doResolve(discovered[0]);
441
- }
442
- } catch (error) {
443
- // Ignore parse errors
444
- }
445
- });
446
-
447
- socket.on('error', (err) => {
448
- const elapsed = Date.now() - startTime;
449
- console.log(`[UDP Discovery] Socket error after ${elapsed}ms: ${err.message}`);
450
- doResolve(null);
451
- });
452
-
453
- socket.bind(() => {
454
- socket.setBroadcast(true);
455
- console.log(`[UDP Discovery] Listening for collector broadcasts (timeout: ${timeoutMs}ms)...`);
456
-
457
- timeout = setTimeout(() => {
458
- const elapsed = Date.now() - startTime;
459
- if (discovered.length > 0) {
460
- console.log(`[UDP Discovery] Timeout after ${elapsed}ms, found ${discovered.length} collector(s)`);
461
- } else {
462
- console.log(`[UDP Discovery] Timeout after ${elapsed}ms, no collector found`);
463
- }
464
- doResolve(discovered.length > 0 ? discovered[0] : null);
465
- }, timeoutMs);
466
- });
467
- });
468
- }
469
-
470
- /**
471
- * Gets the current status of ClickHouse credentials (from env or API)
472
- */
473
- export function getClickHouseCredentialsStatus(): {
474
- hasEndpoint: boolean;
475
- hasUsername: boolean;
476
- hasPassword: boolean;
477
- source: 'environment' | 'api' | 'missing';
478
- } {
479
- const hasEndpoint = !!process.env.BEAM_CLICKHOUSE_ENDPOINT;
480
- const hasUsername = !!process.env.BEAM_CLICKHOUSE_USERNAME;
481
- const hasPassword = !!process.env.BEAM_CLICKHOUSE_PASSWORD;
482
-
483
- if (hasEndpoint && hasUsername && hasPassword) {
484
- return {
485
- hasEndpoint: true,
486
- hasUsername: true,
487
- hasPassword: true,
488
- source: 'environment',
489
- };
490
- }
491
-
492
- return {
493
- hasEndpoint,
494
- hasUsername,
495
- hasPassword,
496
- source: 'missing',
497
- };
498
- }
499
-
500
- /**
501
- * Checks if collector is already running via UDP discovery
502
- */
503
- export async function isCollectorRunning(): Promise<CollectorStatus> {
504
- // First, check if we have a global collector process reference
505
- // If the process is running and we've seen "Everything is ready" in stderr, it's ready
506
- if (globalCollectorProcess) {
507
- // Check if process is still alive
508
- try {
509
- // Process is alive if we can access it and it hasn't exited
510
- const processAlive = globalCollectorProcess.exitCode === null &&
511
- globalCollectorProcess.killed === false;
512
-
513
- if (processAlive) {
514
- // Check if we've seen the "Everything is ready" message in stderr
515
- // This is a reliable indicator that the collector is ready
516
- const stderrText = globalCollectorStderr.join('\n');
517
- const isReady = stderrText.includes('Everything is ready') ||
518
- stderrText.includes('Begin running and processing data');
519
-
520
- if (isReady) {
521
- // Try UDP discovery to get the endpoint, but if it fails, use the configured endpoint
522
- try {
523
- const discovered = await discoverCollectorViaUDP(1000); // Shorter timeout since we know it's running
524
- if (discovered && discovered.status === 'READY') {
525
- return {
526
- isRunning: true,
527
- isReady: true,
528
- pid: discovered.pid || globalCollectorProcess.pid || 0,
529
- otlpEndpoint: discovered.otlpEndpoint,
530
- version: discovered.version,
531
- };
532
- }
533
- } catch {
534
- // UDP discovery failed, but process is alive and has "Everything is ready" message
535
- // Use the configured endpoint as fallback
536
- }
537
-
538
- // Process is alive and we've seen "Everything is ready" - treat as ready
539
- // Extract endpoint from environment or use default
540
- const configuredEndpoint = process.env.BEAM_OTLP_HTTP_ENDPOINT || '0.0.0.0:4318';
541
- return {
542
- isRunning: true,
543
- isReady: true,
544
- pid: globalCollectorProcess.pid || 0,
545
- otlpEndpoint: configuredEndpoint,
546
- version: COLLECTOR_VERSION,
547
- };
548
- }
549
- }
550
- } catch {
551
- // Process might have exited or is invalid
552
- }
553
- }
554
-
555
- // Try UDP discovery as fallback
556
- try {
557
- const udpStartTime = Date.now();
558
- console.log('[UDP Discovery] Starting UDP discovery in isCollectorRunning()...');
559
- console.log('[UDP Discovery] Calling discoverCollectorViaUDP(1000)...');
560
- const discovered = await discoverCollectorViaUDP(1000); // Match C#: 1 second max (10 attempts × 100ms)
561
- const udpElapsed = Date.now() - udpStartTime;
562
- console.log(`[UDP Discovery] UDP discovery promise resolved after ${udpElapsed}ms, result: ${discovered ? 'found' : 'null'}`);
563
- if (discovered) {
564
- console.log(`[UDP Discovery] Found collector in ${udpElapsed}ms: ${discovered.otlpEndpoint}`);
565
- return {
566
- isRunning: true,
567
- isReady: discovered.status === 'READY',
568
- pid: discovered.pid,
569
- otlpEndpoint: discovered.otlpEndpoint,
570
- version: discovered.version,
571
- };
572
- } else {
573
- console.log(`[UDP Discovery] No collector found after ${udpElapsed}ms, returning not running status`);
574
- }
575
- } catch (error) {
576
- // Discovery failed, collector probably not running
577
- const errorMsg = error instanceof Error ? error.message : String(error);
578
- console.log(`[UDP Discovery] Error in isCollectorRunning(): ${errorMsg}`);
579
- if (error instanceof Error && error.stack) {
580
- console.log(`[UDP Discovery] Error stack: ${error.stack}`);
581
- }
582
- }
583
-
584
- console.log('[UDP Discovery] Returning default not-running status from isCollectorRunning()');
585
- return {
586
- isRunning: false,
587
- isReady: false,
588
- pid: 0,
589
- };
590
- }
591
-
592
- /**
593
- * Sets default collector configuration environment variables in process.env
594
- * Matches C# CollectorManager.AddCollectorConfigurationToEnvironment()
595
- */
596
- function addCollectorConfigurationToEnvironment(): void {
597
- const defaults: Record<string, string> = {
598
- BEAM_CLICKHOUSE_PROCESSOR_TIMEOUT: '5s',
599
- BEAM_CLICKHOUSE_PROCESSOR_BATCH_SIZE: '5000',
600
- BEAM_CLICKHOUSE_EXPORTER_TIMEOUT: '5s',
601
- BEAM_CLICKHOUSE_EXPORTER_QUEUE_SIZE: '1000',
602
- BEAM_CLICKHOUSE_EXPORTER_RETRY_ENABLED: 'true',
603
- BEAM_CLICKHOUSE_EXPORTER_RETRY_INITIAL_INTERVAL: '5s',
604
- BEAM_CLICKHOUSE_EXPORTER_RETRY_MAX_INTERVAL: '30s',
605
- BEAM_CLICKHOUSE_EXPORTER_RETRY_MAX_ELAPSED_TIME: '300s',
606
- };
607
-
608
- // Only set if not already set (matching C# behavior)
609
- for (const [key, defaultValue] of Object.entries(defaults)) {
610
- if (!process.env[key]) {
611
- process.env[key] = defaultValue;
612
- }
613
- }
614
- }
615
-
616
- /**
617
- * Sets ClickHouse credentials in process.env (matching C# CollectorManager.AddAuthEnvironmentVars)
618
- */
619
- export function addAuthEnvironmentVars(endpoint: string, username: string, password: string): void {
620
- // Always set the values when explicitly provided (matching C# behavior)
621
- // This ensures credentials fetched from API override any existing (potentially empty) values
622
- // In Node.js, process.env is a writable object - setting properties here persists for the lifetime of the process
623
- process.env.BEAM_CLICKHOUSE_ENDPOINT = endpoint;
624
- process.env.BEAM_CLICKHOUSE_USERNAME = username;
625
- process.env.BEAM_CLICKHOUSE_PASSWORD = password;
626
-
627
- // Verify they were actually set (for debugging)
628
- if (!process.env.BEAM_CLICKHOUSE_ENDPOINT || !process.env.BEAM_CLICKHOUSE_USERNAME || !process.env.BEAM_CLICKHOUSE_PASSWORD) {
629
- throw new Error(`Failed to set ClickHouse credentials in process.env - this should never happen in Node.js`);
630
- }
631
- }
632
-
633
- /**
634
- * Starts the collector and waits for it to be ready before returning.
635
- * This ensures Portal logs (structured logs via OTLP) only start appearing AFTER collector is ready.
636
- * Returns the OTLP endpoint when ready, or null if it times out or fails.
637
- */
638
- export async function startCollectorAndWaitForReady(
639
- env: EnvironmentConfig,
640
- _timeoutMs?: number // Not used - kept for API compatibility
641
- ): Promise<string | null> {
642
- // Match C# logic: (this.InDocker() || UseLocalOtel) && !BEAM_DISABLE_STANDARD_OTEL
643
- const isInDocker = process.env.IS_LOCAL !== '1' && process.env.IS_LOCAL !== 'true';
644
- const useLocalOtel = !!process.env.BEAM_LOCAL_OTEL;
645
- const standardOtelEnabled = (isInDocker || useLocalOtel) && !process.env.BEAM_DISABLE_STANDARD_OTEL;
646
- const hasExplicitEndpoint = !!process.env.BEAM_OTEL_EXPORTER_OTLP_ENDPOINT;
647
-
648
- // If OTLP is not needed, return immediately
649
- if (!standardOtelEnabled && !hasExplicitEndpoint) {
650
- return null;
651
- }
652
-
653
- // Create a minimal console logger for initialization messages (no OTLP yet)
654
- const initLogger = pino({
655
- name: 'beamable-otlp-init',
656
- level: 'info',
657
- }, process.stdout);
658
-
659
- initLogger.info('[OTLP] Setting up collector (waiting for readiness before enabling Portal logs)...');
660
- const setupStartTime = Date.now();
661
-
662
- // Simple linear async/await - no timeouts, no Promise.race complexity
663
- try {
664
- initLogger.info('[OTLP] Step 1: Discovering or starting collector...');
665
- const endpoint = await discoverOrStartCollector(initLogger, standardOtelEnabled, env);
666
- const elapsed = Date.now() - setupStartTime;
667
-
668
- if (endpoint) {
669
- initLogger.info(`[OTLP] Collector ready at ${endpoint}. Portal logs now enabled. (took ${elapsed}ms)`);
670
- return endpoint;
671
- } else {
672
- initLogger.warn(`[OTLP] Collector setup failed, continuing without Portal logs. (took ${elapsed}ms)`);
673
- return null;
674
- }
675
- } catch (error) {
676
- const elapsed = Date.now() - setupStartTime;
677
- const errorMsg = error instanceof Error ? error.message : String(error);
678
- initLogger.error(`[OTLP] Collector setup failed after ${elapsed}ms: ${errorMsg}`);
679
- return null;
680
- }
681
- }
682
-
683
- /**
684
- * Starts the collector asynchronously in the background (non-blocking).
685
- * This allows the service to start immediately while collector downloads/starts.
686
- * Returns immediately - collector setup happens in background.
687
- * @deprecated Use startCollectorAndWaitForReady() to ensure Portal logs only appear after collector is ready
688
- */
689
- export function startCollectorAsync(env: EnvironmentConfig): void {
690
- // Match C# logic: (this.InDocker() || UseLocalOtel) && !BEAM_DISABLE_STANDARD_OTEL
691
- const isInDocker = process.env.IS_LOCAL !== '1' && process.env.IS_LOCAL !== 'true';
692
- const useLocalOtel = !!process.env.BEAM_LOCAL_OTEL;
693
- const standardOtelEnabled = (isInDocker || useLocalOtel) && !process.env.BEAM_DISABLE_STANDARD_OTEL;
694
- const hasExplicitEndpoint = !!process.env.BEAM_OTEL_EXPORTER_OTLP_ENDPOINT;
695
-
696
- // If OTLP is not needed, return immediately
697
- if (!standardOtelEnabled && !hasExplicitEndpoint) {
698
- return;
699
- }
700
-
701
- // Create a minimal console logger for initialization messages (no OTLP yet)
702
- const initLogger = pino({
703
- name: 'beamable-otlp-init',
704
- level: 'info',
705
- }, process.stdout);
706
-
707
- initLogger.info('[OTLP] Starting collector setup in background (non-blocking)...');
708
-
709
- // Start collector setup asynchronously - don't wait for it
710
- // The service can start immediately, collector will connect when ready
711
- discoverOrStartCollector(initLogger, standardOtelEnabled, env)
712
- .then((result) => {
713
- if (result) {
714
- initLogger.info(`[OTLP] Collector setup complete in background, endpoint: ${result}`);
715
- // Update the global logger provider if it exists to enable OTLP
716
- // The logger will automatically start using OTLP once the provider is set
717
- } else {
718
- initLogger.warn('[OTLP] Collector setup returned null, OTLP logging will be disabled');
719
- }
720
- })
721
- .catch((error) => {
722
- const errorMsg = error instanceof Error ? error.message : String(error);
723
- initLogger.error(`[OTLP] Failed to setup collector in background: ${errorMsg}`);
724
- });
725
-
726
- // Return immediately - service can start serving requests
727
- }
728
-
729
- /**
730
- * Sets up the collector BEFORE logging is initialized (DEPRECATED - blocking approach).
731
- * This function is kept for backwards compatibility but is no longer recommended.
732
- * Use startCollectorAsync() for non-blocking startup instead.
733
- * @deprecated Use startCollectorAsync() for non-blocking startup
734
- */
735
- export function setupCollectorBeforeLogging(
736
- env: EnvironmentConfig,
737
- timeoutMs: number = 60000
738
- ): string | null {
739
- // Match C# logic: (this.InDocker() || UseLocalOtel) && !BEAM_DISABLE_STANDARD_OTEL
740
- const isInDocker = process.env.IS_LOCAL !== '1' && process.env.IS_LOCAL !== 'true';
741
- const useLocalOtel = !!process.env.BEAM_LOCAL_OTEL;
742
- const standardOtelEnabled = (isInDocker || useLocalOtel) && !process.env.BEAM_DISABLE_STANDARD_OTEL;
743
- const hasExplicitEndpoint = !!process.env.BEAM_OTEL_EXPORTER_OTLP_ENDPOINT;
744
-
745
- // If OTLP is not needed, return immediately
746
- if (!standardOtelEnabled && !hasExplicitEndpoint) {
747
- return null;
748
- }
749
-
750
- // Create a minimal console logger for initialization messages (no OTLP yet)
751
- const initLogger = pino({
752
- name: 'beamable-otlp-init',
753
- level: 'info',
754
- }, process.stdout);
755
-
756
- initLogger.info('[OTLP] Setting up collector before logging initialization...');
757
-
758
- // Use deasync to wait synchronously for collector setup
759
- let endpoint: string | null = null;
760
- let completed = false;
761
- let setupError: string | null = null;
762
-
763
- // Start collector setup promise
764
- discoverOrStartCollector(initLogger, standardOtelEnabled, env)
765
- .then((result) => {
766
- endpoint = result;
767
- completed = true;
768
- if (result) {
769
- initLogger.info(`[OTLP] Collector setup complete, endpoint: ${result}`);
770
- } else {
771
- initLogger.warn('[OTLP] Collector setup returned null, OTLP logging will be disabled');
772
- }
773
- return result;
774
- })
775
- .catch((error) => {
776
- setupError = error instanceof Error ? error.message : String(error);
777
- completed = true;
778
- initLogger.error(`[OTLP] Failed to setup collector: ${setupError}`);
779
- endpoint = null;
780
- });
781
-
782
- // Wait synchronously for collector setup to complete
783
- const startTime = Date.now();
784
- const timeoutId = setTimeout(() => {
785
- if (!completed) {
786
- initLogger.warn(`[OTLP] Collector setup timeout after ${timeoutMs}ms, continuing without OTLP`);
787
- completed = true;
788
- }
789
- }, timeoutMs);
790
-
791
- try {
792
- // Use deasync to wait for completion (allows event loop to process async operations)
793
- deasync.loopWhile(() => {
794
- const elapsed = Date.now() - startTime;
795
- if (elapsed >= timeoutMs) {
796
- return false; // Stop waiting
797
- }
798
- return !completed; // Continue waiting if not completed
799
- });
800
- } catch (error) {
801
- initLogger.error(`[OTLP] Error during collector setup wait: ${error instanceof Error ? error.message : String(error)}`);
802
- }
803
-
804
- clearTimeout(timeoutId);
805
-
806
- if (completed && endpoint) {
807
- initLogger.info('[OTLP] Collector is ready, proceeding with logger creation');
808
- return endpoint;
809
- } else if (setupError) {
810
- initLogger.error(`[OTLP] Collector setup failed: ${setupError}, continuing without OTLP`);
811
- return null;
812
- } else {
813
- initLogger.warn('[OTLP] Collector setup did not complete in time, continuing without OTLP');
814
- return null;
815
- }
816
- }
817
-
818
- /**
819
- * Starts the OpenTelemetry collector process
820
- */
821
- export async function startCollector(
822
- logger: Logger,
823
- otlpEndpoint?: string,
824
- env?: EnvironmentConfig
825
- ): Promise<{ process: ChildProcess; endpoint: string }> {
826
- logger.info('[Collector] === startCollector() ENTERED ===');
827
- // Clear any previous errors
828
- globalCollectorStartError = null;
829
- globalCollectorInitError = null;
830
-
831
- // CRITICAL: Fetch credentials FIRST, before resolving collector binary/config
832
- // This ensures credentials are in process.env even if collector download fails
833
- // The credentials will persist in process.env for the lifetime of the process.
834
- // Per Gabriel: "these OTEL related ones you need to do it yourself at the beginning of the microservice startup"
835
- let clickhouseEndpoint = process.env.BEAM_CLICKHOUSE_ENDPOINT;
836
- let clickhouseUsername = process.env.BEAM_CLICKHOUSE_USERNAME;
837
- let clickhousePassword = process.env.BEAM_CLICKHOUSE_PASSWORD;
838
-
839
- if ((!clickhouseEndpoint || !clickhouseUsername || !clickhousePassword) && env) {
840
- try {
841
- logger.info('[Collector] Fetching ClickHouse credentials from Beamable API...');
842
- const credStartTime = Date.now();
843
- const credentials = await fetchClickHouseCredentials(env);
844
- const credElapsed = Date.now() - credStartTime;
845
- logger.info(`[Collector] ClickHouse credentials fetch completed in ${credElapsed}ms`);
846
- clickhouseEndpoint = credentials.endpoint;
847
- clickhouseUsername = credentials.username;
848
- clickhousePassword = credentials.password;
849
-
850
- // CRITICAL: Set them in process.env IMMEDIATELY after fetching (matching C# CollectorManager.AddAuthEnvironmentVars)
851
- // In Node.js, process.env is a writable object - setting properties here persists for the lifetime of the process
852
- // This ensures they're available to both the Node.js process and inherited by the collector child process
853
- addAuthEnvironmentVars(clickhouseEndpoint, clickhouseUsername, clickhousePassword);
854
-
855
- // Verify they were set (defensive check - should never fail in Node.js)
856
- const verifyEndpoint = process.env.BEAM_CLICKHOUSE_ENDPOINT;
857
- const verifyUsername = process.env.BEAM_CLICKHOUSE_USERNAME;
858
- const verifyPassword = process.env.BEAM_CLICKHOUSE_PASSWORD;
859
-
860
- if (!verifyEndpoint || !verifyUsername || !verifyPassword) {
861
- logger.error(`[Collector] CRITICAL: Credentials were set but are missing from process.env! This should never happen.`);
862
- throw new Error('Failed to persist ClickHouse credentials in process.env');
863
- }
864
-
865
- logger.info('[Collector] ClickHouse credentials fetched from API and verified in process.env');
866
- } catch (error) {
867
- const errorMsg = `[Collector] Failed to fetch ClickHouse credentials from API: ${error instanceof Error ? error.message : String(error)}`;
868
- logger.error(errorMsg);
869
- throw new Error(errorMsg);
870
- }
871
- }
872
-
873
- // Validate required environment variables (matching C# behavior)
874
- // These must be set before starting the collector, otherwise it will fail
875
- if (!clickhouseEndpoint || !clickhouseUsername || !clickhousePassword) {
876
- const errorMsg = `[Collector] Required ClickHouse credentials are missing. Set BEAM_CLICKHOUSE_ENDPOINT, BEAM_CLICKHOUSE_USERNAME, and BEAM_CLICKHOUSE_PASSWORD, or ensure the API endpoint is accessible.`;
877
- logger.error(errorMsg);
878
- throw new Error(errorMsg);
879
- }
880
-
881
- // Now resolve collector binary and config (after credentials are fetched and set)
882
- logger.info('[Collector] Step 2: Resolving collector binary and config...');
883
- const resolveStartTime = Date.now();
884
- const collectorInfo = await resolveCollector(true, logger);
885
- const resolveElapsed = Date.now() - resolveStartTime;
886
- logger.info(`[Collector] Collector binary/config resolution completed in ${resolveElapsed}ms`);
887
-
888
- if (!collectorInfo.binaryPath) {
889
- logger.error('[Collector] Binary not found and download failed');
890
- throw new Error('Collector binary not found and download failed');
891
- }
892
-
893
- if (!collectorInfo.configPath) {
894
- logger.error('[Collector] Config not found and download failed');
895
- throw new Error('Collector config not found and download failed');
896
- }
897
-
898
- logger.info(`[Collector] Using binary: ${collectorInfo.binaryPath}`);
899
- logger.info(`[Collector] Using config: ${collectorInfo.configPath}`);
900
-
901
- // Verify binary exists
902
- if (!collectorInfo.binaryPath || !existsSync(collectorInfo.binaryPath)) {
903
- throw new Error(`Collector binary not found at ${collectorInfo.binaryPath}`);
904
- }
905
-
906
- // Check if binary is executable (Unix only - Windows doesn't have execute permission)
907
- if (process.platform !== 'win32') {
908
- try {
909
- const fsPromises = await import('fs/promises');
910
- const stats = await fsPromises.stat(collectorInfo.binaryPath);
911
- const mode = Number(stats.mode);
912
- const isExecutable = !!(mode & parseInt('111', 8)); // Check execute bit
913
- if (!isExecutable) {
914
- logger.warn(`[Collector] Binary may not be executable. Mode: ${mode.toString(8)}`);
915
- // Try to make it executable again
916
- try {
917
- chmodSync(collectorInfo.binaryPath, 0o755);
918
- logger.info(`[Collector] Made binary executable`);
919
- } catch (chmodError) {
920
- logger.error(`[Collector] Failed to make binary executable: ${chmodError instanceof Error ? chmodError.message : String(chmodError)}`);
921
- }
922
- }
923
- } catch (statError) {
924
- logger.warn(`[Collector] Could not check binary permissions: ${statError instanceof Error ? statError.message : String(statError)}`);
925
- }
926
- }
927
-
928
- // Set default collector configuration environment variables in process.env
929
- // This ensures they're available to both the Node.js process and the collector child process
930
- // Matches C# CollectorManager.AddCollectorConfigurationToEnvironment()
931
- addCollectorConfigurationToEnvironment();
932
-
933
- // CRITICAL: Set BEAM_COLLECTOR_DISCOVERY_PORT if not already set (matching C# AddDefaultCollectorHostAndPortFallback)
934
- // This MUST be set before starting the collector
935
- if (!process.env.BEAM_COLLECTOR_DISCOVERY_PORT) {
936
- process.env.BEAM_COLLECTOR_DISCOVERY_PORT = String(DISCOVERY_PORT);
937
- logger.info(`[Collector] Set BEAM_COLLECTOR_DISCOVERY_PORT=${DISCOVERY_PORT} (default)`);
938
- }
939
-
940
- // Determine OTLP endpoint
941
- // Use a free port if not specified (like C# does with PortUtil.FreeEndpoint())
942
- let localEndpoint = otlpEndpoint;
943
- if (!localEndpoint) {
944
- // For now, use default OTLP HTTP port
945
- // In production, this would be discovered or set by Beamable
946
- localEndpoint = '0.0.0.0:4318';
947
- }
948
- localEndpoint = localEndpoint.replace(/^http:\/\//, '').replace(/^https:\/\//, '');
949
-
950
- // Find a free port for Prometheus (matching C# PortUtil.FreeTcpPort())
951
- // We'll use a simple approach: bind to port 0 and let the OS assign a port
952
- // For now, use a default port (9090 is common for Prometheus) or find a free one
953
- // C# uses PortUtil.FreeTcpPort() which finds an available TCP port
954
- // We'll use a simple helper to find a free port
955
- const prometheusPort = await findFreePort();
956
-
957
- // Set environment variables for collector
958
- // Note: BEAM_CLICKHOUSE_ENDPOINT is for collector → ClickHouse communication
959
- // This is different from the OTLP endpoint (microservice → collector)
960
- // The collector process will inherit all environment variables from process.env automatically
961
- // We only need to set the OTLP endpoint, discovery port, and Prometheus port explicitly here
962
- const collectorEnv: NodeJS.ProcessEnv = {
963
- ...process.env, // Inherit all process.env (including credentials and config we just set)
964
- BEAM_OTLP_HTTP_ENDPOINT: localEndpoint,
965
- BEAM_COLLECTOR_DISCOVERY_PORT: String(DISCOVERY_PORT),
966
- BEAM_COLLECTOR_PROMETHEUS_PORT: String(prometheusPort),
967
- };
968
-
969
- // Start collector process
970
- const collectorProcess = spawn(collectorInfo.binaryPath, ['--config', collectorInfo.configPath], {
971
- env: collectorEnv,
972
- stdio: ['ignore', 'pipe', 'pipe'],
973
- detached: false,
974
- });
975
-
976
- // Store globally to prevent GC and track status
977
- globalCollectorProcess = collectorProcess;
978
- globalCollectorStartError = null;
979
- globalCollectorExitCode = null;
980
- globalCollectorStderr = [];
981
-
982
- // Set up handlers BEFORE we check if process started, so we capture all output
983
- collectorProcess.stdout?.on('data', (data) => {
984
- const outputLine = data.toString().trim();
985
- // Log stdout at info level to see collector output
986
- logger.info(`[Collector OUT] ${outputLine}`);
987
- });
988
-
989
- collectorProcess.stderr?.on('data', (data) => {
990
- const errorLine = data.toString().trim();
991
- globalCollectorStderr.push(errorLine);
992
- // Keep only last 50 lines to prevent memory issues
993
- if (globalCollectorStderr.length > 50) {
994
- globalCollectorStderr.shift();
995
- }
996
- // Log stderr at info level so we can see why collector crashes
997
- logger.info(`[Collector ERR] ${errorLine}`);
998
- });
999
-
1000
- collectorProcess.on('error', (err) => {
1001
- globalCollectorStartError = err.message;
1002
- logger.error(`[Collector] Failed to start: ${err.message}`);
1003
- });
1004
-
1005
- collectorProcess.on('exit', (code, signal) => {
1006
- globalCollectorExitCode = code;
1007
- logger.warn(`[Collector] Process exited with code ${code}${signal ? `, signal ${signal}` : ''}`);
1008
-
1009
- // If process exited with an error code, log accumulated stderr
1010
- if (code !== 0 && globalCollectorStderr.length > 0) {
1011
- logger.error(`[Collector] Exit error - Last ${Math.min(10, globalCollectorStderr.length)} stderr lines:`);
1012
- globalCollectorStderr.slice(-10).forEach((line) => {
1013
- logger.error(`[Collector ERR] ${line}`);
1014
- });
1015
- }
1016
-
1017
- // Don't clear reference immediately - keep it for status checks
1018
- // globalCollectorProcess = null; // Keep reference for getCollectorProcessStatus()
1019
- });
1020
-
1021
- logger.info(`[Collector] Started with PID ${collectorProcess.pid}, endpoint: ${localEndpoint}`);
1022
-
1023
- // Give the process a moment to start and potentially write to stderr
1024
- // This helps capture immediate startup errors
1025
- // Wait a bit longer to ensure we catch immediate crashes
1026
- await new Promise(resolve => setTimeout(resolve, 500));
1027
-
1028
- // Check if process already exited (immediate crash)
1029
- if (globalCollectorExitCode !== null && globalCollectorExitCode !== 0) {
1030
- const stderrMsg = globalCollectorStderr.length > 0
1031
- ? ` Stderr: ${globalCollectorStderr.join('; ')}`
1032
- : ' No stderr output captured.';
1033
- throw new Error(`Collector process exited immediately after startup with code ${globalCollectorExitCode}.${stderrMsg}`);
1034
- }
1035
-
1036
- return {
1037
- process: collectorProcess,
1038
- endpoint: `http://${localEndpoint}`,
1039
- };
1040
- }
1041
-
1042
- /**
1043
- * Gets detailed collector status including process info and errors
1044
- */
1045
- export function getCollectorProcessStatus(): {
1046
- hasProcess: boolean;
1047
- pid: number | null;
1048
- exitCode: number | null;
1049
- startError: string | null;
1050
- initError: string | null;
1051
- stderr: string[];
1052
- isPreInstalled?: boolean;
1053
- binaryPath?: string | null;
1054
- configPath?: string | null;
1055
- } {
1056
- // Check if collector is actually running successfully - if so, clear stale initError
1057
- const processAlive = globalCollectorProcess !== null &&
1058
- globalCollectorProcess.exitCode === null &&
1059
- globalCollectorProcess.killed === false;
1060
-
1061
- const stderrText = globalCollectorStderr.join('\n');
1062
- const isReady = stderrText.includes('Everything is ready') ||
1063
- stderrText.includes('Begin running and processing data');
1064
-
1065
- // If collector is running and ready, clear stale initError (might be from failed download attempt)
1066
- // The error could be from an initial download attempt that failed, but collector started successfully using pre-installed binary
1067
- const actualInitError = (processAlive && isReady) ? null : globalCollectorInitError;
1068
-
1069
- // Check if collector binary is pre-installed
1070
- const basePath = getCollectorStoragePath();
1071
- const binaryName = getCollectorBinaryName();
1072
- const configName = 'clickhouse-config.yaml';
1073
- const binaryPath = join(basePath, binaryName);
1074
- const configPath = join(basePath, configName);
1075
- const isPreInstalled = existsSync(binaryPath) && existsSync(configPath);
1076
-
1077
- return {
1078
- hasProcess: globalCollectorProcess !== null,
1079
- pid: globalCollectorProcess?.pid ?? null,
1080
- exitCode: globalCollectorExitCode,
1081
- startError: globalCollectorStartError,
1082
- initError: actualInitError, // Clear if collector is running and ready
1083
- stderr: [...globalCollectorStderr], // Return a copy
1084
- isPreInstalled,
1085
- binaryPath: isPreInstalled ? binaryPath : null,
1086
- configPath: isPreInstalled ? configPath : null,
1087
- };
1088
- }
1089
-
1090
- /**
1091
- * Discovers or starts the collector and returns the OTLP endpoint
1092
- */
1093
- export async function discoverOrStartCollector(
1094
- logger: Logger,
1095
- standardOtelEnabled: boolean,
1096
- env?: EnvironmentConfig
1097
- ): Promise<string | null> {
1098
- if (!standardOtelEnabled) {
1099
- return null;
1100
- }
1101
-
1102
- // CRITICAL: Check if collector startup is already in progress
1103
- // This prevents duplicate collector starts if this function is called multiple times
1104
- // (e.g., if setupCollectorBeforeLogging times out but the promise is still running)
1105
- if (globalCollectorStartupPromise) {
1106
- logger.info('[Collector] Collector startup already in progress, waiting for existing startup to complete...');
1107
- try {
1108
- const result = await globalCollectorStartupPromise;
1109
- // Clear the promise after it completes (success or failure)
1110
- globalCollectorStartupPromise = null;
1111
- return result;
1112
- } catch (error) {
1113
- logger.error(`[Collector] Existing startup promise failed: ${error instanceof Error ? error.message : String(error)}`);
1114
- // Clear the promise so we can retry
1115
- globalCollectorStartupPromise = null;
1116
- // Fall through to start a new one
1117
- }
1118
- }
1119
-
1120
- // CRITICAL: Check if we already have a collector process starting/running
1121
- // This prevents duplicate collector starts if this function is called multiple times
1122
- let existingEndpoint: string | undefined;
1123
- if (globalCollectorProcess) {
1124
- // Check if process is still alive
1125
- const processAlive = globalCollectorProcess.exitCode === null &&
1126
- globalCollectorProcess.killed === false;
1127
-
1128
- if (processAlive) {
1129
- logger.info(`[Collector] Collector process already exists (PID ${globalCollectorProcess.pid}), waiting for it to be ready...`);
1130
- // Use the configured endpoint from environment (we started it earlier)
1131
- existingEndpoint = process.env.BEAM_OTLP_HTTP_ENDPOINT ?
1132
- `http://${process.env.BEAM_OTLP_HTTP_ENDPOINT}` :
1133
- 'http://0.0.0.0:4318';
1134
- // Fall through to the wait logic below (don't start a new collector)
1135
- } else {
1136
- // Process is dead, clear it and start fresh
1137
- logger.warn(`[Collector] Previous collector process (PID ${globalCollectorProcess.pid}) is dead, starting new one...`);
1138
- globalCollectorProcess = null;
1139
- globalCollectorExitCode = null;
1140
- globalCollectorStderr = [];
1141
- }
1142
- }
1143
-
1144
- // First, quick check if collector is already running (via UDP discovery)
1145
- // Match C#: 10 attempts × 100ms = 1 second max (much faster than before)
1146
- // Skip UDP discovery if we already have a process reference (faster)
1147
- if (!globalCollectorProcess) {
1148
- logger.info('[Collector] Quick check for existing collector via UDP discovery (timeout: 1s, matching C#)...');
1149
- const udpStartTime = Date.now();
1150
- logger.info('[Collector] Calling isCollectorRunning()...');
1151
- const status = await isCollectorRunning();
1152
- const udpElapsed = Date.now() - udpStartTime;
1153
- logger.info(`[Collector] isCollectorRunning() returned after ${udpElapsed}ms: isRunning=${status.isRunning}, isReady=${status.isReady}, endpoint=${status.otlpEndpoint || 'none'}`);
1154
- if (status.isRunning && status.isReady && status.otlpEndpoint) {
1155
- logger.info(`[Collector] Found running collector at ${status.otlpEndpoint}, will reuse it`);
1156
- return `http://${status.otlpEndpoint}`;
1157
- }
1158
- logger.info('[Collector] No existing collector found, will start new one...');
1159
- } else {
1160
- logger.info('[Collector] Collector process already exists, skipping UDP discovery...');
1161
- }
1162
-
1163
- // Collector not running - start it (or wait for existing one to be ready)
1164
- // Wrap the entire startup logic in a promise that we track globally
1165
- // This prevents duplicate starts if this function is called multiple times
1166
- logger.info('[Collector] Creating startup promise...');
1167
- const startupPromise = (async (): Promise<string | null> => {
1168
- logger.info('[Collector] Startup promise executing...');
1169
- try {
1170
- // Clear any previous init error
1171
- globalCollectorInitError = null;
1172
- logger.info(`[Collector] Checking existingEndpoint: ${existingEndpoint || 'none'}`);
1173
-
1174
- let endpoint: string;
1175
- if (existingEndpoint) {
1176
- // Collector already starting, just wait for it to be ready
1177
- endpoint = existingEndpoint;
1178
- logger.info(`[Collector] Waiting for existing collector to become ready at ${endpoint}...`);
1179
- } else {
1180
- // Start a new collector
1181
- logger.info('[Collector] Starting new OpenTelemetry collector...');
1182
- const startCollectorTime = Date.now();
1183
- const startResult = await startCollector(logger, undefined, env);
1184
- const startCollectorElapsed = Date.now() - startCollectorTime;
1185
- logger.info(`[Collector] Collector process started in ${startCollectorElapsed}ms, endpoint: ${startResult.endpoint}`);
1186
- endpoint = startResult.endpoint;
1187
-
1188
- // Check if collector process exited immediately (crashed)
1189
- logger.info('[Collector] Checking if collector process is still running...');
1190
- await new Promise(resolve => setTimeout(resolve, 200));
1191
-
1192
- if (globalCollectorExitCode !== null && globalCollectorExitCode !== 0) {
1193
- const errorMsg = `Collector process exited immediately with code ${globalCollectorExitCode}. ${globalCollectorStderr.length > 0 ? `Stderr: ${globalCollectorStderr.join('; ')}` : 'No stderr output.'}`;
1194
- globalCollectorInitError = errorMsg;
1195
- logger.error(`[Collector] ${errorMsg}`);
1196
- return null;
1197
- }
1198
- logger.info('[Collector] Collector process is still running, proceeding to readiness check...');
1199
- }
1200
-
1201
- // CRITICAL: Wait for collector to be fully ready before returning
1202
- // We'll wait up to 60 seconds, checking every 500ms
1203
- // This ensures the collector is actually ready to receive logs before we continue
1204
- const maxWaitTime = 60000; // 60 seconds
1205
- const checkInterval = 500; // Check every 500ms
1206
- const maxChecks = Math.floor(maxWaitTime / checkInterval);
1207
-
1208
- logger.info(`[Collector] Waiting for collector to become ready (checking every ${checkInterval}ms, max ${maxWaitTime / 1000}s)...`);
1209
- const readinessStartTime = Date.now();
1210
-
1211
- for (let i = 0; i < maxChecks; i++) {
1212
- await new Promise(resolve => setTimeout(resolve, checkInterval));
1213
- const elapsed = Date.now() - readinessStartTime;
1214
-
1215
- // Check if process exited during wait
1216
- if (globalCollectorExitCode !== null && globalCollectorExitCode !== 0) {
1217
- const errorMsg = `Collector process exited during startup with code ${globalCollectorExitCode}. ${globalCollectorStderr.length > 0 ? `Stderr: ${globalCollectorStderr.join('; ')}` : 'No stderr output.'}`;
1218
- globalCollectorInitError = errorMsg;
1219
- logger.error(`[Collector] ${errorMsg} (after ${elapsed}ms)`);
1220
- return null;
1221
- }
1222
-
1223
- logger.info(`[Collector] Checking collector readiness... (${(elapsed / 1000).toFixed(1)}s elapsed, attempt ${i + 1}/${maxChecks})`);
1224
- const newStatus = await isCollectorRunning();
1225
- logger.info(`[Collector] Collector status: isRunning=${newStatus.isRunning}, isReady=${newStatus.isReady}, pid=${newStatus.pid}, endpoint=${newStatus.otlpEndpoint || 'none'}`);
1226
-
1227
- if (newStatus.isRunning && newStatus.isReady) {
1228
- logger.info(`[Collector] Collector is ready at ${newStatus.otlpEndpoint || endpoint} (ready after ${elapsed}ms)`);
1229
- return newStatus.otlpEndpoint ? `http://${newStatus.otlpEndpoint}` : endpoint;
1230
- }
1231
-
1232
- // Log progress every second (every 2 checks)
1233
- if (i > 0 && i % 2 === 0) {
1234
- logger.info(`[Collector] Still waiting for collector to become ready... (${(elapsed / 1000).toFixed(1)}s elapsed)`);
1235
- }
1236
- }
1237
-
1238
- // Check one more time if process exited
1239
- if (globalCollectorExitCode !== null && globalCollectorExitCode !== 0) {
1240
- const errorMsg = `Collector process exited with code ${globalCollectorExitCode}. ${globalCollectorStderr.length > 0 ? `Stderr: ${globalCollectorStderr.join('; ')}` : 'No stderr output.'}`;
1241
- globalCollectorInitError = errorMsg;
1242
- logger.error(`[Collector] ${errorMsg}`);
1243
- return null;
1244
- }
1245
-
1246
- // Collector did not become ready within timeout
1247
- logger.error(`[Collector] Collector did not become ready within ${maxWaitTime / 1000} seconds`);
1248
- return null;
1249
- } catch (err) {
1250
- const errorMsg = err instanceof Error ? err.message : String(err);
1251
- globalCollectorInitError = errorMsg;
1252
- logger.error(`[Collector] Failed to start collector: ${errorMsg}`);
1253
- return null;
1254
- }
1255
- })();
1256
-
1257
- // Store the promise globally so other calls to this function can wait for it
1258
- globalCollectorStartupPromise = startupPromise;
1259
-
1260
- // Clear the promise when it completes (so we don't keep waiting on old promises)
1261
- startupPromise.finally(() => {
1262
- globalCollectorStartupPromise = null;
1263
- });
1264
-
1265
- return await startupPromise;
1266
- }
1267
-