@service-bridge/node 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +854 -0
  2. package/biome.json +28 -0
  3. package/bun.lock +249 -0
  4. package/dist/express.d.ts +51 -0
  5. package/dist/express.js +129 -0
  6. package/dist/fastify.d.ts +43 -0
  7. package/dist/fastify.js +122 -0
  8. package/dist/index.js +34410 -0
  9. package/dist/trace.d.ts +19 -0
  10. package/http/dist/express.d.ts +51 -0
  11. package/http/dist/express.d.ts.map +1 -0
  12. package/http/dist/express.test.d.ts +2 -0
  13. package/http/dist/express.test.d.ts.map +1 -0
  14. package/http/dist/fastify.d.ts +43 -0
  15. package/http/dist/fastify.d.ts.map +1 -0
  16. package/http/dist/fastify.test.d.ts +2 -0
  17. package/http/dist/fastify.test.d.ts.map +1 -0
  18. package/http/dist/index.d.ts +7 -0
  19. package/http/dist/index.d.ts.map +1 -0
  20. package/http/dist/trace.d.ts +19 -0
  21. package/http/dist/trace.d.ts.map +1 -0
  22. package/http/dist/trace.test.d.ts +2 -0
  23. package/http/dist/trace.test.d.ts.map +1 -0
  24. package/http/package.json +48 -0
  25. package/http/src/express.test.ts +125 -0
  26. package/http/src/express.ts +209 -0
  27. package/http/src/fastify.test.ts +142 -0
  28. package/http/src/fastify.ts +159 -0
  29. package/http/src/index.ts +10 -0
  30. package/http/src/sdk-augment.d.ts +11 -0
  31. package/http/src/servicebridge.d.ts +23 -0
  32. package/http/src/trace.test.ts +97 -0
  33. package/http/src/trace.ts +56 -0
  34. package/http/tsconfig.json +17 -0
  35. package/http/tsconfig.test.json +6 -0
  36. package/package.json +65 -0
  37. package/sdk/dist/generated/servicebridge-package-definition.d.ts +4709 -0
  38. package/sdk/dist/grpc-client.d.ts +304 -0
  39. package/sdk/dist/grpc-client.test.d.ts +1 -0
  40. package/sdk/dist/index.d.ts +2 -0
  41. package/sdk/package.json +30 -0
  42. package/sdk/scripts/generate-proto.ts +65 -0
  43. package/sdk/src/generated/servicebridge-package-definition.ts +5198 -0
  44. package/sdk/src/grpc-client.d.ts +305 -0
  45. package/sdk/src/grpc-client.d.ts.map +1 -0
  46. package/sdk/src/grpc-client.test.ts +422 -0
  47. package/sdk/src/grpc-client.ts +2924 -0
  48. package/sdk/src/index.d.ts +3 -0
  49. package/sdk/src/index.d.ts.map +1 -0
  50. package/sdk/src/index.ts +29 -0
  51. package/sdk/tsconfig.json +13 -0
@@ -0,0 +1,2924 @@
1
+ import { AsyncLocalStorage } from "node:async_hooks";
2
+ import * as crypto from "node:crypto";
3
+ import { dirname, join } from "node:path";
4
+ import { fileURLToPath } from "node:url";
5
+ import * as grpc from "@grpc/grpc-js";
6
+ import * as protoLoader from "@grpc/proto-loader";
7
+ import * as protobufjs from "protobufjs/light";
8
+
9
+ export interface TraceCtx {
10
+ traceId: string;
11
+ spanId: string;
12
+ }
13
+
14
+ const traceStorage = new AsyncLocalStorage<TraceCtx>();
15
+
16
+ export function getTraceContext(): TraceCtx | undefined {
17
+ return traceStorage.getStore();
18
+ }
19
+
20
+ export function runWithTraceContext<T>(ctx: TraceCtx, fn: () => T): T {
21
+ return traceStorage.run(ctx, fn);
22
+ }
23
+
24
+ type DeadlineOptions = { deadline: Date; waitForReady?: boolean };
25
+
26
+ type RegistryFunctionWire = {
27
+ endpoints?: unknown;
28
+ input_schema_json?: string;
29
+ output_schema_json?: string;
30
+ allowed_callers?: unknown;
31
+ };
32
+
33
+ type WorkerHandleRequest = {
34
+ fn: string;
35
+ payload?: Buffer;
36
+ trace_id?: string;
37
+ span_id?: string;
38
+ };
39
+
40
+ type WorkerHandleResponse = {
41
+ output?: Buffer;
42
+ success: boolean;
43
+ error?: string;
44
+ };
45
+
46
+ type WorkerDeliverRequest = {
47
+ group_name?: string;
48
+ payload?: Buffer;
49
+ trace_id?: string;
50
+ parent_span_id?: string;
51
+ topic?: string;
52
+ message_id?: string;
53
+ attempt?: unknown;
54
+ headers?: Record<string, unknown>;
55
+ };
56
+
57
+ type WorkerDeliverResponse = {
58
+ ack: boolean;
59
+ error?: string;
60
+ reject_reason?: string;
61
+ retry_after_ms?: number;
62
+ };
63
+
64
+ type RunChunkWire = {
65
+ type?: string;
66
+ key?: string;
67
+ sequence?: unknown;
68
+ data?: Buffer | Uint8Array | string;
69
+ run_status?: string;
70
+ };
71
+
72
+ type RunWatchStream = grpc.ClientReadableStream<RunChunkWire>;
73
+
74
+ interface WorkerClient {
75
+ Handle(
76
+ req: WorkerHandleRequest,
77
+ md: grpc.Metadata,
78
+ opts: DeadlineOptions,
79
+ cb: (err: Error | null, res?: WorkerHandleResponse) => void,
80
+ ): void;
81
+ close?(): void;
82
+ }
83
+
84
+ interface ControlPlaneClient {
85
+ LookupFunction(
86
+ req: { fn_name: string },
87
+ md: grpc.Metadata,
88
+ opts: DeadlineOptions,
89
+ cb: (
90
+ err: Error | null,
91
+ res?: {
92
+ found?: boolean;
93
+ canonical_name?: string;
94
+ endpoints?: RegistryFunctionWire;
95
+ },
96
+ ) => void,
97
+ ): void;
98
+ Publish(
99
+ req: unknown,
100
+ md: grpc.Metadata,
101
+ opts: DeadlineOptions,
102
+ cb: (err: Error | null, res?: { message_id?: string }) => void,
103
+ ): void;
104
+ RegisterJob(
105
+ req: unknown,
106
+ md: grpc.Metadata,
107
+ opts: DeadlineOptions,
108
+ cb: (err: Error | null, res?: { id?: string }) => void,
109
+ ): void;
110
+ RegisterWorkflow(
111
+ req: unknown,
112
+ md: grpc.Metadata,
113
+ opts: DeadlineOptions,
114
+ cb: (err: Error | null, res?: { id?: string }) => void,
115
+ ): void;
116
+ ReportCallStart(
117
+ req: unknown,
118
+ md: grpc.Metadata,
119
+ opts: DeadlineOptions,
120
+ cb: (err: Error | null) => void,
121
+ ): void;
122
+ ReportCall(
123
+ req: unknown,
124
+ md: grpc.Metadata,
125
+ opts: DeadlineOptions,
126
+ cb: (err: Error | null) => void,
127
+ ): void;
128
+ ReportLog(
129
+ req: unknown,
130
+ md: grpc.Metadata,
131
+ opts: DeadlineOptions,
132
+ cb: (err: Error | null) => void,
133
+ ): void;
134
+ Heartbeat(
135
+ req: unknown,
136
+ md: grpc.Metadata,
137
+ opts: DeadlineOptions,
138
+ cb: (err: Error | null) => void,
139
+ ): void;
140
+ RegisterFunction(
141
+ req: unknown,
142
+ md: grpc.Metadata,
143
+ opts: DeadlineOptions,
144
+ cb: (err: Error | null) => void,
145
+ ): void;
146
+ RegisterConsumerGroup(
147
+ req: unknown,
148
+ md: grpc.Metadata,
149
+ opts: DeadlineOptions,
150
+ cb: (err: Error | null) => void,
151
+ ): void;
152
+ RegisterGroupMember(
153
+ req: unknown,
154
+ md: grpc.Metadata,
155
+ opts: DeadlineOptions,
156
+ cb: (err: Error | null) => void,
157
+ ): void;
158
+ AppendStream(
159
+ req: unknown,
160
+ md: grpc.Metadata,
161
+ opts: DeadlineOptions,
162
+ cb: (err: Error | null) => void,
163
+ ): void;
164
+ WatchRun(
165
+ req: { run_id: string; key: string; from_sequence: number },
166
+ md: grpc.Metadata,
167
+ ): RunWatchStream;
168
+ close?(): void;
169
+ }
170
+
171
+ type WorkerServiceConstructor = {
172
+ new (
173
+ target: string,
174
+ credentials: grpc.ChannelCredentials,
175
+ options: Record<string, unknown>,
176
+ ): WorkerClient;
177
+ service: grpc.ServiceDefinition<grpc.UntypedServiceImplementation>;
178
+ };
179
+
180
+ type ServiceBridgeProtoPackage = {
181
+ ServiceBridge: new (
182
+ target: string,
183
+ credentials: grpc.ChannelCredentials,
184
+ options: Record<string, string | number>,
185
+ ) => ControlPlaneClient;
186
+ ServiceBridgeWorker: WorkerServiceConstructor;
187
+ };
188
+
189
+ const _sdkRoot = join(
190
+ dirname(fileURLToPath(import.meta.url)),
191
+ "..",
192
+ "..",
193
+ "..",
194
+ );
195
+ const protoPath = join(_sdkRoot, "proto", "servicebridge.proto");
196
+ const servicebridgePackageDefinition = protoLoader.loadSync(protoPath, {
197
+ keepCase: true,
198
+ longs: String,
199
+ enums: String,
200
+ defaults: true,
201
+ oneofs: true,
202
+ });
203
+ const proto = grpc.loadPackageDefinition(
204
+ servicebridgePackageDefinition as Parameters<
205
+ typeof grpc.loadPackageDefinition
206
+ >[0],
207
+ ) as unknown as { servicebridge: ServiceBridgeProtoPackage };
208
+ const servicebridgeProto = proto.servicebridge;
209
+
210
+ const channelOpts = {
211
+ "grpc.max_receive_message_length": -1,
212
+ "grpc.max_send_message_length": -1,
213
+ "grpc.http2.initial_window_size": 67108864,
214
+ "grpc.http2.max_frame_size": 16777216,
215
+ };
216
+
217
+ // ── RPC Schema ────────────────────────────────────────────────────────────────
218
+
219
+ /** Тип поля Protobuf-сообщения */
220
+ export type RpcFieldType =
221
+ | "string"
222
+ | "int32"
223
+ | "int64"
224
+ | "uint32"
225
+ | "uint64"
226
+ | "float"
227
+ | "double"
228
+ | "bool"
229
+ | "bytes";
230
+
231
+ /** Определение одного поля схемы */
232
+ export interface RpcFieldDef {
233
+ /** Тип поля */
234
+ type: RpcFieldType;
235
+ /** Уникальный номер поля (field number в proto) */
236
+ id: number;
237
+ /** Повторяющееся поле (массив) */
238
+ repeated?: boolean;
239
+ }
240
+
241
+ /** Схема сообщения: имя поля → определение */
242
+ export type RpcSchema = Record<string, RpcFieldDef>;
243
+
244
+ /** Входная и выходная схемы для RPC-функции */
245
+ export interface RpcSchemaOpts {
246
+ input?: RpcSchema;
247
+ output?: RpcSchema;
248
+ }
249
+
250
+ // Кэш скомпилированных protobuf-типов (ключ = JSON схемы)
251
+ const protoTypeCache = new Map<string, protobufjs.Type>();
252
+
253
+ function buildProtoType(schema: RpcSchema): protobufjs.Type {
254
+ const key = JSON.stringify(schema);
255
+ const cached = protoTypeCache.get(key);
256
+ if (cached) return cached;
257
+
258
+ const root = new protobufjs.Root();
259
+ const type = new protobufjs.Type("Msg");
260
+ for (const [fieldName, def] of Object.entries(schema)) {
261
+ type.add(
262
+ new protobufjs.Field(
263
+ fieldName,
264
+ def.id,
265
+ def.type,
266
+ def.repeated ? "repeated" : "optional",
267
+ ),
268
+ );
269
+ }
270
+ root.add(type);
271
+ protoTypeCache.set(key, type);
272
+ return type;
273
+ }
274
+
275
+ function encodeWithSchema(schema: RpcSchema, payload: unknown): Buffer {
276
+ const type = buildProtoType(schema);
277
+ const msg = type.create((payload ?? {}) as Record<string, unknown>);
278
+ return Buffer.from(type.encode(msg).finish());
279
+ }
280
+
281
+ function decodeWithSchema(schema: RpcSchema, buf: Buffer): unknown {
282
+ if (!buf || buf.length === 0) return {};
283
+ const type = buildProtoType(schema);
284
+ return type.decode(buf).toJSON();
285
+ }
286
+
287
+ // ── Helpers ───────────────────────────────────────────────────────────────────
288
+
289
+ export interface RetryPolicy {
290
+ retries?: number;
291
+ retryDelay?: number;
292
+ }
293
+
294
+ export type WorkerTransport = "tls";
295
+
296
+ export interface WorkerTLSOpts {
297
+ caCert?: string | Buffer;
298
+ cert?: string | Buffer;
299
+ key?: string | Buffer;
300
+ serverName?: string;
301
+ }
302
+
303
+ export interface ServiceBridgeOpts extends RetryPolicy {
304
+ timeout?: number;
305
+ /** @deprecated No longer used — discovery is now lazy via LookupFunction. */
306
+ discoveryTimeout?: number;
307
+ /**
308
+ * How often (ms) each SbResolver re-polls LookupFunction to discover new replicas.
309
+ * Dead replicas are detected instantly by gRPC subchannel health monitoring.
310
+ * Default: 10 000 ms.
311
+ */
312
+ discoveryRefreshMs?: number;
313
+ queueMaxSize?: number;
314
+ queueOverflow?: "drop-oldest" | "drop-newest" | "error";
315
+ heartbeatIntervalMs?: number;
316
+ workerTransport?: WorkerTransport;
317
+ /**
318
+ * Explicit mTLS cert materials for the worker gRPC server.
319
+ * When omitted, they are provisioned automatically from the server on `serve()`:
320
+ * the SDK generates a key pair locally and posts the public key to
321
+ * `POST /api/tls/provision` — the private key never leaves the process.
322
+ */
323
+ workerTLS?: WorkerTLSOpts;
324
+ /**
325
+ * Base URL for the HTTP admin API used for auto-provisioning and management.
326
+ * Defaults to the gRPC host on port 14444 (e.g. "http://127.0.0.1:14444").
327
+ */
328
+ adminUrl?: string;
329
+ /**
330
+ * When `true` (default), automatically patches `console.log / .info / .warn / .error / .debug`
331
+ * so that all console output is also shipped to ServiceBridge as structured log entries.
332
+ * Original console output is preserved (pass-through).
333
+ * Set to `false` to opt out.
334
+ */
335
+ captureLogs?: boolean;
336
+ }
337
+
338
+ export interface RpcOpts extends RetryPolicy {
339
+ timeout?: number;
340
+ traceId?: string;
341
+ parentSpanId?: string;
342
+ }
343
+
344
+ export interface EventOpts {
345
+ traceId?: string;
346
+ parentSpanId?: string;
347
+ idempotencyKey?: string;
348
+ headers?: Record<string, string>;
349
+ }
350
+
351
+ export interface HandleEventOpts {
352
+ concurrency?: number;
353
+ prefetch?: number;
354
+ groupName?: string;
355
+ retryPolicyJson?: string;
356
+ /** Server-side filter expression. Syntax: comma-separated AND conditions.
357
+ * Examples: "status=paid", "amount>100", "status=paid,amount>100", "region!=us-east" */
358
+ filterExpr?: string;
359
+ }
360
+
361
+ export interface HandleRpcOpts {
362
+ timeout?: number;
363
+ retryable?: boolean;
364
+ concurrency?: number;
365
+ /** Схема для авто-валидации и бинарного кодирования payload */
366
+ schema?: RpcSchemaOpts;
367
+ /**
368
+ * Whitelist of service names allowed to call this function.
369
+ * When set, the worker rejects Handle requests from services not in this list.
370
+ * Corresponds to the allowed_callers field on the service key.
371
+ * When mTLS is used, the x-caller-service header is cryptographically backed by the cert CN.
372
+ */
373
+ allowedCallers?: string[];
374
+ }
375
+
376
+ export interface ServeOpts {
377
+ host?: string;
378
+ instanceId?: string;
379
+ weight?: number;
380
+ transport?: WorkerTransport;
381
+ tls?: WorkerTLSOpts;
382
+ }
383
+
384
+ export interface ScheduleOpts {
385
+ cron?: string;
386
+ delay?: number;
387
+ timezone?: string;
388
+ misfire?: "fire_now" | "skip";
389
+ via?: "event" | "rpc" | "workflow";
390
+ retryPolicyJson?: string;
391
+ }
392
+
393
+ /** StreamWriter allows handlers to push incremental chunks during execution. */
394
+ export interface StreamWriter {
395
+ /**
396
+ * Append a chunk to the run's named stream.
397
+ * @param data - any JSON-serializable value
398
+ * @param key - stream key (default: "default"). Use named keys for multiple streams
399
+ * (e.g. "output", "log", "progress").
400
+ */
401
+ write(data: unknown, key?: string): Promise<void>;
402
+ /**
403
+ * Signal completion of a named stream (optional — server closes automatically when the run ends).
404
+ */
405
+ end(key?: string): Promise<void>;
406
+ }
407
+
408
+ export interface EventContext {
409
+ traceId: string;
410
+ spanId: string;
411
+ refs: Record<string, string>;
412
+ retry(delayMs?: number): void;
413
+ reject(reason: string): void;
414
+ /** Real-time stream writer. Use ctx.stream.write(data) to push chunks to subscribers. */
415
+ stream: StreamWriter;
416
+ }
417
+
418
+ /** Context passed as optional second argument to RPC handlers. */
419
+ export interface RpcContext {
420
+ traceId: string;
421
+ spanId: string;
422
+ /** Real-time stream writer. Use ctx.stream.write(data) to push chunks to subscribers. */
423
+ stream: StreamWriter;
424
+ }
425
+
426
+ type EventHandler = (
427
+ payload: unknown,
428
+ ctx: EventContext,
429
+ ) => void | Promise<void>;
430
+ type FnHandler = (
431
+ payload: unknown,
432
+ ctx?: RpcContext,
433
+ ) => unknown | Promise<unknown>;
434
+
435
+ export interface HttpSpan {
436
+ traceId: string;
437
+ spanId: string;
438
+ end(opts: { statusCode?: number; success?: boolean; error?: string }): void;
439
+ }
440
+
441
+ export interface RunStreamEvent {
442
+ type: "chunk" | "run_complete";
443
+ runId: string;
444
+ key: string;
445
+ sequence: number;
446
+ data: unknown;
447
+ runStatus?: string;
448
+ }
449
+
450
+ export interface WatchRunOpts {
451
+ /** Filter by stream key. Empty = all keys. Default: "default". */
452
+ key?: string;
453
+ /** Replay chunks with sequence strictly greater than this value. 0 = full replay. */
454
+ fromSequence?: number;
455
+ }
456
+
457
+ export type ServiceBridgeErrorSeverity = "fatal" | "retriable" | "ignorable";
458
+
459
+ export class ServiceBridgeError extends Error {
460
+ code?: number;
461
+ component: string;
462
+ operation: string;
463
+ severity: ServiceBridgeErrorSeverity;
464
+ retryable: boolean;
465
+ override cause?: unknown;
466
+
467
+ constructor(opts: {
468
+ message: string;
469
+ code?: number;
470
+ component: string;
471
+ operation: string;
472
+ severity: ServiceBridgeErrorSeverity;
473
+ cause?: unknown;
474
+ }) {
475
+ super(opts.message);
476
+ this.name = "ServiceBridgeError";
477
+ this.code = opts.code;
478
+ this.component = opts.component;
479
+ this.operation = opts.operation;
480
+ this.severity = opts.severity;
481
+ this.retryable = opts.severity === "retriable";
482
+ this.cause = opts.cause;
483
+ }
484
+ }
485
+
486
+ /**
487
+ * A single node in a workflow DAG.
488
+ *
489
+ * - `id` — unique step identifier; used in `deps` of other steps.
490
+ * - `deps` — IDs of steps that must succeed before this step runs.
491
+ * Empty array (default) = root step; receives the workflow input directly.
492
+ * - `if` — optional filter expression (same syntax as event filters).
493
+ * If it evaluates to false the step is skipped.
494
+ */
495
+ export type WorkflowStep =
496
+ | {
497
+ id: string;
498
+ type: "rpc";
499
+ ref: string;
500
+ deps?: string[];
501
+ if?: string;
502
+ timeoutMs?: number;
503
+ }
504
+ | { id: string; type: "event"; ref: string; deps?: string[]; if?: string }
505
+ | {
506
+ id: string;
507
+ type: "event_wait";
508
+ ref: string;
509
+ deps?: string[];
510
+ if?: string;
511
+ timeoutMs?: number;
512
+ }
513
+ | {
514
+ id: string;
515
+ type: "sleep";
516
+ durationMs: number;
517
+ deps?: string[];
518
+ if?: string;
519
+ }
520
+ | { id: string; type: "workflow"; ref: string; deps?: string[]; if?: string };
521
+
522
+ export interface ServiceBridgeService {
523
+ rpc<T = unknown>(fn: string, payload?: unknown, opts?: RpcOpts): Promise<T>;
524
+ event(topic: string, payload?: unknown, opts?: EventOpts): Promise<string>;
525
+ job(target: string, opts: ScheduleOpts): Promise<string>;
526
+ workflow(name: string, steps: WorkflowStep[]): Promise<string>;
527
+ cancelWorkflowRun(runId: string): Promise<void>;
528
+ /**
529
+ * Регистрирует обработчик RPC-функции.
530
+ * Если передан `opts.schema`, payload автоматически валидируется и
531
+ * кодируется/декодируется в бинарный Protobuf-формат вместо JSON.
532
+ * Схема также публикуется в server registry и используется вызывающей стороной.
533
+ */
534
+ handleRpc(
535
+ fn: string,
536
+ handler: FnHandler,
537
+ opts?: HandleRpcOpts,
538
+ ): ServiceBridgeService;
539
+ handleEvent(
540
+ pattern: string,
541
+ handler: EventHandler,
542
+ opts?: HandleEventOpts,
543
+ ): ServiceBridgeService;
544
+ serve(opts?: ServeOpts): Promise<void>;
545
+ stop(): void;
546
+ startHttpSpan(opts: {
547
+ method: string;
548
+ path: string;
549
+ traceId?: string;
550
+ parentSpanId?: string;
551
+ }): HttpSpan;
552
+ /**
553
+ * Register an HTTP endpoint in the ServiceBridge catalog.
554
+ * Call this once per route after your HTTP server is configured.
555
+ * The Express and Fastify adapters call this automatically — you only need
556
+ * this when integrating a different HTTP framework.
557
+ *
558
+ * @example
559
+ * await sb.registerHttpEndpoint({ method: 'GET', route: '/users/:id' });
560
+ */
561
+ registerHttpEndpoint(opts: {
562
+ /** HTTP method: GET, POST, PUT, PATCH, DELETE, HEAD, OPTIONS */
563
+ method: string;
564
+ /** Route pattern including parameter placeholders, e.g. "/users/:id" */
565
+ route: string;
566
+ /** Stable identifier for this process instance. Defaults to a per-SDK UUID. */
567
+ instanceId?: string;
568
+ /** Address where this service can be reached, e.g. "http://10.0.0.1:3000" */
569
+ endpoint?: string;
570
+ /** Service names allowed to call this endpoint (RBAC). Default: [] (all allowed) */
571
+ allowedCallers?: string[];
572
+ }): Promise<void>;
573
+ /**
574
+ * Subscribe to a run's real-time stream.
575
+ * Replays existing chunks then forwards new ones until the run completes or the caller breaks.
576
+ *
577
+ * @example
578
+ * const stream = sb.watchRun(runId, { key: "output" });
579
+ * for await (const chunk of stream) {
580
+ * process.stdout.write(chunk.data.token);
581
+ * }
582
+ */
583
+ watchRun(runId: string, opts?: WatchRunOpts): AsyncIterable<RunStreamEvent>;
584
+ /** @internal Used by captureConsole to forward intercepted log lines. */
585
+ readonly _log: (
586
+ level: string,
587
+ msg: string,
588
+ attrs?: Record<string, string>,
589
+ ) => void;
590
+ }
591
+
592
+ function containsValue(values: string[], value: string): boolean {
593
+ return values.includes(value);
594
+ }
595
+
596
+ function containsOrAll(values: string[], value: string): boolean {
597
+ if (values.length === 0) return true;
598
+ return containsValue(values, value);
599
+ }
600
+
601
+ // extractPeerCN reads the mTLS peer certificate CN from a grpc-js server call.
602
+ // Uses the stable getAuthContext() API (ServerUnaryCallImpl, grpc-js ≥1.9).
603
+ function extractPeerCN(call: grpc.ServerUnaryCall<unknown, unknown>): string {
604
+ // Use the stable getAuthContext() API (available since grpc-js 1.9, ServerUnaryCallImpl).
605
+ const cert = (
606
+ call as grpc.ServerUnaryCall<unknown, unknown> & {
607
+ getAuthContext?(): { sslPeerCertificate?: { subject?: { CN?: string } } };
608
+ }
609
+ ).getAuthContext?.()?.sslPeerCertificate;
610
+ return typeof cert?.subject?.CN === "string" ? cert.subject.CN : "";
611
+ }
612
+
613
+ function computeDurationMs(startedAt: number): number {
614
+ return Math.max(1, Date.now() - startedAt);
615
+ }
616
+
617
+ function toJsonBuffer(payload: unknown): Buffer {
618
+ return Buffer.from(JSON.stringify(payload ?? {}));
619
+ }
620
+
621
+ function toPemBuffer(v?: string | Buffer): Buffer | undefined {
622
+ if (!v) return undefined;
623
+ return Buffer.isBuffer(v) ? v : Buffer.from(v);
624
+ }
625
+
626
+ function severityFromStatus(code?: number): ServiceBridgeErrorSeverity {
627
+ switch (code) {
628
+ case grpc.status.UNAVAILABLE:
629
+ case grpc.status.UNKNOWN:
630
+ case grpc.status.DEADLINE_EXCEEDED:
631
+ case grpc.status.RESOURCE_EXHAUSTED:
632
+ return "retriable";
633
+ case grpc.status.CANCELLED:
634
+ return "ignorable";
635
+ default:
636
+ return "fatal";
637
+ }
638
+ }
639
+
640
+ function normalizeServiceError(
641
+ err: unknown,
642
+ operation: string,
643
+ component = "control-plane",
644
+ ): ServiceBridgeError {
645
+ if (err instanceof ServiceBridgeError) {
646
+ return err;
647
+ }
648
+ const grpcErr = err as Partial<grpc.ServiceError> | undefined;
649
+ const code = typeof grpcErr?.code === "number" ? grpcErr.code : undefined;
650
+ const message =
651
+ typeof grpcErr?.message === "string" && grpcErr.message.length > 0
652
+ ? grpcErr.message
653
+ : err instanceof Error
654
+ ? err.message
655
+ : String(err);
656
+ return new ServiceBridgeError({
657
+ message,
658
+ code,
659
+ component,
660
+ operation,
661
+ severity: severityFromStatus(code),
662
+ cause: err,
663
+ });
664
+ }
665
+
666
+ function reportSDKError(
667
+ operation: string,
668
+ err: unknown,
669
+ component = "sdk",
670
+ ): ServiceBridgeError {
671
+ const normalized = normalizeServiceError(err, operation, component);
672
+ const message = `[servicebridge] ${normalized.component}.${normalized.operation}: ${normalized.message}`;
673
+ if (normalized.severity === "fatal") {
674
+ console.error(message, err);
675
+ } else {
676
+ console.warn(message, err);
677
+ }
678
+ return normalized;
679
+ }
680
+
681
+ function _secretEquals(a: string, b: string): boolean {
682
+ const sumA = crypto.createHash("sha256").update(a).digest();
683
+ const sumB = crypto.createHash("sha256").update(b).digest();
684
+ return crypto.timingSafeEqual(sumA, sumB);
685
+ }
686
+
687
+ function parseCanonicalFunctionName(target: string): {
688
+ canonicalName: string;
689
+ serviceName: string;
690
+ fnName: string;
691
+ } | null {
692
+ const canonicalName = target.trim();
693
+ if (!canonicalName) return null;
694
+ const slash = canonicalName.indexOf("/");
695
+ if (slash <= 0 || slash === canonicalName.length - 1) return null;
696
+ const serviceName = canonicalName.slice(0, slash).trim();
697
+ const fnName = canonicalName.slice(slash + 1).trim();
698
+ if (!serviceName || !fnName) return null;
699
+ return { canonicalName, serviceName, fnName };
700
+ }
701
+
702
+ function _resolveWorkerEndpoint(endpoint: string): string {
703
+ const raw = endpoint.trim();
704
+ if (!raw) throw new Error("worker endpoint is empty");
705
+ if (raw.includes("://")) {
706
+ throw new Error("worker endpoint must be host:port without scheme");
707
+ }
708
+ return raw;
709
+ }
710
+
711
+ function workerChannelOptions(
712
+ tlsOpts?: WorkerTLSOpts,
713
+ ): Record<string, string | number> {
714
+ if (!tlsOpts?.serverName) {
715
+ return channelOpts;
716
+ }
717
+ return {
718
+ ...channelOpts,
719
+ "grpc.ssl_target_name_override": tlsOpts.serverName,
720
+ "grpc.default_authority": tlsOpts.serverName,
721
+ };
722
+ }
723
+
724
+ function workerClientCredentials(
725
+ tlsOpts?: WorkerTLSOpts,
726
+ ): grpc.ChannelCredentials {
727
+ return grpc.credentials.createSsl(
728
+ toPemBuffer(tlsOpts?.caCert),
729
+ toPemBuffer(tlsOpts?.key),
730
+ toPemBuffer(tlsOpts?.cert),
731
+ );
732
+ }
733
+
734
+ // ── TLS auto-provisioning (Variant B.1) ───────────────────────────────────────
735
+ // SDK generates an ECDSA key pair locally. Only the public key is sent to the
736
+ // server. The server signs it and returns cert + CA PEM. Private key never leaves
737
+ // this process.
738
+
739
+ async function provisionWorkerTLS(
740
+ adminBase: string,
741
+ serviceKey: string,
742
+ _serviceName: string,
743
+ ): Promise<{ cert: Buffer; key: Buffer; caCert: Buffer }> {
744
+ const { privateKey, publicKey } = crypto.generateKeyPairSync("ec", {
745
+ namedCurve: "P-256",
746
+ privateKeyEncoding: { type: "pkcs8", format: "pem" },
747
+ publicKeyEncoding: { type: "spki", format: "pem" },
748
+ });
749
+
750
+ const res = await fetch(`${adminBase}/api/tls/provision`, {
751
+ method: "POST",
752
+ headers: {
753
+ "Content-Type": "application/json",
754
+ "x-service-key": serviceKey,
755
+ },
756
+ body: JSON.stringify({ public_key_pem: publicKey }),
757
+ });
758
+
759
+ if (!res.ok) {
760
+ const body = await res.text().catch(() => "");
761
+ throw new Error(
762
+ `TLS provisioning failed (${res.status}): ${body}. ` +
763
+ `Make sure the service key "${serviceKey.slice(0, 12)}…" has a service_name set in the UI.`,
764
+ );
765
+ }
766
+
767
+ const data = (await res.json()) as { cert_pem?: string; ca_pem?: string };
768
+ if (!data.cert_pem || !data.ca_pem) {
769
+ throw new Error(
770
+ "TLS provisioning: server returned incomplete response (missing cert_pem or ca_pem)",
771
+ );
772
+ }
773
+
774
+ return {
775
+ cert: Buffer.from(data.cert_pem),
776
+ key: Buffer.from(privateKey as string),
777
+ caCert: Buffer.from(data.ca_pem),
778
+ };
779
+ }
780
+
781
+ // ── gRPC Custom Name Resolver for sb:// scheme ────────────────────────────────
782
+ // Each servicebridge() call registers its context keyed by a unique clientId.
783
+ // gRPC channels use target `sb://<clientId>/<canonical-fn-name>`, which routes
784
+ // through this resolver. The resolver calls LookupFunction on first resolution
785
+ // and re-resolves on a background timer (Stale-While-Revalidate pattern).
786
+ // gRPC's native subchannel health monitoring handles dead-worker detection instantly.
787
+
788
+ type SbEndpointInfo = { host: string; port: number };
789
+
790
+ interface SbResolverContext {
791
+ lookupFn: (target: string) => Promise<SbEndpointInfo[]>;
792
+ refreshIntervalMs: number;
793
+ }
794
+
795
+ const _sbContexts = new Map<string, SbResolverContext>();
796
+ let _sbResolverRegistered = false;
797
+
798
+ function ensureSbResolverRegistered(): void {
799
+ if (_sbResolverRegistered) return;
800
+ _sbResolverRegistered = true;
801
+
802
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
803
+ const { registerResolver } = require("@grpc/grpc-js/build/src/resolver") as {
804
+ registerResolver: (scheme: string, cls: unknown) => void;
805
+ };
806
+
807
+ class SbResolver {
808
+ private readonly clientId: string;
809
+ private readonly canonicalName: string;
810
+ private readonly listener: (
811
+ endpoints: unknown,
812
+ attrs: Record<string, unknown>,
813
+ serviceConfig: null,
814
+ note: string,
815
+ ) => boolean;
816
+ private refreshTimer: ReturnType<typeof setInterval> | null = null;
817
+
818
+ constructor(
819
+ target: { authority?: string; path: string },
820
+ listener: SbResolver["listener"],
821
+ ) {
822
+ this.clientId = target.authority ?? "";
823
+ // path may be "/canonical-name" (with leading slash) or "canonical-name"
824
+ this.canonicalName = (target.path ?? "").replace(/^\/+/, "");
825
+ this.listener = listener;
826
+ this.init().catch(() => {});
827
+ }
828
+
829
+ private async init(): Promise<void> {
830
+ const ctx = _sbContexts.get(this.clientId);
831
+ if (!ctx) {
832
+ this.listener(
833
+ {
834
+ ok: false,
835
+ error: {
836
+ code: 14,
837
+ details: `No SB context for clientId=${this.clientId}`,
838
+ metadata: {},
839
+ },
840
+ },
841
+ {},
842
+ null,
843
+ "sb-resolver",
844
+ );
845
+ return;
846
+ }
847
+ // Background timer: re-resolve on every tick so new replicas are discovered
848
+ // without a streaming connection. Dead replicas are caught immediately by
849
+ // gRPC's native subchannel health probing.
850
+ this.refreshTimer = setInterval(
851
+ () => this.resolve().catch(() => {}),
852
+ ctx.refreshIntervalMs,
853
+ );
854
+ await this.resolve();
855
+ }
856
+
857
+ updateResolution(): void {
858
+ this.resolve().catch(() => {});
859
+ }
860
+
861
+ private async resolve(): Promise<void> {
862
+ const ctx = _sbContexts.get(this.clientId);
863
+ if (!ctx) return;
864
+ try {
865
+ const eps = await ctx.lookupFn(this.canonicalName);
866
+ this.reportEndpoints(eps);
867
+ } catch {
868
+ // gRPC will call updateResolution() again on the next connection attempt
869
+ }
870
+ }
871
+
872
+ private reportEndpoints(eps: SbEndpointInfo[]): void {
873
+ if (eps.length === 0) {
874
+ this.listener(
875
+ {
876
+ ok: false,
877
+ error: {
878
+ code: 14,
879
+ details: `No live endpoints for ${this.canonicalName}`,
880
+ metadata: {},
881
+ },
882
+ },
883
+ {},
884
+ null,
885
+ "sb-resolver",
886
+ );
887
+ return;
888
+ }
889
+ this.listener(
890
+ { ok: true, value: eps.map((ep) => ({ addresses: [ep] })) },
891
+ {},
892
+ null,
893
+ "sb-resolver",
894
+ );
895
+ }
896
+
897
+ destroy(): void {
898
+ if (this.refreshTimer) {
899
+ clearInterval(this.refreshTimer);
900
+ this.refreshTimer = null;
901
+ }
902
+ }
903
+
904
+ static getDefaultAuthority(target: {
905
+ authority?: string;
906
+ path: string;
907
+ }): string {
908
+ return target.authority ?? target.path ?? "";
909
+ }
910
+ }
911
+
912
+ registerResolver("sb", SbResolver);
913
+ }
914
+
915
+ export function servicebridge(
916
+ url: string,
917
+ serviceKey: string,
918
+ service = "",
919
+ globalOpts: ServiceBridgeOpts = {},
920
+ ): ServiceBridgeService {
921
+ const meta = new grpc.Metadata();
922
+ meta.add("x-service-key", serviceKey);
923
+
924
+ const rawUrl = url.trim();
925
+ const target = rawUrl.replace(/^grpcs?:\/\//, "") || "127.0.0.1:14445";
926
+
927
+ // Derive the HTTP admin base URL (used for TLS provisioning and management).
928
+ // Default: same host as gRPC but on port 14444.
929
+ const adminBase =
930
+ globalOpts.adminUrl ??
931
+ (rawUrl
932
+ ? rawUrl.replace(/^grpcs?:\/\//, "http://").replace(/:\d+$/, ":14444")
933
+ : "http://127.0.0.1:14444");
934
+
935
+ const configuredCACert = toPemBuffer(globalOpts.workerTLS?.caCert);
936
+ const configuredClientKey = toPemBuffer(globalOpts.workerTLS?.key);
937
+ const configuredClientCert = toPemBuffer(globalOpts.workerTLS?.cert);
938
+
939
+ if (
940
+ (configuredClientCert && !configuredClientKey) ||
941
+ (!configuredClientCert && configuredClientKey)
942
+ ) {
943
+ throw new Error(
944
+ "workerTLS.cert and workerTLS.key must be provided together",
945
+ );
946
+ }
947
+
948
+ // Control plane uses one-way TLS: server presents its cert, we authenticate
949
+ // via x-service-key in gRPC metadata. No client cert required here.
950
+ // If the user provided an explicit CA cert, we use it for server verification;
951
+ // otherwise we skip server cert verification (safe for local dev with self-signed CAs).
952
+ function makeControlPlaneCreds(): grpc.ChannelCredentials {
953
+ if (configuredCACert) {
954
+ // One-way TLS: verify server cert with provided CA, no client cert.
955
+ return grpc.credentials.createSsl(configuredCACert, null, null);
956
+ }
957
+ // Skip server cert verification — suitable for local dev with self-signed CA.
958
+ return grpc.credentials.createSsl(null, null, null, {
959
+ checkServerIdentity: () => undefined,
960
+ });
961
+ }
962
+
963
+ function unaryDeadlineOptions(
964
+ timeoutMs = globalOpts.timeout ?? 30_000,
965
+ ): DeadlineOptions {
966
+ return { deadline: new Date(Date.now() + timeoutMs) };
967
+ }
968
+
969
+ // Unique client ID — used as the authority in `sb://<clientId>/<fn>` targets
970
+ // so the module-level SbResolver can find this client's context.
971
+ const clientId = crypto.randomUUID();
972
+
973
+ // Effective mTLS materials — set from explicit workerTLS or auto-provisioned in serve().
974
+ // Declared here so _controlReady closure can reference it before the normal var declaration.
975
+ let effectiveWorkerTLS: WorkerTLSOpts | null = globalOpts.workerTLS ?? null;
976
+
977
+ // The stub is created lazily after TLS provisioning.
978
+ // Until _controlReady resolves, all SDK calls are held in the offline queue.
979
+ let stub = new servicebridgeProto.ServiceBridge(
980
+ target,
981
+ makeControlPlaneCreds(),
982
+ workerChannelOptions(globalOpts.workerTLS),
983
+ );
984
+
985
+ // If no explicit certs are provided, provision them now (eagerly, in the background).
986
+ // The stub is recreated with the provisioned CA cert once provisioning completes.
987
+ // startRegistryWatch() is called only after the stub is ready.
988
+ const _controlReady: Promise<void> = (async () => {
989
+ if (configuredCACert) {
990
+ // CA cert already provided — stub creds are correct, connect immediately.
991
+ return;
992
+ }
993
+ // Provision: generate key pair, POST public key to server, get cert+CA back.
994
+ try {
995
+ const prov = await provisionWorkerTLS(adminBase, serviceKey, service);
996
+ effectiveWorkerTLS = prov;
997
+ // Recreate stub with verified server CA cert (one-way TLS).
998
+ try {
999
+ (stub as { close?: () => void }).close?.();
1000
+ } catch {
1001
+ /* ignore */
1002
+ }
1003
+ stub = new servicebridgeProto.ServiceBridge(
1004
+ target,
1005
+ grpc.credentials.createSsl(prov.caCert, null, null),
1006
+ workerChannelOptions(prov),
1007
+ );
1008
+ } catch (_err) {
1009
+ // Provisioning failure is non-fatal for client-only mode (no serve()).
1010
+ // Errors will surface when actual gRPC calls are attempted.
1011
+ }
1012
+ })();
1013
+
1014
+ type EventHandlerEntry = {
1015
+ groupName: string;
1016
+ pattern: string;
1017
+ handler: EventHandler;
1018
+ opts: HandleEventOpts;
1019
+ };
1020
+
1021
+ const eventHandlers = new Map<string, EventHandlerEntry>();
1022
+ const fnHandlers = new Map<
1023
+ string,
1024
+ { handler: FnHandler; opts: HandleRpcOpts }
1025
+ >();
1026
+ const registeredGroups = new Set<string>();
1027
+
1028
+ // ── Lightweight registry: metadata only, no connection state ──────────────
1029
+ // gRPC channels (via SbResolver) own connection management.
1030
+ interface FunctionMeta {
1031
+ canonicalName: string;
1032
+ serviceName: string;
1033
+ fnName: string;
1034
+ endpoints: SbEndpointInfo[];
1035
+ inputSchema?: RpcSchema;
1036
+ outputSchema?: RpcSchema;
1037
+ allowedCallers: string[];
1038
+ }
1039
+ const functionMeta = new Map<string, FunctionMeta>();
1040
+ // shortName → canonical (null = ambiguous: multiple services export same fn name)
1041
+ const fnAliasMap = new Map<string, string | null>();
1042
+ // One grpc.Channel per canonical function name; owned by gRPC via SbResolver
1043
+ const functionChannels = new Map<string, grpc.Channel>();
1044
+
1045
+ let isOnline = false;
1046
+ let stopped = false;
1047
+ // Timer to restore isOnline after transient control-plane connection errors.
1048
+ let onlineRestoreTimer: ReturnType<typeof setTimeout> | null = null;
1049
+
1050
+ type QueuedOp =
1051
+ | { type: "event"; topic: string; payload: unknown; opts?: EventOpts }
1052
+ | { type: "job"; target: string; opts: ScheduleOpts }
1053
+ | { type: "workflow"; name: string; steps: WorkflowStep[] }
1054
+ | {
1055
+ type: "reportCallStart";
1056
+ traceId: string;
1057
+ spanId: string;
1058
+ parentSpanId: string;
1059
+ fn: string;
1060
+ startedAt: number;
1061
+ inputBuf: Buffer;
1062
+ attempt: number;
1063
+ }
1064
+ | {
1065
+ type: "reportCall";
1066
+ traceId: string;
1067
+ spanId: string;
1068
+ fn: string;
1069
+ startedAt: number;
1070
+ inputBuf: Buffer;
1071
+ success: boolean;
1072
+ attempt: number;
1073
+ outputBuf?: Buffer;
1074
+ error?: string;
1075
+ };
1076
+
1077
+ const offlineQueue: QueuedOp[] = [];
1078
+ let workerServer: grpc.Server | null = null;
1079
+ let serveState: {
1080
+ endpoint: string;
1081
+ transport: WorkerTransport;
1082
+ opts: ServeOpts;
1083
+ instanceId: string;
1084
+ } | null = null;
1085
+ let heartbeatTimer: ReturnType<typeof setTimeout> | null = null;
1086
+ let registrationSyncPromise: Promise<void> | null = null;
1087
+ let lastCpuSample: { user: number; system: number } | null = null;
1088
+ let lastCpuTime = 0;
1089
+
1090
+ function getProcessMetrics(): { cpuPercent?: number; ramMb?: number } {
1091
+ const ramMb = Math.round(process.memoryUsage().rss / (1024 * 1024));
1092
+ const now = Date.now();
1093
+ const cpu = process.cpuUsage();
1094
+ let cpuPercent: number | undefined;
1095
+ if (lastCpuSample && lastCpuTime > 0) {
1096
+ const elapsedMs = now - lastCpuTime;
1097
+ if (elapsedMs >= 100) {
1098
+ const deltaUser = cpu.user - lastCpuSample.user;
1099
+ const deltaSystem = cpu.system - lastCpuSample.system;
1100
+ cpuPercent = Math.min(
1101
+ 100,
1102
+ Math.max(0, (deltaUser + deltaSystem) / (elapsedMs * 10)),
1103
+ );
1104
+ }
1105
+ }
1106
+ lastCpuSample = { user: cpu.user, system: cpu.system };
1107
+ lastCpuTime = now;
1108
+ return { cpuPercent, ramMb };
1109
+ }
1110
+
1111
+ function enqueueOffline(op: QueuedOp): void {
1112
+ const max = globalOpts.queueMaxSize ?? 1000;
1113
+ const policy = globalOpts.queueOverflow ?? "drop-oldest";
1114
+ if (offlineQueue.length >= max) {
1115
+ if (policy === "error")
1116
+ throw new Error("ServiceBridge offline queue is full");
1117
+ if (policy === "drop-oldest") {
1118
+ offlineQueue.shift();
1119
+ } else {
1120
+ return;
1121
+ }
1122
+ }
1123
+ offlineQueue.push(op);
1124
+ }
1125
+
1126
+ function makeWorkerMeta(): grpc.Metadata {
1127
+ const md = new grpc.Metadata();
1128
+ md.add("x-caller-service", service);
1129
+ return md;
1130
+ }
1131
+
1132
+ async function sendReportCallStart(op: {
1133
+ traceId: string;
1134
+ spanId: string;
1135
+ parentSpanId: string;
1136
+ fn: string;
1137
+ startedAt: number;
1138
+ inputBuf: Buffer;
1139
+ attempt: number;
1140
+ }): Promise<void> {
1141
+ await new Promise<void>((resolve, reject) => {
1142
+ stub.ReportCallStart(
1143
+ {
1144
+ trace_id: op.traceId,
1145
+ span_id: op.spanId,
1146
+ parent_span_id: op.parentSpanId,
1147
+ fn: op.fn,
1148
+ service_name: service,
1149
+ started_at: String(op.startedAt),
1150
+ input: op.inputBuf,
1151
+ attempt: op.attempt,
1152
+ instance_id: serveState?.instanceId ?? "",
1153
+ },
1154
+ meta,
1155
+ unaryDeadlineOptions(),
1156
+ (err: Error | null) => (err ? reject(err) : resolve()),
1157
+ );
1158
+ });
1159
+ }
1160
+
1161
+ async function sendReportCall(op: {
1162
+ traceId: string;
1163
+ spanId: string;
1164
+ fn: string;
1165
+ startedAt: number;
1166
+ inputBuf: Buffer;
1167
+ success: boolean;
1168
+ attempt: number;
1169
+ outputBuf?: Buffer;
1170
+ error?: string;
1171
+ }): Promise<void> {
1172
+ await new Promise<void>((resolve, reject) => {
1173
+ stub.ReportCall(
1174
+ {
1175
+ trace_id: op.traceId,
1176
+ span_id: op.spanId,
1177
+ fn: op.fn,
1178
+ service_name: service,
1179
+ started_at: String(op.startedAt),
1180
+ duration_ms: String(computeDurationMs(op.startedAt)),
1181
+ success: op.success,
1182
+ error: op.error ?? "",
1183
+ input: op.inputBuf,
1184
+ output: op.outputBuf ?? Buffer.alloc(0),
1185
+ attempt: op.attempt,
1186
+ instance_id: serveState?.instanceId ?? "",
1187
+ },
1188
+ meta,
1189
+ unaryDeadlineOptions(),
1190
+ (err: Error | null) => (err ? reject(err) : resolve()),
1191
+ );
1192
+ });
1193
+ }
1194
+
1195
+ // ── Logger (batched log shipping to ServiceBridge) ──────────────────────────
1196
+ type LogEntry = {
1197
+ trace_id?: string;
1198
+ span_id?: string;
1199
+ service_name: string;
1200
+ level: string;
1201
+ message: string;
1202
+ timestamp_ns: string;
1203
+ attributes: Record<string, string>;
1204
+ instance_id?: string;
1205
+ };
1206
+
1207
+ let logBatch: LogEntry[] = [];
1208
+ let logFlushTimer: ReturnType<typeof setTimeout> | null = null;
1209
+
1210
+ function flushLogs(): void {
1211
+ if (logBatch.length === 0) return;
1212
+ const entries = logBatch;
1213
+ logBatch = [];
1214
+ if (logFlushTimer !== null) {
1215
+ clearTimeout(logFlushTimer);
1216
+ logFlushTimer = null;
1217
+ }
1218
+ stub.ReportLog({ entries }, meta, unaryDeadlineOptions(), () => {});
1219
+ }
1220
+
1221
+ function pushLog(
1222
+ level: string,
1223
+ msg: string,
1224
+ attrs?: Record<string, string>,
1225
+ ): void {
1226
+ const tc = traceStorage.getStore();
1227
+ const entry: LogEntry = {
1228
+ service_name: service,
1229
+ level,
1230
+ message: msg,
1231
+ timestamp_ns: String(Date.now() * 1_000_000),
1232
+ attributes: attrs ?? {},
1233
+ };
1234
+ if (tc?.traceId) entry.trace_id = tc.traceId;
1235
+ if (tc?.spanId) entry.span_id = tc.spanId;
1236
+ if (serveState?.instanceId) entry.instance_id = serveState.instanceId;
1237
+ logBatch.push(entry);
1238
+ if (logBatch.length >= 100) {
1239
+ flushLogs();
1240
+ } else if (logFlushTimer === null) {
1241
+ logFlushTimer = setTimeout(flushLogs, 500);
1242
+ }
1243
+ }
1244
+
1245
+ // ── Registry context registration ──────────────────────────────────────────
1246
+ ensureSbResolverRegistered();
1247
+ _sbContexts.set(clientId, {
1248
+ lookupFn: async (target: string) => {
1249
+ const canonical = resolveCanonical(target);
1250
+ if (canonical) {
1251
+ const m = functionMeta.get(canonical);
1252
+ if (m && m.endpoints.length > 0) return m.endpoints;
1253
+ }
1254
+ return doLookupFunction(target);
1255
+ },
1256
+ refreshIntervalMs: globalOpts.discoveryRefreshMs ?? 10_000,
1257
+ });
1258
+
1259
+ // ── Helper: parse raw wire endpoints into SbEndpointInfo[] ─────────────────
1260
+ function parseEndpointsFromWire(raw: unknown): SbEndpointInfo[] {
1261
+ if (!Array.isArray(raw)) return [];
1262
+ const results: SbEndpointInfo[] = [];
1263
+ for (const ep of raw) {
1264
+ const str = typeof ep?.endpoint === "string" ? ep.endpoint.trim() : "";
1265
+ if (!str) continue;
1266
+ const colon = str.lastIndexOf(":");
1267
+ if (colon < 0) continue;
1268
+ const host = str.slice(0, colon);
1269
+ const port = parseInt(str.slice(colon + 1), 10);
1270
+ if (host && Number.isFinite(port) && port > 0)
1271
+ results.push({ host, port });
1272
+ }
1273
+ return results;
1274
+ }
1275
+
1276
+ function parseSchemaJsonSafe(raw: unknown): RpcSchema | undefined {
1277
+ if (typeof raw !== "string" || !raw) return undefined;
1278
+ try {
1279
+ const p = JSON.parse(raw);
1280
+ if (p && typeof p === "object" && !Array.isArray(p))
1281
+ return p as RpcSchema;
1282
+ } catch {
1283
+ /* ignored */
1284
+ }
1285
+ return undefined;
1286
+ }
1287
+
1288
+ // ── Resolve a call target to a canonical function name ─────────────────────
1289
+ function resolveCanonical(target: string): string | null {
1290
+ const n = target.trim();
1291
+ if (!n) throw new Error("RPC target is required");
1292
+ if (n.includes("/")) return functionMeta.has(n) ? n : null;
1293
+ const alias = fnAliasMap.get(n);
1294
+ if (alias === null)
1295
+ throw new Error(
1296
+ `RPC target "${n}" is ambiguous; use canonical service/fn`,
1297
+ );
1298
+ return alias ?? null;
1299
+ }
1300
+
1301
+ // ── Server-side lazy lookup (called by resolver on cache miss) ───────────────
1302
+ async function doLookupFunction(target: string): Promise<SbEndpointInfo[]> {
1303
+ return new Promise((resolve) => {
1304
+ stub.LookupFunction(
1305
+ { fn_name: target },
1306
+ meta,
1307
+ { deadline: new Date(Date.now() + 5_000) },
1308
+ (err, res) => {
1309
+ if (err || !res?.found) {
1310
+ resolve([]);
1311
+ return;
1312
+ }
1313
+ const canonicalName = res.canonical_name ?? target;
1314
+ const wire = res.endpoints as RegistryFunctionWire | undefined;
1315
+ const endpoints = parseEndpointsFromWire(wire?.endpoints);
1316
+ // Upsert into local metadata so next resolveCanonical() finds it
1317
+ const isNewFunction = !functionMeta.has(canonicalName);
1318
+ if (isNewFunction) {
1319
+ const parsed = parseCanonicalFunctionName(canonicalName);
1320
+ if (parsed) {
1321
+ functionMeta.set(canonicalName, {
1322
+ canonicalName,
1323
+ fnName: parsed.fnName,
1324
+ serviceName: parsed.serviceName,
1325
+ inputSchema: parseSchemaJsonSafe(wire?.input_schema_json),
1326
+ outputSchema: parseSchemaJsonSafe(wire?.output_schema_json),
1327
+ allowedCallers: Array.isArray(wire?.allowed_callers)
1328
+ ? (wire.allowed_callers as string[])
1329
+ : [],
1330
+ endpoints,
1331
+ });
1332
+ rebuildAliasMap();
1333
+ }
1334
+ } else {
1335
+ const meta = functionMeta.get(canonicalName);
1336
+ if (meta) meta.endpoints = endpoints;
1337
+ }
1338
+ resolve(endpoints);
1339
+ },
1340
+ );
1341
+ });
1342
+ }
1343
+
1344
+ // ── Registry rebuild helpers ────────────────────────────────────────────────
1345
+ function rebuildAliasMap(): void {
1346
+ fnAliasMap.clear();
1347
+ const count = new Map<string, number>();
1348
+ for (const m of functionMeta.values())
1349
+ count.set(m.fnName, (count.get(m.fnName) ?? 0) + 1);
1350
+ for (const m of functionMeta.values())
1351
+ fnAliasMap.set(
1352
+ m.fnName,
1353
+ (count.get(m.fnName) ?? 0) === 1 ? m.canonicalName : null,
1354
+ );
1355
+ }
1356
+
1357
+ // ── gRPC channel per function (resolver manages endpoint selection + LB) ────
1358
+ function getOrCreateFunctionChannel(canonicalName: string): grpc.Channel {
1359
+ let ch = functionChannels.get(canonicalName);
1360
+ if (!ch) {
1361
+ const tlsOpts = effectiveWorkerTLS;
1362
+ const creds = tlsOpts
1363
+ ? workerClientCredentials(tlsOpts)
1364
+ : grpc.credentials.createSsl(null, null, null, {
1365
+ checkServerIdentity: () => undefined,
1366
+ });
1367
+ // `sb://<clientId>/<canonical>` — SbResolver looks up context by clientId
1368
+ ch = new grpc.Channel(`sb://${clientId}/${canonicalName}`, creds, {
1369
+ ...workerChannelOptions(tlsOpts ?? undefined),
1370
+ "grpc.lb_policy_name": "round_robin",
1371
+ });
1372
+ functionChannels.set(canonicalName, ch);
1373
+ }
1374
+ return ch;
1375
+ }
1376
+
1377
+ function closeAllFunctionChannels(): void {
1378
+ for (const [, ch] of functionChannels) {
1379
+ try {
1380
+ ch.close();
1381
+ } catch {
1382
+ /* ignored */
1383
+ }
1384
+ }
1385
+ functionChannels.clear();
1386
+ }
1387
+
1388
+ // Set online once the control plane stub has valid creds and flush any queued ops.
1389
+ _controlReady
1390
+ .then(() => {
1391
+ isOnline = true;
1392
+ flushQueue().catch((err) => {
1393
+ if (isConnectionError(err)) scheduleOnlineRestore();
1394
+ else reportSDKError("flush-on-ready", err);
1395
+ });
1396
+ })
1397
+ .catch(() => {
1398
+ // Provisioning failure is non-fatal for client-only mode.
1399
+ // isOnline remains false; individual calls will surface errors.
1400
+ });
1401
+
1402
+ // Restore isOnline after transient control-plane errors (gRPC reconnects automatically).
1403
+ function scheduleOnlineRestore(): void {
1404
+ if (stopped || isOnline || onlineRestoreTimer) return;
1405
+ onlineRestoreTimer = setTimeout(() => {
1406
+ onlineRestoreTimer = null;
1407
+ if (!stopped) {
1408
+ isOnline = true;
1409
+ flushQueue().catch((err) => {
1410
+ if (isConnectionError(err)) scheduleOnlineRestore();
1411
+ else reportSDKError("flush-on-restore", err);
1412
+ });
1413
+ }
1414
+ }, 2_000);
1415
+ }
1416
+
1417
+ function isConnectionError(e: unknown): boolean {
1418
+ const code = (e as Partial<grpc.ServiceError> | undefined)?.code;
1419
+ return code === grpc.status.UNAVAILABLE || code === grpc.status.UNKNOWN;
1420
+ }
1421
+
1422
+ function normalizeUnknownErrorMessage(error: unknown): string {
1423
+ return error instanceof Error ? error.message : String(error);
1424
+ }
1425
+
1426
+ function reportCallStartAsync(
1427
+ traceId: string,
1428
+ spanId: string,
1429
+ parentSpanId: string,
1430
+ fn: string,
1431
+ startedAt: number,
1432
+ inputBuf: Buffer,
1433
+ attempt: number,
1434
+ ): void {
1435
+ if (!isOnline) {
1436
+ enqueueOffline({
1437
+ type: "reportCallStart",
1438
+ traceId,
1439
+ spanId,
1440
+ parentSpanId,
1441
+ fn,
1442
+ startedAt,
1443
+ inputBuf,
1444
+ attempt,
1445
+ });
1446
+ return;
1447
+ }
1448
+ sendReportCallStart({
1449
+ traceId,
1450
+ spanId,
1451
+ parentSpanId,
1452
+ fn,
1453
+ startedAt,
1454
+ inputBuf,
1455
+ attempt,
1456
+ }).catch((err) => {
1457
+ if (isConnectionError(err)) {
1458
+ isOnline = false;
1459
+ scheduleOnlineRestore();
1460
+ enqueueOffline({
1461
+ type: "reportCallStart",
1462
+ traceId,
1463
+ spanId,
1464
+ parentSpanId,
1465
+ fn,
1466
+ startedAt,
1467
+ inputBuf,
1468
+ attempt,
1469
+ });
1470
+ return;
1471
+ }
1472
+ reportSDKError("report-call-start", err);
1473
+ });
1474
+ }
1475
+
1476
+ function reportCallAsync(
1477
+ traceId: string,
1478
+ spanId: string,
1479
+ fn: string,
1480
+ startedAt: number,
1481
+ inputBuf: Buffer,
1482
+ success: boolean,
1483
+ attempt: number,
1484
+ outputBuf?: Buffer,
1485
+ error = "",
1486
+ ): void {
1487
+ if (!isOnline) {
1488
+ enqueueOffline({
1489
+ type: "reportCall",
1490
+ traceId,
1491
+ spanId,
1492
+ fn,
1493
+ startedAt,
1494
+ inputBuf,
1495
+ success,
1496
+ attempt,
1497
+ outputBuf,
1498
+ error,
1499
+ });
1500
+ return;
1501
+ }
1502
+ sendReportCall({
1503
+ traceId,
1504
+ spanId,
1505
+ fn,
1506
+ startedAt,
1507
+ inputBuf,
1508
+ success,
1509
+ attempt,
1510
+ outputBuf,
1511
+ error,
1512
+ }).catch((err) => {
1513
+ if (isConnectionError(err)) {
1514
+ isOnline = false;
1515
+ scheduleOnlineRestore();
1516
+ enqueueOffline({
1517
+ type: "reportCall",
1518
+ traceId,
1519
+ spanId,
1520
+ fn,
1521
+ startedAt,
1522
+ inputBuf,
1523
+ success,
1524
+ attempt,
1525
+ outputBuf,
1526
+ error,
1527
+ });
1528
+ return;
1529
+ }
1530
+ reportSDKError("report-call", err);
1531
+ });
1532
+ }
1533
+
1534
+ async function flushQueue(): Promise<void> {
1535
+ while (offlineQueue.length > 0 && isOnline) {
1536
+ const op = offlineQueue[0];
1537
+ try {
1538
+ if (op.type === "event") {
1539
+ await new Promise<void>((res, rej) => {
1540
+ stub.Publish(
1541
+ {
1542
+ topic: op.topic,
1543
+ payload: toJsonBuffer(op.payload),
1544
+ headers: op.opts?.headers ?? {},
1545
+ trace_id: op.opts?.traceId ?? "",
1546
+ parent_span_id: op.opts?.parentSpanId ?? "",
1547
+ producer_service: service,
1548
+ idempotency_key: op.opts?.idempotencyKey ?? "",
1549
+ },
1550
+ meta,
1551
+ unaryDeadlineOptions(),
1552
+ (err: Error | null) => (err ? rej(err) : res()),
1553
+ );
1554
+ });
1555
+ } else if (op.type === "job") {
1556
+ await new Promise<void>((res, rej) => {
1557
+ stub.RegisterJob(
1558
+ {
1559
+ cron_expr: op.opts.cron ?? "",
1560
+ timezone: op.opts.timezone ?? "UTC",
1561
+ misfire_policy: op.opts.misfire ?? "fire_now",
1562
+ target_type: op.opts.via ?? "rpc",
1563
+ target_ref: op.target,
1564
+ delay_ms: op.opts.delay ?? 0,
1565
+ service_name: service,
1566
+ retry_policy_json: op.opts.retryPolicyJson ?? "{}",
1567
+ },
1568
+ meta,
1569
+ unaryDeadlineOptions(),
1570
+ (err: Error | null) => (err ? rej(err) : res()),
1571
+ );
1572
+ });
1573
+ } else if (op.type === "workflow") {
1574
+ await new Promise<void>((res, rej) => {
1575
+ stub.RegisterWorkflow(
1576
+ {
1577
+ name: op.name,
1578
+ definition: JSON.stringify(op.steps),
1579
+ opts: "{}",
1580
+ service_name: service,
1581
+ },
1582
+ meta,
1583
+ unaryDeadlineOptions(),
1584
+ (err: Error | null) => (err ? rej(err) : res()),
1585
+ );
1586
+ });
1587
+ } else if (op.type === "reportCallStart") {
1588
+ await sendReportCallStart({ ...op });
1589
+ } else if (op.type === "reportCall") {
1590
+ await sendReportCall(op);
1591
+ }
1592
+ offlineQueue.shift();
1593
+ } catch (err) {
1594
+ if (isConnectionError(err)) {
1595
+ isOnline = false;
1596
+ scheduleOnlineRestore();
1597
+ break;
1598
+ }
1599
+ reportSDKError("flush-offline-queue", err);
1600
+ // Keep queue head for retry on next reconnect.
1601
+ break;
1602
+ }
1603
+ }
1604
+ }
1605
+
1606
+ function requirePeerCN(
1607
+ call: grpc.ServerUnaryCall<
1608
+ WorkerHandleRequest | WorkerDeliverRequest,
1609
+ unknown
1610
+ >,
1611
+ allowedCallers: string[] | undefined,
1612
+ isDeliver: boolean,
1613
+ ): string | null {
1614
+ const peerCN = extractPeerCN(call);
1615
+ if (!peerCN) return null;
1616
+ if (isDeliver) {
1617
+ return peerCN === "ServiceBridge Server" ? peerCN : null;
1618
+ }
1619
+ if (peerCN === "ServiceBridge Server") return peerCN;
1620
+ if (allowedCallers && allowedCallers.length > 0) {
1621
+ return containsValue(allowedCallers, peerCN) ? peerCN : null;
1622
+ }
1623
+ return peerCN;
1624
+ }
1625
+
1626
+ function isRegistryResyncRequiredError(e: unknown): boolean {
1627
+ const code = (e as Partial<grpc.ServiceError> | undefined)?.code;
1628
+ return (
1629
+ code === grpc.status.NOT_FOUND || code === grpc.status.FAILED_PRECONDITION
1630
+ );
1631
+ }
1632
+
1633
+ async function sendHeartbeat(): Promise<void> {
1634
+ if (!serveState) return;
1635
+ const state = serveState;
1636
+ const metrics = getProcessMetrics();
1637
+ const req: Record<string, unknown> = {
1638
+ service_name: service,
1639
+ instance_id: state.instanceId,
1640
+ endpoint: state.endpoint,
1641
+ group_names: [...registeredGroups],
1642
+ function_names: [...fnHandlers.keys()],
1643
+ transport: state.transport,
1644
+ };
1645
+ if (metrics.cpuPercent != null) req.cpu_percent = metrics.cpuPercent;
1646
+ if (metrics.ramMb != null) req.ram_mb = metrics.ramMb;
1647
+ await new Promise<void>((resolve, reject) => {
1648
+ stub.Heartbeat(req, meta, unaryDeadlineOptions(), (err: Error | null) =>
1649
+ err ? reject(normalizeServiceError(err, "heartbeat")) : resolve(),
1650
+ );
1651
+ });
1652
+ }
1653
+
1654
+ async function syncRegistrations(reason: string): Promise<void> {
1655
+ if (!serveState) return;
1656
+ if (registrationSyncPromise) {
1657
+ return registrationSyncPromise;
1658
+ }
1659
+
1660
+ const state = serveState;
1661
+ registrationSyncPromise = (async () => {
1662
+ const nextGroups = new Set<string>();
1663
+ for (const [fn, fnEntry] of fnHandlers.entries()) {
1664
+ await new Promise<void>((resolve, reject) => {
1665
+ stub.RegisterFunction(
1666
+ {
1667
+ fn_name: fn,
1668
+ service_name: service,
1669
+ instance_id: state.instanceId,
1670
+ endpoint: state.endpoint,
1671
+ transport: state.transport,
1672
+ weight: Math.max(1, state.opts.weight ?? 1),
1673
+ input_schema_json: fnEntry.opts.schema?.input
1674
+ ? JSON.stringify(fnEntry.opts.schema.input)
1675
+ : "",
1676
+ output_schema_json: fnEntry.opts.schema?.output
1677
+ ? JSON.stringify(fnEntry.opts.schema.output)
1678
+ : "",
1679
+ },
1680
+ meta,
1681
+ unaryDeadlineOptions(),
1682
+ (err: Error | null) =>
1683
+ err
1684
+ ? reject(
1685
+ normalizeServiceError(
1686
+ err,
1687
+ `register-function:${reason}:${fn}`,
1688
+ ),
1689
+ )
1690
+ : resolve(),
1691
+ );
1692
+ });
1693
+ }
1694
+
1695
+ for (const entry of eventHandlers.values()) {
1696
+ const groupName = entry.groupName;
1697
+ await new Promise<void>((resolve, reject) => {
1698
+ stub.RegisterConsumerGroup(
1699
+ {
1700
+ name: groupName,
1701
+ pattern: entry.pattern,
1702
+ mode: "shared",
1703
+ retry_policy_json: entry.opts.retryPolicyJson ?? "{}",
1704
+ active: true,
1705
+ filter_expr: entry.opts.filterExpr ?? "",
1706
+ },
1707
+ meta,
1708
+ unaryDeadlineOptions(),
1709
+ (err: Error | null) =>
1710
+ err
1711
+ ? reject(
1712
+ normalizeServiceError(
1713
+ err,
1714
+ `register-group:${reason}:${groupName}`,
1715
+ ),
1716
+ )
1717
+ : resolve(),
1718
+ );
1719
+ });
1720
+ await new Promise<void>((resolve, reject) => {
1721
+ stub.RegisterGroupMember(
1722
+ {
1723
+ group_name: groupName,
1724
+ service_name: service,
1725
+ instance_id: state.instanceId,
1726
+ endpoint: state.endpoint,
1727
+ transport: state.transport,
1728
+ weight: Math.max(1, state.opts.weight ?? 1),
1729
+ },
1730
+ meta,
1731
+ unaryDeadlineOptions(),
1732
+ (err: Error | null) =>
1733
+ err
1734
+ ? reject(
1735
+ normalizeServiceError(
1736
+ err,
1737
+ `register-member:${reason}:${groupName}`,
1738
+ ),
1739
+ )
1740
+ : resolve(),
1741
+ );
1742
+ });
1743
+ nextGroups.add(groupName);
1744
+ }
1745
+
1746
+ registeredGroups.clear();
1747
+ for (const groupName of nextGroups) {
1748
+ registeredGroups.add(groupName);
1749
+ }
1750
+ await sendHeartbeat();
1751
+ })().finally(() => {
1752
+ registrationSyncPromise = null;
1753
+ });
1754
+
1755
+ return registrationSyncPromise;
1756
+ }
1757
+
1758
+ function nextHeartbeatDelayMs(): number {
1759
+ const base = Math.max(1_000, globalOpts.heartbeatIntervalMs ?? 10_000);
1760
+ const jitter = Math.max(250, Math.floor(base * 0.2));
1761
+ const offset = Math.floor(Math.random() * (jitter * 2 + 1)) - jitter;
1762
+ return Math.max(1_000, base + offset);
1763
+ }
1764
+
1765
+ async function heartbeatTick(): Promise<void> {
1766
+ if (!serveState || stopped) return;
1767
+ try {
1768
+ await sendHeartbeat();
1769
+ } catch (err) {
1770
+ if (isConnectionError(err)) {
1771
+ isOnline = false;
1772
+ scheduleOnlineRestore();
1773
+ } else if (isRegistryResyncRequiredError(err)) {
1774
+ await syncRegistrations("heartbeat-resync");
1775
+ } else {
1776
+ reportSDKError("heartbeat", err);
1777
+ }
1778
+ } finally {
1779
+ if (!stopped && serveState) {
1780
+ scheduleNextHeartbeat();
1781
+ }
1782
+ }
1783
+ }
1784
+
1785
+ function scheduleNextHeartbeat(delayMs = nextHeartbeatDelayMs()): void {
1786
+ if (!serveState || stopped) return;
1787
+ if (heartbeatTimer) clearTimeout(heartbeatTimer);
1788
+ heartbeatTimer = setTimeout(() => {
1789
+ heartbeatTimer = null;
1790
+ heartbeatTick().catch((err) => {
1791
+ reportSDKError("heartbeat", err);
1792
+ });
1793
+ }, delayMs);
1794
+ }
1795
+
1796
+ function makeStreamWriter(runId: string): StreamWriter {
1797
+ const append = (data: unknown, key = "default"): Promise<void> => {
1798
+ if (!isOnline || !runId) return Promise.resolve();
1799
+ return new Promise<void>((resolve, reject) => {
1800
+ stub.AppendStream(
1801
+ {
1802
+ run_id: runId,
1803
+ key,
1804
+ data: Buffer.from(JSON.stringify(data)),
1805
+ },
1806
+ meta,
1807
+ unaryDeadlineOptions(),
1808
+ (err: Error | null) =>
1809
+ err
1810
+ ? reject(normalizeServiceError(err, `append-stream:${runId}`))
1811
+ : resolve(),
1812
+ );
1813
+ });
1814
+ };
1815
+ return {
1816
+ write: append,
1817
+ end: (_key = "default") => Promise.resolve(),
1818
+ };
1819
+ }
1820
+
1821
+ async function handleRpc(
1822
+ call: grpc.ServerUnaryCall<WorkerHandleRequest, WorkerHandleResponse>,
1823
+ cb: grpc.sendUnaryData<WorkerHandleResponse>,
1824
+ ) {
1825
+ const entry = fnHandlers.get(call.request.fn || "");
1826
+ const allowedCallers = entry?.opts.allowedCallers;
1827
+ if (!requirePeerCN(call, allowedCallers, false)) {
1828
+ cb({
1829
+ code: grpc.status.UNAUTHENTICATED,
1830
+ message: "mTLS client cert required",
1831
+ });
1832
+ return;
1833
+ }
1834
+ const { fn, payload: payloadBuf, trace_id, span_id } = call.request;
1835
+ const traceCtx: TraceCtx = {
1836
+ traceId: trace_id || "",
1837
+ spanId: span_id || "",
1838
+ };
1839
+
1840
+ if (!entry) {
1841
+ cb({
1842
+ code: grpc.status.NOT_FOUND,
1843
+ message: `No handler registered for: ${fn}`,
1844
+ metadata: new grpc.Metadata(),
1845
+ });
1846
+ return;
1847
+ }
1848
+
1849
+ // Worker-side caller policy: peerCN verified by requirePeerCN above.
1850
+ const _callerAllowList = entry.opts.allowedCallers;
1851
+ const _peerCN = extractPeerCN(call) ?? "";
1852
+
1853
+ // Декодируем: protobuf если у handler есть входная схема, иначе JSON
1854
+ let parsed: unknown = {};
1855
+ const inputSchema = entry.opts.schema?.input;
1856
+ const outputSchema = entry.opts.schema?.output;
1857
+ try {
1858
+ if (payloadBuf?.length) {
1859
+ parsed = inputSchema
1860
+ ? decodeWithSchema(inputSchema, payloadBuf as Buffer)
1861
+ : JSON.parse((payloadBuf as Buffer).toString());
1862
+ }
1863
+ } catch {
1864
+ cb({
1865
+ code: grpc.status.INVALID_ARGUMENT,
1866
+ message: "Invalid payload encoding",
1867
+ });
1868
+ return;
1869
+ }
1870
+
1871
+ const rpcCtx: RpcContext = {
1872
+ traceId: traceCtx.traceId,
1873
+ spanId: traceCtx.spanId,
1874
+ stream: makeStreamWriter(traceCtx.traceId),
1875
+ };
1876
+
1877
+ try {
1878
+ const result = await traceStorage.run(traceCtx, () =>
1879
+ entry.handler(parsed, rpcCtx),
1880
+ );
1881
+ // Кодируем: protobuf если есть выходная схема, иначе JSON
1882
+ const outputBuf = outputSchema
1883
+ ? encodeWithSchema(outputSchema, result)
1884
+ : toJsonBuffer(result);
1885
+ cb(null, { output: outputBuf, success: true });
1886
+ } catch (e: unknown) {
1887
+ cb(null, {
1888
+ output: Buffer.alloc(0),
1889
+ success: false,
1890
+ error: normalizeUnknownErrorMessage(e),
1891
+ });
1892
+ }
1893
+ }
1894
+
1895
+ async function handleDeliverMessage(
1896
+ call: grpc.ServerUnaryCall<WorkerDeliverRequest, WorkerDeliverResponse>,
1897
+ cb: grpc.sendUnaryData<WorkerDeliverResponse>,
1898
+ ): Promise<void> {
1899
+ if (!requirePeerCN(call, undefined, true)) {
1900
+ cb({
1901
+ code: grpc.status.UNAUTHENTICATED,
1902
+ message: "mTLS client cert required (ServiceBridge Server)",
1903
+ });
1904
+ return;
1905
+ }
1906
+ const { group_name, payload, trace_id, parent_span_id } = call.request;
1907
+ const traceCtx: TraceCtx = {
1908
+ traceId: trace_id || "",
1909
+ spanId: parent_span_id || "",
1910
+ };
1911
+ const entry =
1912
+ typeof group_name === "string"
1913
+ ? eventHandlers.get(group_name)
1914
+ : undefined;
1915
+ if (!entry) {
1916
+ cb(null, { ack: false, error: "consumer_group_handler_missing" });
1917
+ return;
1918
+ }
1919
+
1920
+ let parsed: unknown = {};
1921
+ try {
1922
+ if (payload?.length) parsed = JSON.parse((payload as Buffer).toString());
1923
+ } catch {
1924
+ cb(null, { ack: false, reject_reason: "invalid_json_payload" });
1925
+ return;
1926
+ }
1927
+
1928
+ let shouldRetry = false;
1929
+ let retryAfter = 1000;
1930
+ let rejectReason = "";
1931
+ const errors: string[] = [];
1932
+
1933
+ const ctx: EventContext = {
1934
+ traceId: traceCtx.traceId,
1935
+ spanId: traceCtx.spanId,
1936
+ refs: {
1937
+ topic: String(call.request?.topic ?? ""),
1938
+ groupName: entry.groupName,
1939
+ messageId: String(call.request?.message_id ?? ""),
1940
+ attempt: String(call.request?.attempt ?? ""),
1941
+ headers: JSON.stringify(call.request?.headers ?? {}),
1942
+ },
1943
+ retry(delayMs = 1000) {
1944
+ shouldRetry = true;
1945
+ retryAfter = Math.max(1, Math.floor(delayMs));
1946
+ },
1947
+ reject(reason: string) {
1948
+ rejectReason = reason || "rejected_by_consumer";
1949
+ },
1950
+ stream: makeStreamWriter(traceCtx.traceId),
1951
+ };
1952
+
1953
+ try {
1954
+ await traceStorage.run(traceCtx, () => entry.handler(parsed, ctx));
1955
+ } catch (e: unknown) {
1956
+ errors.push(normalizeUnknownErrorMessage(e));
1957
+ shouldRetry = true;
1958
+ }
1959
+
1960
+ if (rejectReason) {
1961
+ cb(null, {
1962
+ ack: false,
1963
+ reject_reason: rejectReason,
1964
+ error: errors.join("; "),
1965
+ });
1966
+ return;
1967
+ }
1968
+ if (shouldRetry || errors.length > 0) {
1969
+ cb(null, {
1970
+ ack: false,
1971
+ retry_after_ms: retryAfter,
1972
+ error: errors.join("; ").slice(0, 2048),
1973
+ });
1974
+ return;
1975
+ }
1976
+ cb(null, { ack: true });
1977
+ }
1978
+
1979
+ const svc: ServiceBridgeService = {
1980
+ async rpc<T = unknown>(
1981
+ fn: string,
1982
+ payload?: unknown,
1983
+ opts?: RpcOpts,
1984
+ ): Promise<T> {
1985
+ try {
1986
+ await _controlReady;
1987
+ } catch (err) {
1988
+ throw normalizeServiceError(err, "control-plane");
1989
+ }
1990
+ const tc = traceStorage.getStore();
1991
+ const traceId = opts?.traceId ?? tc?.traceId ?? crypto.randomUUID();
1992
+ const maxRetries = opts?.retries ?? globalOpts.retries ?? 3;
1993
+ const baseDelay = opts?.retryDelay ?? globalOpts.retryDelay ?? 300;
1994
+ const timeout = opts?.timeout ?? globalOpts.timeout ?? 30000;
1995
+
1996
+ // Resolve canonical name — check local metadata first, then server lookup.
1997
+ let canonical = resolveCanonical(fn);
1998
+ if (!canonical) {
1999
+ await doLookupFunction(fn);
2000
+ canonical = resolveCanonical(fn);
2001
+ }
2002
+ if (!canonical)
2003
+ throw normalizeServiceError(
2004
+ new Error(`No endpoints available for RPC: ${fn}`),
2005
+ `rpc:${fn}`,
2006
+ "worker",
2007
+ );
2008
+
2009
+ const fmeta = functionMeta.get(canonical);
2010
+
2011
+ // Caller-side policy check
2012
+ if (fmeta && !containsOrAll(fmeta.allowedCallers, service)) {
2013
+ throw new Error(
2014
+ `Service "${service}" is not allowed to call "${fn}". ` +
2015
+ `Permitted callers: ${fmeta.allowedCallers.join(", ")}`,
2016
+ );
2017
+ }
2018
+
2019
+ const inputSchema = fmeta?.inputSchema;
2020
+ const outputSchema = fmeta?.outputSchema;
2021
+ const inputBuf = inputSchema
2022
+ ? encodeWithSchema(inputSchema, payload)
2023
+ : toJsonBuffer(payload);
2024
+ const telemetryInputBuf = toJsonBuffer(payload);
2025
+
2026
+ // Root span — represents the entire RPC call including all retry attempts.
2027
+ // When retries are configured (maxRetries > 0), each attempt also gets its
2028
+ // own child span named "attempt:<canonical>" so the UI can render them as
2029
+ // individual retry rows (same model as workflow retries).
2030
+ const rootSpanId = crypto.randomUUID();
2031
+ const parentSpanId = opts?.parentSpanId ?? tc?.spanId ?? "";
2032
+ const rootStartedAt = Date.now();
2033
+ const hasRetries = maxRetries > 0;
2034
+
2035
+ // Report root span start once (covers all attempts).
2036
+ reportCallStartAsync(
2037
+ traceId,
2038
+ rootSpanId,
2039
+ parentSpanId,
2040
+ canonical,
2041
+ rootStartedAt,
2042
+ telemetryInputBuf,
2043
+ 1,
2044
+ );
2045
+
2046
+ // gRPC channel for this function — SbResolver handles LB + connection pool
2047
+ const channel = getOrCreateFunctionChannel(canonical);
2048
+ const workerClient = new servicebridgeProto.ServiceBridgeWorker(
2049
+ "__channel__",
2050
+ grpc.credentials.createInsecure(),
2051
+ { channelOverride: channel },
2052
+ ) as unknown as WorkerClient;
2053
+
2054
+ let lastError: unknown;
2055
+ for (let attempt = 1; attempt <= maxRetries + 1; attempt++) {
2056
+ // When retrying: each attempt is a child span of the root span.
2057
+ // When no retries configured: the root span IS the only span (current behavior).
2058
+ const attemptSpanId = hasRetries ? crypto.randomUUID() : rootSpanId;
2059
+ const attemptParentId = hasRetries ? rootSpanId : parentSpanId;
2060
+ const attemptFn = hasRetries ? `attempt:${canonical}` : canonical;
2061
+ const attemptStartedAt = hasRetries ? Date.now() : rootStartedAt;
2062
+
2063
+ if (hasRetries) {
2064
+ reportCallStartAsync(
2065
+ traceId,
2066
+ attemptSpanId,
2067
+ attemptParentId,
2068
+ attemptFn,
2069
+ attemptStartedAt,
2070
+ telemetryInputBuf,
2071
+ attempt,
2072
+ );
2073
+ }
2074
+
2075
+ try {
2076
+ const deadline = new Date(Date.now() + timeout);
2077
+ // Send the short function name (e.g. "td.echo") to the worker, not the
2078
+ // canonical name (e.g. "dev0/td.echo") — workers register handlers by
2079
+ // short name and can't find them by canonical.
2080
+ const fn = fmeta?.fnName ?? canonical;
2081
+ if (!fn) throw new Error("unreachable");
2082
+ const res = await new Promise<WorkerHandleResponse>(
2083
+ (resolve, reject) => {
2084
+ workerClient.Handle(
2085
+ {
2086
+ fn,
2087
+ payload: inputBuf,
2088
+ trace_id: traceId,
2089
+ span_id: attemptSpanId,
2090
+ },
2091
+ makeWorkerMeta(),
2092
+ // waitForReady: gRPC holds the call in CONNECTING state until a
2093
+ // subchannel becomes READY (bounded by deadline). This transparently
2094
+ // survives worker restarts without needing manual retry logic.
2095
+ { deadline, waitForReady: true },
2096
+ (err: Error | null, response?: WorkerHandleResponse) =>
2097
+ err
2098
+ ? reject(err)
2099
+ : resolve(
2100
+ response ?? { success: false, error: "empty response" },
2101
+ ),
2102
+ );
2103
+ },
2104
+ );
2105
+ if (!res?.success)
2106
+ throw new Error(res?.error || "handler returned failure");
2107
+ const outputBuf = res.output ?? Buffer.alloc(0);
2108
+ const result = outputSchema
2109
+ ? decodeWithSchema(outputSchema, outputBuf)
2110
+ : outputBuf?.length
2111
+ ? JSON.parse(outputBuf.toString())
2112
+ : {};
2113
+
2114
+ const encodedOutput = outputBuf?.length
2115
+ ? toJsonBuffer(result)
2116
+ : Buffer.alloc(0);
2117
+ if (hasRetries) {
2118
+ // Close attempt span as success (with its own input+output).
2119
+ reportCallAsync(
2120
+ traceId,
2121
+ attemptSpanId,
2122
+ attemptFn,
2123
+ attemptStartedAt,
2124
+ telemetryInputBuf,
2125
+ true,
2126
+ attempt,
2127
+ encodedOutput,
2128
+ );
2129
+ // Close root span as overall success — propagate output from the successful attempt.
2130
+ reportCallAsync(
2131
+ traceId,
2132
+ rootSpanId,
2133
+ canonical,
2134
+ rootStartedAt,
2135
+ telemetryInputBuf,
2136
+ true,
2137
+ attempt,
2138
+ encodedOutput,
2139
+ );
2140
+ } else {
2141
+ reportCallAsync(
2142
+ traceId,
2143
+ rootSpanId,
2144
+ canonical,
2145
+ rootStartedAt,
2146
+ telemetryInputBuf,
2147
+ true,
2148
+ attempt,
2149
+ encodedOutput,
2150
+ );
2151
+ }
2152
+ return result as T;
2153
+ } catch (e: unknown) {
2154
+ lastError = e;
2155
+ const errMsg = normalizeUnknownErrorMessage(e);
2156
+ if (hasRetries) {
2157
+ // Close attempt span as failure.
2158
+ reportCallAsync(
2159
+ traceId,
2160
+ attemptSpanId,
2161
+ attemptFn,
2162
+ attemptStartedAt,
2163
+ telemetryInputBuf,
2164
+ false,
2165
+ attempt,
2166
+ undefined,
2167
+ errMsg,
2168
+ );
2169
+ if (attempt > maxRetries) {
2170
+ // Last attempt exhausted — close root span as failure too.
2171
+ reportCallAsync(
2172
+ traceId,
2173
+ rootSpanId,
2174
+ canonical,
2175
+ rootStartedAt,
2176
+ telemetryInputBuf,
2177
+ false,
2178
+ attempt,
2179
+ undefined,
2180
+ errMsg,
2181
+ );
2182
+ }
2183
+ } else {
2184
+ // gRPC channel handles connection recovery automatically;
2185
+ // no manual eviction needed.
2186
+ reportCallAsync(
2187
+ traceId,
2188
+ rootSpanId,
2189
+ canonical,
2190
+ rootStartedAt,
2191
+ telemetryInputBuf,
2192
+ false,
2193
+ attempt,
2194
+ undefined,
2195
+ errMsg,
2196
+ );
2197
+ }
2198
+ if (attempt <= maxRetries) {
2199
+ await new Promise((r) =>
2200
+ setTimeout(r, baseDelay * 2 ** (attempt - 1)),
2201
+ );
2202
+ }
2203
+ }
2204
+ }
2205
+ throw normalizeServiceError(lastError, `rpc:${fn}`, "worker");
2206
+ },
2207
+
2208
+ event(topic: string, payload?: unknown, opts?: EventOpts): Promise<string> {
2209
+ if (!isOnline) {
2210
+ enqueueOffline({ type: "event", topic, payload, opts });
2211
+ return Promise.resolve("");
2212
+ }
2213
+ const tc = traceStorage.getStore();
2214
+ return new Promise((resolve, reject) => {
2215
+ stub.Publish(
2216
+ {
2217
+ topic,
2218
+ payload: toJsonBuffer(payload),
2219
+ headers: opts?.headers ?? {},
2220
+ trace_id: opts?.traceId ?? tc?.traceId ?? "",
2221
+ parent_span_id: opts?.parentSpanId ?? tc?.spanId ?? "",
2222
+ producer_service: service,
2223
+ idempotency_key: opts?.idempotencyKey ?? "",
2224
+ },
2225
+ meta,
2226
+ unaryDeadlineOptions(),
2227
+ (err: Error | null, res?: { message_id?: string }) =>
2228
+ err
2229
+ ? reject(normalizeServiceError(err, `publish:${topic}`))
2230
+ : resolve(res?.message_id ?? ""),
2231
+ );
2232
+ });
2233
+ },
2234
+
2235
+ handleRpc(
2236
+ fn: string,
2237
+ handler: FnHandler,
2238
+ opts?: HandleRpcOpts,
2239
+ ): ServiceBridgeService {
2240
+ fnHandlers.set(fn, { handler, opts: opts ?? {} });
2241
+ return svc;
2242
+ },
2243
+
2244
+ handleEvent(
2245
+ pattern: string,
2246
+ handler: EventHandler,
2247
+ opts?: HandleEventOpts,
2248
+ ): ServiceBridgeService {
2249
+ const normalizedOpts = opts ?? {};
2250
+ const groupName = normalizedOpts.groupName || `${service}:${pattern}`;
2251
+ if (eventHandlers.has(groupName)) {
2252
+ throw new Error(
2253
+ `Duplicate event consumer group "${groupName}". ` +
2254
+ "Use a distinct groupName for each handleEvent() registration.",
2255
+ );
2256
+ }
2257
+ eventHandlers.set(groupName, {
2258
+ groupName,
2259
+ pattern,
2260
+ handler,
2261
+ opts: normalizedOpts,
2262
+ });
2263
+ return svc;
2264
+ },
2265
+
2266
+ async serve(opts: ServeOpts = {}): Promise<void> {
2267
+ if (workerServer) throw new Error("serve() already called");
2268
+ if (fnHandlers.size === 0 && eventHandlers.size === 0) {
2269
+ throw new Error(
2270
+ "No handlers registered. Call handleRpc() or handleEvent() before serve().",
2271
+ );
2272
+ }
2273
+ if (!service.trim()) {
2274
+ throw new Error("serve() requires a non-empty service name");
2275
+ }
2276
+
2277
+ try {
2278
+ workerServer = new grpc.Server(channelOpts);
2279
+ workerServer.addService(
2280
+ servicebridgeProto.ServiceBridgeWorker.service,
2281
+ {
2282
+ Handle: handleRpc,
2283
+ DeliverMessage: handleDeliverMessage,
2284
+ },
2285
+ );
2286
+ const host = opts.host || "127.0.0.1";
2287
+ const requestedTransport = opts.transport ?? globalOpts.workerTransport;
2288
+
2289
+ // Resolve effective TLS options: explicit opts → global workerTLS → auto-provision.
2290
+ const tlsOpts = opts.tls ?? globalOpts.workerTLS;
2291
+ let effectiveCert = toPemBuffer(tlsOpts?.cert);
2292
+ let effectiveKey = toPemBuffer(tlsOpts?.key);
2293
+ let effectiveCACert = toPemBuffer(tlsOpts?.caCert);
2294
+
2295
+ if (!effectiveCert || !effectiveKey || !effectiveCACert) {
2296
+ // Reuse cert provisioned during _controlReady if available — avoids issuing two
2297
+ // separate key pairs for the same service instance (wasteful and confusing in logs).
2298
+ const cached = effectiveWorkerTLS;
2299
+ if (cached?.cert && cached?.key && cached?.caCert) {
2300
+ effectiveCert = toPemBuffer(cached.cert);
2301
+ effectiveKey = toPemBuffer(cached.key);
2302
+ effectiveCACert = toPemBuffer(cached.caCert);
2303
+ } else {
2304
+ // Auto-provision: generate key pair locally, post public key to server.
2305
+ // Private key stays in this process — server only sees the public key.
2306
+ const provisioned = await provisionWorkerTLS(
2307
+ adminBase,
2308
+ serviceKey,
2309
+ service,
2310
+ );
2311
+ effectiveCert = provisioned.cert;
2312
+ effectiveKey = provisioned.key;
2313
+ effectiveCACert = provisioned.caCert;
2314
+ effectiveWorkerTLS = {
2315
+ caCert: effectiveCACert,
2316
+ cert: effectiveCert,
2317
+ key: effectiveKey,
2318
+ };
2319
+ }
2320
+ }
2321
+
2322
+ if (!effectiveKey || !effectiveCert) {
2323
+ throw new Error("TLS key and cert are required for worker serve()");
2324
+ }
2325
+ const workerTransport: WorkerTransport = requestedTransport ?? "tls";
2326
+ const serverCreds = grpc.ServerCredentials.createSsl(
2327
+ effectiveCACert ?? null, // verify ServiceBridge's client cert against the shared CA
2328
+ [{ private_key: effectiveKey, cert_chain: effectiveCert }],
2329
+ true, // require mTLS — only ServiceBridge (CN="ServiceBridge Server") may call workers
2330
+ );
2331
+ const port = await new Promise<number>((resolve, reject) => {
2332
+ const server = workerServer;
2333
+ if (!server) {
2334
+ reject(new Error("Worker server is not initialized"));
2335
+ return;
2336
+ }
2337
+ server.bindAsync(`${host}:0`, serverCreds, (err, p) =>
2338
+ err ? reject(err) : resolve(p),
2339
+ );
2340
+ });
2341
+ const endpoint = `${host}:${port}`;
2342
+ const instanceId =
2343
+ opts.instanceId ||
2344
+ Array.from(crypto.getRandomValues(new Uint8Array(3)))
2345
+ .map((b) => b.toString(16).padStart(2, "0"))
2346
+ .join(""); // 6 hex chars
2347
+ serveState = { endpoint, transport: workerTransport, opts, instanceId };
2348
+
2349
+ await syncRegistrations("initial");
2350
+ scheduleNextHeartbeat();
2351
+
2352
+ await _controlReady;
2353
+ } catch (err) {
2354
+ if (heartbeatTimer) clearTimeout(heartbeatTimer);
2355
+ heartbeatTimer = null;
2356
+ registeredGroups.clear();
2357
+ serveState = null;
2358
+ workerServer?.forceShutdown();
2359
+ workerServer = null;
2360
+ throw normalizeServiceError(err, "serve");
2361
+ }
2362
+ },
2363
+
2364
+ stop(): void {
2365
+ stopped = true;
2366
+ isOnline = false;
2367
+ if (onlineRestoreTimer) clearTimeout(onlineRestoreTimer);
2368
+ if (heartbeatTimer) clearTimeout(heartbeatTimer);
2369
+ onlineRestoreTimer = null;
2370
+ heartbeatTimer = null;
2371
+ registeredGroups.clear();
2372
+
2373
+ flushLogs();
2374
+
2375
+ closeAllFunctionChannels();
2376
+ functionMeta.clear();
2377
+ fnAliasMap.clear();
2378
+ _sbContexts.delete(clientId);
2379
+ offlineQueue.length = 0;
2380
+ workerServer?.forceShutdown();
2381
+ workerServer = null;
2382
+ serveState = null;
2383
+ try {
2384
+ stub.close?.();
2385
+ } catch (err) {
2386
+ reportSDKError("close-control-plane-client", err);
2387
+ }
2388
+ },
2389
+
2390
+ job(target: string, opts: ScheduleOpts): Promise<string> {
2391
+ if (!isOnline) {
2392
+ enqueueOffline({ type: "job", target, opts });
2393
+ return Promise.resolve("");
2394
+ }
2395
+ return new Promise((resolve, reject) => {
2396
+ stub.RegisterJob(
2397
+ {
2398
+ cron_expr: opts.cron ?? "",
2399
+ timezone: opts.timezone ?? "UTC",
2400
+ misfire_policy: opts.misfire ?? "fire_now",
2401
+ target_type: opts.via ?? "rpc",
2402
+ target_ref: target,
2403
+ delay_ms: opts.delay ?? 0,
2404
+ service_name: service,
2405
+ retry_policy_json: opts.retryPolicyJson ?? "{}",
2406
+ },
2407
+ meta,
2408
+ unaryDeadlineOptions(),
2409
+ (err: Error | null, res?: { id?: string }) =>
2410
+ err
2411
+ ? reject(normalizeServiceError(err, `register-job:${target}`))
2412
+ : resolve(res?.id ?? ""),
2413
+ );
2414
+ });
2415
+ },
2416
+
2417
+ workflow(name: string, steps: WorkflowStep[]): Promise<string> {
2418
+ if (!isOnline) {
2419
+ enqueueOffline({ type: "workflow", name, steps });
2420
+ return Promise.resolve("");
2421
+ }
2422
+ return new Promise((resolve, reject) => {
2423
+ stub.RegisterWorkflow(
2424
+ {
2425
+ name,
2426
+ definition: JSON.stringify(steps),
2427
+ opts: "{}",
2428
+ service_name: service,
2429
+ },
2430
+ meta,
2431
+ unaryDeadlineOptions(),
2432
+ (err: Error | null, res?: { id?: string }) =>
2433
+ err
2434
+ ? reject(normalizeServiceError(err, `register-workflow:${name}`))
2435
+ : resolve(res?.id ?? ""),
2436
+ );
2437
+ });
2438
+ },
2439
+
2440
+ async cancelWorkflowRun(runId: string): Promise<void> {
2441
+ // Derive the HTTP admin URL from the gRPC URL by switching the protocol
2442
+ // and using the well-known HTTP port (7700 default).
2443
+ const adminBase =
2444
+ globalOpts.adminUrl ??
2445
+ rawUrl.replace(/:\d+$/, ":7700").replace(/^grpcs?:\/\//, "http://");
2446
+ const endpoint = `${adminBase}/api/workflow-runs/${encodeURIComponent(runId)}`;
2447
+ const res = await fetch(endpoint, { method: "DELETE" });
2448
+ if (!res.ok) {
2449
+ const body = await res.text().catch(() => "");
2450
+ throw new Error(`cancelWorkflowRun failed (${res.status}): ${body}`);
2451
+ }
2452
+ },
2453
+
2454
+ startHttpSpan(opts: {
2455
+ method: string;
2456
+ path: string;
2457
+ traceId?: string;
2458
+ parentSpanId?: string;
2459
+ }): HttpSpan {
2460
+ const tc = traceStorage.getStore();
2461
+ const traceId = opts.traceId ?? tc?.traceId ?? crypto.randomUUID();
2462
+ const spanId = crypto.randomUUID();
2463
+ const parentSpanId = opts.parentSpanId ?? tc?.spanId ?? "";
2464
+ const fn = `http:${opts.method}:${opts.path}`;
2465
+ const startedAt = Date.now();
2466
+ const inputBuf = toJsonBuffer({ method: opts.method, path: opts.path });
2467
+ reportCallStartAsync(
2468
+ traceId,
2469
+ spanId,
2470
+ parentSpanId,
2471
+ fn,
2472
+ startedAt,
2473
+ inputBuf,
2474
+ 1,
2475
+ );
2476
+ return {
2477
+ traceId,
2478
+ spanId,
2479
+ end(endOpts) {
2480
+ const success =
2481
+ endOpts.success ??
2482
+ (endOpts.statusCode != null && endOpts.statusCode < 400);
2483
+ const outputBuf = toJsonBuffer({
2484
+ statusCode: endOpts.statusCode ?? null,
2485
+ error: endOpts.error ?? null,
2486
+ });
2487
+ reportCallAsync(
2488
+ traceId,
2489
+ spanId,
2490
+ fn,
2491
+ startedAt,
2492
+ inputBuf,
2493
+ success,
2494
+ 1,
2495
+ outputBuf,
2496
+ endOpts.error ?? "",
2497
+ );
2498
+ },
2499
+ };
2500
+ },
2501
+
2502
+ async registerHttpEndpoint(opts: {
2503
+ method: string;
2504
+ route: string;
2505
+ instanceId?: string;
2506
+ endpoint?: string;
2507
+ allowedCallers?: string[];
2508
+ }): Promise<void> {
2509
+ const res = await fetch(`${adminBase}/api/http/register`, {
2510
+ method: "POST",
2511
+ headers: {
2512
+ "Content-Type": "application/json",
2513
+ "x-service-key": serviceKey,
2514
+ },
2515
+ body: JSON.stringify({
2516
+ service_name: service,
2517
+ method: opts.method,
2518
+ route_pattern: opts.route,
2519
+ instance_id: opts.instanceId,
2520
+ endpoint: opts.endpoint,
2521
+ allowed_callers: opts.allowedCallers ?? [],
2522
+ }),
2523
+ });
2524
+ if (!res.ok) {
2525
+ const body = await res.text().catch(() => "");
2526
+ throw new Error(`registerHttpEndpoint failed (${res.status}): ${body}`);
2527
+ }
2528
+ },
2529
+
2530
+ watchRun(
2531
+ runId: string,
2532
+ opts?: WatchRunOpts,
2533
+ ): AsyncIterable<RunStreamEvent> {
2534
+ const key = opts?.key ?? "default";
2535
+ const fromSeq = opts?.fromSequence ?? 0;
2536
+ const WATCH_RUN_QUEUE_LIMIT = 256;
2537
+ const WATCH_RUN_RETRY_MIN_MS = 500;
2538
+ const WATCH_RUN_RETRY_MAX_MS = 5_000;
2539
+
2540
+ return {
2541
+ [Symbol.asyncIterator](): AsyncIterator<RunStreamEvent> {
2542
+ let stream: RunWatchStream | null = null;
2543
+ let reconnectTimer: ReturnType<typeof setTimeout> | null = null;
2544
+ let cancelled = false;
2545
+ let done = false;
2546
+ let fatalError: ServiceBridgeError | null = null;
2547
+ let reconnectDelay = WATCH_RUN_RETRY_MIN_MS;
2548
+ let lastSequence = fromSeq;
2549
+ const queue: RunStreamEvent[] = [];
2550
+ const waiters: Array<{
2551
+ resolve: (value: IteratorResult<RunStreamEvent>) => void;
2552
+ reject: (reason?: unknown) => void;
2553
+ }> = [];
2554
+
2555
+ function metadataValue(
2556
+ err: unknown,
2557
+ keyName: string,
2558
+ ): string | undefined {
2559
+ const metadata = (err as { metadata?: grpc.Metadata } | undefined)
2560
+ ?.metadata;
2561
+ if (!metadata || typeof metadata.get !== "function")
2562
+ return undefined;
2563
+ const values = metadata.get(keyName);
2564
+ if (!Array.isArray(values) || values.length === 0) return undefined;
2565
+ const value = values[0];
2566
+ if (typeof value === "string") return value;
2567
+ if (Buffer.isBuffer(value)) return value.toString();
2568
+ return value == null ? undefined : String(value);
2569
+ }
2570
+
2571
+ function parseChunk(chunk: RunChunkWire): RunStreamEvent {
2572
+ let data: unknown = null;
2573
+ const rawData = chunk.data
2574
+ ? Buffer.from(chunk.data)
2575
+ : Buffer.alloc(0);
2576
+ if (rawData.length > 0) {
2577
+ try {
2578
+ data = JSON.parse(rawData.toString());
2579
+ } catch (cause) {
2580
+ throw new ServiceBridgeError({
2581
+ message: `watchRun received non-JSON chunk for ${runId}`,
2582
+ component: "sdk",
2583
+ operation: `watch-run:${runId}`,
2584
+ severity: "fatal",
2585
+ cause,
2586
+ });
2587
+ }
2588
+ }
2589
+ return {
2590
+ type: chunk.type === "run_complete" ? "run_complete" : "chunk",
2591
+ runId,
2592
+ key: chunk.key || key,
2593
+ sequence: Number(chunk.sequence ?? 0),
2594
+ data,
2595
+ runStatus: chunk.run_status || undefined,
2596
+ };
2597
+ }
2598
+
2599
+ function clearReconnectTimer() {
2600
+ if (reconnectTimer) {
2601
+ clearTimeout(reconnectTimer);
2602
+ reconnectTimer = null;
2603
+ }
2604
+ }
2605
+
2606
+ function clearStream(cancelCurrent = false) {
2607
+ const current = stream;
2608
+ stream = null;
2609
+ if (!current) return;
2610
+ if (typeof current.removeAllListeners === "function") {
2611
+ current.removeAllListeners();
2612
+ }
2613
+ if (cancelCurrent && typeof current.cancel === "function") {
2614
+ current.cancel();
2615
+ }
2616
+ }
2617
+
2618
+ function fail(err: ServiceBridgeError) {
2619
+ if (fatalError || done) return;
2620
+ fatalError = err;
2621
+ clearReconnectTimer();
2622
+ clearStream(true);
2623
+ for (const waiter of waiters) {
2624
+ waiter.reject(err);
2625
+ }
2626
+ waiters.length = 0;
2627
+ }
2628
+
2629
+ function enqueue(event: RunStreamEvent) {
2630
+ if (fatalError || done) return;
2631
+ if (waiters.length > 0) {
2632
+ const waiter = waiters.shift();
2633
+ if (waiter) {
2634
+ waiter.resolve({ value: event, done: false });
2635
+ }
2636
+ return;
2637
+ }
2638
+
2639
+ if (queue.length >= WATCH_RUN_QUEUE_LIMIT) {
2640
+ fail(
2641
+ new ServiceBridgeError({
2642
+ message: `watchRun consumer is not draining fast enough for ${runId}`,
2643
+ component: "sdk",
2644
+ operation: `watch-run:${runId}`,
2645
+ severity: "fatal",
2646
+ }),
2647
+ );
2648
+ return;
2649
+ }
2650
+
2651
+ queue.push(event);
2652
+ }
2653
+
2654
+ function finish() {
2655
+ if (done || fatalError) return;
2656
+ done = true;
2657
+ clearReconnectTimer();
2658
+ clearStream(false);
2659
+ for (const waiter of waiters) {
2660
+ waiter.resolve({ value: undefined, done: true });
2661
+ }
2662
+ waiters.length = 0;
2663
+ }
2664
+
2665
+ function shouldRetry(err: unknown): boolean {
2666
+ if (
2667
+ metadataValue(err, "servicebridge-run-stream-retryable") ===
2668
+ "true"
2669
+ ) {
2670
+ return true;
2671
+ }
2672
+ const code = (err as Partial<grpc.ServiceError> | undefined)?.code;
2673
+ return (
2674
+ code === grpc.status.UNAVAILABLE ||
2675
+ code === grpc.status.UNKNOWN ||
2676
+ code === grpc.status.DEADLINE_EXCEEDED ||
2677
+ code === grpc.status.RESOURCE_EXHAUSTED
2678
+ );
2679
+ }
2680
+
2681
+ function maybeUpdateResumeSequence(err: unknown) {
2682
+ const parsed = Number(
2683
+ metadataValue(err, "servicebridge-run-stream-resume-from") ?? 0,
2684
+ );
2685
+ if (Number.isFinite(parsed) && parsed > lastSequence) {
2686
+ lastSequence = parsed;
2687
+ }
2688
+ }
2689
+
2690
+ function scheduleReconnect() {
2691
+ if (cancelled || done || fatalError || reconnectTimer) return;
2692
+ const delay = reconnectDelay;
2693
+ reconnectTimer = setTimeout(() => {
2694
+ reconnectTimer = null;
2695
+ start();
2696
+ }, delay);
2697
+ reconnectDelay = Math.min(
2698
+ reconnectDelay * 2,
2699
+ WATCH_RUN_RETRY_MAX_MS,
2700
+ );
2701
+ }
2702
+
2703
+ function start() {
2704
+ if (cancelled || done || fatalError || stream) return;
2705
+
2706
+ try {
2707
+ const current = stub.WatchRun(
2708
+ { run_id: runId, key, from_sequence: lastSequence },
2709
+ meta,
2710
+ );
2711
+ let endedWithOK = false;
2712
+ stream = current;
2713
+
2714
+ current.on("data", (chunk: RunChunkWire) => {
2715
+ if (stream !== current || cancelled || done || fatalError)
2716
+ return;
2717
+ reconnectDelay = WATCH_RUN_RETRY_MIN_MS;
2718
+
2719
+ let event: RunStreamEvent;
2720
+ try {
2721
+ event = parseChunk(chunk);
2722
+ } catch (err) {
2723
+ fail(
2724
+ err instanceof ServiceBridgeError
2725
+ ? err
2726
+ : normalizeServiceError(err, `watch-run:${runId}`),
2727
+ );
2728
+ return;
2729
+ }
2730
+ if (event.sequence > 0) {
2731
+ if (event.sequence <= lastSequence) return;
2732
+ lastSequence = event.sequence;
2733
+ }
2734
+
2735
+ enqueue(event);
2736
+ if (event.type === "run_complete") {
2737
+ finish();
2738
+ }
2739
+ });
2740
+
2741
+ current.on("error", (err: unknown) => {
2742
+ if (stream !== current || cancelled || done || fatalError)
2743
+ return;
2744
+ clearStream(false);
2745
+ maybeUpdateResumeSequence(err);
2746
+ if (!shouldRetry(err)) {
2747
+ fail(normalizeServiceError(err, `watch-run:${runId}`));
2748
+ return;
2749
+ }
2750
+ reconnectDelay = metadataValue(
2751
+ err,
2752
+ "servicebridge-run-stream-disconnect-reason",
2753
+ )
2754
+ ? WATCH_RUN_RETRY_MIN_MS
2755
+ : reconnectDelay;
2756
+ scheduleReconnect();
2757
+ });
2758
+
2759
+ current.on("status", (statusInfo: { code?: number }) => {
2760
+ if (
2761
+ (stream !== current && stream !== null) ||
2762
+ cancelled ||
2763
+ done ||
2764
+ fatalError
2765
+ ) {
2766
+ return;
2767
+ }
2768
+ endedWithOK = statusInfo.code === grpc.status.OK;
2769
+ if (endedWithOK) {
2770
+ clearReconnectTimer();
2771
+ if (stream === null) {
2772
+ finish();
2773
+ }
2774
+ }
2775
+ });
2776
+
2777
+ current.on("end", () => {
2778
+ if (stream !== current || cancelled || done || fatalError)
2779
+ return;
2780
+ clearStream(false);
2781
+ if (endedWithOK) {
2782
+ finish();
2783
+ return;
2784
+ }
2785
+ scheduleReconnect();
2786
+ });
2787
+ } catch (err) {
2788
+ maybeUpdateResumeSequence(err);
2789
+ if (!shouldRetry(err)) {
2790
+ fail(normalizeServiceError(err, `watch-run:${runId}`));
2791
+ return;
2792
+ }
2793
+ scheduleReconnect();
2794
+ }
2795
+ }
2796
+
2797
+ start();
2798
+
2799
+ return {
2800
+ next(): Promise<IteratorResult<RunStreamEvent>> {
2801
+ if (queue.length > 0) {
2802
+ const value = queue.shift();
2803
+ if (value) {
2804
+ return Promise.resolve({ value, done: false });
2805
+ }
2806
+ }
2807
+ if (fatalError) {
2808
+ return Promise.reject(fatalError);
2809
+ }
2810
+ if (done) {
2811
+ return Promise.resolve({ value: undefined, done: true });
2812
+ }
2813
+ return new Promise((resolve, reject) => {
2814
+ waiters.push({ resolve, reject });
2815
+ });
2816
+ },
2817
+ return(): Promise<IteratorResult<RunStreamEvent>> {
2818
+ cancelled = true;
2819
+ clearReconnectTimer();
2820
+ clearStream(true);
2821
+ if (!done && !fatalError) {
2822
+ done = true;
2823
+ for (const waiter of waiters) {
2824
+ waiter.resolve({ value: undefined, done: true });
2825
+ }
2826
+ waiters.length = 0;
2827
+ }
2828
+ return Promise.resolve({ value: undefined, done: true });
2829
+ },
2830
+ };
2831
+ },
2832
+ };
2833
+ },
2834
+
2835
+ _log: (level, msg, attrs) => pushLog(level, msg, attrs),
2836
+ };
2837
+
2838
+ if (globalOpts.captureLogs !== false) {
2839
+ captureConsole(svc);
2840
+ }
2841
+
2842
+ return svc;
2843
+ }
2844
+
2845
+ // ── Console interception ──────────────────────────────────────────────────────
2846
+
2847
+ type ConsoleMethod = "log" | "info" | "warn" | "error" | "debug";
2848
+
2849
+ /**
2850
+ * Intercepts `console.log / .info / .warn / .error / .debug` so that every
2851
+ * call is **also** shipped to ServiceBridge as a structured log entry.
2852
+ *
2853
+ * Original console output is preserved (pass-through) — nothing changes on
2854
+ * the terminal side. Trace correlation works automatically via AsyncLocalStorage.
2855
+ *
2856
+ * @example
2857
+ * const svc = servicebridge("localhost:14445", "my-key", "orders");
2858
+ * captureConsole(svc); // call once at startup
2859
+ * console.log("order created"); // appears on stdout AND in ServiceBridge
2860
+ */
2861
+ export function captureConsole(svc: ServiceBridgeService): void {
2862
+ const levelMap: Record<ConsoleMethod, string> = {
2863
+ log: "INFO",
2864
+ info: "INFO",
2865
+ warn: "WARN",
2866
+ error: "ERROR",
2867
+ debug: "DEBUG",
2868
+ };
2869
+
2870
+ const methods: ConsoleMethod[] = ["log", "info", "warn", "error", "debug"];
2871
+
2872
+ for (const method of methods) {
2873
+ const original = console[method].bind(console);
2874
+ const sbLevel = levelMap[method];
2875
+
2876
+ console[method] = (...args: unknown[]) => {
2877
+ // Always pass through to the original console
2878
+ original(...args);
2879
+
2880
+ // Separate string/primitive args (→ message) from object args (→ attributes).
2881
+ // This preserves structure so the server can index and filter by fields.
2882
+ let message = "";
2883
+ const attributes: Record<string, string> = {};
2884
+
2885
+ for (const a of args) {
2886
+ if (a === null || a === undefined) {
2887
+ // skip nullish values
2888
+ } else if (typeof a === "string") {
2889
+ message = message ? `${message} ${a}` : a;
2890
+ } else if (a instanceof Error) {
2891
+ const errStr = a.stack ?? a.message;
2892
+ message = message ? `${message} ${errStr}` : errStr;
2893
+ } else if (typeof a === "object" && !Array.isArray(a)) {
2894
+ // Plain object → merge fields into attributes (stringify non-strings)
2895
+ for (const [k, v] of Object.entries(a as Record<string, unknown>)) {
2896
+ if (v === null || v === undefined) continue;
2897
+ attributes[k] = typeof v === "string" ? v : JSON.stringify(v);
2898
+ }
2899
+ } else {
2900
+ // Primitives (number, boolean, array) → append to message
2901
+ const s =
2902
+ typeof a === "object"
2903
+ ? (() => {
2904
+ try {
2905
+ return JSON.stringify(a);
2906
+ } catch {
2907
+ return String(a);
2908
+ }
2909
+ })()
2910
+ : String(a);
2911
+ message = message ? `${message} ${s}` : s;
2912
+ }
2913
+ }
2914
+
2915
+ if (message || Object.keys(attributes).length > 0) {
2916
+ svc._log(
2917
+ sbLevel,
2918
+ message,
2919
+ Object.keys(attributes).length > 0 ? attributes : undefined,
2920
+ );
2921
+ }
2922
+ };
2923
+ }
2924
+ }