@infersec/conduit 1.24.2 → 1.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,7 +6,7 @@ const __dirname = __pathDirname(__filename);
6
6
 
7
7
  import { parseArgs } from 'node:util';
8
8
  import 'node:crypto';
9
- import { a as asError, s as startInferenceAgent } from './start-CpPE5_K5.js';
9
+ import { a as asError, s as startInferenceAgent } from './start-DBk2G4SP.js';
10
10
  import 'argon2';
11
11
  import 'node:child_process';
12
12
  import 'node:stream';
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ const __filename = __fileURLToPath(import.meta.url);
5
5
  const __dirname = __pathDirname(__filename);
6
6
 
7
7
  import 'node:crypto';
8
- import { s as startInferenceAgent, a as asError } from './start-CpPE5_K5.js';
8
+ import { s as startInferenceAgent, a as asError } from './start-DBk2G4SP.js';
9
9
  import 'argon2';
10
10
  import 'node:child_process';
11
11
  import 'node:stream';
@@ -14914,9 +14914,32 @@ const API_SERVICE_CONDUIT_API_REFERENCE = {
14914
14914
  type: "rest"
14915
14915
  }
14916
14916
  }
14917
+ },
14918
+ "/conduit/api/v1/source/:sourceID/requests/:requestID/stream": {
14919
+ POST: {
14920
+ auth: {
14921
+ type: "api-key"
14922
+ },
14923
+ parameters: {
14924
+ requestID: ULIDSchema,
14925
+ sourceID: ULIDSchema
14926
+ },
14927
+ response: {
14928
+ schema: object({
14929
+ acknowledged: literal(true)
14930
+ }),
14931
+ type: "rest"
14932
+ }
14933
+ }
14917
14934
  }
14918
14935
  };
14919
14936
 
14937
+ /**
14938
+ * Coerce non-string values to JSON strings. Some LLM backends (e.g. llama.cpp)
14939
+ * return tool_calls arguments as parsed objects instead of JSON strings, which
14940
+ * violates the OpenAI spec. This schema field normalises them on parse.
14941
+ */
14942
+ const jsonStringCoerced = preprocess(val => (typeof val === "string" ? val : JSON.stringify(val)), string$1());
14920
14943
  // ==================== CHAT COMPLETION ROLES ====================
14921
14944
  _enum([
14922
14945
  "system",
@@ -14963,13 +14986,13 @@ const ChatCompletionAssistantMessageParamSchema = object({
14963
14986
  type: literal("function"),
14964
14987
  function: object({
14965
14988
  name: string$1(),
14966
- arguments: string$1()
14989
+ arguments: jsonStringCoerced
14967
14990
  })
14968
14991
  }))
14969
14992
  .optional(),
14970
14993
  function_call: object({
14971
14994
  name: string$1(),
14972
- arguments: string$1()
14995
+ arguments: jsonStringCoerced
14973
14996
  })
14974
14997
  .optional(),
14975
14998
  refusal: string$1().nullable().optional()
@@ -15006,13 +15029,13 @@ const ChatCompletionMessageSchema = object({
15006
15029
  type: literal("function"),
15007
15030
  function: object({
15008
15031
  name: string$1(),
15009
- arguments: string$1()
15032
+ arguments: jsonStringCoerced
15010
15033
  })
15011
15034
  }))
15012
15035
  .optional(),
15013
15036
  function_call: object({
15014
15037
  name: string$1(),
15015
- arguments: string$1()
15038
+ arguments: jsonStringCoerced
15016
15039
  })
15017
15040
  .optional(),
15018
15041
  refusal: string$1().nullable()
@@ -17595,6 +17618,9 @@ function createLogger({ attributes = {}, name }) {
17595
17618
  child: (attributes) => {
17596
17619
  return buildLogger(logger.child(processAttributes(attributes)));
17597
17620
  },
17621
+ debug: (message, attributes) => {
17622
+ logger.debug(processAttributes(attributes ?? {}), message);
17623
+ },
17598
17624
  error: (message, attributes) => {
17599
17625
  logger.error(processAttributes(attributes ?? {}), message);
17600
17626
  },
@@ -97834,7 +97860,6 @@ function implementSingleEndpoint({ endpoint, handler, method, mount, route }) {
97834
97860
  : []), (async (req, res) => {
97835
97861
  res.locals.requestID = ulid$2();
97836
97862
  try {
97837
- console.log("HANDLE REQ", method, route, req.params);
97838
97863
  // Extract and validate parameters with proper type assertion
97839
97864
  const parameters = endpoint.parameters
97840
97865
  ? validateAndExtract("params", req.params, endpoint.parameters)
@@ -97882,7 +97907,6 @@ function implementSingleEndpoint({ endpoint, handler, method, mount, route }) {
97882
97907
  res.status(output.status).send(output.statusText);
97883
97908
  return;
97884
97909
  }
97885
- console.log("GOT RESPONSE", method, route, output.status, typeof output.body);
97886
97910
  res.status(output.status);
97887
97911
  if (endpoint.response.type === "text-stream") {
97888
97912
  if (!res.getHeader("content-type")) {
@@ -108354,7 +108378,8 @@ async function handleSSERequests({ apiURL, configuration, logger, modelID, onReq
108354
108378
  onRequestEnd,
108355
108379
  onRequestStart,
108356
108380
  reportMetrics,
108357
- request: payload
108381
+ request: payload,
108382
+ signal
108358
108383
  }).catch(error => {
108359
108384
  logger.error("SSE request handler failed", {
108360
108385
  error: asError(error),
@@ -108388,7 +108413,7 @@ async function handleSSERequests({ apiURL, configuration, logger, modelID, onReq
108388
108413
  }
108389
108414
  }
108390
108415
  }
108391
- async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request }) {
108416
+ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request, signal }) {
108392
108417
  function reportMetricsSafe(payload) {
108393
108418
  reportMetrics(payload).catch(error => {
108394
108419
  logger.warn("Failed to upload LLM prompt metrics", {
@@ -108408,7 +108433,8 @@ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest
108408
108433
  logger,
108409
108434
  requestID: request.requestID,
108410
108435
  requestStartedAt,
108411
- response
108436
+ response,
108437
+ signal
108412
108438
  });
108413
108439
  const latencyMs = Math.max(0, Date.now() - requestStartedAt);
108414
108440
  const totalTokens = 0;
@@ -108448,26 +108474,23 @@ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest
108448
108474
  durationMs: latencyMs,
108449
108475
  totalTokens
108450
108476
  });
108451
- await postChunk({
108477
+ const streamHandler = await sendChunkStream({
108452
108478
  apiURL,
108453
108479
  configuration,
108454
- payload: {
108455
- data: encodeBinaryChunk(Buffer.from(failureMessage)),
108456
- sequence: 0,
108457
- status: 502
108458
- },
108459
- requestID: request.requestID
108480
+ requestID: request.requestID,
108481
+ logger
108460
108482
  });
108461
- await postChunk({
108462
- apiURL,
108463
- configuration,
108464
- payload: {
108465
- data: null,
108466
- sequence: 1,
108467
- status: 502
108468
- },
108469
- requestID: request.requestID
108483
+ await streamHandler.sendChunk({
108484
+ data: encodeBinaryChunk(Buffer.from(failureMessage)),
108485
+ sequence: 0,
108486
+ status: 502
108487
+ });
108488
+ await streamHandler.sendChunk({
108489
+ data: null,
108490
+ sequence: 1,
108491
+ status: 502
108470
108492
  });
108493
+ await streamHandler.end();
108471
108494
  reportMetricsSafe({
108472
108495
  bytes: requestBytes + failureBytes,
108473
108496
  completionTokens: 0,
@@ -108491,12 +108514,22 @@ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest
108491
108514
  await onRequestEnd?.(request);
108492
108515
  }
108493
108516
  }
108494
- async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response }) {
108517
+ async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response, signal }) {
108495
108518
  let sequence = 0;
108496
108519
  let responseBytes = 0;
108497
108520
  let timeToFirstTokenMs = null;
108521
+ const streamHandler = await sendChunkStream({
108522
+ apiURL,
108523
+ configuration,
108524
+ requestID,
108525
+ logger
108526
+ });
108498
108527
  if (response.body instanceof Readable) {
108499
108528
  for await (const chunk of response.body) {
108529
+ if (signal?.aborted) {
108530
+ streamHandler.abort();
108531
+ throw new Error("Request cancelled");
108532
+ }
108500
108533
  const buffer = Buffer.isBuffer(chunk)
108501
108534
  ? chunk
108502
108535
  : Buffer.from(chunk);
@@ -108504,28 +108537,19 @@ async function streamResponse({ apiURL, configuration, logger, requestID, reques
108504
108537
  timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108505
108538
  }
108506
108539
  responseBytes += buffer.length;
108507
- await postChunk({
108508
- apiURL,
108509
- configuration,
108510
- payload: {
108511
- data: encodeBinaryChunk(buffer),
108512
- sequence,
108513
- status: response.status
108514
- },
108515
- requestID
108540
+ await streamHandler.sendChunk({
108541
+ data: encodeBinaryChunk(buffer),
108542
+ sequence,
108543
+ status: response.status
108516
108544
  });
108517
108545
  sequence += 1;
108518
108546
  }
108519
- await postChunk({
108520
- apiURL,
108521
- configuration,
108522
- payload: {
108523
- data: null,
108524
- sequence,
108525
- status: response.status
108526
- },
108527
- requestID
108547
+ await streamHandler.sendChunk({
108548
+ data: null,
108549
+ sequence,
108550
+ status: response.status
108528
108551
  });
108552
+ await streamHandler.end();
108529
108553
  return {
108530
108554
  responseBytes,
108531
108555
  status: response.status,
@@ -108541,27 +108565,18 @@ async function streamResponse({ apiURL, configuration, logger, requestID, reques
108541
108565
  responseBytes = Buffer.byteLength(responsePayload, "utf8");
108542
108566
  timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108543
108567
  }
108544
- await postChunk({
108545
- apiURL,
108546
- configuration,
108547
- payload: {
108548
- data: encodeBinaryChunk(Buffer.from(responsePayload)),
108549
- headers: response.headers,
108550
- sequence,
108551
- status: response.status
108552
- },
108553
- requestID
108568
+ await streamHandler.sendChunk({
108569
+ data: encodeBinaryChunk(Buffer.from(responsePayload)),
108570
+ headers: response.headers,
108571
+ sequence,
108572
+ status: response.status
108554
108573
  });
108555
- await postChunk({
108556
- apiURL,
108557
- configuration,
108558
- payload: {
108559
- data: null,
108560
- sequence: sequence + 1,
108561
- status: response.status
108562
- },
108563
- requestID
108574
+ await streamHandler.sendChunk({
108575
+ data: null,
108576
+ sequence: sequence + 1,
108577
+ status: response.status
108564
108578
  });
108579
+ await streamHandler.end();
108565
108580
  logger.info("SSE response queued", {
108566
108581
  requestMethod: requestID
108567
108582
  });
@@ -108571,28 +108586,101 @@ async function streamResponse({ apiURL, configuration, logger, requestID, reques
108571
108586
  timeToFirstTokenMs
108572
108587
  };
108573
108588
  }
108574
- async function postChunk({ apiURL, configuration, payload, requestID }) {
108575
- const response = ClientToServerAPIResponseSchema.parse({
108576
- data: payload.data,
108577
- headers: payload.headers,
108578
- requestID,
108579
- status: payload.status
108580
- });
108581
- await fetch(`${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/${requestID}/chunk`, {
108582
- body: JSON.stringify({
108583
- ...response,
108584
- sequence: payload.sequence
108585
- }),
108586
- headers: {
108587
- "content-type": "application/json",
108588
- "x-api-key": configuration.apiKey
108589
- },
108590
- method: "POST"
108591
- });
108592
- }
108593
108589
  function encodeBinaryChunk(chunk) {
108594
108590
  return chunk.toString("base64");
108595
108591
  }
108592
+ async function sendChunkStream({ apiURL, configuration, requestID, logger }) {
108593
+ const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/${requestID}/stream`;
108594
+ const maxFlushAttempts = 3;
108595
+ let isAborted = false;
108596
+ let isClosed = false;
108597
+ let activeAbortController = null;
108598
+ const chunks = [];
108599
+ const sendChunk = async (payload) => {
108600
+ if (isAborted || isClosed) {
108601
+ return;
108602
+ }
108603
+ const response = ClientToServerAPIResponseSchema.parse({
108604
+ data: payload.data,
108605
+ headers: payload.headers,
108606
+ requestID,
108607
+ status: payload.status
108608
+ });
108609
+ const chunk = JSON.stringify({
108610
+ ...response,
108611
+ sequence: payload.sequence
108612
+ });
108613
+ chunks.push(Buffer.from(chunk + "\n"));
108614
+ if (chunks.length >= 10) {
108615
+ await flushChunks();
108616
+ }
108617
+ };
108618
+ const flushChunks = async () => {
108619
+ if (chunks.length === 0 || isAborted) {
108620
+ return;
108621
+ }
108622
+ const batch = chunks.splice(0, chunks.length);
108623
+ const body = Buffer.concat(batch);
108624
+ for (let attempt = 1; attempt <= maxFlushAttempts; attempt += 1) {
108625
+ try {
108626
+ activeAbortController = new AbortController();
108627
+ const response = await fetch(streamURL, {
108628
+ body: body.toString(),
108629
+ headers: {
108630
+ "content-type": "application/json",
108631
+ "x-api-key": configuration.apiKey
108632
+ },
108633
+ method: "POST",
108634
+ signal: activeAbortController.signal
108635
+ });
108636
+ if (!response.ok) {
108637
+ throw new Error(`Chunk stream flush failed with status ${response.status}`);
108638
+ }
108639
+ return;
108640
+ }
108641
+ catch (error) {
108642
+ if (isAborted) {
108643
+ return;
108644
+ }
108645
+ if (attempt >= maxFlushAttempts) {
108646
+ chunks.unshift(...batch);
108647
+ throw asError(error);
108648
+ }
108649
+ logger.warn("Failed to send chunk batch", {
108650
+ error: asError(error)
108651
+ });
108652
+ await sleep(100 * attempt);
108653
+ }
108654
+ finally {
108655
+ activeAbortController = null;
108656
+ }
108657
+ }
108658
+ };
108659
+ const end = async () => {
108660
+ if (isClosed || isAborted) {
108661
+ return;
108662
+ }
108663
+ await flushChunks();
108664
+ isClosed = true;
108665
+ };
108666
+ const abort = (error) => {
108667
+ isAborted = true;
108668
+ if (activeAbortController) {
108669
+ activeAbortController.abort();
108670
+ }
108671
+ chunks.length = 0;
108672
+ if (error) {
108673
+ logger.error("Chunk stream aborted", {
108674
+ error: asError(error)
108675
+ });
108676
+ }
108677
+ };
108678
+ return {
108679
+ sendChunk,
108680
+ end,
108681
+ abort
108682
+ };
108683
+ }
108596
108684
  function calculateRequestBytes(body) {
108597
108685
  if (body === null || body === undefined) {
108598
108686
  return 0;
@@ -117945,6 +118033,42 @@ async function collectMachineMetadata() {
117945
118033
  return machineMetadata;
117946
118034
  }
117947
118035
 
118036
+ /**
118037
+ * Coerce non-string tool_calls function.arguments to JSON strings.
118038
+ * Some LLM backends return arguments as parsed objects instead of
118039
+ * JSON strings, violating the OpenAI spec. This mutates in place
118040
+ * and returns true if any coercion was performed.
118041
+ */
118042
+ function coerceToolCallArguments(parsed) {
118043
+ const choices = parsed.choices;
118044
+ if (!Array.isArray(choices))
118045
+ return false;
118046
+ let modified = false;
118047
+ for (const choice of choices) {
118048
+ if (!choice || typeof choice !== "object")
118049
+ continue;
118050
+ const choiceRecord = choice;
118051
+ const msg = choiceRecord.delta ?? choiceRecord.message;
118052
+ if (!msg || typeof msg !== "object")
118053
+ continue;
118054
+ const toolCalls = msg.tool_calls;
118055
+ if (!Array.isArray(toolCalls))
118056
+ continue;
118057
+ for (const tc of toolCalls) {
118058
+ if (!tc || typeof tc !== "object")
118059
+ continue;
118060
+ const fn = tc.function;
118061
+ if (!fn || typeof fn !== "object")
118062
+ continue;
118063
+ const fnRecord = fn;
118064
+ if (fnRecord.arguments !== undefined && typeof fnRecord.arguments !== "string") {
118065
+ fnRecord.arguments = JSON.stringify(fnRecord.arguments);
118066
+ modified = true;
118067
+ }
118068
+ }
118069
+ }
118070
+ return modified;
118071
+ }
117948
118072
  function isEngineUsageChunk(value) {
117949
118073
  if (!value || typeof value !== "object") {
117950
118074
  return false;
@@ -117980,6 +118104,10 @@ function monitorEngineResponseStream({ agentEngineType, body, contextLength, eng
117980
118104
  }
117981
118105
  try {
117982
118106
  const parsed = JSON.parse(payload);
118107
+ let modified = false;
118108
+ if (coerceToolCallArguments(parsed)) {
118109
+ modified = true;
118110
+ }
117983
118111
  if (parsed.usage) {
117984
118112
  const usageChunk = parsed.usage;
117985
118113
  const effectiveContext = getEffectiveContextLength({
@@ -117991,10 +118119,13 @@ function monitorEngineResponseStream({ agentEngineType, body, contextLength, eng
117991
118119
  usageChunk.prompt_tokens !== undefined &&
117992
118120
  effectiveContext !== null) {
117993
118121
  usageChunk.context_usage = usageChunk.prompt_tokens / effectiveContext;
117994
- modifiedLines.push("data: " + JSON.stringify(parsed));
117995
- continue;
118122
+ modified = true;
117996
118123
  }
117997
118124
  }
118125
+ if (modified) {
118126
+ modifiedLines.push("data: " + JSON.stringify(parsed));
118127
+ continue;
118128
+ }
117998
118129
  }
117999
118130
  catch (_error) {
118000
118131
  // Ignore malformed chunks
@@ -118070,13 +118201,14 @@ function monitorEngineResponseStream({ agentEngineType, body, contextLength, eng
118070
118201
  }
118071
118202
  }
118072
118203
  body.on("data", (chunk) => {
118204
+ const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
118073
118205
  if (firstChunkAt === null) {
118074
118206
  firstChunkAt = Date.now();
118075
118207
  }
118076
- responseBytes += chunk.length;
118077
- buffer += chunk.toString("utf8");
118208
+ responseBytes += chunkBuffer.length;
118209
+ buffer += chunkBuffer.toString("utf8");
118078
118210
  parseUsageFromBuffer();
118079
- passThrough.write(modifyChunkWithUsage(chunk));
118211
+ passThrough.write(modifyChunkWithUsage(chunkBuffer));
118080
118212
  });
118081
118213
  body.once("error", err => {
118082
118214
  logEngineMetrics({
@@ -118133,6 +118265,148 @@ function monitorEngineResponseStream({ agentEngineType, body, contextLength, eng
118133
118265
  stream: passThrough
118134
118266
  };
118135
118267
  }
118268
+ function monitorEngineResponseSingle({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }) {
118269
+ const maxUsageCaptureBytes = 1024 * 1024;
118270
+ const startedAt = requestStartedAt ?? Date.now();
118271
+ const passThrough = new PassThrough();
118272
+ let responseBytes = 0;
118273
+ let firstChunkAt = null;
118274
+ let usage = null;
118275
+ const usageChunks = [];
118276
+ let usageBytes = 0;
118277
+ let usageCaptureEnabled = true;
118278
+ let completed = false;
118279
+ function finalize(error) {
118280
+ if (completed) {
118281
+ return;
118282
+ }
118283
+ completed = true;
118284
+ if (onComplete) {
118285
+ const completion = onComplete({
118286
+ durationMs: Math.max(0, Date.now() - startedAt),
118287
+ error,
118288
+ requestBodyBytes,
118289
+ responseBytes,
118290
+ timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
118291
+ usage
118292
+ });
118293
+ if (completion && typeof completion.catch === "function") {
118294
+ completion.catch(error => {
118295
+ logger.error("Engine metrics completion failed", {
118296
+ error: asError(error),
118297
+ requestUrl: requestPath
118298
+ });
118299
+ });
118300
+ }
118301
+ }
118302
+ }
118303
+ body.on("data", (chunk) => {
118304
+ const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
118305
+ if (firstChunkAt === null) {
118306
+ firstChunkAt = Date.now();
118307
+ }
118308
+ responseBytes += chunkBuffer.length;
118309
+ if (usageCaptureEnabled) {
118310
+ const nextSize = usageBytes + chunkBuffer.length;
118311
+ if (nextSize <= maxUsageCaptureBytes) {
118312
+ usageChunks.push(chunkBuffer);
118313
+ usageBytes = nextSize;
118314
+ }
118315
+ else {
118316
+ usageCaptureEnabled = false;
118317
+ usageChunks.length = 0;
118318
+ }
118319
+ }
118320
+ passThrough.write(chunkBuffer);
118321
+ });
118322
+ body.once("error", err => {
118323
+ logEngineMetrics({
118324
+ agentEngineType,
118325
+ error: err,
118326
+ level: "error",
118327
+ logger,
118328
+ requestBodyBytes,
118329
+ requestPath,
118330
+ responseBytes,
118331
+ usage
118332
+ });
118333
+ finalize(err);
118334
+ passThrough.destroy(err);
118335
+ });
118336
+ body.once("end", () => {
118337
+ if (usageCaptureEnabled) {
118338
+ try {
118339
+ const parsed = JSON.parse(Buffer.concat(usageChunks).toString("utf8"));
118340
+ if (parsed.usage) {
118341
+ const usageChunk = parsed.usage;
118342
+ const completionTokens = usageChunk.completion_tokens ?? null;
118343
+ const promptTokens = usageChunk.prompt_tokens ?? null;
118344
+ const totalTokens = usageChunk.total_tokens ?? null;
118345
+ let contextUsage = usageChunk.context_usage ?? null;
118346
+ const effectiveContext = getEffectiveContextLength({
118347
+ contextLength,
118348
+ engine,
118349
+ parallelism
118350
+ });
118351
+ if (contextUsage === null &&
118352
+ promptTokens !== null &&
118353
+ effectiveContext !== null) {
118354
+ contextUsage = promptTokens / effectiveContext;
118355
+ }
118356
+ usage = {
118357
+ completionTokens,
118358
+ contextUsage,
118359
+ promptTokens,
118360
+ totalTokens
118361
+ };
118362
+ }
118363
+ }
118364
+ catch (error) {
118365
+ logger.error("Failed to parse engine response body", {
118366
+ error: asError(error),
118367
+ requestUrl: requestPath
118368
+ });
118369
+ }
118370
+ }
118371
+ logEngineMetrics({
118372
+ agentEngineType,
118373
+ level: "info",
118374
+ logger,
118375
+ requestBodyBytes,
118376
+ requestPath,
118377
+ responseBytes,
118378
+ usage
118379
+ });
118380
+ finalize(null);
118381
+ passThrough.end();
118382
+ });
118383
+ body.once("close", () => {
118384
+ if (completed) {
118385
+ if (!passThrough.writableEnded) {
118386
+ passThrough.end();
118387
+ }
118388
+ return;
118389
+ }
118390
+ const closeError = new Error("Engine response stream closed before completion");
118391
+ logEngineMetrics({
118392
+ agentEngineType,
118393
+ error: closeError,
118394
+ level: "error",
118395
+ logger,
118396
+ requestBodyBytes,
118397
+ requestPath,
118398
+ responseBytes,
118399
+ usage
118400
+ });
118401
+ finalize(closeError);
118402
+ if (!passThrough.writableEnded) {
118403
+ passThrough.end();
118404
+ }
118405
+ });
118406
+ return {
118407
+ stream: passThrough
118408
+ };
118409
+ }
118136
118410
  function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }) {
118137
118411
  const metricsMessage = [
118138
118412
  "LLM engine stream metrics",
@@ -118205,6 +118479,35 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID,
118205
118479
  }
118206
118480
  const { bytes: requestBodyBytes, payload: serializedBody } = serializeRequestBody(body);
118207
118481
  const requestStartedAt = Date.now();
118482
+ const requestBody = JSON.parse(serializedBody);
118483
+ const streamRequested = requestBody.stream === true;
118484
+ const onMonitoringComplete = ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
118485
+ const completionTokens = normalizeTokenCount(usage?.completionTokens);
118486
+ const promptTokens = normalizeTokenCount(usage?.promptTokens);
118487
+ const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
118488
+ const latencyMs = Math.max(0, durationMs);
118489
+ reportMetricsSafe({
118490
+ bytes: requestBodyBytes + responseBytes,
118491
+ completionTokens,
118492
+ engine: configuration.agentEngineType,
118493
+ endpointId: null,
118494
+ latencyMs,
118495
+ modelId: modelID,
118496
+ promptTokens,
118497
+ requestBytes: requestBodyBytes,
118498
+ requestId: null,
118499
+ requestMethod: "POST",
118500
+ requestPath: path,
118501
+ responseBytes,
118502
+ successful: !error,
118503
+ timeToFirstTokenMs,
118504
+ tokensPerSecond: calculateTokensPerSecond({
118505
+ durationMs: latencyMs,
118506
+ totalTokens
118507
+ }),
118508
+ totalTokens
118509
+ });
118510
+ };
118208
118511
  const response = await modelManager
118209
118512
  .fetchOpenAI(path, {
118210
118513
  body: serializedBody,
@@ -118299,44 +118602,31 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID,
118299
118602
  statusText: responseStatusText
118300
118603
  };
118301
118604
  }
118302
- const monitoredResponse = monitorEngineResponseStream({
118303
- agentEngineType: configuration.agentEngineType,
118304
- body: Readable.fromWeb(response.body),
118305
- contextLength: modelManager.contextLength,
118306
- engine: configuration.agentEngineType,
118307
- logger,
118308
- onComplete: ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
118309
- const completionTokens = normalizeTokenCount(usage?.completionTokens);
118310
- const promptTokens = normalizeTokenCount(usage?.promptTokens);
118311
- const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
118312
- const latencyMs = Math.max(0, durationMs);
118313
- reportMetricsSafe({
118314
- bytes: requestBodyBytes + responseBytes,
118315
- completionTokens,
118316
- engine: configuration.agentEngineType,
118317
- endpointId: null,
118318
- latencyMs,
118319
- modelId: modelID,
118320
- promptTokens,
118321
- requestBytes: requestBodyBytes,
118322
- requestId: null,
118323
- requestMethod: "POST",
118324
- requestPath: path,
118325
- responseBytes,
118326
- successful: !error,
118327
- timeToFirstTokenMs,
118328
- tokensPerSecond: calculateTokensPerSecond({
118329
- durationMs: latencyMs,
118330
- totalTokens
118331
- }),
118332
- totalTokens
118333
- });
118334
- },
118335
- parallelism: modelManager.parallelism,
118336
- requestBodyBytes,
118337
- requestPath: path,
118338
- requestStartedAt
118339
- });
118605
+ const monitoredResponse = streamRequested
118606
+ ? monitorEngineResponseStream({
118607
+ agentEngineType: configuration.agentEngineType,
118608
+ body: Readable.fromWeb(response.body),
118609
+ contextLength: modelManager.contextLength,
118610
+ engine: configuration.agentEngineType,
118611
+ logger,
118612
+ onComplete: onMonitoringComplete,
118613
+ parallelism: modelManager.parallelism,
118614
+ requestBodyBytes,
118615
+ requestPath: path,
118616
+ requestStartedAt
118617
+ })
118618
+ : monitorEngineResponseSingle({
118619
+ agentEngineType: configuration.agentEngineType,
118620
+ body: Readable.fromWeb(response.body),
118621
+ contextLength: modelManager.contextLength,
118622
+ engine: configuration.agentEngineType,
118623
+ logger,
118624
+ onComplete: onMonitoringComplete,
118625
+ parallelism: modelManager.parallelism,
118626
+ requestBodyBytes,
118627
+ requestPath: path,
118628
+ requestStartedAt
118629
+ });
118340
118630
  return {
118341
118631
  body: monitoredResponse.stream,
118342
118632
  headers: Object.fromEntries(response.headers.entries()),
@@ -39,5 +39,6 @@ interface MonitorEngineResponseResult {
39
39
  stream: Readable;
40
40
  }
41
41
  export declare function monitorEngineResponseStream({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
42
+ export declare function monitorEngineResponseSingle({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
42
43
  export declare function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }: EngineMetricsLogOptions): void;
43
44
  export {};
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@infersec/conduit",
3
3
  "description": "End user conduit agent for connecting local LLMs to the cloud.",
4
- "version": "1.24.2",
4
+ "version": "1.25.0",
5
5
  "bin": {
6
6
  "infersec-conduit": "./dist/cli.js"
7
7
  },
@@ -27,7 +27,7 @@
27
27
  "test:format": "prettier --check .",
28
28
  "test:lint": "eslint source/**/*.ts",
29
29
  "test:types": "tsc -p tsconfig.json --noEmit",
30
- "test:unit": "vitest run"
30
+ "test:unit": "vitest -c vitest.config.ts run"
31
31
  },
32
32
  "prettier": "@infersec/prettier",
33
33
  "publishConfig": {
@@ -1 +0,0 @@
1
- export {};