@infersec/conduit 1.17.5 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,7 +6,7 @@ const __dirname = __pathDirname(__filename);
6
6
 
7
7
  import { parseArgs } from 'node:util';
8
8
  import 'node:crypto';
9
- import { a as asError, s as startInferenceAgent } from './start-BDCrsqSt.js';
9
+ import { a as asError, s as startInferenceAgent } from './start-CopKwPN6.js';
10
10
  import 'argon2';
11
11
  import 'node:child_process';
12
12
  import 'node:stream';
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ const __filename = __fileURLToPath(import.meta.url);
5
5
  const __dirname = __pathDirname(__filename);
6
6
 
7
7
  import 'node:crypto';
8
- import { s as startInferenceAgent, a as asError } from './start-BDCrsqSt.js';
8
+ import { s as startInferenceAgent, a as asError } from './start-CopKwPN6.js';
9
9
  import 'argon2';
10
10
  import 'node:child_process';
11
11
  import 'node:stream';
@@ -1,12 +1,14 @@
1
- import { type APIResponse, type ServerToClientAPIRequest } from "@infersec/definitions";
1
+ import { InferenceAgentLLMMetricsPayload, type ULID, type APIResponse, type ServerToClientAPIRequest } from "@infersec/definitions";
2
2
  import { Logger } from "@infersec/logger";
3
3
  import { Configuration } from "../configuration.js";
4
- export declare function handleSSERequests({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, signal }: {
4
+ export declare function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }: {
5
5
  apiURL: string;
6
6
  configuration: Configuration;
7
7
  logger: Logger;
8
+ modelID: ULID;
8
9
  onRequest: (request: ServerToClientAPIRequest) => Promise<APIResponse>;
9
10
  onRequestEnd?: (request: ServerToClientAPIRequest) => Promise<void> | void;
10
11
  onRequestStart?: (request: ServerToClientAPIRequest) => Promise<void> | void;
12
+ reportMetrics: (payload: InferenceAgentLLMMetricsPayload) => Promise<void>;
11
13
  signal?: AbortSignal;
12
14
  }): Promise<void>;
@@ -14659,11 +14659,54 @@ custom((data) => {
14659
14659
  return result.success;
14660
14660
  }, "Invalid API reference structure");
14661
14661
 
14662
+ const LLMEngineSchema = _enum(["llama.cpp", "vllm"]);
14663
+ const LLMModelFormatSchema = _enum([
14664
+ // VLLM
14665
+ "safetensors",
14666
+ "pytorch",
14667
+ "awq",
14668
+ "gptq",
14669
+ // Llama.cpp
14670
+ "gguf"
14671
+ ]);
14672
+ const LLMModelSchema = object({
14673
+ format: LLMModelFormatSchema,
14674
+ id: string$1().min(1),
14675
+ multimodalEnabled: boolean$1(),
14676
+ source: discriminatedUnion("type", [
14677
+ object({
14678
+ irid: IRIDSchema,
14679
+ type: literal("storage")
14680
+ }),
14681
+ object({
14682
+ modelSecret: string$1().min(1).nullable(),
14683
+ slug: string$1().min(1),
14684
+ type: literal("huggingface")
14685
+ })
14686
+ ])
14687
+ });
14688
+ const QuantizationFileSchema = object({
14689
+ filePath: string$1().min(1),
14690
+ sizeBytes: number$1().int().nonnegative().nullable()
14691
+ });
14692
+ array(QuantizationFileSchema);
14693
+
14662
14694
  const InferenceAgentLLMMetricsPayloadSchema = object({
14663
14695
  bytes: number$1().int().nonnegative(),
14664
14696
  completionTokens: number$1().int().nonnegative(),
14697
+ engine: LLMEngineSchema.nullable(),
14698
+ endpointId: ULIDSchema.nullable(),
14699
+ latencyMs: number$1().int().nonnegative(),
14700
+ modelId: ULIDSchema.nullable(),
14665
14701
  promptTokens: number$1().int().nonnegative(),
14702
+ requestBytes: number$1().int().nonnegative(),
14703
+ requestId: ULIDSchema.nullable(),
14704
+ requestMethod: string$1().nullable(),
14705
+ requestPath: string$1().nullable(),
14706
+ responseBytes: number$1().int().nonnegative(),
14666
14707
  successful: boolean$1(),
14708
+ timeToFirstTokenMs: number$1().int().nonnegative().nullable(),
14709
+ tokensPerSecond: number$1().int().nonnegative(),
14667
14710
  totalTokens: number$1().int().nonnegative()
14668
14711
  });
14669
14712
  const InferenceAgentMachineGPUSchema = object({
@@ -14765,38 +14808,6 @@ const ConduitState = z.preprocess(value => {
14765
14808
  return value;
14766
14809
  }, ConduitStateSchema);
14767
14810
 
14768
- const LLMEngineSchema = _enum(["llama.cpp", "vllm"]);
14769
- const LLMModelFormatSchema = _enum([
14770
- // VLLM
14771
- "safetensors",
14772
- "pytorch",
14773
- "awq",
14774
- "gptq",
14775
- // Llama.cpp
14776
- "gguf"
14777
- ]);
14778
- const LLMModelSchema = object({
14779
- format: LLMModelFormatSchema,
14780
- id: string$1().min(1),
14781
- multimodalEnabled: boolean$1(),
14782
- source: discriminatedUnion("type", [
14783
- object({
14784
- irid: IRIDSchema,
14785
- type: literal("storage")
14786
- }),
14787
- object({
14788
- modelSecret: string$1().min(1).nullable(),
14789
- slug: string$1().min(1),
14790
- type: literal("huggingface")
14791
- })
14792
- ])
14793
- });
14794
- const QuantizationFileSchema = object({
14795
- filePath: string$1().min(1),
14796
- sizeBytes: number$1().int().nonnegative().nullable()
14797
- });
14798
- array(QuantizationFileSchema);
14799
-
14800
14811
  const InferenceAgentConfigurationSchema = object({
14801
14812
  contextLength: number$1().int().positive().nullable(),
14802
14813
  inferenceSourceID: ULIDSchema,
@@ -108293,7 +108304,7 @@ function sleep(ms) {
108293
108304
  });
108294
108305
  }
108295
108306
 
108296
- async function handleSSERequests({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, signal }) {
108307
+ async function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }) {
108297
108308
  const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/stream`;
108298
108309
  const maxReconnectDelayMs = 30000;
108299
108310
  let reconnectAttempt = 0;
@@ -108318,9 +108329,11 @@ async function handleSSERequests({ apiURL, configuration, logger, onRequest, onR
108318
108329
  apiURL,
108319
108330
  configuration,
108320
108331
  logger,
108332
+ modelID,
108321
108333
  onRequest,
108322
108334
  onRequestEnd,
108323
108335
  onRequestStart,
108336
+ reportMetrics,
108324
108337
  request: payload
108325
108338
  }).catch(error => {
108326
108339
  logger.error("SSE request handler failed", {
@@ -108350,28 +108363,71 @@ async function handleSSERequests({ apiURL, configuration, logger, onRequest, onR
108350
108363
  await sleep(reconnectDelayMs);
108351
108364
  }
108352
108365
  }
108353
- async function handleRequest({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, request }) {
108366
+ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request }) {
108367
+ function reportMetricsSafe(payload) {
108368
+ reportMetrics(payload).catch(error => {
108369
+ logger.warn("Failed to upload LLM prompt metrics", {
108370
+ error: asError(error),
108371
+ requestUrl: request.path
108372
+ });
108373
+ });
108374
+ }
108375
+ const requestStartedAt = Date.now();
108376
+ const requestBytes = calculateRequestBytes(request.body ?? null);
108354
108377
  try {
108355
108378
  await onRequestStart?.(request);
108356
108379
  const response = await onRequest(request);
108357
- await streamResponse({
108380
+ const responseMetrics = await streamResponse({
108358
108381
  apiURL,
108359
108382
  configuration,
108360
108383
  logger,
108361
108384
  requestID: request.requestID,
108385
+ requestStartedAt,
108362
108386
  response
108363
108387
  });
108388
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
108389
+ const totalTokens = 0;
108390
+ const tokensPerSecond = calculateTokensPerSecond$1({
108391
+ durationMs: latencyMs,
108392
+ totalTokens
108393
+ });
108394
+ reportMetricsSafe({
108395
+ bytes: requestBytes + responseMetrics.responseBytes,
108396
+ completionTokens: 0,
108397
+ engine: configuration.agentEngineType,
108398
+ endpointId: null,
108399
+ latencyMs,
108400
+ modelId: modelID,
108401
+ promptTokens: 0,
108402
+ requestBytes,
108403
+ requestId: request.requestID,
108404
+ requestMethod: request.method,
108405
+ requestPath: request.path,
108406
+ responseBytes: responseMetrics.responseBytes,
108407
+ successful: responseMetrics.status < 400,
108408
+ timeToFirstTokenMs: responseMetrics.timeToFirstTokenMs,
108409
+ tokensPerSecond,
108410
+ totalTokens
108411
+ });
108364
108412
  }
108365
108413
  catch (error) {
108366
108414
  logger.error("SSE request failed", {
108367
108415
  error: asError(error),
108368
108416
  requestMethod: request.requestID
108369
108417
  });
108418
+ const failureMessage = "Bad gateway\n\nProxying failed";
108419
+ const failureBytes = Buffer.byteLength(failureMessage, "utf8");
108420
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
108421
+ const totalTokens = 0;
108422
+ const tokensPerSecond = calculateTokensPerSecond$1({
108423
+ durationMs: latencyMs,
108424
+ totalTokens
108425
+ });
108370
108426
  await postChunk({
108371
108427
  apiURL,
108372
108428
  configuration,
108373
108429
  payload: {
108374
- data: encodeTextChunk("Bad gateway\n\nProxying failed"),
108430
+ data: encodeTextChunk(failureMessage),
108375
108431
  sequence: 0,
108376
108432
  status: 502
108377
108433
  },
@@ -108387,16 +108443,40 @@ async function handleRequest({ apiURL, configuration, logger, onRequest, onReque
108387
108443
  },
108388
108444
  requestID: request.requestID
108389
108445
  });
108446
+ reportMetricsSafe({
108447
+ bytes: requestBytes + failureBytes,
108448
+ completionTokens: 0,
108449
+ engine: configuration.agentEngineType,
108450
+ endpointId: null,
108451
+ latencyMs,
108452
+ modelId: modelID,
108453
+ promptTokens: 0,
108454
+ requestBytes,
108455
+ requestId: request.requestID,
108456
+ requestMethod: request.method,
108457
+ requestPath: request.path,
108458
+ responseBytes: failureBytes,
108459
+ successful: false,
108460
+ timeToFirstTokenMs: latencyMs,
108461
+ tokensPerSecond,
108462
+ totalTokens
108463
+ });
108390
108464
  }
108391
108465
  finally {
108392
108466
  await onRequestEnd?.(request);
108393
108467
  }
108394
108468
  }
108395
- async function streamResponse({ apiURL, configuration, logger, requestID, response }) {
108469
+ async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response }) {
108396
108470
  let sequence = 0;
108471
+ let responseBytes = 0;
108472
+ let timeToFirstTokenMs = null;
108397
108473
  if (response.body instanceof Readable) {
108398
108474
  for await (const chunk of response.body) {
108399
108475
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
108476
+ if (timeToFirstTokenMs === null) {
108477
+ timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108478
+ }
108479
+ responseBytes += buffer.length;
108400
108480
  await postChunk({
108401
108481
  apiURL,
108402
108482
  configuration,
@@ -108419,17 +108499,26 @@ async function streamResponse({ apiURL, configuration, logger, requestID, respon
108419
108499
  },
108420
108500
  requestID
108421
108501
  });
108422
- return;
108502
+ return {
108503
+ responseBytes,
108504
+ status: response.status,
108505
+ timeToFirstTokenMs
108506
+ };
108507
+ }
108508
+ const responsePayload = response.body
108509
+ ? typeof response.body === "string"
108510
+ ? response.body
108511
+ : JSON.stringify(response.body)
108512
+ : "";
108513
+ if (responsePayload.length > 0) {
108514
+ responseBytes = Buffer.byteLength(responsePayload, "utf8");
108515
+ timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108423
108516
  }
108424
108517
  await postChunk({
108425
108518
  apiURL,
108426
108519
  configuration,
108427
108520
  payload: {
108428
- data: encodeTextChunk(response.body
108429
- ? typeof response.body === "string"
108430
- ? response.body
108431
- : JSON.stringify(response.body)
108432
- : ""),
108521
+ data: encodeTextChunk(responsePayload),
108433
108522
  headers: response.headers,
108434
108523
  sequence,
108435
108524
  status: response.status
@@ -108449,6 +108538,11 @@ async function streamResponse({ apiURL, configuration, logger, requestID, respon
108449
108538
  logger.info("SSE response queued", {
108450
108539
  requestMethod: requestID
108451
108540
  });
108541
+ return {
108542
+ responseBytes,
108543
+ status: response.status,
108544
+ timeToFirstTokenMs
108545
+ };
108452
108546
  }
108453
108547
  async function postChunk({ apiURL, configuration, payload, requestID }) {
108454
108548
  const response = ClientToServerAPIResponseSchema.parse({
@@ -108475,6 +108569,25 @@ function encodeTextChunk(chunk) {
108475
108569
  }
108476
108570
  return `data:text/plain;base64,${Buffer.from(chunk, "utf-8").toString("base64")}`;
108477
108571
  }
108572
+ function calculateRequestBytes(body) {
108573
+ if (body === null || body === undefined) {
108574
+ return 0;
108575
+ }
108576
+ if (typeof body === "string") {
108577
+ return Buffer.byteLength(body, "utf8");
108578
+ }
108579
+ return Buffer.byteLength(JSON.stringify(body), "utf8");
108580
+ }
108581
+ function calculateTokensPerSecond$1({ durationMs, totalTokens }) {
108582
+ if (durationMs <= 0) {
108583
+ return 0;
108584
+ }
108585
+ const tokensPerSecond = totalTokens / (durationMs / 1000);
108586
+ if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
108587
+ return 0;
108588
+ }
108589
+ return Math.round(tokensPerSecond);
108590
+ }
108478
108591
 
108479
108592
  /**
108480
108593
  * Proxy server requests to the local inference HTTP server.
@@ -117837,9 +117950,11 @@ function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBy
117837
117950
  }
117838
117951
  logger[level](metricsMessage, attributes);
117839
117952
  }
117840
- function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath }) {
117953
+ function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath, requestStartedAt }) {
117954
+ const startedAt = requestStartedAt ?? Date.now();
117841
117955
  const passThrough = new PassThrough();
117842
117956
  let responseBytes = 0;
117957
+ let firstChunkAt = null;
117843
117958
  let usage = null;
117844
117959
  let buffer = "";
117845
117960
  let completed = false;
@@ -117849,12 +117964,22 @@ function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete
117849
117964
  }
117850
117965
  completed = true;
117851
117966
  if (onComplete) {
117852
- onComplete({
117967
+ const completion = onComplete({
117968
+ durationMs: Math.max(0, Date.now() - startedAt),
117853
117969
  error,
117854
117970
  requestBodyBytes,
117855
117971
  responseBytes,
117972
+ timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
117856
117973
  usage
117857
117974
  });
117975
+ if (completion && typeof completion.catch === "function") {
117976
+ completion.catch(error => {
117977
+ logger.error("Engine metrics completion failed", {
117978
+ error: asError(error),
117979
+ requestUrl: requestPath
117980
+ });
117981
+ });
117982
+ }
117858
117983
  }
117859
117984
  }
117860
117985
  function parseUsageFromBuffer() {
@@ -117885,6 +118010,9 @@ function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete
117885
118010
  }
117886
118011
  }
117887
118012
  body.on("data", (chunk) => {
118013
+ if (firstChunkAt === null) {
118014
+ firstChunkAt = Date.now();
118015
+ }
117888
118016
  responseBytes += chunk.length;
117889
118017
  buffer += chunk.toString("utf8");
117890
118018
  parseUsageFromBuffer();
@@ -117970,25 +118098,33 @@ function serializeRequestBody(body) {
117970
118098
  payload
117971
118099
  };
117972
118100
  }
117973
- async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelManager, path, reportMetrics }) {
118101
+ function calculateTokensPerSecond({ durationMs, totalTokens }) {
118102
+ if (durationMs <= 0) {
118103
+ return 0;
118104
+ }
118105
+ const tokensPerSecond = totalTokens / (durationMs / 1000);
118106
+ if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
118107
+ return 0;
118108
+ }
118109
+ return Math.round(tokensPerSecond);
118110
+ }
118111
+ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }) {
117974
118112
  function normalizeTokenCount(value) {
117975
118113
  if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
117976
118114
  return value;
117977
118115
  }
117978
118116
  return 0;
117979
118117
  }
117980
- async function safeReportMetrics(payload) {
117981
- try {
117982
- await reportMetrics(payload);
117983
- }
117984
- catch (error) {
118118
+ function reportMetricsSafe(payload) {
118119
+ reportMetrics(payload).catch(error => {
117985
118120
  logger.warn("Failed to upload LLM prompt metrics", {
117986
- error: error,
118121
+ error: asError(error),
117987
118122
  requestUrl: path
117988
118123
  });
117989
- }
118124
+ });
117990
118125
  }
117991
118126
  const { bytes: requestBodyBytes, payload: serializedBody } = serializeRequestBody(body);
118127
+ const requestStartedAt = Date.now();
117992
118128
  const response = await modelManager
117993
118129
  .fetchOpenAI(path, {
117994
118130
  body: serializedBody,
@@ -118008,11 +118144,23 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
118008
118144
  responseBytes: 0,
118009
118145
  usage: null
118010
118146
  });
118011
- void safeReportMetrics({
118147
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
118148
+ reportMetricsSafe({
118012
118149
  bytes: requestBodyBytes,
118013
118150
  completionTokens: 0,
118151
+ engine: configuration.agentEngineType,
118152
+ endpointId: null,
118153
+ latencyMs,
118154
+ modelId: modelID,
118014
118155
  promptTokens: 0,
118156
+ requestBytes: requestBodyBytes,
118157
+ requestId: null,
118158
+ requestMethod: "POST",
118159
+ requestPath: path,
118160
+ responseBytes: 0,
118015
118161
  successful: false,
118162
+ timeToFirstTokenMs: null,
118163
+ tokensPerSecond: 0,
118016
118164
  totalTokens: 0
118017
118165
  });
118018
118166
  throw error;
@@ -118027,10 +118175,17 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
118027
118175
  error: responseError,
118028
118176
  requestUrl: path,
118029
118177
  statusCode: response.status,
118030
- statusText: responseStatusText
118178
+ statusText: responseStatusText,
118179
+ responseBody: responseBody ?? undefined
118031
118180
  });
118181
+ if (!response.body) {
118182
+ return {
118183
+ status: response.status,
118184
+ statusText: responseStatusText
118185
+ };
118186
+ }
118032
118187
  }
118033
- if (!response.body || !response.ok) {
118188
+ if (!response.body) {
118034
118189
  logEngineMetrics({
118035
118190
  agentEngineType: configuration.agentEngineType,
118036
118191
  level: response.ok ? "info" : "error",
@@ -118040,11 +118195,23 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
118040
118195
  responseBytes: 0,
118041
118196
  usage: null
118042
118197
  });
118043
- void safeReportMetrics({
118198
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
118199
+ reportMetricsSafe({
118044
118200
  bytes: requestBodyBytes,
118045
118201
  completionTokens: 0,
118202
+ engine: configuration.agentEngineType,
118203
+ endpointId: null,
118204
+ latencyMs,
118205
+ modelId: modelID,
118046
118206
  promptTokens: 0,
118207
+ requestBytes: requestBodyBytes,
118208
+ requestId: null,
118209
+ requestMethod: "POST",
118210
+ requestPath: path,
118211
+ responseBytes: 0,
118047
118212
  successful: false,
118213
+ timeToFirstTokenMs: null,
118214
+ tokensPerSecond: 0,
118048
118215
  totalTokens: 0
118049
118216
  });
118050
118217
  return {
@@ -118056,20 +118223,36 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
118056
118223
  agentEngineType: configuration.agentEngineType,
118057
118224
  body: Readable.fromWeb(response.body),
118058
118225
  logger,
118059
- onComplete: ({ error, responseBytes, usage }) => {
118226
+ onComplete: ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
118060
118227
  const completionTokens = normalizeTokenCount(usage?.completionTokens);
118061
118228
  const promptTokens = normalizeTokenCount(usage?.promptTokens);
118062
118229
  const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
118063
- void safeReportMetrics({
118230
+ const latencyMs = Math.max(0, durationMs);
118231
+ reportMetricsSafe({
118064
118232
  bytes: requestBodyBytes + responseBytes,
118065
118233
  completionTokens,
118234
+ engine: configuration.agentEngineType,
118235
+ endpointId: null,
118236
+ latencyMs,
118237
+ modelId: modelID,
118066
118238
  promptTokens,
118239
+ requestBytes: requestBodyBytes,
118240
+ requestId: null,
118241
+ requestMethod: "POST",
118242
+ requestPath: path,
118243
+ responseBytes,
118067
118244
  successful: !error,
118245
+ timeToFirstTokenMs,
118246
+ tokensPerSecond: calculateTokensPerSecond({
118247
+ durationMs: latencyMs,
118248
+ totalTokens
118249
+ }),
118068
118250
  totalTokens
118069
118251
  });
118070
118252
  },
118071
118253
  requestBodyBytes,
118072
- requestPath: path
118254
+ requestPath: path,
118255
+ requestStartedAt
118073
118256
  });
118074
118257
  return {
118075
118258
  body: monitoredResponse.stream,
@@ -118176,6 +118359,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
118176
118359
  body,
118177
118360
  configuration,
118178
118361
  logger,
118362
+ modelID: conduitConfiguration.targetModel.id,
118179
118363
  modelManager,
118180
118364
  path: "/v1/chat/completions",
118181
118365
  reportMetrics: apiClient.reportPromptMetrics
@@ -118188,6 +118372,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
118188
118372
  body,
118189
118373
  configuration,
118190
118374
  logger,
118375
+ modelID: conduitConfiguration.targetModel.id,
118191
118376
  modelManager,
118192
118377
  path: "/v1/completions",
118193
118378
  reportMetrics: apiClient.reportPromptMetrics
@@ -118247,6 +118432,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
118247
118432
  apiURL: configuration.apiURL,
118248
118433
  configuration,
118249
118434
  logger,
118435
+ modelID: conduitConfiguration.targetModel.id,
118250
118436
  onRequest: async (request) => {
118251
118437
  return proxyRequest({
118252
118438
  configuration,
@@ -118265,6 +118451,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
118265
118451
  setOnlineState();
118266
118452
  }
118267
118453
  },
118454
+ reportMetrics: apiClient.reportPromptMetrics,
118268
118455
  signal: abortController.signal
118269
118456
  }).catch(error => {
118270
118457
  logger.error("SSE handler failed", {
@@ -1,6 +1,6 @@
1
1
  import { Readable } from "node:stream";
2
- import { Logger } from "@infersec/logger";
3
2
  import { LLMEngine } from "@infersec/definitions";
3
+ import { Logger } from "@infersec/logger";
4
4
  export interface EngineUsageMetrics {
5
5
  completionTokens: number | null;
6
6
  promptTokens: number | null;
@@ -13,14 +13,17 @@ interface EngineMetricsLoggerOptions {
13
13
  requestPath: string;
14
14
  }
15
15
  interface EngineMetricsCompletion {
16
+ durationMs: number;
16
17
  error: Error | null;
17
18
  requestBodyBytes: number;
18
19
  responseBytes: number;
20
+ timeToFirstTokenMs: number | null;
19
21
  usage: EngineUsageMetrics | null;
20
22
  }
21
23
  interface MonitorEngineResponseOptions extends EngineMetricsLoggerOptions {
22
24
  body: Readable;
23
- onComplete?: (result: EngineMetricsCompletion) => void;
25
+ onComplete?: (result: EngineMetricsCompletion) => void | Promise<void>;
26
+ requestStartedAt?: number;
24
27
  }
25
28
  interface EngineMetricsLogOptions extends EngineMetricsLoggerOptions {
26
29
  error?: Error;
@@ -32,5 +35,5 @@ interface MonitorEngineResponseResult {
32
35
  stream: Readable;
33
36
  }
34
37
  export declare function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }: EngineMetricsLogOptions): void;
35
- export declare function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
38
+ export declare function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath, requestStartedAt }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
36
39
  export {};
@@ -1,12 +1,13 @@
1
1
  import { Readable } from "node:stream";
2
- import { InferenceAgentLLMMetricsPayload } from "@infersec/definitions";
2
+ import { InferenceAgentLLMMetricsPayload, type ULID } from "@infersec/definitions";
3
3
  import { Logger } from "@infersec/logger";
4
4
  import { Configuration } from "../configuration.js";
5
5
  import { ModelManager } from "../modelManagement/ModelManager.js";
6
- export declare function proxyOpenAIStreamingRoute({ body, configuration, logger, modelManager, path, reportMetrics }: {
6
+ export declare function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }: {
7
7
  body: unknown;
8
8
  configuration: Configuration;
9
9
  logger: Logger;
10
+ modelID: ULID;
10
11
  modelManager: ModelManager;
11
12
  path: "/v1/chat/completions" | "/v1/completions";
12
13
  reportMetrics: (payload: InferenceAgentLLMMetricsPayload) => Promise<void>;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@infersec/conduit",
3
3
  "description": "End user conduit agent for connecting local LLMs to the cloud.",
4
- "version": "1.17.5",
4
+ "version": "1.19.0",
5
5
  "bin": {
6
6
  "infersec-conduit": "./dist/cli.js"
7
7
  },