@infersec/conduit 1.17.5 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js
CHANGED
|
@@ -6,7 +6,7 @@ const __dirname = __pathDirname(__filename);
|
|
|
6
6
|
|
|
7
7
|
import { parseArgs } from 'node:util';
|
|
8
8
|
import 'node:crypto';
|
|
9
|
-
import { a as asError, s as startInferenceAgent } from './start-
|
|
9
|
+
import { a as asError, s as startInferenceAgent } from './start-BUDNvRkQ.js';
|
|
10
10
|
import 'argon2';
|
|
11
11
|
import 'node:child_process';
|
|
12
12
|
import 'node:stream';
|
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ const __filename = __fileURLToPath(import.meta.url);
|
|
|
5
5
|
const __dirname = __pathDirname(__filename);
|
|
6
6
|
|
|
7
7
|
import 'node:crypto';
|
|
8
|
-
import { s as startInferenceAgent, a as asError } from './start-
|
|
8
|
+
import { s as startInferenceAgent, a as asError } from './start-BUDNvRkQ.js';
|
|
9
9
|
import 'argon2';
|
|
10
10
|
import 'node:child_process';
|
|
11
11
|
import 'node:stream';
|
package/dist/sse/handler.d.ts
CHANGED
|
@@ -1,12 +1,14 @@
|
|
|
1
|
-
import { type APIResponse, type ServerToClientAPIRequest } from "@infersec/definitions";
|
|
1
|
+
import { InferenceAgentLLMMetricsPayload, type ULID, type APIResponse, type ServerToClientAPIRequest } from "@infersec/definitions";
|
|
2
2
|
import { Logger } from "@infersec/logger";
|
|
3
3
|
import { Configuration } from "../configuration.js";
|
|
4
|
-
export declare function handleSSERequests({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, signal }: {
|
|
4
|
+
export declare function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }: {
|
|
5
5
|
apiURL: string;
|
|
6
6
|
configuration: Configuration;
|
|
7
7
|
logger: Logger;
|
|
8
|
+
modelID: ULID;
|
|
8
9
|
onRequest: (request: ServerToClientAPIRequest) => Promise<APIResponse>;
|
|
9
10
|
onRequestEnd?: (request: ServerToClientAPIRequest) => Promise<void> | void;
|
|
10
11
|
onRequestStart?: (request: ServerToClientAPIRequest) => Promise<void> | void;
|
|
12
|
+
reportMetrics: (payload: InferenceAgentLLMMetricsPayload) => Promise<void>;
|
|
11
13
|
signal?: AbortSignal;
|
|
12
14
|
}): Promise<void>;
|
|
@@ -14659,11 +14659,54 @@ custom((data) => {
|
|
|
14659
14659
|
return result.success;
|
|
14660
14660
|
}, "Invalid API reference structure");
|
|
14661
14661
|
|
|
14662
|
+
const LLMEngineSchema = _enum(["llama.cpp", "vllm"]);
|
|
14663
|
+
const LLMModelFormatSchema = _enum([
|
|
14664
|
+
// VLLM
|
|
14665
|
+
"safetensors",
|
|
14666
|
+
"pytorch",
|
|
14667
|
+
"awq",
|
|
14668
|
+
"gptq",
|
|
14669
|
+
// Llama.cpp
|
|
14670
|
+
"gguf"
|
|
14671
|
+
]);
|
|
14672
|
+
const LLMModelSchema = object({
|
|
14673
|
+
format: LLMModelFormatSchema,
|
|
14674
|
+
id: string$1().min(1),
|
|
14675
|
+
multimodalEnabled: boolean$1(),
|
|
14676
|
+
source: discriminatedUnion("type", [
|
|
14677
|
+
object({
|
|
14678
|
+
irid: IRIDSchema,
|
|
14679
|
+
type: literal("storage")
|
|
14680
|
+
}),
|
|
14681
|
+
object({
|
|
14682
|
+
modelSecret: string$1().min(1).nullable(),
|
|
14683
|
+
slug: string$1().min(1),
|
|
14684
|
+
type: literal("huggingface")
|
|
14685
|
+
})
|
|
14686
|
+
])
|
|
14687
|
+
});
|
|
14688
|
+
const QuantizationFileSchema = object({
|
|
14689
|
+
filePath: string$1().min(1),
|
|
14690
|
+
sizeBytes: number$1().int().nonnegative().nullable()
|
|
14691
|
+
});
|
|
14692
|
+
array(QuantizationFileSchema);
|
|
14693
|
+
|
|
14662
14694
|
const InferenceAgentLLMMetricsPayloadSchema = object({
|
|
14663
14695
|
bytes: number$1().int().nonnegative(),
|
|
14664
14696
|
completionTokens: number$1().int().nonnegative(),
|
|
14697
|
+
engine: LLMEngineSchema.nullable(),
|
|
14698
|
+
endpointId: ULIDSchema.nullable(),
|
|
14699
|
+
latencyMs: number$1().int().nonnegative(),
|
|
14700
|
+
modelId: ULIDSchema.nullable(),
|
|
14665
14701
|
promptTokens: number$1().int().nonnegative(),
|
|
14702
|
+
requestBytes: number$1().int().nonnegative(),
|
|
14703
|
+
requestId: ULIDSchema.nullable(),
|
|
14704
|
+
requestMethod: string$1().nullable(),
|
|
14705
|
+
requestPath: string$1().nullable(),
|
|
14706
|
+
responseBytes: number$1().int().nonnegative(),
|
|
14666
14707
|
successful: boolean$1(),
|
|
14708
|
+
timeToFirstTokenMs: number$1().int().nonnegative().nullable(),
|
|
14709
|
+
tokensPerSecond: number$1().int().nonnegative(),
|
|
14667
14710
|
totalTokens: number$1().int().nonnegative()
|
|
14668
14711
|
});
|
|
14669
14712
|
const InferenceAgentMachineGPUSchema = object({
|
|
@@ -14765,38 +14808,6 @@ const ConduitState = z.preprocess(value => {
|
|
|
14765
14808
|
return value;
|
|
14766
14809
|
}, ConduitStateSchema);
|
|
14767
14810
|
|
|
14768
|
-
const LLMEngineSchema = _enum(["llama.cpp", "vllm"]);
|
|
14769
|
-
const LLMModelFormatSchema = _enum([
|
|
14770
|
-
// VLLM
|
|
14771
|
-
"safetensors",
|
|
14772
|
-
"pytorch",
|
|
14773
|
-
"awq",
|
|
14774
|
-
"gptq",
|
|
14775
|
-
// Llama.cpp
|
|
14776
|
-
"gguf"
|
|
14777
|
-
]);
|
|
14778
|
-
const LLMModelSchema = object({
|
|
14779
|
-
format: LLMModelFormatSchema,
|
|
14780
|
-
id: string$1().min(1),
|
|
14781
|
-
multimodalEnabled: boolean$1(),
|
|
14782
|
-
source: discriminatedUnion("type", [
|
|
14783
|
-
object({
|
|
14784
|
-
irid: IRIDSchema,
|
|
14785
|
-
type: literal("storage")
|
|
14786
|
-
}),
|
|
14787
|
-
object({
|
|
14788
|
-
modelSecret: string$1().min(1).nullable(),
|
|
14789
|
-
slug: string$1().min(1),
|
|
14790
|
-
type: literal("huggingface")
|
|
14791
|
-
})
|
|
14792
|
-
])
|
|
14793
|
-
});
|
|
14794
|
-
const QuantizationFileSchema = object({
|
|
14795
|
-
filePath: string$1().min(1),
|
|
14796
|
-
sizeBytes: number$1().int().nonnegative().nullable()
|
|
14797
|
-
});
|
|
14798
|
-
array(QuantizationFileSchema);
|
|
14799
|
-
|
|
14800
14811
|
const InferenceAgentConfigurationSchema = object({
|
|
14801
14812
|
contextLength: number$1().int().positive().nullable(),
|
|
14802
14813
|
inferenceSourceID: ULIDSchema,
|
|
@@ -108293,7 +108304,7 @@ function sleep(ms) {
|
|
|
108293
108304
|
});
|
|
108294
108305
|
}
|
|
108295
108306
|
|
|
108296
|
-
async function handleSSERequests({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, signal }) {
|
|
108307
|
+
async function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }) {
|
|
108297
108308
|
const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/stream`;
|
|
108298
108309
|
const maxReconnectDelayMs = 30000;
|
|
108299
108310
|
let reconnectAttempt = 0;
|
|
@@ -108318,9 +108329,11 @@ async function handleSSERequests({ apiURL, configuration, logger, onRequest, onR
|
|
|
108318
108329
|
apiURL,
|
|
108319
108330
|
configuration,
|
|
108320
108331
|
logger,
|
|
108332
|
+
modelID,
|
|
108321
108333
|
onRequest,
|
|
108322
108334
|
onRequestEnd,
|
|
108323
108335
|
onRequestStart,
|
|
108336
|
+
reportMetrics,
|
|
108324
108337
|
request: payload
|
|
108325
108338
|
}).catch(error => {
|
|
108326
108339
|
logger.error("SSE request handler failed", {
|
|
@@ -108350,28 +108363,71 @@ async function handleSSERequests({ apiURL, configuration, logger, onRequest, onR
|
|
|
108350
108363
|
await sleep(reconnectDelayMs);
|
|
108351
108364
|
}
|
|
108352
108365
|
}
|
|
108353
|
-
async function handleRequest({ apiURL, configuration, logger, onRequest, onRequestEnd, onRequestStart, request }) {
|
|
108366
|
+
async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request }) {
|
|
108367
|
+
function reportMetricsSafe(payload) {
|
|
108368
|
+
reportMetrics(payload).catch(error => {
|
|
108369
|
+
logger.warn("Failed to upload LLM prompt metrics", {
|
|
108370
|
+
error: asError(error),
|
|
108371
|
+
requestUrl: request.path
|
|
108372
|
+
});
|
|
108373
|
+
});
|
|
108374
|
+
}
|
|
108375
|
+
const requestStartedAt = Date.now();
|
|
108376
|
+
const requestBytes = calculateRequestBytes(request.body ?? null);
|
|
108354
108377
|
try {
|
|
108355
108378
|
await onRequestStart?.(request);
|
|
108356
108379
|
const response = await onRequest(request);
|
|
108357
|
-
await streamResponse({
|
|
108380
|
+
const responseMetrics = await streamResponse({
|
|
108358
108381
|
apiURL,
|
|
108359
108382
|
configuration,
|
|
108360
108383
|
logger,
|
|
108361
108384
|
requestID: request.requestID,
|
|
108385
|
+
requestStartedAt,
|
|
108362
108386
|
response
|
|
108363
108387
|
});
|
|
108388
|
+
const latencyMs = Math.max(0, Date.now() - requestStartedAt);
|
|
108389
|
+
const totalTokens = 0;
|
|
108390
|
+
const tokensPerSecond = calculateTokensPerSecond$1({
|
|
108391
|
+
durationMs: latencyMs,
|
|
108392
|
+
totalTokens
|
|
108393
|
+
});
|
|
108394
|
+
reportMetricsSafe({
|
|
108395
|
+
bytes: requestBytes + responseMetrics.responseBytes,
|
|
108396
|
+
completionTokens: 0,
|
|
108397
|
+
engine: configuration.agentEngineType,
|
|
108398
|
+
endpointId: null,
|
|
108399
|
+
latencyMs,
|
|
108400
|
+
modelId: modelID,
|
|
108401
|
+
promptTokens: 0,
|
|
108402
|
+
requestBytes,
|
|
108403
|
+
requestId: request.requestID,
|
|
108404
|
+
requestMethod: request.method,
|
|
108405
|
+
requestPath: request.path,
|
|
108406
|
+
responseBytes: responseMetrics.responseBytes,
|
|
108407
|
+
successful: responseMetrics.status < 400,
|
|
108408
|
+
timeToFirstTokenMs: responseMetrics.timeToFirstTokenMs,
|
|
108409
|
+
tokensPerSecond,
|
|
108410
|
+
totalTokens
|
|
108411
|
+
});
|
|
108364
108412
|
}
|
|
108365
108413
|
catch (error) {
|
|
108366
108414
|
logger.error("SSE request failed", {
|
|
108367
108415
|
error: asError(error),
|
|
108368
108416
|
requestMethod: request.requestID
|
|
108369
108417
|
});
|
|
108418
|
+
const failureMessage = "Bad gateway\n\nProxying failed";
|
|
108419
|
+
const failureBytes = Buffer.byteLength(failureMessage, "utf8");
|
|
108420
|
+
const latencyMs = Math.max(0, Date.now() - requestStartedAt);
|
|
108421
|
+
const totalTokens = 0;
|
|
108422
|
+
const tokensPerSecond = calculateTokensPerSecond$1({
|
|
108423
|
+
durationMs: latencyMs,
|
|
108424
|
+
totalTokens
|
|
108425
|
+
});
|
|
108370
108426
|
await postChunk({
|
|
108371
108427
|
apiURL,
|
|
108372
108428
|
configuration,
|
|
108373
108429
|
payload: {
|
|
108374
|
-
data: encodeTextChunk(
|
|
108430
|
+
data: encodeTextChunk(failureMessage),
|
|
108375
108431
|
sequence: 0,
|
|
108376
108432
|
status: 502
|
|
108377
108433
|
},
|
|
@@ -108387,16 +108443,40 @@ async function handleRequest({ apiURL, configuration, logger, onRequest, onReque
|
|
|
108387
108443
|
},
|
|
108388
108444
|
requestID: request.requestID
|
|
108389
108445
|
});
|
|
108446
|
+
reportMetricsSafe({
|
|
108447
|
+
bytes: requestBytes + failureBytes,
|
|
108448
|
+
completionTokens: 0,
|
|
108449
|
+
engine: configuration.agentEngineType,
|
|
108450
|
+
endpointId: null,
|
|
108451
|
+
latencyMs,
|
|
108452
|
+
modelId: modelID,
|
|
108453
|
+
promptTokens: 0,
|
|
108454
|
+
requestBytes,
|
|
108455
|
+
requestId: request.requestID,
|
|
108456
|
+
requestMethod: request.method,
|
|
108457
|
+
requestPath: request.path,
|
|
108458
|
+
responseBytes: failureBytes,
|
|
108459
|
+
successful: false,
|
|
108460
|
+
timeToFirstTokenMs: latencyMs,
|
|
108461
|
+
tokensPerSecond,
|
|
108462
|
+
totalTokens
|
|
108463
|
+
});
|
|
108390
108464
|
}
|
|
108391
108465
|
finally {
|
|
108392
108466
|
await onRequestEnd?.(request);
|
|
108393
108467
|
}
|
|
108394
108468
|
}
|
|
108395
|
-
async function streamResponse({ apiURL, configuration, logger, requestID, response }) {
|
|
108469
|
+
async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response }) {
|
|
108396
108470
|
let sequence = 0;
|
|
108471
|
+
let responseBytes = 0;
|
|
108472
|
+
let timeToFirstTokenMs = null;
|
|
108397
108473
|
if (response.body instanceof Readable) {
|
|
108398
108474
|
for await (const chunk of response.body) {
|
|
108399
108475
|
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
108476
|
+
if (timeToFirstTokenMs === null) {
|
|
108477
|
+
timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
|
|
108478
|
+
}
|
|
108479
|
+
responseBytes += buffer.length;
|
|
108400
108480
|
await postChunk({
|
|
108401
108481
|
apiURL,
|
|
108402
108482
|
configuration,
|
|
@@ -108419,17 +108499,26 @@ async function streamResponse({ apiURL, configuration, logger, requestID, respon
|
|
|
108419
108499
|
},
|
|
108420
108500
|
requestID
|
|
108421
108501
|
});
|
|
108422
|
-
return
|
|
108502
|
+
return {
|
|
108503
|
+
responseBytes,
|
|
108504
|
+
status: response.status,
|
|
108505
|
+
timeToFirstTokenMs
|
|
108506
|
+
};
|
|
108507
|
+
}
|
|
108508
|
+
const responsePayload = response.body
|
|
108509
|
+
? typeof response.body === "string"
|
|
108510
|
+
? response.body
|
|
108511
|
+
: JSON.stringify(response.body)
|
|
108512
|
+
: "";
|
|
108513
|
+
if (responsePayload.length > 0) {
|
|
108514
|
+
responseBytes = Buffer.byteLength(responsePayload, "utf8");
|
|
108515
|
+
timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
|
|
108423
108516
|
}
|
|
108424
108517
|
await postChunk({
|
|
108425
108518
|
apiURL,
|
|
108426
108519
|
configuration,
|
|
108427
108520
|
payload: {
|
|
108428
|
-
data: encodeTextChunk(
|
|
108429
|
-
? typeof response.body === "string"
|
|
108430
|
-
? response.body
|
|
108431
|
-
: JSON.stringify(response.body)
|
|
108432
|
-
: ""),
|
|
108521
|
+
data: encodeTextChunk(responsePayload),
|
|
108433
108522
|
headers: response.headers,
|
|
108434
108523
|
sequence,
|
|
108435
108524
|
status: response.status
|
|
@@ -108449,6 +108538,11 @@ async function streamResponse({ apiURL, configuration, logger, requestID, respon
|
|
|
108449
108538
|
logger.info("SSE response queued", {
|
|
108450
108539
|
requestMethod: requestID
|
|
108451
108540
|
});
|
|
108541
|
+
return {
|
|
108542
|
+
responseBytes,
|
|
108543
|
+
status: response.status,
|
|
108544
|
+
timeToFirstTokenMs
|
|
108545
|
+
};
|
|
108452
108546
|
}
|
|
108453
108547
|
async function postChunk({ apiURL, configuration, payload, requestID }) {
|
|
108454
108548
|
const response = ClientToServerAPIResponseSchema.parse({
|
|
@@ -108475,6 +108569,25 @@ function encodeTextChunk(chunk) {
|
|
|
108475
108569
|
}
|
|
108476
108570
|
return `data:text/plain;base64,${Buffer.from(chunk, "utf-8").toString("base64")}`;
|
|
108477
108571
|
}
|
|
108572
|
+
function calculateRequestBytes(body) {
|
|
108573
|
+
if (body === null || body === undefined) {
|
|
108574
|
+
return 0;
|
|
108575
|
+
}
|
|
108576
|
+
if (typeof body === "string") {
|
|
108577
|
+
return Buffer.byteLength(body, "utf8");
|
|
108578
|
+
}
|
|
108579
|
+
return Buffer.byteLength(JSON.stringify(body), "utf8");
|
|
108580
|
+
}
|
|
108581
|
+
function calculateTokensPerSecond$1({ durationMs, totalTokens }) {
|
|
108582
|
+
if (durationMs <= 0) {
|
|
108583
|
+
return 0;
|
|
108584
|
+
}
|
|
108585
|
+
const tokensPerSecond = totalTokens / (durationMs / 1000);
|
|
108586
|
+
if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
|
|
108587
|
+
return 0;
|
|
108588
|
+
}
|
|
108589
|
+
return Math.round(tokensPerSecond);
|
|
108590
|
+
}
|
|
108478
108591
|
|
|
108479
108592
|
/**
|
|
108480
108593
|
* Proxy server requests to the local inference HTTP server.
|
|
@@ -117837,9 +117950,11 @@ function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBy
|
|
|
117837
117950
|
}
|
|
117838
117951
|
logger[level](metricsMessage, attributes);
|
|
117839
117952
|
}
|
|
117840
|
-
function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath }) {
|
|
117953
|
+
function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath, requestStartedAt }) {
|
|
117954
|
+
const startedAt = requestStartedAt ?? Date.now();
|
|
117841
117955
|
const passThrough = new PassThrough();
|
|
117842
117956
|
let responseBytes = 0;
|
|
117957
|
+
let firstChunkAt = null;
|
|
117843
117958
|
let usage = null;
|
|
117844
117959
|
let buffer = "";
|
|
117845
117960
|
let completed = false;
|
|
@@ -117849,12 +117964,22 @@ function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete
|
|
|
117849
117964
|
}
|
|
117850
117965
|
completed = true;
|
|
117851
117966
|
if (onComplete) {
|
|
117852
|
-
onComplete({
|
|
117967
|
+
const completion = onComplete({
|
|
117968
|
+
durationMs: Math.max(0, Date.now() - startedAt),
|
|
117853
117969
|
error,
|
|
117854
117970
|
requestBodyBytes,
|
|
117855
117971
|
responseBytes,
|
|
117972
|
+
timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
|
|
117856
117973
|
usage
|
|
117857
117974
|
});
|
|
117975
|
+
if (completion && typeof completion.catch === "function") {
|
|
117976
|
+
completion.catch(error => {
|
|
117977
|
+
logger.error("Engine metrics completion failed", {
|
|
117978
|
+
error: asError(error),
|
|
117979
|
+
requestUrl: requestPath
|
|
117980
|
+
});
|
|
117981
|
+
});
|
|
117982
|
+
}
|
|
117858
117983
|
}
|
|
117859
117984
|
}
|
|
117860
117985
|
function parseUsageFromBuffer() {
|
|
@@ -117885,6 +118010,9 @@ function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete
|
|
|
117885
118010
|
}
|
|
117886
118011
|
}
|
|
117887
118012
|
body.on("data", (chunk) => {
|
|
118013
|
+
if (firstChunkAt === null) {
|
|
118014
|
+
firstChunkAt = Date.now();
|
|
118015
|
+
}
|
|
117888
118016
|
responseBytes += chunk.length;
|
|
117889
118017
|
buffer += chunk.toString("utf8");
|
|
117890
118018
|
parseUsageFromBuffer();
|
|
@@ -117970,25 +118098,33 @@ function serializeRequestBody(body) {
|
|
|
117970
118098
|
payload
|
|
117971
118099
|
};
|
|
117972
118100
|
}
|
|
117973
|
-
|
|
118101
|
+
function calculateTokensPerSecond({ durationMs, totalTokens }) {
|
|
118102
|
+
if (durationMs <= 0) {
|
|
118103
|
+
return 0;
|
|
118104
|
+
}
|
|
118105
|
+
const tokensPerSecond = totalTokens / (durationMs / 1000);
|
|
118106
|
+
if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
|
|
118107
|
+
return 0;
|
|
118108
|
+
}
|
|
118109
|
+
return Math.round(tokensPerSecond);
|
|
118110
|
+
}
|
|
118111
|
+
async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }) {
|
|
117974
118112
|
function normalizeTokenCount(value) {
|
|
117975
118113
|
if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
|
|
117976
118114
|
return value;
|
|
117977
118115
|
}
|
|
117978
118116
|
return 0;
|
|
117979
118117
|
}
|
|
117980
|
-
|
|
117981
|
-
|
|
117982
|
-
await reportMetrics(payload);
|
|
117983
|
-
}
|
|
117984
|
-
catch (error) {
|
|
118118
|
+
function reportMetricsSafe(payload) {
|
|
118119
|
+
reportMetrics(payload).catch(error => {
|
|
117985
118120
|
logger.warn("Failed to upload LLM prompt metrics", {
|
|
117986
|
-
error: error,
|
|
118121
|
+
error: asError(error),
|
|
117987
118122
|
requestUrl: path
|
|
117988
118123
|
});
|
|
117989
|
-
}
|
|
118124
|
+
});
|
|
117990
118125
|
}
|
|
117991
118126
|
const { bytes: requestBodyBytes, payload: serializedBody } = serializeRequestBody(body);
|
|
118127
|
+
const requestStartedAt = Date.now();
|
|
117992
118128
|
const response = await modelManager
|
|
117993
118129
|
.fetchOpenAI(path, {
|
|
117994
118130
|
body: serializedBody,
|
|
@@ -118008,11 +118144,23 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
|
|
|
118008
118144
|
responseBytes: 0,
|
|
118009
118145
|
usage: null
|
|
118010
118146
|
});
|
|
118011
|
-
|
|
118147
|
+
const latencyMs = Math.max(0, Date.now() - requestStartedAt);
|
|
118148
|
+
reportMetricsSafe({
|
|
118012
118149
|
bytes: requestBodyBytes,
|
|
118013
118150
|
completionTokens: 0,
|
|
118151
|
+
engine: configuration.agentEngineType,
|
|
118152
|
+
endpointId: null,
|
|
118153
|
+
latencyMs,
|
|
118154
|
+
modelId: modelID,
|
|
118014
118155
|
promptTokens: 0,
|
|
118156
|
+
requestBytes: requestBodyBytes,
|
|
118157
|
+
requestId: null,
|
|
118158
|
+
requestMethod: "POST",
|
|
118159
|
+
requestPath: path,
|
|
118160
|
+
responseBytes: 0,
|
|
118015
118161
|
successful: false,
|
|
118162
|
+
timeToFirstTokenMs: null,
|
|
118163
|
+
tokensPerSecond: 0,
|
|
118016
118164
|
totalTokens: 0
|
|
118017
118165
|
});
|
|
118018
118166
|
throw error;
|
|
@@ -118040,11 +118188,23 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
|
|
|
118040
118188
|
responseBytes: 0,
|
|
118041
118189
|
usage: null
|
|
118042
118190
|
});
|
|
118043
|
-
|
|
118191
|
+
const latencyMs = Math.max(0, Date.now() - requestStartedAt);
|
|
118192
|
+
reportMetricsSafe({
|
|
118044
118193
|
bytes: requestBodyBytes,
|
|
118045
118194
|
completionTokens: 0,
|
|
118195
|
+
engine: configuration.agentEngineType,
|
|
118196
|
+
endpointId: null,
|
|
118197
|
+
latencyMs,
|
|
118198
|
+
modelId: modelID,
|
|
118046
118199
|
promptTokens: 0,
|
|
118200
|
+
requestBytes: requestBodyBytes,
|
|
118201
|
+
requestId: null,
|
|
118202
|
+
requestMethod: "POST",
|
|
118203
|
+
requestPath: path,
|
|
118204
|
+
responseBytes: 0,
|
|
118047
118205
|
successful: false,
|
|
118206
|
+
timeToFirstTokenMs: null,
|
|
118207
|
+
tokensPerSecond: 0,
|
|
118048
118208
|
totalTokens: 0
|
|
118049
118209
|
});
|
|
118050
118210
|
return {
|
|
@@ -118056,20 +118216,36 @@ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelMan
|
|
|
118056
118216
|
agentEngineType: configuration.agentEngineType,
|
|
118057
118217
|
body: Readable.fromWeb(response.body),
|
|
118058
118218
|
logger,
|
|
118059
|
-
onComplete: ({ error, responseBytes, usage }) => {
|
|
118219
|
+
onComplete: ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
|
|
118060
118220
|
const completionTokens = normalizeTokenCount(usage?.completionTokens);
|
|
118061
118221
|
const promptTokens = normalizeTokenCount(usage?.promptTokens);
|
|
118062
118222
|
const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
|
|
118063
|
-
|
|
118223
|
+
const latencyMs = Math.max(0, durationMs);
|
|
118224
|
+
reportMetricsSafe({
|
|
118064
118225
|
bytes: requestBodyBytes + responseBytes,
|
|
118065
118226
|
completionTokens,
|
|
118227
|
+
engine: configuration.agentEngineType,
|
|
118228
|
+
endpointId: null,
|
|
118229
|
+
latencyMs,
|
|
118230
|
+
modelId: modelID,
|
|
118066
118231
|
promptTokens,
|
|
118232
|
+
requestBytes: requestBodyBytes,
|
|
118233
|
+
requestId: null,
|
|
118234
|
+
requestMethod: "POST",
|
|
118235
|
+
requestPath: path,
|
|
118236
|
+
responseBytes,
|
|
118067
118237
|
successful: !error,
|
|
118238
|
+
timeToFirstTokenMs,
|
|
118239
|
+
tokensPerSecond: calculateTokensPerSecond({
|
|
118240
|
+
durationMs: latencyMs,
|
|
118241
|
+
totalTokens
|
|
118242
|
+
}),
|
|
118068
118243
|
totalTokens
|
|
118069
118244
|
});
|
|
118070
118245
|
},
|
|
118071
118246
|
requestBodyBytes,
|
|
118072
|
-
requestPath: path
|
|
118247
|
+
requestPath: path,
|
|
118248
|
+
requestStartedAt
|
|
118073
118249
|
});
|
|
118074
118250
|
return {
|
|
118075
118251
|
body: monitoredResponse.stream,
|
|
@@ -118176,6 +118352,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
|
|
|
118176
118352
|
body,
|
|
118177
118353
|
configuration,
|
|
118178
118354
|
logger,
|
|
118355
|
+
modelID: conduitConfiguration.targetModel.id,
|
|
118179
118356
|
modelManager,
|
|
118180
118357
|
path: "/v1/chat/completions",
|
|
118181
118358
|
reportMetrics: apiClient.reportPromptMetrics
|
|
@@ -118188,6 +118365,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
|
|
|
118188
118365
|
body,
|
|
118189
118366
|
configuration,
|
|
118190
118367
|
logger,
|
|
118368
|
+
modelID: conduitConfiguration.targetModel.id,
|
|
118191
118369
|
modelManager,
|
|
118192
118370
|
path: "/v1/completions",
|
|
118193
118371
|
reportMetrics: apiClient.reportPromptMetrics
|
|
@@ -118247,6 +118425,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
|
|
|
118247
118425
|
apiURL: configuration.apiURL,
|
|
118248
118426
|
configuration,
|
|
118249
118427
|
logger,
|
|
118428
|
+
modelID: conduitConfiguration.targetModel.id,
|
|
118250
118429
|
onRequest: async (request) => {
|
|
118251
118430
|
return proxyRequest({
|
|
118252
118431
|
configuration,
|
|
@@ -118265,6 +118444,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
|
|
|
118265
118444
|
setOnlineState();
|
|
118266
118445
|
}
|
|
118267
118446
|
},
|
|
118447
|
+
reportMetrics: apiClient.reportPromptMetrics,
|
|
118268
118448
|
signal: abortController.signal
|
|
118269
118449
|
}).catch(error => {
|
|
118270
118450
|
logger.error("SSE handler failed", {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { Readable } from "node:stream";
|
|
2
|
-
import { Logger } from "@infersec/logger";
|
|
3
2
|
import { LLMEngine } from "@infersec/definitions";
|
|
3
|
+
import { Logger } from "@infersec/logger";
|
|
4
4
|
export interface EngineUsageMetrics {
|
|
5
5
|
completionTokens: number | null;
|
|
6
6
|
promptTokens: number | null;
|
|
@@ -13,14 +13,17 @@ interface EngineMetricsLoggerOptions {
|
|
|
13
13
|
requestPath: string;
|
|
14
14
|
}
|
|
15
15
|
interface EngineMetricsCompletion {
|
|
16
|
+
durationMs: number;
|
|
16
17
|
error: Error | null;
|
|
17
18
|
requestBodyBytes: number;
|
|
18
19
|
responseBytes: number;
|
|
20
|
+
timeToFirstTokenMs: number | null;
|
|
19
21
|
usage: EngineUsageMetrics | null;
|
|
20
22
|
}
|
|
21
23
|
interface MonitorEngineResponseOptions extends EngineMetricsLoggerOptions {
|
|
22
24
|
body: Readable;
|
|
23
|
-
onComplete?: (result: EngineMetricsCompletion) => void
|
|
25
|
+
onComplete?: (result: EngineMetricsCompletion) => void | Promise<void>;
|
|
26
|
+
requestStartedAt?: number;
|
|
24
27
|
}
|
|
25
28
|
interface EngineMetricsLogOptions extends EngineMetricsLoggerOptions {
|
|
26
29
|
error?: Error;
|
|
@@ -32,5 +35,5 @@ interface MonitorEngineResponseResult {
|
|
|
32
35
|
stream: Readable;
|
|
33
36
|
}
|
|
34
37
|
export declare function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }: EngineMetricsLogOptions): void;
|
|
35
|
-
export declare function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
|
|
38
|
+
export declare function monitorEngineResponseStream({ agentEngineType, body, logger, onComplete, requestBodyBytes, requestPath, requestStartedAt }: MonitorEngineResponseOptions): MonitorEngineResponseResult;
|
|
36
39
|
export {};
|
package/dist/utils/openai.d.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { Readable } from "node:stream";
|
|
2
|
-
import { InferenceAgentLLMMetricsPayload } from "@infersec/definitions";
|
|
2
|
+
import { InferenceAgentLLMMetricsPayload, type ULID } from "@infersec/definitions";
|
|
3
3
|
import { Logger } from "@infersec/logger";
|
|
4
4
|
import { Configuration } from "../configuration.js";
|
|
5
5
|
import { ModelManager } from "../modelManagement/ModelManager.js";
|
|
6
|
-
export declare function proxyOpenAIStreamingRoute({ body, configuration, logger, modelManager, path, reportMetrics }: {
|
|
6
|
+
export declare function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }: {
|
|
7
7
|
body: unknown;
|
|
8
8
|
configuration: Configuration;
|
|
9
9
|
logger: Logger;
|
|
10
|
+
modelID: ULID;
|
|
10
11
|
modelManager: ModelManager;
|
|
11
12
|
path: "/v1/chat/completions" | "/v1/completions";
|
|
12
13
|
reportMetrics: (payload: InferenceAgentLLMMetricsPayload) => Promise<void>;
|