@infersec/conduit 1.28.1 → 1.28.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ import require$$3$1 from 'assert';
12
12
  import require$$4$1 from 'events';
13
13
  import require$$1$1 from 'stream';
14
14
  import crypto, { createHash } from 'node:crypto';
15
- import require$$0$7, { Readable, Transform, getDefaultHighWaterMark, Duplex, Writable, PassThrough } from 'node:stream';
15
+ import require$$0$7, { Readable, Transform, PassThrough, getDefaultHighWaterMark, Duplex, Writable } from 'node:stream';
16
16
  import 'argon2';
17
17
  import { spawn, ChildProcess, execFile, spawnSync } from 'node:child_process';
18
18
  import require$$0$6 from 'node:assert';
@@ -108649,352 +108649,551 @@ class ModelManager extends EventEmitter {
108649
108649
  }
108650
108650
  }
108651
108651
 
108652
- async function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }) {
108653
- const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/stream`;
108654
- const maxReconnectDelayMs = 30000;
108655
- let reconnectAttempt = 0;
108656
- while (!signal?.aborted) {
108657
- const connectionStartedAt = Date.now();
108658
- try {
108659
- await connectSSE(streamURL, {
108660
- headers: {
108661
- "x-api-key": configuration.apiKey
108662
- },
108663
- onError: (error) => {
108664
- logger.error("SSE connection error", {
108665
- error
108666
- });
108667
- },
108668
- onMessage: (message) => {
108669
- if (message.event !== "request") {
108670
- return;
108652
+ function createConduitGeneralAPIReferenceHandlers({ cycleEngine, conduitStateManager, getModelManager, logger, setErrorState, startEngine, stopEngine, stopRequestedByControl }) {
108653
+ return {
108654
+ "/conduit/engine/start": {
108655
+ POST: async () => {
108656
+ const modelManager = getModelManager();
108657
+ if (conduitStateManager.getState().state !== "idle") {
108658
+ return {
108659
+ status: 409,
108660
+ statusText: "Engine can only be started from idle state"
108661
+ };
108662
+ }
108663
+ if (!modelManager.canStart) {
108664
+ return {
108665
+ status: 409,
108666
+ statusText: `Engine cannot be started from current state: ${modelManager.state}`
108667
+ };
108668
+ }
108669
+ try {
108670
+ logger.info("Received remote engine start request");
108671
+ await startEngine();
108672
+ return {
108673
+ body: {
108674
+ acknowledged: true
108675
+ },
108676
+ status: 202
108677
+ };
108678
+ }
108679
+ catch (error) {
108680
+ if (stopRequestedByControl() || modelManager.state === "stopped") {
108681
+ return {
108682
+ status: 409,
108683
+ statusText: "Engine start was interrupted"
108684
+ };
108671
108685
  }
108672
- const payload = ServerToClientAPIRequestSchema.parse(JSON.parse(message.data));
108673
- handleRequest({
108674
- apiURL,
108675
- configuration,
108676
- logger,
108677
- modelID,
108678
- onRequest,
108679
- onRequestEnd,
108680
- onRequestStart,
108681
- reportMetrics,
108682
- request: payload,
108683
- signal
108684
- }).catch(error => {
108685
- logger.error("SSE request handler failed", {
108686
- error: asError(error),
108687
- requestMethod: payload.requestID
108688
- });
108689
- });
108690
- },
108691
- signal
108692
- });
108693
- }
108694
- catch (error) {
108695
- if (signal?.aborted) {
108696
- return;
108686
+ const parsedError = asError(error);
108687
+ setErrorState({ error: parsedError.message });
108688
+ return {
108689
+ status: 500,
108690
+ statusText: parsedError.message
108691
+ };
108692
+ }
108697
108693
  }
108698
- const isTerminated = isTerminatedError(error);
108699
- if (!isTerminated) {
108700
- logger.error("SSE connection failed", {
108701
- ...getNetworkErrorAttributes(error),
108702
- error: asError(error)
108694
+ },
108695
+ "/conduit/engine/stop": {
108696
+ POST: async () => {
108697
+ const modelManager = getModelManager();
108698
+ const sourceState = conduitStateManager.getState().state;
108699
+ if (sourceState !== "bootingEngine" && sourceState !== "online") {
108700
+ return {
108701
+ status: 409,
108702
+ statusText: "Engine can only be stopped while booting or online"
108703
+ };
108704
+ }
108705
+ if (!modelManager.canStop) {
108706
+ return {
108707
+ status: 409,
108708
+ statusText: `Engine cannot be stopped from current state: ${modelManager.state}`
108709
+ };
108710
+ }
108711
+ logger.info("Received remote engine stop request");
108712
+ stopEngine({
108713
+ reason: "Remote shutdown requested"
108714
+ }).catch(error => {
108715
+ const parsedError = asError(error);
108716
+ logger.error("Remote engine stop request failed", {
108717
+ error: parsedError
108718
+ });
108719
+ setErrorState({ error: parsedError.message });
108703
108720
  });
108721
+ return {
108722
+ body: {
108723
+ acknowledged: true
108724
+ },
108725
+ status: 202
108726
+ };
108704
108727
  }
108705
- if (signal?.aborted) {
108706
- return;
108728
+ },
108729
+ "/conduit/engine/cycle": {
108730
+ POST: async () => {
108731
+ const modelManager = getModelManager();
108732
+ const sourceState = conduitStateManager.getState().state;
108733
+ if (sourceState !== "bootingEngine" &&
108734
+ sourceState !== "online" &&
108735
+ sourceState !== "idle") {
108736
+ return {
108737
+ status: 409,
108738
+ statusText: "Engine can only be cycled while booting, online, or idle"
108739
+ };
108740
+ }
108741
+ if (sourceState !== "idle" && !modelManager.canStop) {
108742
+ return {
108743
+ status: 409,
108744
+ statusText: `Engine cannot be cycled from current state: ${modelManager.state}`
108745
+ };
108746
+ }
108747
+ try {
108748
+ logger.info("Received remote engine cycle request");
108749
+ await cycleEngine();
108750
+ return {
108751
+ body: {
108752
+ acknowledged: true
108753
+ },
108754
+ status: 202
108755
+ };
108756
+ }
108757
+ catch (error) {
108758
+ const parsedError = asError(error);
108759
+ setErrorState({ error: parsedError.message });
108760
+ return {
108761
+ status: 500,
108762
+ statusText: parsedError.message
108763
+ };
108764
+ }
108707
108765
  }
108708
- if (!isTerminated) {
108709
- const connectionDurationMs = Date.now() - connectionStartedAt;
108710
- reconnectAttempt = connectionDurationMs > 10000 ? 0 : reconnectAttempt + 1;
108711
- const reconnectDelayMs = Math.min(maxReconnectDelayMs, Math.max(1000, 1000 * 2 ** Math.min(6, reconnectAttempt)));
108712
- logger.warn("SSE disconnected, retrying");
108713
- await sleep(reconnectDelayMs);
108766
+ }
108767
+ };
108768
+ }
108769
+ function createPostCycleEngineHandler(options) {
108770
+ return createConduitGeneralAPIReferenceHandlers(options)["/conduit/engine/cycle"].POST;
108771
+ }
108772
+ function createPostStartEngineHandler(options) {
108773
+ return createConduitGeneralAPIReferenceHandlers(options)["/conduit/engine/start"].POST;
108774
+ }
108775
+ function createPostStopEngineHandler(options) {
108776
+ return createConduitGeneralAPIReferenceHandlers(options)["/conduit/engine/stop"].POST;
108777
+ }
108778
+
108779
+ /**
108780
+ * Coerce non-string tool_calls function.arguments to JSON strings.
108781
+ * Some LLM backends return arguments as parsed objects instead of
108782
+ * JSON strings, violating the OpenAI spec. This mutates in place
108783
+ * and returns true if any coercion was performed.
108784
+ */
108785
+ function coerceToolCallArguments(parsed) {
108786
+ const choices = parsed.choices;
108787
+ if (!Array.isArray(choices))
108788
+ return false;
108789
+ let modified = false;
108790
+ for (const choice of choices) {
108791
+ if (!choice || typeof choice !== "object")
108792
+ continue;
108793
+ const choiceRecord = choice;
108794
+ const msg = choiceRecord.delta ?? choiceRecord.message;
108795
+ if (!msg || typeof msg !== "object")
108796
+ continue;
108797
+ const toolCalls = msg.tool_calls;
108798
+ if (!Array.isArray(toolCalls))
108799
+ continue;
108800
+ for (const tc of toolCalls) {
108801
+ if (!tc || typeof tc !== "object")
108802
+ continue;
108803
+ const fn = tc.function;
108804
+ if (!fn || typeof fn !== "object")
108805
+ continue;
108806
+ const fnRecord = fn;
108807
+ if (fnRecord.arguments !== undefined && typeof fnRecord.arguments !== "string") {
108808
+ fnRecord.arguments = JSON.stringify(fnRecord.arguments);
108809
+ modified = true;
108714
108810
  }
108715
108811
  }
108716
108812
  }
108813
+ return modified;
108717
108814
  }
108718
- async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request, signal }) {
108719
- function reportMetricsSafe(payload) {
108720
- reportMetrics(payload).catch(error => {
108721
- logger.warn("Failed to upload LLM prompt metrics", {
108722
- error: asError(error),
108723
- requestUrl: request.path
108724
- });
108725
- });
108726
- }
108727
- const requestStartedAt = Date.now();
108728
- const requestBytes = calculateRequestBytes(request.body ?? null);
108729
- try {
108730
- await onRequestStart?.(request);
108731
- const response = await onRequest(request);
108732
- const responseMetrics = await streamResponse({
108733
- apiURL,
108734
- configuration,
108735
- logger,
108736
- requestID: request.requestID,
108737
- requestStartedAt,
108738
- response,
108739
- signal
108740
- });
108741
- const latencyMs = Math.max(0, Date.now() - requestStartedAt);
108742
- const totalTokens = 0;
108743
- const tokensPerSecond = calculateTokensPerSecond$1({
108744
- durationMs: latencyMs,
108745
- totalTokens
108746
- });
108747
- reportMetricsSafe({
108748
- bytes: requestBytes + responseMetrics.responseBytes,
108749
- completionTokens: 0,
108750
- engine: configuration.agentEngineType,
108751
- endpointId: null,
108752
- latencyMs,
108753
- modelId: modelID,
108754
- promptTokens: 0,
108755
- requestBytes,
108756
- requestId: request.requestID,
108757
- requestMethod: request.method,
108758
- requestPath: request.path,
108759
- responseBytes: responseMetrics.responseBytes,
108760
- successful: responseMetrics.status < 400,
108761
- timeToFirstTokenMs: responseMetrics.timeToFirstTokenMs,
108762
- tokensPerSecond,
108763
- totalTokens
108764
- });
108765
- }
108766
- catch (error) {
108767
- logger.error("SSE request failed", {
108768
- error: asError(error),
108769
- requestMethod: request.requestID
108770
- });
108771
- const failureMessage = "Bad gateway\n\nProxying failed";
108772
- const failureBytes = Buffer.byteLength(failureMessage, "utf8");
108773
- const latencyMs = Math.max(0, Date.now() - requestStartedAt);
108774
- const totalTokens = 0;
108775
- const tokensPerSecond = calculateTokensPerSecond$1({
108776
- durationMs: latencyMs,
108777
- totalTokens
108778
- });
108779
- const streamHandler = await sendChunkStream({
108780
- apiURL,
108781
- configuration,
108782
- requestID: request.requestID,
108783
- logger
108784
- });
108785
- await streamHandler.sendChunk({
108786
- data: encodeBinaryChunk(Buffer.from(failureMessage)),
108787
- sequence: 0,
108788
- status: 502
108789
- });
108790
- await streamHandler.sendChunk({
108791
- data: null,
108792
- sequence: 1,
108793
- status: 502
108794
- });
108795
- await streamHandler.end();
108796
- reportMetricsSafe({
108797
- bytes: requestBytes + failureBytes,
108798
- completionTokens: 0,
108799
- engine: configuration.agentEngineType,
108800
- endpointId: null,
108801
- latencyMs,
108802
- modelId: modelID,
108803
- promptTokens: 0,
108804
- requestBytes,
108805
- requestId: request.requestID,
108806
- requestMethod: request.method,
108807
- requestPath: request.path,
108808
- responseBytes: failureBytes,
108809
- successful: false,
108810
- timeToFirstTokenMs: latencyMs,
108811
- tokensPerSecond,
108812
- totalTokens
108813
- });
108815
+ function isEngineUsageChunk(value) {
108816
+ if (!value || typeof value !== "object") {
108817
+ return false;
108814
108818
  }
108815
- finally {
108816
- await onRequestEnd?.(request);
108819
+ const record = value;
108820
+ if (!record.usage || typeof record.usage !== "object") {
108821
+ return false;
108817
108822
  }
108823
+ return true;
108818
108824
  }
108819
- async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response, signal }) {
108820
- let sequence = 0;
108825
+ function monitorEngineResponseStream({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }) {
108826
+ const startedAt = requestStartedAt ?? Date.now();
108827
+ const passThrough = new PassThrough();
108821
108828
  let responseBytes = 0;
108822
- let timeToFirstTokenMs = null;
108823
- const streamHandler = await sendChunkStream({
108824
- apiURL,
108825
- configuration,
108826
- requestID,
108827
- logger
108828
- });
108829
- if (response.body instanceof Readable) {
108830
- for await (const chunk of response.body) {
108831
- if (signal?.aborted) {
108832
- streamHandler.abort();
108833
- throw new Error("Request cancelled");
108829
+ let firstChunkAt = null;
108830
+ let usage = null;
108831
+ let buffer = "";
108832
+ let completed = false;
108833
+ function modifyChunkWithUsage(chunk) {
108834
+ const text = chunk.toString("utf8");
108835
+ const lines = text.split("\n");
108836
+ const modifiedLines = [];
108837
+ for (const rawLine of lines) {
108838
+ const line = rawLine.trim();
108839
+ if (!line.startsWith("data:")) {
108840
+ modifiedLines.push(rawLine);
108841
+ continue;
108834
108842
  }
108835
- const buffer = Buffer.isBuffer(chunk)
108836
- ? chunk
108837
- : Buffer.from(chunk);
108838
- if (timeToFirstTokenMs === null) {
108839
- timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108843
+ const payload = line.slice(5).trim();
108844
+ if (!payload || payload === "[DONE]") {
108845
+ modifiedLines.push(rawLine);
108846
+ continue;
108840
108847
  }
108841
- responseBytes += buffer.length;
108842
- await streamHandler.sendChunk({
108843
- data: encodeBinaryChunk(buffer),
108844
- sequence,
108845
- status: response.status
108846
- });
108847
- sequence += 1;
108848
- }
108849
- await streamHandler.sendChunk({
108850
- data: null,
108851
- sequence,
108852
- status: response.status
108853
- });
108854
- await streamHandler.end();
108855
- return {
108856
- responseBytes,
108857
- status: response.status,
108858
- timeToFirstTokenMs
108859
- };
108860
- }
108861
- const responsePayload = response.body
108862
- ? typeof response.body === "string"
108863
- ? response.body
108864
- : JSON.stringify(response.body)
108865
- : "";
108866
- if (responsePayload.length > 0) {
108867
- responseBytes = Buffer.byteLength(responsePayload, "utf8");
108868
- timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
108869
- }
108870
- await streamHandler.sendChunk({
108871
- data: encodeBinaryChunk(Buffer.from(responsePayload)),
108872
- headers: response.headers,
108873
- sequence,
108874
- status: response.status
108875
- });
108876
- await streamHandler.sendChunk({
108877
- data: null,
108878
- sequence: sequence + 1,
108879
- status: response.status
108880
- });
108881
- await streamHandler.end();
108882
- logger.info("SSE response queued", {
108883
- requestMethod: requestID
108884
- });
108885
- return {
108886
- responseBytes,
108887
- status: response.status,
108888
- timeToFirstTokenMs
108889
- };
108890
- }
108891
- function encodeBinaryChunk(chunk) {
108892
- return chunk.toString("base64");
108893
- }
108894
- async function sendChunkStream({ apiURL, configuration, requestID, logger }) {
108895
- const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/${requestID}/stream`;
108896
- const maxFlushAttempts = 3;
108897
- let isAborted = false;
108898
- let isClosed = false;
108899
- let activeAbortController = null;
108900
- const chunks = [];
108901
- const sendChunk = async (payload) => {
108902
- if (isAborted || isClosed) {
108903
- return;
108904
- }
108905
- const response = ClientToServerAPIResponseSchema.parse({
108906
- data: payload.data,
108907
- headers: payload.headers,
108908
- requestID,
108909
- status: payload.status
108910
- });
108911
- const chunk = JSON.stringify({
108912
- ...response,
108913
- sequence: payload.sequence
108914
- });
108915
- chunks.push(Buffer.from(chunk + "\n"));
108916
- if (chunks.length >= 10) {
108917
- await flushChunks();
108918
- }
108919
- };
108920
- const flushChunks = async () => {
108921
- if (chunks.length === 0 || isAborted) {
108922
- return;
108923
- }
108924
- const batch = chunks.splice(0, chunks.length);
108925
- const body = Buffer.concat(batch);
108926
- for (let attempt = 1; attempt <= maxFlushAttempts; attempt += 1) {
108927
108848
  try {
108928
- activeAbortController = new AbortController();
108929
- const response = await fetchWithRetry(streamURL, {
108930
- body: body.toString(),
108931
- headers: {
108932
- "content-type": "application/json",
108933
- "x-api-key": configuration.apiKey
108934
- },
108935
- method: "POST",
108936
- signal: activeAbortController.signal
108937
- }, {
108938
- maxAttempts: 2,
108939
- timeoutMs: 15000
108940
- });
108941
- if (!response.ok) {
108942
- throw new Error(`Chunk stream flush failed with status ${response.status}`);
108849
+ const parsed = JSON.parse(payload);
108850
+ let modified = false;
108851
+ if (coerceToolCallArguments(parsed)) {
108852
+ modified = true;
108943
108853
  }
108944
- return;
108945
- }
108946
- catch (error) {
108947
- if (isAborted) {
108948
- return;
108854
+ if (parsed.usage) {
108855
+ const usageChunk = parsed.usage;
108856
+ const effectiveContext = getEffectiveContextLength({
108857
+ contextLength,
108858
+ engine,
108859
+ parallelism
108860
+ });
108861
+ if (usageChunk.context_usage === undefined &&
108862
+ usageChunk.prompt_tokens !== undefined &&
108863
+ effectiveContext !== null) {
108864
+ usageChunk.context_usage = usageChunk.prompt_tokens / effectiveContext;
108865
+ modified = true;
108866
+ }
108949
108867
  }
108950
- if (attempt >= maxFlushAttempts) {
108951
- chunks.unshift(...batch);
108952
- throw asError(error);
108868
+ if (modified) {
108869
+ modifiedLines.push("data: " + JSON.stringify(parsed));
108870
+ continue;
108953
108871
  }
108954
- logger.warn("Failed to send chunk batch", {
108955
- ...getNetworkErrorAttributes(error),
108956
- error: asError(error)
108957
- });
108958
- await sleep(100 * attempt);
108959
108872
  }
108960
- finally {
108961
- activeAbortController = null;
108873
+ catch (_error) {
108874
+ // Ignore malformed chunks
108962
108875
  }
108876
+ modifiedLines.push(rawLine);
108963
108877
  }
108964
- };
108965
- const end = async () => {
108966
- if (isClosed || isAborted) {
108967
- return;
108878
+ return Buffer.from(modifiedLines.join("\n"), "utf8");
108879
+ }
108880
+ function parseUsageFromBuffer() {
108881
+ const lines = buffer.split("\n");
108882
+ buffer = lines.pop() ?? "";
108883
+ for (const rawLine of lines) {
108884
+ const line = rawLine.trim();
108885
+ if (!line.startsWith("data:")) {
108886
+ continue;
108887
+ }
108888
+ const payload = line.slice(5).trim();
108889
+ if (!payload || payload === "[DONE]") {
108890
+ continue;
108891
+ }
108892
+ try {
108893
+ const parsed = JSON.parse(payload);
108894
+ if (isEngineUsageChunk(parsed)) {
108895
+ const completionTokens = parsed.usage?.completion_tokens ?? null;
108896
+ const promptTokens = parsed.usage?.prompt_tokens ?? null;
108897
+ const totalTokens = parsed.usage?.total_tokens ?? null;
108898
+ let contextUsage = parsed.usage?.context_usage ?? null;
108899
+ const effectiveContextForUsage = getEffectiveContextLength({
108900
+ contextLength,
108901
+ engine,
108902
+ parallelism
108903
+ });
108904
+ if (contextUsage === null &&
108905
+ promptTokens !== null &&
108906
+ effectiveContextForUsage !== null) {
108907
+ contextUsage = promptTokens / effectiveContextForUsage;
108908
+ }
108909
+ usage = {
108910
+ completionTokens,
108911
+ contextUsage,
108912
+ promptTokens,
108913
+ totalTokens
108914
+ };
108915
+ }
108916
+ }
108917
+ catch (_error) {
108918
+ // Ignore malformed chunks
108919
+ }
108968
108920
  }
108969
- await flushChunks();
108970
- isClosed = true;
108971
- };
108972
- const abort = (error) => {
108973
- isAborted = true;
108974
- if (activeAbortController) {
108975
- activeAbortController.abort();
108921
+ }
108922
+ function finalize(error) {
108923
+ if (completed) {
108924
+ return;
108976
108925
  }
108977
- chunks.length = 0;
108978
- if (error) {
108979
- logger.error("Chunk stream aborted", {
108980
- error: asError(error)
108926
+ completed = true;
108927
+ if (onComplete) {
108928
+ const completion = onComplete({
108929
+ durationMs: Math.max(0, Date.now() - startedAt),
108930
+ error,
108931
+ requestBodyBytes,
108932
+ responseBytes,
108933
+ timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
108934
+ usage
108981
108935
  });
108936
+ if (completion && typeof completion.catch === "function") {
108937
+ completion.catch(error => {
108938
+ logger.error("Engine metrics completion failed", {
108939
+ error: asError(error),
108940
+ requestUrl: requestPath
108941
+ });
108942
+ });
108943
+ }
108982
108944
  }
108983
- };
108984
- return {
108985
- sendChunk,
108986
- end,
108987
- abort
108988
- };
108989
- }
108990
- function calculateRequestBytes(body) {
108991
- if (body === null || body === undefined) {
108992
- return 0;
108993
108945
  }
108994
- if (typeof body === "string") {
108995
- return Buffer.byteLength(body, "utf8");
108946
+ body.on("data", (chunk) => {
108947
+ const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
108948
+ if (firstChunkAt === null) {
108949
+ firstChunkAt = Date.now();
108950
+ }
108951
+ responseBytes += chunkBuffer.length;
108952
+ buffer += chunkBuffer.toString("utf8");
108953
+ parseUsageFromBuffer();
108954
+ passThrough.write(modifyChunkWithUsage(chunkBuffer));
108955
+ });
108956
+ body.once("error", err => {
108957
+ logEngineMetrics({
108958
+ agentEngineType,
108959
+ error: err,
108960
+ level: "error",
108961
+ logger,
108962
+ requestBodyBytes,
108963
+ requestPath,
108964
+ responseBytes,
108965
+ usage
108966
+ });
108967
+ finalize(err);
108968
+ passThrough.destroy(err);
108969
+ });
108970
+ body.once("end", () => {
108971
+ parseUsageFromBuffer();
108972
+ logEngineMetrics({
108973
+ agentEngineType,
108974
+ level: "info",
108975
+ logger,
108976
+ requestBodyBytes,
108977
+ requestPath,
108978
+ responseBytes,
108979
+ usage
108980
+ });
108981
+ finalize(null);
108982
+ passThrough.end();
108983
+ });
108984
+ body.once("close", () => {
108985
+ if (completed) {
108986
+ if (!passThrough.writableEnded) {
108987
+ passThrough.end();
108988
+ }
108989
+ return;
108990
+ }
108991
+ const closeError = new Error("Engine response stream closed before completion");
108992
+ logEngineMetrics({
108993
+ agentEngineType,
108994
+ error: closeError,
108995
+ level: "error",
108996
+ logger,
108997
+ requestBodyBytes,
108998
+ requestPath,
108999
+ responseBytes,
109000
+ usage
109001
+ });
109002
+ finalize(closeError);
109003
+ if (!passThrough.writableEnded) {
109004
+ passThrough.end();
109005
+ }
109006
+ });
109007
+ return {
109008
+ stream: passThrough
109009
+ };
109010
+ }
109011
+ function monitorEngineResponseSingle({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }) {
109012
+ const maxUsageCaptureBytes = 1024 * 1024;
109013
+ const startedAt = requestStartedAt ?? Date.now();
109014
+ const passThrough = new PassThrough();
109015
+ let responseBytes = 0;
109016
+ let firstChunkAt = null;
109017
+ let usage = null;
109018
+ const usageChunks = [];
109019
+ let usageBytes = 0;
109020
+ let usageCaptureEnabled = true;
109021
+ let completed = false;
109022
+ function finalize(error) {
109023
+ if (completed) {
109024
+ return;
109025
+ }
109026
+ completed = true;
109027
+ if (onComplete) {
109028
+ const completion = onComplete({
109029
+ durationMs: Math.max(0, Date.now() - startedAt),
109030
+ error,
109031
+ requestBodyBytes,
109032
+ responseBytes,
109033
+ timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
109034
+ usage
109035
+ });
109036
+ if (completion && typeof completion.catch === "function") {
109037
+ completion.catch(error => {
109038
+ logger.error("Engine metrics completion failed", {
109039
+ error: asError(error),
109040
+ requestUrl: requestPath
109041
+ });
109042
+ });
109043
+ }
109044
+ }
108996
109045
  }
108997
- return Buffer.byteLength(JSON.stringify(body), "utf8");
109046
+ body.on("data", (chunk) => {
109047
+ const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
109048
+ if (firstChunkAt === null) {
109049
+ firstChunkAt = Date.now();
109050
+ }
109051
+ responseBytes += chunkBuffer.length;
109052
+ if (usageCaptureEnabled) {
109053
+ const nextSize = usageBytes + chunkBuffer.length;
109054
+ if (nextSize <= maxUsageCaptureBytes) {
109055
+ usageChunks.push(chunkBuffer);
109056
+ usageBytes = nextSize;
109057
+ }
109058
+ else {
109059
+ usageCaptureEnabled = false;
109060
+ usageChunks.length = 0;
109061
+ }
109062
+ }
109063
+ passThrough.write(chunkBuffer);
109064
+ });
109065
+ body.once("error", err => {
109066
+ logEngineMetrics({
109067
+ agentEngineType,
109068
+ error: err,
109069
+ level: "error",
109070
+ logger,
109071
+ requestBodyBytes,
109072
+ requestPath,
109073
+ responseBytes,
109074
+ usage
109075
+ });
109076
+ finalize(err);
109077
+ passThrough.destroy(err);
109078
+ });
109079
+ body.once("end", () => {
109080
+ if (usageCaptureEnabled) {
109081
+ try {
109082
+ const parsed = JSON.parse(Buffer.concat(usageChunks).toString("utf8"));
109083
+ if (parsed.usage) {
109084
+ const usageChunk = parsed.usage;
109085
+ const completionTokens = usageChunk.completion_tokens ?? null;
109086
+ const promptTokens = usageChunk.prompt_tokens ?? null;
109087
+ const totalTokens = usageChunk.total_tokens ?? null;
109088
+ let contextUsage = usageChunk.context_usage ?? null;
109089
+ const effectiveContext = getEffectiveContextLength({
109090
+ contextLength,
109091
+ engine,
109092
+ parallelism
109093
+ });
109094
+ if (contextUsage === null &&
109095
+ promptTokens !== null &&
109096
+ effectiveContext !== null) {
109097
+ contextUsage = promptTokens / effectiveContext;
109098
+ }
109099
+ usage = {
109100
+ completionTokens,
109101
+ contextUsage,
109102
+ promptTokens,
109103
+ totalTokens
109104
+ };
109105
+ }
109106
+ }
109107
+ catch (error) {
109108
+ logger.error("Failed to parse engine response body", {
109109
+ error: asError(error),
109110
+ requestUrl: requestPath
109111
+ });
109112
+ }
109113
+ }
109114
+ logEngineMetrics({
109115
+ agentEngineType,
109116
+ level: "info",
109117
+ logger,
109118
+ requestBodyBytes,
109119
+ requestPath,
109120
+ responseBytes,
109121
+ usage
109122
+ });
109123
+ finalize(null);
109124
+ passThrough.end();
109125
+ });
109126
+ body.once("close", () => {
109127
+ if (completed) {
109128
+ if (!passThrough.writableEnded) {
109129
+ passThrough.end();
109130
+ }
109131
+ return;
109132
+ }
109133
+ const closeError = new Error("Engine response stream closed before completion");
109134
+ logEngineMetrics({
109135
+ agentEngineType,
109136
+ error: closeError,
109137
+ level: "error",
109138
+ logger,
109139
+ requestBodyBytes,
109140
+ requestPath,
109141
+ responseBytes,
109142
+ usage
109143
+ });
109144
+ finalize(closeError);
109145
+ if (!passThrough.writableEnded) {
109146
+ passThrough.end();
109147
+ }
109148
+ });
109149
+ return {
109150
+ stream: passThrough
109151
+ };
109152
+ }
109153
+ function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }) {
109154
+ const metricsMessage = [
109155
+ "LLM engine stream metrics",
109156
+ `path=${requestPath}`,
109157
+ `bytesTo=${requestBodyBytes}`,
109158
+ `bytesFrom=${responseBytes}`,
109159
+ `promptTokens=${usage?.promptTokens ?? "n/a"}`,
109160
+ `completionTokens=${usage?.completionTokens ?? "n/a"}`,
109161
+ `totalTokens=${usage?.totalTokens ?? "n/a"}`,
109162
+ `contextUsage=${usage?.contextUsage ?? "n/a"}`
109163
+ ].join(" ");
109164
+ const attributes = {
109165
+ agentEngineType,
109166
+ requestUrl: requestPath
109167
+ };
109168
+ if (error) {
109169
+ attributes.error = error;
109170
+ }
109171
+ logger[level](metricsMessage, attributes);
109172
+ }
109173
+
109174
+ function isPlainObject$1(value) {
109175
+ return typeof value === "object" && value !== null && !Array.isArray(value);
109176
+ }
109177
+ function serializeRequestBody(body) {
109178
+ if (!isPlainObject$1(body)) {
109179
+ const payload = typeof body === "string" ? body : JSON.stringify(body);
109180
+ return {
109181
+ bytes: Buffer.byteLength(payload, "utf8"),
109182
+ payload
109183
+ };
109184
+ }
109185
+ const requestPayload = { ...body };
109186
+ const streamOptions = requestPayload.stream_options;
109187
+ const normalizedStreamOptions = isPlainObject$1(streamOptions)
109188
+ ? { ...streamOptions }
109189
+ : {};
109190
+ normalizedStreamOptions.include_usage = true;
109191
+ requestPayload.stream_options = normalizedStreamOptions;
109192
+ const payload = JSON.stringify(requestPayload);
109193
+ return {
109194
+ bytes: Buffer.byteLength(payload, "utf8"),
109195
+ payload
109196
+ };
108998
109197
  }
108999
109198
  function calculateTokensPerSecond$1({ durationMs, totalTokens }) {
109000
109199
  if (durationMs <= 0) {
@@ -109006,107 +109205,714 @@ function calculateTokensPerSecond$1({ durationMs, totalTokens }) {
109006
109205
  }
109007
109206
  return Math.round(tokensPerSecond);
109008
109207
  }
109009
-
109010
- /**
109011
- * Proxy server requests to the local inference HTTP server.
109012
- */
109013
- async function proxyRequest({ configuration, request }) {
109014
- let finalPath = request.path;
109015
- if (request.parameters) {
109016
- Object.entries(request.parameters).forEach(([key, value]) => {
109017
- finalPath = finalPath.replace(`:${key}`, String(value));
109208
+ async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }) {
109209
+ function normalizeTokenCount(value) {
109210
+ if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
109211
+ return value;
109212
+ }
109213
+ return 0;
109214
+ }
109215
+ function reportMetricsSafe(payload) {
109216
+ reportMetrics(payload).catch(error => {
109217
+ logger.warn("Failed to upload LLM prompt metrics", {
109218
+ error: asError(error),
109219
+ requestUrl: path
109220
+ });
109018
109221
  });
109019
109222
  }
109020
- const url = new URL(finalPath, `http://localhost:${configuration.port}`);
109021
- if (request.query) {
109022
- for (const [key, value] of Object.entries(request.query)) {
109023
- url.searchParams.set(key, value);
109223
+ const { bytes: requestBodyBytes, payload: serializedBody } = serializeRequestBody(body);
109224
+ const requestStartedAt = Date.now();
109225
+ const requestBody = JSON.parse(serializedBody);
109226
+ const streamRequested = requestBody.stream === true;
109227
+ const onMonitoringComplete = ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
109228
+ const completionTokens = normalizeTokenCount(usage?.completionTokens);
109229
+ const promptTokens = normalizeTokenCount(usage?.promptTokens);
109230
+ const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
109231
+ const latencyMs = Math.max(0, durationMs);
109232
+ reportMetricsSafe({
109233
+ bytes: requestBodyBytes + responseBytes,
109234
+ completionTokens,
109235
+ engine: configuration.agentEngineType,
109236
+ endpointId: null,
109237
+ latencyMs,
109238
+ modelId: modelID,
109239
+ promptTokens,
109240
+ requestBytes: requestBodyBytes,
109241
+ requestId: null,
109242
+ requestMethod: "POST",
109243
+ requestPath: path,
109244
+ responseBytes,
109245
+ successful: !error,
109246
+ timeToFirstTokenMs,
109247
+ tokensPerSecond: calculateTokensPerSecond$1({
109248
+ durationMs: latencyMs,
109249
+ totalTokens
109250
+ }),
109251
+ totalTokens
109252
+ });
109253
+ };
109254
+ const response = await modelManager
109255
+ .fetchOpenAI(path, {
109256
+ body: serializedBody,
109257
+ headers: {
109258
+ "Content-Type": "application/json"
109259
+ },
109260
+ method: "POST"
109261
+ })
109262
+ .catch(error => {
109263
+ logEngineMetrics({
109264
+ agentEngineType: configuration.agentEngineType,
109265
+ error: error,
109266
+ level: "error",
109267
+ logger,
109268
+ requestBodyBytes,
109269
+ requestPath: path,
109270
+ responseBytes: 0,
109271
+ usage: null
109272
+ });
109273
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
109274
+ reportMetricsSafe({
109275
+ bytes: requestBodyBytes,
109276
+ completionTokens: 0,
109277
+ engine: configuration.agentEngineType,
109278
+ endpointId: null,
109279
+ latencyMs,
109280
+ modelId: modelID,
109281
+ promptTokens: 0,
109282
+ requestBytes: requestBodyBytes,
109283
+ requestId: null,
109284
+ requestMethod: "POST",
109285
+ requestPath: path,
109286
+ responseBytes: 0,
109287
+ successful: false,
109288
+ timeToFirstTokenMs: null,
109289
+ tokensPerSecond: 0,
109290
+ totalTokens: 0
109291
+ });
109292
+ throw error;
109293
+ });
109294
+ const responseStatusText = response.statusText ?? "Upstream request failed";
109295
+ if (!response.ok) {
109296
+ const responseBody = await response.text().catch(() => null);
109297
+ const responseError = new Error(responseBody
109298
+ ? `Upstream error response: ${responseBody}`
109299
+ : "Upstream error response: empty body");
109300
+ logger.error("LLM engine request failed", {
109301
+ error: responseError,
109302
+ requestUrl: path,
109303
+ statusCode: response.status,
109304
+ statusText: responseStatusText,
109305
+ responseBody: responseBody ?? undefined
109306
+ });
109307
+ if (!response.body) {
109308
+ return {
109309
+ status: response.status,
109310
+ statusText: responseStatusText
109311
+ };
109024
109312
  }
109025
109313
  }
109026
- const fetchOptions = {
109027
- method: request.method,
109028
- headers: request.headers
109029
- };
109030
- if (request.body) {
109031
- fetchOptions.body =
109032
- typeof request.body === "object" ? JSON.stringify(request.body) : request.body;
109314
+ if (!response.body) {
109315
+ logEngineMetrics({
109316
+ agentEngineType: configuration.agentEngineType,
109317
+ level: response.ok ? "info" : "error",
109318
+ logger,
109319
+ requestBodyBytes,
109320
+ requestPath: path,
109321
+ responseBytes: 0,
109322
+ usage: null
109323
+ });
109324
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
109325
+ reportMetricsSafe({
109326
+ bytes: requestBodyBytes,
109327
+ completionTokens: 0,
109328
+ engine: configuration.agentEngineType,
109329
+ endpointId: null,
109330
+ latencyMs,
109331
+ modelId: modelID,
109332
+ promptTokens: 0,
109333
+ requestBytes: requestBodyBytes,
109334
+ requestId: null,
109335
+ requestMethod: "POST",
109336
+ requestPath: path,
109337
+ responseBytes: 0,
109338
+ successful: false,
109339
+ timeToFirstTokenMs: null,
109340
+ tokensPerSecond: 0,
109341
+ totalTokens: 0
109342
+ });
109343
+ return {
109344
+ status: response.status,
109345
+ statusText: responseStatusText
109346
+ };
109033
109347
  }
109034
- const response = await undiciExports.fetch(url, fetchOptions);
109348
+ const monitoredResponse = streamRequested
109349
+ ? monitorEngineResponseStream({
109350
+ agentEngineType: configuration.agentEngineType,
109351
+ body: Readable.fromWeb(response.body),
109352
+ contextLength: modelManager.contextLength,
109353
+ engine: configuration.agentEngineType,
109354
+ logger,
109355
+ onComplete: onMonitoringComplete,
109356
+ parallelism: modelManager.parallelism,
109357
+ requestBodyBytes,
109358
+ requestPath: path,
109359
+ requestStartedAt
109360
+ })
109361
+ : monitorEngineResponseSingle({
109362
+ agentEngineType: configuration.agentEngineType,
109363
+ body: Readable.fromWeb(response.body),
109364
+ contextLength: modelManager.contextLength,
109365
+ engine: configuration.agentEngineType,
109366
+ logger,
109367
+ onComplete: onMonitoringComplete,
109368
+ parallelism: modelManager.parallelism,
109369
+ requestBodyBytes,
109370
+ requestPath: path,
109371
+ requestStartedAt
109372
+ });
109035
109373
  return {
109036
- body: response.body ? Readable.fromWeb(response.body) : null,
109374
+ body: monitoredResponse.stream,
109037
109375
  headers: Object.fromEntries(response.headers.entries()),
109038
- requestID: request.requestID,
109039
109376
  status: response.status
109040
109377
  };
109041
109378
  }
109042
109379
 
109043
- class ConduitStateReportManager {
109044
- apiClient;
109045
- conduitStateManager;
109046
- downloadProgressReportIntervalMs;
109047
- logger;
109048
- stateIntervalMs;
109049
- conduitStateReportInFlight = false;
109050
- lastConduitStateReportAt = 0;
109051
- pendingConduitStateReport = null;
109052
- stateInterval = null;
109053
- constructor({ apiClient, conduitStateManager, downloadProgressReportIntervalMs, logger, stateIntervalMs }) {
109054
- this.apiClient = apiClient;
109055
- this.conduitStateManager = conduitStateManager;
109056
- this.downloadProgressReportIntervalMs = downloadProgressReportIntervalMs;
109057
- this.logger = logger;
109058
- this.stateIntervalMs = stateIntervalMs;
109059
- }
109060
- async start() {
109061
- await this.sendConduitState();
109062
- this.stateInterval = setInterval(() => {
109063
- this.sendConduitState().catch(error => {
109064
- this.logger.error("Conduit state update failed", {
109065
- error: asError(error)
109380
+ function createConduitOpenAIAPIReferenceHandlers({ apiClient, configuration, getModelID, getModelManager, logger, startup }) {
109381
+ return {
109382
+ "/v1/chat/completions": {
109383
+ POST: async ({ body }) => {
109384
+ const modelID = getModelID();
109385
+ const modelManager = getModelManager();
109386
+ return proxyOpenAIStreamingRoute({
109387
+ body,
109388
+ configuration,
109389
+ logger,
109390
+ modelID,
109391
+ modelManager,
109392
+ path: "/v1/chat/completions",
109393
+ reportMetrics: apiClient.reportPromptMetrics
109066
109394
  });
109067
- });
109068
- }, this.stateIntervalMs);
109069
- }
109070
- stop() {
109071
- if (this.stateInterval) {
109072
- clearInterval(this.stateInterval);
109073
- this.stateInterval = null;
109074
- }
109075
- if (this.pendingConduitStateReport) {
109076
- clearTimeout(this.pendingConduitStateReport);
109077
- this.pendingConduitStateReport = null;
109078
- }
109079
- }
109080
- reportDownloadProgress() {
109081
- this.scheduleConduitStateReport();
109082
- }
109083
- async reportNow() {
109084
- if (this.pendingConduitStateReport) {
109085
- clearTimeout(this.pendingConduitStateReport);
109086
- this.pendingConduitStateReport = null;
109087
- }
109088
- await this.triggerConduitStateReport();
109089
- }
109090
- reportStateChange() {
109091
- if (this.pendingConduitStateReport) {
109092
- clearTimeout(this.pendingConduitStateReport);
109093
- this.pendingConduitStateReport = null;
109094
- }
109095
- this.triggerConduitStateReport().catch(error => {
109096
- this.logger.error("Conduit state update failed", {
109097
- error: asError(error)
109098
- });
109099
- });
109100
- }
109101
- async sendConduitState() {
109102
- try {
109103
- await this.apiClient.reportConduitState(this.conduitStateManager.touch());
109104
- this.lastConduitStateReportAt = Date.now();
109395
+ }
109396
+ },
109397
+ "/v1/completions": {
109398
+ POST: async ({ body }) => {
109399
+ const modelID = getModelID();
109400
+ const modelManager = getModelManager();
109401
+ return proxyOpenAIStreamingRoute({
109402
+ body,
109403
+ configuration,
109404
+ logger,
109405
+ modelID,
109406
+ modelManager,
109407
+ path: "/v1/completions",
109408
+ reportMetrics: apiClient.reportPromptMetrics
109409
+ });
109410
+ }
109411
+ },
109412
+ "/v1/models": {
109413
+ GET: async () => {
109414
+ const modelID = getModelID();
109415
+ const modelManager = getModelManager();
109416
+ const effectiveContextLength = getEffectiveContextLength({
109417
+ contextLength: modelManager.contextLength,
109418
+ engine: configuration.agentEngineType,
109419
+ parallelism: modelManager.parallelism
109420
+ });
109421
+ return {
109422
+ body: {
109423
+ data: [
109424
+ {
109425
+ created: startup / 1000,
109426
+ id: modelID,
109427
+ limit: {
109428
+ context: effectiveContextLength
109429
+ },
109430
+ object: "model",
109431
+ owned_by: "infersec"
109432
+ }
109433
+ ],
109434
+ object: "list"
109435
+ },
109436
+ status: 200
109437
+ };
109438
+ }
109105
109439
  }
109106
- catch (error) {
109107
- this.logger.error("Conduit state update failed", {
109108
- ...getNetworkErrorAttributes(error),
109109
- error: asError(error)
109440
+ };
109441
+ }
109442
+ function createGetModelsHandler(options) {
109443
+ return createConduitOpenAIAPIReferenceHandlers(options)["/v1/models"].GET;
109444
+ }
109445
+ function createPostChatCompletionsHandler(options) {
109446
+ return createConduitOpenAIAPIReferenceHandlers(options)["/v1/chat/completions"].POST;
109447
+ }
109448
+ function createPostCompletionsHandler(options) {
109449
+ return createConduitOpenAIAPIReferenceHandlers(options)["/v1/completions"].POST;
109450
+ }
109451
+
109452
+ function createHealthHandler() {
109453
+ return (_req, res) => {
109454
+ res.status(200).send("OK");
109455
+ };
109456
+ }
109457
+
109458
+ async function handleSSERequests({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, signal }) {
109459
+ const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/stream`;
109460
+ const maxReconnectDelayMs = 30000;
109461
+ let reconnectAttempt = 0;
109462
+ while (!signal?.aborted) {
109463
+ const connectionStartedAt = Date.now();
109464
+ try {
109465
+ await connectSSE(streamURL, {
109466
+ headers: {
109467
+ "x-api-key": configuration.apiKey
109468
+ },
109469
+ onError: (error) => {
109470
+ logger.error("SSE connection error", {
109471
+ error
109472
+ });
109473
+ },
109474
+ onMessage: (message) => {
109475
+ if (message.event !== "request") {
109476
+ return;
109477
+ }
109478
+ const payload = ServerToClientAPIRequestSchema.parse(JSON.parse(message.data));
109479
+ handleRequest({
109480
+ apiURL,
109481
+ configuration,
109482
+ logger,
109483
+ modelID,
109484
+ onRequest,
109485
+ onRequestEnd,
109486
+ onRequestStart,
109487
+ reportMetrics,
109488
+ request: payload,
109489
+ signal
109490
+ }).catch(error => {
109491
+ logger.error("SSE request handler failed", {
109492
+ error: asError(error),
109493
+ requestMethod: payload.requestID
109494
+ });
109495
+ });
109496
+ },
109497
+ signal
109498
+ });
109499
+ }
109500
+ catch (error) {
109501
+ if (signal?.aborted) {
109502
+ return;
109503
+ }
109504
+ const isTerminated = isTerminatedError(error);
109505
+ if (!isTerminated) {
109506
+ logger.error("SSE connection failed", {
109507
+ ...getNetworkErrorAttributes(error),
109508
+ error: asError(error)
109509
+ });
109510
+ }
109511
+ if (signal?.aborted) {
109512
+ return;
109513
+ }
109514
+ if (!isTerminated) {
109515
+ const connectionDurationMs = Date.now() - connectionStartedAt;
109516
+ reconnectAttempt = connectionDurationMs > 10000 ? 0 : reconnectAttempt + 1;
109517
+ const reconnectDelayMs = Math.min(maxReconnectDelayMs, Math.max(1000, 1000 * 2 ** Math.min(6, reconnectAttempt)));
109518
+ logger.warn("SSE disconnected, retrying");
109519
+ await sleep(reconnectDelayMs);
109520
+ }
109521
+ }
109522
+ }
109523
+ }
109524
+ async function handleRequest({ apiURL, configuration, logger, modelID, onRequest, onRequestEnd, onRequestStart, reportMetrics, request, signal }) {
109525
+ function reportMetricsSafe(payload) {
109526
+ reportMetrics(payload).catch(error => {
109527
+ logger.warn("Failed to upload LLM prompt metrics", {
109528
+ error: asError(error),
109529
+ requestUrl: request.path
109530
+ });
109531
+ });
109532
+ }
109533
+ const requestStartedAt = Date.now();
109534
+ const requestBytes = calculateRequestBytes(request.body ?? null);
109535
+ try {
109536
+ await onRequestStart?.(request);
109537
+ const response = await onRequest(request);
109538
+ const responseMetrics = await streamResponse({
109539
+ apiURL,
109540
+ configuration,
109541
+ logger,
109542
+ requestID: request.requestID,
109543
+ requestStartedAt,
109544
+ response,
109545
+ signal
109546
+ });
109547
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
109548
+ const totalTokens = 0;
109549
+ const tokensPerSecond = calculateTokensPerSecond({
109550
+ durationMs: latencyMs,
109551
+ totalTokens
109552
+ });
109553
+ reportMetricsSafe({
109554
+ bytes: requestBytes + responseMetrics.responseBytes,
109555
+ completionTokens: 0,
109556
+ engine: configuration.agentEngineType,
109557
+ endpointId: null,
109558
+ latencyMs,
109559
+ modelId: modelID,
109560
+ promptTokens: 0,
109561
+ requestBytes,
109562
+ requestId: request.requestID,
109563
+ requestMethod: request.method,
109564
+ requestPath: request.path,
109565
+ responseBytes: responseMetrics.responseBytes,
109566
+ successful: responseMetrics.status < 400,
109567
+ timeToFirstTokenMs: responseMetrics.timeToFirstTokenMs,
109568
+ tokensPerSecond,
109569
+ totalTokens
109570
+ });
109571
+ }
109572
+ catch (error) {
109573
+ logger.error("SSE request failed", {
109574
+ error: asError(error),
109575
+ requestMethod: request.requestID
109576
+ });
109577
+ const failureMessage = "Bad gateway\n\nProxying failed";
109578
+ const failureBytes = Buffer.byteLength(failureMessage, "utf8");
109579
+ const latencyMs = Math.max(0, Date.now() - requestStartedAt);
109580
+ const totalTokens = 0;
109581
+ const tokensPerSecond = calculateTokensPerSecond({
109582
+ durationMs: latencyMs,
109583
+ totalTokens
109584
+ });
109585
+ const streamHandler = await sendChunkStream({
109586
+ apiURL,
109587
+ configuration,
109588
+ requestID: request.requestID,
109589
+ logger
109590
+ });
109591
+ await streamHandler.sendChunk({
109592
+ data: encodeBinaryChunk(Buffer.from(failureMessage)),
109593
+ sequence: 0,
109594
+ status: 502
109595
+ });
109596
+ await streamHandler.sendChunk({
109597
+ data: null,
109598
+ sequence: 1,
109599
+ status: 502
109600
+ });
109601
+ await streamHandler.end();
109602
+ reportMetricsSafe({
109603
+ bytes: requestBytes + failureBytes,
109604
+ completionTokens: 0,
109605
+ engine: configuration.agentEngineType,
109606
+ endpointId: null,
109607
+ latencyMs,
109608
+ modelId: modelID,
109609
+ promptTokens: 0,
109610
+ requestBytes,
109611
+ requestId: request.requestID,
109612
+ requestMethod: request.method,
109613
+ requestPath: request.path,
109614
+ responseBytes: failureBytes,
109615
+ successful: false,
109616
+ timeToFirstTokenMs: latencyMs,
109617
+ tokensPerSecond,
109618
+ totalTokens
109619
+ });
109620
+ }
109621
+ finally {
109622
+ await onRequestEnd?.(request);
109623
+ }
109624
+ }
109625
+ async function streamResponse({ apiURL, configuration, logger, requestID, requestStartedAt, response, signal }) {
109626
+ let sequence = 0;
109627
+ let responseBytes = 0;
109628
+ let timeToFirstTokenMs = null;
109629
+ const streamHandler = await sendChunkStream({
109630
+ apiURL,
109631
+ configuration,
109632
+ requestID,
109633
+ logger
109634
+ });
109635
+ if (response.body instanceof Readable) {
109636
+ for await (const chunk of response.body) {
109637
+ if (signal?.aborted) {
109638
+ streamHandler.abort();
109639
+ throw new Error("Request cancelled");
109640
+ }
109641
+ const buffer = Buffer.isBuffer(chunk)
109642
+ ? chunk
109643
+ : Buffer.from(chunk);
109644
+ if (timeToFirstTokenMs === null) {
109645
+ timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
109646
+ }
109647
+ responseBytes += buffer.length;
109648
+ await streamHandler.sendChunk({
109649
+ data: encodeBinaryChunk(buffer),
109650
+ sequence,
109651
+ status: response.status
109652
+ });
109653
+ sequence += 1;
109654
+ }
109655
+ await streamHandler.sendChunk({
109656
+ data: null,
109657
+ sequence,
109658
+ status: response.status
109659
+ });
109660
+ await streamHandler.end();
109661
+ return {
109662
+ responseBytes,
109663
+ status: response.status,
109664
+ timeToFirstTokenMs
109665
+ };
109666
+ }
109667
+ const responsePayload = response.body
109668
+ ? typeof response.body === "string"
109669
+ ? response.body
109670
+ : JSON.stringify(response.body)
109671
+ : "";
109672
+ if (responsePayload.length > 0) {
109673
+ responseBytes = Buffer.byteLength(responsePayload, "utf8");
109674
+ timeToFirstTokenMs = Math.max(0, Date.now() - requestStartedAt);
109675
+ }
109676
+ await streamHandler.sendChunk({
109677
+ data: encodeBinaryChunk(Buffer.from(responsePayload)),
109678
+ headers: response.headers,
109679
+ sequence,
109680
+ status: response.status
109681
+ });
109682
+ await streamHandler.sendChunk({
109683
+ data: null,
109684
+ sequence: sequence + 1,
109685
+ status: response.status
109686
+ });
109687
+ await streamHandler.end();
109688
+ logger.info("SSE response queued", {
109689
+ requestMethod: requestID
109690
+ });
109691
+ return {
109692
+ responseBytes,
109693
+ status: response.status,
109694
+ timeToFirstTokenMs
109695
+ };
109696
+ }
109697
+ function encodeBinaryChunk(chunk) {
109698
+ return chunk.toString("base64");
109699
+ }
109700
+ async function sendChunkStream({ apiURL, configuration, requestID, logger }) {
109701
+ const streamURL = `${apiURL}/conduit/api/v1/source/${configuration.inferenceSourceID}/requests/${requestID}/stream`;
109702
+ const maxFlushAttempts = 3;
109703
+ let isAborted = false;
109704
+ let isClosed = false;
109705
+ let activeAbortController = null;
109706
+ const chunks = [];
109707
+ const sendChunk = async (payload) => {
109708
+ if (isAborted || isClosed) {
109709
+ return;
109710
+ }
109711
+ const response = ClientToServerAPIResponseSchema.parse({
109712
+ data: payload.data,
109713
+ headers: payload.headers,
109714
+ requestID,
109715
+ status: payload.status
109716
+ });
109717
+ const chunk = JSON.stringify({
109718
+ ...response,
109719
+ sequence: payload.sequence
109720
+ });
109721
+ chunks.push(Buffer.from(chunk + "\n"));
109722
+ if (chunks.length >= 10) {
109723
+ await flushChunks();
109724
+ }
109725
+ };
109726
+ const flushChunks = async () => {
109727
+ if (chunks.length === 0 || isAborted) {
109728
+ return;
109729
+ }
109730
+ const batch = chunks.splice(0, chunks.length);
109731
+ const body = Buffer.concat(batch);
109732
+ for (let attempt = 1; attempt <= maxFlushAttempts; attempt += 1) {
109733
+ try {
109734
+ activeAbortController = new AbortController();
109735
+ const response = await fetchWithRetry(streamURL, {
109736
+ body: body.toString(),
109737
+ headers: {
109738
+ "content-type": "application/json",
109739
+ "x-api-key": configuration.apiKey
109740
+ },
109741
+ method: "POST",
109742
+ signal: activeAbortController.signal
109743
+ }, {
109744
+ maxAttempts: 2,
109745
+ timeoutMs: 15000
109746
+ });
109747
+ if (!response.ok) {
109748
+ throw new Error(`Chunk stream flush failed with status ${response.status}`);
109749
+ }
109750
+ return;
109751
+ }
109752
+ catch (error) {
109753
+ if (isAborted) {
109754
+ return;
109755
+ }
109756
+ if (attempt >= maxFlushAttempts) {
109757
+ chunks.unshift(...batch);
109758
+ throw asError(error);
109759
+ }
109760
+ logger.warn("Failed to send chunk batch", {
109761
+ ...getNetworkErrorAttributes(error),
109762
+ error: asError(error)
109763
+ });
109764
+ await sleep(100 * attempt);
109765
+ }
109766
+ finally {
109767
+ activeAbortController = null;
109768
+ }
109769
+ }
109770
+ };
109771
+ const end = async () => {
109772
+ if (isClosed || isAborted) {
109773
+ return;
109774
+ }
109775
+ await flushChunks();
109776
+ isClosed = true;
109777
+ };
109778
+ const abort = (error) => {
109779
+ isAborted = true;
109780
+ if (activeAbortController) {
109781
+ activeAbortController.abort();
109782
+ }
109783
+ chunks.length = 0;
109784
+ if (error) {
109785
+ logger.error("Chunk stream aborted", {
109786
+ error: asError(error)
109787
+ });
109788
+ }
109789
+ };
109790
+ return {
109791
+ sendChunk,
109792
+ end,
109793
+ abort
109794
+ };
109795
+ }
109796
+ function calculateRequestBytes(body) {
109797
+ if (body === null || body === undefined) {
109798
+ return 0;
109799
+ }
109800
+ if (typeof body === "string") {
109801
+ return Buffer.byteLength(body, "utf8");
109802
+ }
109803
+ return Buffer.byteLength(JSON.stringify(body), "utf8");
109804
+ }
109805
+ function calculateTokensPerSecond({ durationMs, totalTokens }) {
109806
+ if (durationMs <= 0) {
109807
+ return 0;
109808
+ }
109809
+ const tokensPerSecond = totalTokens / (durationMs / 1000);
109810
+ if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
109811
+ return 0;
109812
+ }
109813
+ return Math.round(tokensPerSecond);
109814
+ }
109815
+
109816
+ /**
109817
+ * Proxy server requests to the local inference HTTP server.
109818
+ */
109819
+ async function proxyRequest({ configuration, request }) {
109820
+ let finalPath = request.path;
109821
+ if (request.parameters) {
109822
+ Object.entries(request.parameters).forEach(([key, value]) => {
109823
+ finalPath = finalPath.replace(`:${key}`, String(value));
109824
+ });
109825
+ }
109826
+ const url = new URL(finalPath, `http://localhost:${configuration.port}`);
109827
+ if (request.query) {
109828
+ for (const [key, value] of Object.entries(request.query)) {
109829
+ url.searchParams.set(key, value);
109830
+ }
109831
+ }
109832
+ const fetchOptions = {
109833
+ method: request.method,
109834
+ headers: request.headers
109835
+ };
109836
+ if (request.body) {
109837
+ fetchOptions.body =
109838
+ typeof request.body === "object" ? JSON.stringify(request.body) : request.body;
109839
+ }
109840
+ const response = await undiciExports.fetch(url, fetchOptions);
109841
+ return {
109842
+ body: response.body ? Readable.fromWeb(response.body) : null,
109843
+ headers: Object.fromEntries(response.headers.entries()),
109844
+ requestID: request.requestID,
109845
+ status: response.status
109846
+ };
109847
+ }
109848
+
109849
+ class ConduitStateReportManager {
109850
+ apiClient;
109851
+ conduitStateManager;
109852
+ downloadProgressReportIntervalMs;
109853
+ logger;
109854
+ stateIntervalMs;
109855
+ conduitStateReportInFlight = false;
109856
+ lastConduitStateReportAt = 0;
109857
+ pendingConduitStateReport = null;
109858
+ stateInterval = null;
109859
+ constructor({ apiClient, conduitStateManager, downloadProgressReportIntervalMs, logger, stateIntervalMs }) {
109860
+ this.apiClient = apiClient;
109861
+ this.conduitStateManager = conduitStateManager;
109862
+ this.downloadProgressReportIntervalMs = downloadProgressReportIntervalMs;
109863
+ this.logger = logger;
109864
+ this.stateIntervalMs = stateIntervalMs;
109865
+ }
109866
+ async start() {
109867
+ await this.sendConduitState();
109868
+ this.stateInterval = setInterval(() => {
109869
+ this.sendConduitState().catch(error => {
109870
+ this.logger.error("Conduit state update failed", {
109871
+ error: asError(error)
109872
+ });
109873
+ });
109874
+ }, this.stateIntervalMs);
109875
+ }
109876
+ stop() {
109877
+ if (this.stateInterval) {
109878
+ clearInterval(this.stateInterval);
109879
+ this.stateInterval = null;
109880
+ }
109881
+ if (this.pendingConduitStateReport) {
109882
+ clearTimeout(this.pendingConduitStateReport);
109883
+ this.pendingConduitStateReport = null;
109884
+ }
109885
+ }
109886
+ reportDownloadProgress() {
109887
+ this.scheduleConduitStateReport();
109888
+ }
109889
+ async reportNow() {
109890
+ if (this.pendingConduitStateReport) {
109891
+ clearTimeout(this.pendingConduitStateReport);
109892
+ this.pendingConduitStateReport = null;
109893
+ }
109894
+ await this.triggerConduitStateReport();
109895
+ }
109896
+ reportStateChange() {
109897
+ if (this.pendingConduitStateReport) {
109898
+ clearTimeout(this.pendingConduitStateReport);
109899
+ this.pendingConduitStateReport = null;
109900
+ }
109901
+ this.triggerConduitStateReport().catch(error => {
109902
+ this.logger.error("Conduit state update failed", {
109903
+ error: asError(error)
109904
+ });
109905
+ });
109906
+ }
109907
+ async sendConduitState() {
109908
+ try {
109909
+ await this.apiClient.reportConduitState(this.conduitStateManager.touch());
109910
+ this.lastConduitStateReportAt = Date.now();
109911
+ }
109912
+ catch (error) {
109913
+ this.logger.error("Conduit state update failed", {
109914
+ ...getNetworkErrorAttributes(error),
109915
+ error: asError(error)
109110
109916
  });
109111
109917
  }
109112
109918
  }
@@ -109171,7 +109977,7 @@ class ConduitStateManager {
109171
109977
  }
109172
109978
  }
109173
109979
 
109174
- function isPlainObject$1(value) {
109980
+ function isPlainObject(value) {
109175
109981
  if (typeof value !== 'object' || value === null) {
109176
109982
  return false;
109177
109983
  }
@@ -109208,7 +110014,7 @@ const normalizeFileUrl = file => file instanceof URL ? fileURLToPath(file) : fil
109208
110014
  // This also does basic validation on them and on the command file.
109209
110015
  const normalizeParameters = (rawFile, rawArguments = [], rawOptions = {}) => {
109210
110016
  const filePath = safeNormalizeFileUrl(rawFile, 'First argument');
109211
- const [commandArguments, options] = isPlainObject$1(rawArguments)
110017
+ const [commandArguments, options] = isPlainObject(rawArguments)
109212
110018
  ? [[], rawArguments]
109213
110019
  : [rawArguments, rawOptions];
109214
110020
 
@@ -109226,7 +110032,7 @@ const normalizeParameters = (rawFile, rawArguments = [], rawOptions = {}) => {
109226
110032
  throw new TypeError(`Arguments cannot contain null bytes ("\\0"): ${nullByteArgument}`);
109227
110033
  }
109228
110034
 
109229
- if (!isPlainObject$1(options)) {
110035
+ if (!isPlainObject(options)) {
109230
110036
  throw new TypeError(`Last argument must be an options object: ${options}`);
109231
110037
  }
109232
110038
 
@@ -109423,7 +110229,7 @@ const parseExpression = expression => {
109423
110229
  return String(expression);
109424
110230
  }
109425
110231
 
109426
- if (isPlainObject$1(expression) && ('stdout' in expression || 'isMaxBuffer' in expression)) {
110232
+ if (isPlainObject(expression) && ('stdout' in expression || 'isMaxBuffer' in expression)) {
109427
110233
  return getSubprocessResult(expression);
109428
110234
  }
109429
110235
 
@@ -109479,7 +110285,7 @@ const getStdioLength = ({stdio}) => Array.isArray(stdio)
109479
110285
  ? Math.max(stdio.length, STANDARD_STREAMS_ALIASES.length)
109480
110286
  : STANDARD_STREAMS_ALIASES.length;
109481
110287
 
109482
- const normalizeFdSpecificValue = (optionValue, optionArray, optionName) => isPlainObject$1(optionValue)
110288
+ const normalizeFdSpecificValue = (optionValue, optionArray, optionName) => isPlainObject(optionValue)
109483
110289
  ? normalizeOptionObject(optionValue, optionArray, optionName)
109484
110290
  : optionArray.fill(optionValue);
109485
110291
 
@@ -113962,13 +114768,13 @@ const checkBooleanOption = (value, optionName) => {
113962
114768
  const isGenerator = value => isAsyncGenerator(value) || isSyncGenerator(value);
113963
114769
  const isAsyncGenerator = value => Object.prototype.toString.call(value) === '[object AsyncGeneratorFunction]';
113964
114770
  const isSyncGenerator = value => Object.prototype.toString.call(value) === '[object GeneratorFunction]';
113965
- const isTransformOptions = value => isPlainObject$1(value)
114771
+ const isTransformOptions = value => isPlainObject(value)
113966
114772
  && (value.transform !== undefined || value.final !== undefined);
113967
114773
 
113968
114774
  const isUrl = value => Object.prototype.toString.call(value) === '[object URL]';
113969
114775
  const isRegularUrl = value => isUrl(value) && value.protocol !== 'file:';
113970
114776
 
113971
- const isFilePathObject = value => isPlainObject$1(value)
114777
+ const isFilePathObject = value => isPlainObject(value)
113972
114778
  && Object.keys(value).length > 0
113973
114779
  && Object.keys(value).every(key => FILE_PATH_KEYS.has(key))
113974
114780
  && isFilePathString(value.file);
@@ -114131,7 +114937,7 @@ const normalizeDuplex = ({
114131
114937
  };
114132
114938
 
114133
114939
  const normalizeTransformStream = ({stdioItem, stdioItem: {value}, index, newTransforms, direction}) => {
114134
- const {transform, objectMode} = isPlainObject$1(value) ? value : {transform: value};
114940
+ const {transform, objectMode} = isPlainObject(value) ? value : {transform: value};
114135
114941
  const {writableObjectMode, readableObjectMode} = getTransformObjectModes(objectMode, index, newTransforms, direction);
114136
114942
  return ({
114137
114943
  ...stdioItem,
@@ -114146,7 +114952,7 @@ const normalizeGenerator = ({stdioItem, stdioItem: {value}, index, newTransforms
114146
114952
  binary: binaryOption = false,
114147
114953
  preserveNewlines = false,
114148
114954
  objectMode,
114149
- } = isPlainObject$1(value) ? value : {transform: value};
114955
+ } = isPlainObject(value) ? value : {transform: value};
114150
114956
  const binary = binaryOption || BINARY_ENCODINGS.has(encoding);
114151
114957
  const {writableObjectMode, readableObjectMode} = getTransformObjectModes(objectMode, index, newTransforms, direction);
114152
114958
  return {
@@ -116909,7 +117715,7 @@ const unpipeOnSignalAbort = async (unpipeSignal, {sourceStream, mergedStream, fi
116909
117715
 
116910
117716
  // Pipe a subprocess' `stdout`/`stderr`/`stdio` into another subprocess' `stdin`
116911
117717
  const pipeToSubprocess = (sourceInfo, ...pipeArguments) => {
116912
- if (isPlainObject$1(pipeArguments[0])) {
117718
+ if (isPlainObject(pipeArguments[0])) {
116913
117719
  return pipeToSubprocess.bind(undefined, {
116914
117720
  ...sourceInfo,
116915
117721
  boundOptions: {...sourceInfo.boundOptions, ...pipeArguments[0]},
@@ -118109,7 +118915,7 @@ const mergeOptions = (boundOptions, options) => {
118109
118915
  };
118110
118916
 
118111
118917
  const mergeOption = (optionName, boundOptionValue, optionValue) => {
118112
- if (DEEP_OPTIONS.has(optionName) && isPlainObject$1(boundOptionValue) && isPlainObject$1(optionValue)) {
118918
+ if (DEEP_OPTIONS.has(optionName) && isPlainObject(boundOptionValue) && isPlainObject(optionValue)) {
118113
118919
  return {...boundOptionValue, ...optionValue};
118114
118920
  }
118115
118921
 
@@ -118141,7 +118947,7 @@ const createExeca = (mapArguments, boundOptions, deepOptions, setBoundExeca) =>
118141
118947
  };
118142
118948
 
118143
118949
  const callBoundExeca = ({mapArguments, deepOptions = {}, boundOptions = {}, setBoundExeca, createNested}, firstArgument, ...nextArguments) => {
118144
- if (isPlainObject$1(firstArgument)) {
118950
+ if (isPlainObject(firstArgument)) {
118145
118951
  return createNested(mapArguments, mergeOptions(boundOptions, firstArgument), setBoundExeca);
118146
118952
  }
118147
118953
 
@@ -118172,795 +118978,193 @@ const parseArguments = ({mapArguments, firstArgument, nextArguments, deepOptions
118172
118978
  return {
118173
118979
  file,
118174
118980
  commandArguments,
118175
- options,
118176
- isSync,
118177
- };
118178
- };
118179
-
118180
- // Main logic for `execaCommand()`
118181
- const mapCommandAsync = ({file, commandArguments}) => parseCommand(file, commandArguments);
118182
-
118183
- // Main logic for `execaCommandSync()`
118184
- const mapCommandSync = ({file, commandArguments}) => ({...parseCommand(file, commandArguments), isSync: true});
118185
-
118186
- // Convert `execaCommand(command)` into `execa(file, ...commandArguments)`
118187
- const parseCommand = (command, unusedArguments) => {
118188
- if (unusedArguments.length > 0) {
118189
- throw new TypeError(`The command and its arguments must be passed as a single string: ${command} ${unusedArguments}.`);
118190
- }
118191
-
118192
- const [file, ...commandArguments] = parseCommandString(command);
118193
- return {file, commandArguments};
118194
- };
118195
-
118196
- // Convert `command` string into an array of file or arguments to pass to $`${...fileOrCommandArguments}`
118197
- const parseCommandString = command => {
118198
- if (typeof command !== 'string') {
118199
- throw new TypeError(`The command must be a string: ${String(command)}.`);
118200
- }
118201
-
118202
- const trimmedCommand = command.trim();
118203
- if (trimmedCommand === '') {
118204
- return [];
118205
- }
118206
-
118207
- const tokens = [];
118208
- for (const token of trimmedCommand.split(SPACES_REGEXP)) {
118209
- // Allow spaces to be escaped by a backslash if not meant as a delimiter
118210
- const previousToken = tokens.at(-1);
118211
- if (previousToken && previousToken.endsWith('\\')) {
118212
- // Merge previous token with current one
118213
- tokens[tokens.length - 1] = `${previousToken.slice(0, -1)} ${token}`;
118214
- } else {
118215
- tokens.push(token);
118216
- }
118217
- }
118218
-
118219
- return tokens;
118220
- };
118221
-
118222
- const SPACES_REGEXP = / +/g;
118223
-
118224
- // Sets `$.sync` and `$.s`
118225
- const setScriptSync = (boundExeca, createNested, boundOptions) => {
118226
- boundExeca.sync = createNested(mapScriptSync, boundOptions);
118227
- boundExeca.s = boundExeca.sync;
118228
- };
118229
-
118230
- // Main logic for `$`
118231
- const mapScriptAsync = ({options}) => getScriptOptions(options);
118232
-
118233
- // Main logic for `$.sync`
118234
- const mapScriptSync = ({options}) => ({...getScriptOptions(options), isSync: true});
118235
-
118236
- // `$` is like `execa` but with script-friendly options: `{stdin: 'inherit', preferLocal: true}`
118237
- const getScriptOptions = options => ({options: {...getScriptStdinOption(options), ...options}});
118238
-
118239
- const getScriptStdinOption = ({input, inputFile, stdio}) => input === undefined && inputFile === undefined && stdio === undefined
118240
- ? {stdin: 'inherit'}
118241
- : {};
118242
-
118243
- // When using $(...).pipe(...), most script-friendly options should apply to both commands.
118244
- // However, some options (like `stdin: 'inherit'`) would create issues with piping, i.e. cannot be deep.
118245
- const deepScriptOptions = {preferLocal: true};
118246
-
118247
- const execa = createExeca(() => ({}));
118248
- createExeca(() => ({isSync: true}));
118249
- createExeca(mapCommandAsync);
118250
- createExeca(mapCommandSync);
118251
- createExeca(mapNode);
118252
- createExeca(mapScriptAsync, {}, deepScriptOptions, setScriptSync);
118253
-
118254
- getIpcExport();
118255
-
118256
- const MACHINE_ID_PATHS = ["/etc/machine-id", "/var/lib/dbus/machine-id"];
118257
- async function readMachineIdentifier() {
118258
- for (const path of MACHINE_ID_PATHS) {
118259
- try {
118260
- const contents = await readFile(path, "utf8");
118261
- const trimmed = contents.trim();
118262
- if (trimmed.length > 0) {
118263
- return trimmed;
118264
- }
118265
- }
118266
- catch {
118267
- // Ignore and continue to next candidate
118268
- }
118269
- }
118270
- return os.hostname();
118271
- }
118272
- async function detectLlamaCppVersion() {
118273
- try {
118274
- const { stdout } = await execa("llama-server", ["--version"]);
118275
- const versionLine = stdout.trim();
118276
- return versionLine.length > 0 ? (versionLine.split("\n")[0] ?? null) : null;
118277
- }
118278
- catch {
118279
- return null;
118280
- }
118281
- }
118282
- async function detectVLLMVersion() {
118283
- try {
118284
- const { stdout } = await execa("python3", [
118285
- "-c",
118286
- "import importlib.metadata as md; print(md.version('vllm'))"
118287
- ]);
118288
- const version = stdout.trim();
118289
- return version.length > 0 ? version : null;
118290
- }
118291
- catch {
118292
- return null;
118293
- }
118294
- }
118295
- function normalizeMegabytes(value) {
118296
- if (typeof value !== "number" || Number.isNaN(value)) {
118297
- return null;
118298
- }
118299
- return Math.round(value * 1024 * 1024);
118300
- }
118301
- function resolveCpuValue(value) {
118302
- if (typeof value === "number" && Number.isFinite(value)) {
118303
- return value;
118304
- }
118305
- if (typeof value === "string") {
118306
- const parsed = Number(value);
118307
- return Number.isFinite(parsed) ? parsed : null;
118308
- }
118309
- return null;
118310
- }
118311
- async function collectMachineMetadata() {
118312
- const [cpuResult, memResult, osResult, graphicsResult] = await Promise.allSettled([
118313
- si.cpu(),
118314
- si.mem(),
118315
- si.osInfo(),
118316
- si.graphics()
118317
- ]);
118318
- const cpuInfo = cpuResult.status === "fulfilled" ? cpuResult.value : null;
118319
- const memInfo = memResult.status === "fulfilled" ? memResult.value : null;
118320
- const osInfo = osResult.status === "fulfilled" ? osResult.value : null;
118321
- const graphicsInfo = graphicsResult.status === "fulfilled"
118322
- ? graphicsResult.value
118323
- : { controllers: [] };
118324
- const gpus = (graphicsInfo.controllers ?? []).map((controller) => ({
118325
- bus: controller.bus ?? null,
118326
- driverVersion: controller.driverVersion ?? null,
118327
- memoryFreeBytes: normalizeMegabytes(controller.memoryFree ?? null),
118328
- memoryTotalBytes: normalizeMegabytes(controller.memoryTotal ?? null),
118329
- model: controller.model ?? controller.name ?? null,
118330
- temperatureCelsius: controller.temperatureGpu ?? null,
118331
- vendor: controller.vendor ?? null
118332
- }));
118333
- const machineMetadata = {
118334
- cpu: {
118335
- baseClockGHz: resolveCpuValue(cpuInfo?.speed ?? null),
118336
- logicalCores: cpuInfo?.cores ?? null,
118337
- maxClockGHz: resolveCpuValue(cpuInfo?.speedMax ?? null),
118338
- model: cpuInfo?.brand ?? null,
118339
- physicalCores: cpuInfo?.physicalCores ?? null
118340
- },
118341
- gpus,
118342
- hostname: os.hostname(),
118343
- llamaCppVersion: await detectLlamaCppVersion(),
118344
- machineID: await readMachineIdentifier(),
118345
- memory: {
118346
- availableBytes: memInfo?.available ?? null,
118347
- totalBytes: memInfo?.total ?? null
118348
- },
118349
- os: {
118350
- arch: osInfo?.arch ?? os.arch(),
118351
- platform: osInfo?.platform ?? os.platform(),
118352
- release: osInfo?.release ?? os.release(),
118353
- type: osInfo?.kernel ?? null,
118354
- version: osInfo?.build ?? null
118355
- },
118356
- vllmVersion: await detectVLLMVersion()
118357
- };
118358
- return machineMetadata;
118359
- }
118360
-
118361
- /**
118362
- * Coerce non-string tool_calls function.arguments to JSON strings.
118363
- * Some LLM backends return arguments as parsed objects instead of
118364
- * JSON strings, violating the OpenAI spec. This mutates in place
118365
- * and returns true if any coercion was performed.
118366
- */
118367
- function coerceToolCallArguments(parsed) {
118368
- const choices = parsed.choices;
118369
- if (!Array.isArray(choices))
118370
- return false;
118371
- let modified = false;
118372
- for (const choice of choices) {
118373
- if (!choice || typeof choice !== "object")
118374
- continue;
118375
- const choiceRecord = choice;
118376
- const msg = choiceRecord.delta ?? choiceRecord.message;
118377
- if (!msg || typeof msg !== "object")
118378
- continue;
118379
- const toolCalls = msg.tool_calls;
118380
- if (!Array.isArray(toolCalls))
118381
- continue;
118382
- for (const tc of toolCalls) {
118383
- if (!tc || typeof tc !== "object")
118384
- continue;
118385
- const fn = tc.function;
118386
- if (!fn || typeof fn !== "object")
118387
- continue;
118388
- const fnRecord = fn;
118389
- if (fnRecord.arguments !== undefined && typeof fnRecord.arguments !== "string") {
118390
- fnRecord.arguments = JSON.stringify(fnRecord.arguments);
118391
- modified = true;
118392
- }
118393
- }
118394
- }
118395
- return modified;
118396
- }
118397
- function isEngineUsageChunk(value) {
118398
- if (!value || typeof value !== "object") {
118399
- return false;
118400
- }
118401
- const record = value;
118402
- if (!record.usage || typeof record.usage !== "object") {
118403
- return false;
118404
- }
118405
- return true;
118406
- }
118407
- function monitorEngineResponseStream({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }) {
118408
- const startedAt = requestStartedAt ?? Date.now();
118409
- const passThrough = new PassThrough();
118410
- let responseBytes = 0;
118411
- let firstChunkAt = null;
118412
- let usage = null;
118413
- let buffer = "";
118414
- let completed = false;
118415
- function modifyChunkWithUsage(chunk) {
118416
- const text = chunk.toString("utf8");
118417
- const lines = text.split("\n");
118418
- const modifiedLines = [];
118419
- for (const rawLine of lines) {
118420
- const line = rawLine.trim();
118421
- if (!line.startsWith("data:")) {
118422
- modifiedLines.push(rawLine);
118423
- continue;
118424
- }
118425
- const payload = line.slice(5).trim();
118426
- if (!payload || payload === "[DONE]") {
118427
- modifiedLines.push(rawLine);
118428
- continue;
118429
- }
118430
- try {
118431
- const parsed = JSON.parse(payload);
118432
- let modified = false;
118433
- if (coerceToolCallArguments(parsed)) {
118434
- modified = true;
118435
- }
118436
- if (parsed.usage) {
118437
- const usageChunk = parsed.usage;
118438
- const effectiveContext = getEffectiveContextLength({
118439
- contextLength,
118440
- engine,
118441
- parallelism
118442
- });
118443
- if (usageChunk.context_usage === undefined &&
118444
- usageChunk.prompt_tokens !== undefined &&
118445
- effectiveContext !== null) {
118446
- usageChunk.context_usage = usageChunk.prompt_tokens / effectiveContext;
118447
- modified = true;
118448
- }
118449
- }
118450
- if (modified) {
118451
- modifiedLines.push("data: " + JSON.stringify(parsed));
118452
- continue;
118453
- }
118454
- }
118455
- catch (_error) {
118456
- // Ignore malformed chunks
118457
- }
118458
- modifiedLines.push(rawLine);
118459
- }
118460
- return Buffer.from(modifiedLines.join("\n"), "utf8");
118461
- }
118462
- function parseUsageFromBuffer() {
118463
- const lines = buffer.split("\n");
118464
- buffer = lines.pop() ?? "";
118465
- for (const rawLine of lines) {
118466
- const line = rawLine.trim();
118467
- if (!line.startsWith("data:")) {
118468
- continue;
118469
- }
118470
- const payload = line.slice(5).trim();
118471
- if (!payload || payload === "[DONE]") {
118472
- continue;
118473
- }
118474
- try {
118475
- const parsed = JSON.parse(payload);
118476
- if (isEngineUsageChunk(parsed)) {
118477
- const completionTokens = parsed.usage?.completion_tokens ?? null;
118478
- const promptTokens = parsed.usage?.prompt_tokens ?? null;
118479
- const totalTokens = parsed.usage?.total_tokens ?? null;
118480
- let contextUsage = parsed.usage?.context_usage ?? null;
118481
- const effectiveContextForUsage = getEffectiveContextLength({
118482
- contextLength,
118483
- engine,
118484
- parallelism
118485
- });
118486
- if (contextUsage === null &&
118487
- promptTokens !== null &&
118488
- effectiveContextForUsage !== null) {
118489
- contextUsage = promptTokens / effectiveContextForUsage;
118490
- }
118491
- usage = {
118492
- completionTokens,
118493
- contextUsage,
118494
- promptTokens,
118495
- totalTokens
118496
- };
118497
- }
118498
- }
118499
- catch (_error) {
118500
- // Ignore malformed chunks
118501
- }
118502
- }
118503
- }
118504
- function finalize(error) {
118505
- if (completed) {
118506
- return;
118507
- }
118508
- completed = true;
118509
- if (onComplete) {
118510
- const completion = onComplete({
118511
- durationMs: Math.max(0, Date.now() - startedAt),
118512
- error,
118513
- requestBodyBytes,
118514
- responseBytes,
118515
- timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
118516
- usage
118517
- });
118518
- if (completion && typeof completion.catch === "function") {
118519
- completion.catch(error => {
118520
- logger.error("Engine metrics completion failed", {
118521
- error: asError(error),
118522
- requestUrl: requestPath
118523
- });
118524
- });
118525
- }
118526
- }
118527
- }
118528
- body.on("data", (chunk) => {
118529
- const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
118530
- if (firstChunkAt === null) {
118531
- firstChunkAt = Date.now();
118532
- }
118533
- responseBytes += chunkBuffer.length;
118534
- buffer += chunkBuffer.toString("utf8");
118535
- parseUsageFromBuffer();
118536
- passThrough.write(modifyChunkWithUsage(chunkBuffer));
118537
- });
118538
- body.once("error", err => {
118539
- logEngineMetrics({
118540
- agentEngineType,
118541
- error: err,
118542
- level: "error",
118543
- logger,
118544
- requestBodyBytes,
118545
- requestPath,
118546
- responseBytes,
118547
- usage
118548
- });
118549
- finalize(err);
118550
- passThrough.destroy(err);
118551
- });
118552
- body.once("end", () => {
118553
- parseUsageFromBuffer();
118554
- logEngineMetrics({
118555
- agentEngineType,
118556
- level: "info",
118557
- logger,
118558
- requestBodyBytes,
118559
- requestPath,
118560
- responseBytes,
118561
- usage
118562
- });
118563
- finalize(null);
118564
- passThrough.end();
118565
- });
118566
- body.once("close", () => {
118567
- if (completed) {
118568
- if (!passThrough.writableEnded) {
118569
- passThrough.end();
118570
- }
118571
- return;
118572
- }
118573
- const closeError = new Error("Engine response stream closed before completion");
118574
- logEngineMetrics({
118575
- agentEngineType,
118576
- error: closeError,
118577
- level: "error",
118578
- logger,
118579
- requestBodyBytes,
118580
- requestPath,
118581
- responseBytes,
118582
- usage
118583
- });
118584
- finalize(closeError);
118585
- if (!passThrough.writableEnded) {
118586
- passThrough.end();
118587
- }
118588
- });
118589
- return {
118590
- stream: passThrough
118591
- };
118592
- }
118593
- function monitorEngineResponseSingle({ agentEngineType, body, contextLength, engine, logger, onComplete, parallelism, requestBodyBytes, requestPath, requestStartedAt }) {
118594
- const maxUsageCaptureBytes = 1024 * 1024;
118595
- const startedAt = requestStartedAt ?? Date.now();
118596
- const passThrough = new PassThrough();
118597
- let responseBytes = 0;
118598
- let firstChunkAt = null;
118599
- let usage = null;
118600
- const usageChunks = [];
118601
- let usageBytes = 0;
118602
- let usageCaptureEnabled = true;
118603
- let completed = false;
118604
- function finalize(error) {
118605
- if (completed) {
118606
- return;
118607
- }
118608
- completed = true;
118609
- if (onComplete) {
118610
- const completion = onComplete({
118611
- durationMs: Math.max(0, Date.now() - startedAt),
118612
- error,
118613
- requestBodyBytes,
118614
- responseBytes,
118615
- timeToFirstTokenMs: firstChunkAt === null ? null : Math.max(0, firstChunkAt - startedAt),
118616
- usage
118617
- });
118618
- if (completion && typeof completion.catch === "function") {
118619
- completion.catch(error => {
118620
- logger.error("Engine metrics completion failed", {
118621
- error: asError(error),
118622
- requestUrl: requestPath
118623
- });
118624
- });
118625
- }
118626
- }
118627
- }
118628
- body.on("data", (chunk) => {
118629
- const chunkBuffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
118630
- if (firstChunkAt === null) {
118631
- firstChunkAt = Date.now();
118632
- }
118633
- responseBytes += chunkBuffer.length;
118634
- if (usageCaptureEnabled) {
118635
- const nextSize = usageBytes + chunkBuffer.length;
118636
- if (nextSize <= maxUsageCaptureBytes) {
118637
- usageChunks.push(chunkBuffer);
118638
- usageBytes = nextSize;
118639
- }
118640
- else {
118641
- usageCaptureEnabled = false;
118642
- usageChunks.length = 0;
118643
- }
118644
- }
118645
- passThrough.write(chunkBuffer);
118646
- });
118647
- body.once("error", err => {
118648
- logEngineMetrics({
118649
- agentEngineType,
118650
- error: err,
118651
- level: "error",
118652
- logger,
118653
- requestBodyBytes,
118654
- requestPath,
118655
- responseBytes,
118656
- usage
118657
- });
118658
- finalize(err);
118659
- passThrough.destroy(err);
118660
- });
118661
- body.once("end", () => {
118662
- if (usageCaptureEnabled) {
118663
- try {
118664
- const parsed = JSON.parse(Buffer.concat(usageChunks).toString("utf8"));
118665
- if (parsed.usage) {
118666
- const usageChunk = parsed.usage;
118667
- const completionTokens = usageChunk.completion_tokens ?? null;
118668
- const promptTokens = usageChunk.prompt_tokens ?? null;
118669
- const totalTokens = usageChunk.total_tokens ?? null;
118670
- let contextUsage = usageChunk.context_usage ?? null;
118671
- const effectiveContext = getEffectiveContextLength({
118672
- contextLength,
118673
- engine,
118674
- parallelism
118675
- });
118676
- if (contextUsage === null &&
118677
- promptTokens !== null &&
118678
- effectiveContext !== null) {
118679
- contextUsage = promptTokens / effectiveContext;
118680
- }
118681
- usage = {
118682
- completionTokens,
118683
- contextUsage,
118684
- promptTokens,
118685
- totalTokens
118686
- };
118687
- }
118688
- }
118689
- catch (error) {
118690
- logger.error("Failed to parse engine response body", {
118691
- error: asError(error),
118692
- requestUrl: requestPath
118693
- });
118694
- }
118695
- }
118696
- logEngineMetrics({
118697
- agentEngineType,
118698
- level: "info",
118699
- logger,
118700
- requestBodyBytes,
118701
- requestPath,
118702
- responseBytes,
118703
- usage
118704
- });
118705
- finalize(null);
118706
- passThrough.end();
118707
- });
118708
- body.once("close", () => {
118709
- if (completed) {
118710
- if (!passThrough.writableEnded) {
118711
- passThrough.end();
118981
+ options,
118982
+ isSync,
118983
+ };
118984
+ };
118985
+
118986
+ // Main logic for `execaCommand()`
118987
+ const mapCommandAsync = ({file, commandArguments}) => parseCommand(file, commandArguments);
118988
+
118989
+ // Main logic for `execaCommandSync()`
118990
+ const mapCommandSync = ({file, commandArguments}) => ({...parseCommand(file, commandArguments), isSync: true});
118991
+
118992
+ // Convert `execaCommand(command)` into `execa(file, ...commandArguments)`
118993
+ const parseCommand = (command, unusedArguments) => {
118994
+ if (unusedArguments.length > 0) {
118995
+ throw new TypeError(`The command and its arguments must be passed as a single string: ${command} ${unusedArguments}.`);
118996
+ }
118997
+
118998
+ const [file, ...commandArguments] = parseCommandString(command);
118999
+ return {file, commandArguments};
119000
+ };
119001
+
119002
+ // Convert `command` string into an array of file or arguments to pass to $`${...fileOrCommandArguments}`
119003
+ const parseCommandString = command => {
119004
+ if (typeof command !== 'string') {
119005
+ throw new TypeError(`The command must be a string: ${String(command)}.`);
119006
+ }
119007
+
119008
+ const trimmedCommand = command.trim();
119009
+ if (trimmedCommand === '') {
119010
+ return [];
119011
+ }
119012
+
119013
+ const tokens = [];
119014
+ for (const token of trimmedCommand.split(SPACES_REGEXP)) {
119015
+ // Allow spaces to be escaped by a backslash if not meant as a delimiter
119016
+ const previousToken = tokens.at(-1);
119017
+ if (previousToken && previousToken.endsWith('\\')) {
119018
+ // Merge previous token with current one
119019
+ tokens[tokens.length - 1] = `${previousToken.slice(0, -1)} ${token}`;
119020
+ } else {
119021
+ tokens.push(token);
119022
+ }
119023
+ }
119024
+
119025
+ return tokens;
119026
+ };
119027
+
119028
+ const SPACES_REGEXP = / +/g;
119029
+
119030
+ // Sets `$.sync` and `$.s`
119031
+ const setScriptSync = (boundExeca, createNested, boundOptions) => {
119032
+ boundExeca.sync = createNested(mapScriptSync, boundOptions);
119033
+ boundExeca.s = boundExeca.sync;
119034
+ };
119035
+
119036
+ // Main logic for `$`
119037
+ const mapScriptAsync = ({options}) => getScriptOptions(options);
119038
+
119039
+ // Main logic for `$.sync`
119040
+ const mapScriptSync = ({options}) => ({...getScriptOptions(options), isSync: true});
119041
+
119042
+ // `$` is like `execa` but with script-friendly options: `{stdin: 'inherit', preferLocal: true}`
119043
+ const getScriptOptions = options => ({options: {...getScriptStdinOption(options), ...options}});
119044
+
119045
+ const getScriptStdinOption = ({input, inputFile, stdio}) => input === undefined && inputFile === undefined && stdio === undefined
119046
+ ? {stdin: 'inherit'}
119047
+ : {};
119048
+
119049
+ // When using $(...).pipe(...), most script-friendly options should apply to both commands.
119050
+ // However, some options (like `stdin: 'inherit'`) would create issues with piping, i.e. cannot be deep.
119051
+ const deepScriptOptions = {preferLocal: true};
119052
+
119053
+ const execa = createExeca(() => ({}));
119054
+ createExeca(() => ({isSync: true}));
119055
+ createExeca(mapCommandAsync);
119056
+ createExeca(mapCommandSync);
119057
+ createExeca(mapNode);
119058
+ createExeca(mapScriptAsync, {}, deepScriptOptions, setScriptSync);
119059
+
119060
+ getIpcExport();
119061
+
119062
+ const MACHINE_ID_PATHS = ["/etc/machine-id", "/var/lib/dbus/machine-id"];
119063
+ async function readMachineIdentifier() {
119064
+ for (const path of MACHINE_ID_PATHS) {
119065
+ try {
119066
+ const contents = await readFile(path, "utf8");
119067
+ const trimmed = contents.trim();
119068
+ if (trimmed.length > 0) {
119069
+ return trimmed;
118712
119070
  }
118713
- return;
118714
119071
  }
118715
- const closeError = new Error("Engine response stream closed before completion");
118716
- logEngineMetrics({
118717
- agentEngineType,
118718
- error: closeError,
118719
- level: "error",
118720
- logger,
118721
- requestBodyBytes,
118722
- requestPath,
118723
- responseBytes,
118724
- usage
118725
- });
118726
- finalize(closeError);
118727
- if (!passThrough.writableEnded) {
118728
- passThrough.end();
119072
+ catch {
119073
+ // Ignore and continue to next candidate
118729
119074
  }
118730
- });
118731
- return {
118732
- stream: passThrough
118733
- };
118734
- }
118735
- function logEngineMetrics({ agentEngineType, error, level, logger, requestBodyBytes, requestPath, responseBytes, usage }) {
118736
- const metricsMessage = [
118737
- "LLM engine stream metrics",
118738
- `path=${requestPath}`,
118739
- `bytesTo=${requestBodyBytes}`,
118740
- `bytesFrom=${responseBytes}`,
118741
- `promptTokens=${usage?.promptTokens ?? "n/a"}`,
118742
- `completionTokens=${usage?.completionTokens ?? "n/a"}`,
118743
- `totalTokens=${usage?.totalTokens ?? "n/a"}`,
118744
- `contextUsage=${usage?.contextUsage ?? "n/a"}`
118745
- ].join(" ");
118746
- const attributes = {
118747
- agentEngineType,
118748
- requestUrl: requestPath
118749
- };
118750
- if (error) {
118751
- attributes.error = error;
118752
119075
  }
118753
- logger[level](metricsMessage, attributes);
118754
- }
118755
-
118756
- function isPlainObject(value) {
118757
- return typeof value === "object" && value !== null && !Array.isArray(value);
119076
+ return os.hostname();
118758
119077
  }
118759
- function serializeRequestBody(body) {
118760
- if (!isPlainObject(body)) {
118761
- const payload = typeof body === "string" ? body : JSON.stringify(body);
118762
- return {
118763
- bytes: Buffer.byteLength(payload, "utf8"),
118764
- payload
118765
- };
119078
+ async function detectLlamaCppVersion() {
119079
+ try {
119080
+ const { stdout } = await execa("llama-server", ["--version"]);
119081
+ const versionLine = stdout.trim();
119082
+ return versionLine.length > 0 ? (versionLine.split("\n")[0] ?? null) : null;
119083
+ }
119084
+ catch {
119085
+ return null;
118766
119086
  }
118767
- const requestPayload = { ...body };
118768
- const streamOptions = requestPayload.stream_options;
118769
- const normalizedStreamOptions = isPlainObject(streamOptions)
118770
- ? { ...streamOptions }
118771
- : {};
118772
- normalizedStreamOptions.include_usage = true;
118773
- requestPayload.stream_options = normalizedStreamOptions;
118774
- const payload = JSON.stringify(requestPayload);
118775
- return {
118776
- bytes: Buffer.byteLength(payload, "utf8"),
118777
- payload
118778
- };
118779
119087
  }
118780
- function calculateTokensPerSecond({ durationMs, totalTokens }) {
118781
- if (durationMs <= 0) {
118782
- return 0;
119088
+ async function detectVLLMVersion() {
119089
+ try {
119090
+ const { stdout } = await execa("python3", [
119091
+ "-c",
119092
+ "import importlib.metadata as md; print(md.version('vllm'))"
119093
+ ]);
119094
+ const version = stdout.trim();
119095
+ return version.length > 0 ? version : null;
118783
119096
  }
118784
- const tokensPerSecond = totalTokens / (durationMs / 1000);
118785
- if (!Number.isFinite(tokensPerSecond) || tokensPerSecond <= 0) {
118786
- return 0;
119097
+ catch {
119098
+ return null;
118787
119099
  }
118788
- return Math.round(tokensPerSecond);
118789
119100
  }
118790
- async function proxyOpenAIStreamingRoute({ body, configuration, logger, modelID, modelManager, path, reportMetrics }) {
118791
- function normalizeTokenCount(value) {
118792
- if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
118793
- return value;
118794
- }
118795
- return 0;
118796
- }
118797
- function reportMetricsSafe(payload) {
118798
- reportMetrics(payload).catch(error => {
118799
- logger.warn("Failed to upload LLM prompt metrics", {
118800
- error: asError(error),
118801
- requestUrl: path
118802
- });
118803
- });
119101
+ function normalizeMegabytes(value) {
119102
+ if (typeof value !== "number" || Number.isNaN(value)) {
119103
+ return null;
118804
119104
  }
118805
- const { bytes: requestBodyBytes, payload: serializedBody } = serializeRequestBody(body);
118806
- const requestStartedAt = Date.now();
118807
- const requestBody = JSON.parse(serializedBody);
118808
- const streamRequested = requestBody.stream === true;
118809
- const onMonitoringComplete = ({ durationMs, error, responseBytes, timeToFirstTokenMs, usage }) => {
118810
- const completionTokens = normalizeTokenCount(usage?.completionTokens);
118811
- const promptTokens = normalizeTokenCount(usage?.promptTokens);
118812
- const totalTokens = normalizeTokenCount(usage?.totalTokens ?? completionTokens + promptTokens);
118813
- const latencyMs = Math.max(0, durationMs);
118814
- reportMetricsSafe({
118815
- bytes: requestBodyBytes + responseBytes,
118816
- completionTokens,
118817
- engine: configuration.agentEngineType,
118818
- endpointId: null,
118819
- latencyMs,
118820
- modelId: modelID,
118821
- promptTokens,
118822
- requestBytes: requestBodyBytes,
118823
- requestId: null,
118824
- requestMethod: "POST",
118825
- requestPath: path,
118826
- responseBytes,
118827
- successful: !error,
118828
- timeToFirstTokenMs,
118829
- tokensPerSecond: calculateTokensPerSecond({
118830
- durationMs: latencyMs,
118831
- totalTokens
118832
- }),
118833
- totalTokens
118834
- });
118835
- };
118836
- const response = await modelManager
118837
- .fetchOpenAI(path, {
118838
- body: serializedBody,
118839
- headers: {
118840
- "Content-Type": "application/json"
118841
- },
118842
- method: "POST"
118843
- })
118844
- .catch(error => {
118845
- logEngineMetrics({
118846
- agentEngineType: configuration.agentEngineType,
118847
- error: error,
118848
- level: "error",
118849
- logger,
118850
- requestBodyBytes,
118851
- requestPath: path,
118852
- responseBytes: 0,
118853
- usage: null
118854
- });
118855
- const latencyMs = Math.max(0, Date.now() - requestStartedAt);
118856
- reportMetricsSafe({
118857
- bytes: requestBodyBytes,
118858
- completionTokens: 0,
118859
- engine: configuration.agentEngineType,
118860
- endpointId: null,
118861
- latencyMs,
118862
- modelId: modelID,
118863
- promptTokens: 0,
118864
- requestBytes: requestBodyBytes,
118865
- requestId: null,
118866
- requestMethod: "POST",
118867
- requestPath: path,
118868
- responseBytes: 0,
118869
- successful: false,
118870
- timeToFirstTokenMs: null,
118871
- tokensPerSecond: 0,
118872
- totalTokens: 0
118873
- });
118874
- throw error;
118875
- });
118876
- const responseStatusText = response.statusText ?? "Upstream request failed";
118877
- if (!response.ok) {
118878
- const responseBody = await response.text().catch(() => null);
118879
- const responseError = new Error(responseBody
118880
- ? `Upstream error response: ${responseBody}`
118881
- : "Upstream error response: empty body");
118882
- logger.error("LLM engine request failed", {
118883
- error: responseError,
118884
- requestUrl: path,
118885
- statusCode: response.status,
118886
- statusText: responseStatusText,
118887
- responseBody: responseBody ?? undefined
118888
- });
118889
- if (!response.body) {
118890
- return {
118891
- status: response.status,
118892
- statusText: responseStatusText
118893
- };
118894
- }
119105
+ return Math.round(value * 1024 * 1024);
119106
+ }
119107
+ function resolveCpuValue(value) {
119108
+ if (typeof value === "number" && Number.isFinite(value)) {
119109
+ return value;
118895
119110
  }
118896
- if (!response.body) {
118897
- logEngineMetrics({
118898
- agentEngineType: configuration.agentEngineType,
118899
- level: response.ok ? "info" : "error",
118900
- logger,
118901
- requestBodyBytes,
118902
- requestPath: path,
118903
- responseBytes: 0,
118904
- usage: null
118905
- });
118906
- const latencyMs = Math.max(0, Date.now() - requestStartedAt);
118907
- reportMetricsSafe({
118908
- bytes: requestBodyBytes,
118909
- completionTokens: 0,
118910
- engine: configuration.agentEngineType,
118911
- endpointId: null,
118912
- latencyMs,
118913
- modelId: modelID,
118914
- promptTokens: 0,
118915
- requestBytes: requestBodyBytes,
118916
- requestId: null,
118917
- requestMethod: "POST",
118918
- requestPath: path,
118919
- responseBytes: 0,
118920
- successful: false,
118921
- timeToFirstTokenMs: null,
118922
- tokensPerSecond: 0,
118923
- totalTokens: 0
118924
- });
118925
- return {
118926
- status: response.status,
118927
- statusText: responseStatusText
118928
- };
119111
+ if (typeof value === "string") {
119112
+ const parsed = Number(value);
119113
+ return Number.isFinite(parsed) ? parsed : null;
118929
119114
  }
118930
- const monitoredResponse = streamRequested
118931
- ? monitorEngineResponseStream({
118932
- agentEngineType: configuration.agentEngineType,
118933
- body: Readable.fromWeb(response.body),
118934
- contextLength: modelManager.contextLength,
118935
- engine: configuration.agentEngineType,
118936
- logger,
118937
- onComplete: onMonitoringComplete,
118938
- parallelism: modelManager.parallelism,
118939
- requestBodyBytes,
118940
- requestPath: path,
118941
- requestStartedAt
118942
- })
118943
- : monitorEngineResponseSingle({
118944
- agentEngineType: configuration.agentEngineType,
118945
- body: Readable.fromWeb(response.body),
118946
- contextLength: modelManager.contextLength,
118947
- engine: configuration.agentEngineType,
118948
- logger,
118949
- onComplete: onMonitoringComplete,
118950
- parallelism: modelManager.parallelism,
118951
- requestBodyBytes,
118952
- requestPath: path,
118953
- requestStartedAt
118954
- });
118955
- return {
118956
- body: monitoredResponse.stream,
118957
- headers: Object.fromEntries(response.headers.entries()),
118958
- status: response.status
119115
+ return null;
119116
+ }
119117
+ async function collectMachineMetadata() {
119118
+ const [cpuResult, memResult, osResult, graphicsResult] = await Promise.allSettled([
119119
+ si.cpu(),
119120
+ si.mem(),
119121
+ si.osInfo(),
119122
+ si.graphics()
119123
+ ]);
119124
+ const cpuInfo = cpuResult.status === "fulfilled" ? cpuResult.value : null;
119125
+ const memInfo = memResult.status === "fulfilled" ? memResult.value : null;
119126
+ const osInfo = osResult.status === "fulfilled" ? osResult.value : null;
119127
+ const graphicsInfo = graphicsResult.status === "fulfilled"
119128
+ ? graphicsResult.value
119129
+ : { controllers: [] };
119130
+ const gpus = (graphicsInfo.controllers ?? []).map((controller) => ({
119131
+ bus: controller.bus ?? null,
119132
+ driverVersion: controller.driverVersion ?? null,
119133
+ memoryFreeBytes: normalizeMegabytes(controller.memoryFree ?? null),
119134
+ memoryTotalBytes: normalizeMegabytes(controller.memoryTotal ?? null),
119135
+ model: controller.model ?? controller.name ?? null,
119136
+ temperatureCelsius: controller.temperatureGpu ?? null,
119137
+ vendor: controller.vendor ?? null
119138
+ }));
119139
+ const machineMetadata = {
119140
+ cpu: {
119141
+ baseClockGHz: resolveCpuValue(cpuInfo?.speed ?? null),
119142
+ logicalCores: cpuInfo?.cores ?? null,
119143
+ maxClockGHz: resolveCpuValue(cpuInfo?.speedMax ?? null),
119144
+ model: cpuInfo?.brand ?? null,
119145
+ physicalCores: cpuInfo?.physicalCores ?? null
119146
+ },
119147
+ gpus,
119148
+ hostname: os.hostname(),
119149
+ llamaCppVersion: await detectLlamaCppVersion(),
119150
+ machineID: await readMachineIdentifier(),
119151
+ memory: {
119152
+ availableBytes: memInfo?.available ?? null,
119153
+ totalBytes: memInfo?.total ?? null
119154
+ },
119155
+ os: {
119156
+ arch: osInfo?.arch ?? os.arch(),
119157
+ platform: osInfo?.platform ?? os.platform(),
119158
+ release: osInfo?.release ?? os.release(),
119159
+ type: osInfo?.kernel ?? null,
119160
+ version: osInfo?.build ?? null
119161
+ },
119162
+ vllmVersion: await detectVLLMVersion()
118959
119163
  };
119164
+ return machineMetadata;
118960
119165
  }
118961
119166
 
118962
119167
  async function createApplication({ abortController, apiClient, configuration, logger }) {
118963
- // Fetch configuration
118964
119168
  logger.info("Fetching conduit configuration");
118965
119169
  let conduitConfiguration = await apiClient.getConduitConfiguration();
118966
119170
  logger.info("Received configuration", {
@@ -118986,7 +119190,6 @@ async function createApplication({ abortController, apiClient, configuration, lo
118986
119190
  let modelFileName = getConduitModelFileName(conduitConfiguration);
118987
119191
  let modelName = getConduitModelName(conduitConfiguration);
118988
119192
  const startup = Date.now();
118989
- // Initialise model manager
118990
119193
  let modelManager = new ModelManager({
118991
119194
  contextLength: conduitConfiguration.contextLength ?? null,
118992
119195
  engine: configuration.agentEngineType,
@@ -119027,6 +119230,7 @@ async function createApplication({ abortController, apiClient, configuration, lo
119027
119230
  });
119028
119231
  conduitStateReportManager.reportStateChange();
119029
119232
  };
119233
+ let stopRequestedByControl = false;
119030
119234
  const attachLifecycleListeners = () => {
119031
119235
  modelManager.on("engineError", err => {
119032
119236
  logger.error("LLM engine error", {
@@ -119035,6 +119239,9 @@ async function createApplication({ abortController, apiClient, configuration, lo
119035
119239
  stopRequestedByControl = false;
119036
119240
  setErrorState({ error: err.message });
119037
119241
  });
119242
+ modelManager.on("engineReady", () => {
119243
+ setOnlineState();
119244
+ });
119038
119245
  modelManager.on("engineTerminated", () => {
119039
119246
  if (stopRequestedByControl) {
119040
119247
  stopRequestedByControl = false;
@@ -119046,9 +119253,6 @@ async function createApplication({ abortController, apiClient, configuration, lo
119046
119253
  });
119047
119254
  conduitStateReportManager.reportStateChange();
119048
119255
  });
119049
- modelManager.on("engineReady", () => {
119050
- setOnlineState();
119051
- });
119052
119256
  };
119053
119257
  attachLifecycleListeners();
119054
119258
  let lastDownloadKey = "";
@@ -119074,7 +119278,6 @@ async function createApplication({ abortController, apiClient, configuration, lo
119074
119278
  });
119075
119279
  conduitStateReportManager.reportDownloadProgress();
119076
119280
  };
119077
- let stopRequestedByControl = false;
119078
119281
  async function startEngine() {
119079
119282
  logger.info("Engine start requested");
119080
119283
  conduitStateManager.setState({
@@ -119106,31 +119309,49 @@ async function createApplication({ abortController, apiClient, configuration, lo
119106
119309
  });
119107
119310
  await conduitStateReportManager.reportNow();
119108
119311
  logger.info("Stopping engine process");
119109
- await modelManager.stop();
119312
+ try {
119313
+ await modelManager.stop();
119314
+ }
119315
+ catch (error) {
119316
+ stopRequestedByControl = false;
119317
+ throw error;
119318
+ }
119110
119319
  logger.info("Engine process stopped");
119111
119320
  setIdleState({ reason });
119112
119321
  }
119113
- modelManager.on("engineError", err => {
119114
- logger.error("LLM engine error", {
119115
- error: err
119116
- });
119117
- stopRequestedByControl = false;
119118
- setErrorState({ error: err.message });
119119
- });
119120
- modelManager.on("engineTerminated", () => {
119121
- if (stopRequestedByControl) {
119122
- stopRequestedByControl = false;
119123
- setIdleState({ reason: "Remote shutdown requested" });
119124
- return;
119322
+ async function cycleEngine() {
119323
+ const sourceState = conduitStateManager.getState().state;
119324
+ if (sourceState !== "idle") {
119325
+ await stopEngine({
119326
+ reason: "Remote cycle requested"
119327
+ });
119125
119328
  }
119126
- conduitStateManager.setState({
119127
- state: "offline"
119329
+ logger.info("Fetching new configuration");
119330
+ const newConduitConfiguration = await apiClient.getConduitConfiguration();
119331
+ logger.info("Received new configuration", {
119332
+ modelID: newConduitConfiguration.targetModel.id
119128
119333
  });
119129
- conduitStateReportManager.reportStateChange();
119130
- });
119131
- modelManager.on("engineReady", () => {
119132
- setOnlineState();
119133
- });
119334
+ logger.info("Updating configuration and model manager");
119335
+ conduitConfiguration = newConduitConfiguration;
119336
+ modelFileName = getConduitModelFileName(conduitConfiguration);
119337
+ modelName = getConduitModelName(conduitConfiguration);
119338
+ modelManager = new ModelManager({
119339
+ contextLength: conduitConfiguration.contextLength ?? null,
119340
+ engine: configuration.agentEngineType,
119341
+ logger,
119342
+ model: conduitConfiguration.targetModel,
119343
+ parallelism: conduitConfiguration.parallelism ?? null,
119344
+ root: configuration.rootDirectory
119345
+ });
119346
+ attachLifecycleListeners();
119347
+ if (sourceState === "idle") {
119348
+ logger.info("Restarting engine from idle");
119349
+ }
119350
+ else {
119351
+ logger.info("Restarting engine");
119352
+ }
119353
+ await startEngine();
119354
+ }
119134
119355
  if (configuration.startMode === "idle") {
119135
119356
  setIdleState({ reason: "Startup mode is idle" });
119136
119357
  }
@@ -119143,165 +119364,47 @@ async function createApplication({ abortController, apiClient, configuration, lo
119143
119364
  setErrorState({ error: parsedError.message });
119144
119365
  });
119145
119366
  }
119146
- // #region API routes
119147
119367
  const app = express();
119148
119368
  const publicRouter = createRouter();
119149
119369
  app.use(publicRouter);
119150
- publicRouter.get("/health", (_req, res) => {
119151
- res.status(200).send("OK");
119152
- });
119370
+ publicRouter.get("/health", createHealthHandler());
119153
119371
  implementAPIReference({
119154
119372
  api: {
119373
+ "/conduit/engine/cycle": {
119374
+ POST: createPostCycleEngineHandler({
119375
+ cycleEngine,
119376
+ conduitStateManager,
119377
+ getModelManager: () => modelManager,
119378
+ logger,
119379
+ setErrorState,
119380
+ startEngine,
119381
+ stopEngine,
119382
+ stopRequestedByControl: () => stopRequestedByControl
119383
+ })
119384
+ },
119155
119385
  "/conduit/engine/start": {
119156
- POST: async () => {
119157
- if (conduitStateManager.getState().state !== "idle") {
119158
- return {
119159
- status: 409,
119160
- statusText: "Engine can only be started from idle state"
119161
- };
119162
- }
119163
- if (!modelManager.canStart) {
119164
- return {
119165
- status: 409,
119166
- statusText: `Engine cannot be started from current state: ${modelManager.state}`
119167
- };
119168
- }
119169
- try {
119170
- logger.info("Received remote engine start request");
119171
- await startEngine();
119172
- return {
119173
- body: {
119174
- acknowledged: true
119175
- },
119176
- status: 202
119177
- };
119178
- }
119179
- catch (error) {
119180
- if (stopRequestedByControl || modelManager.state === "stopped") {
119181
- return {
119182
- status: 409,
119183
- statusText: "Engine start was interrupted"
119184
- };
119185
- }
119186
- const parsedError = asError(error);
119187
- setErrorState({ error: parsedError.message });
119188
- return {
119189
- status: 500,
119190
- statusText: parsedError.message
119191
- };
119192
- }
119193
- }
119386
+ POST: createPostStartEngineHandler({
119387
+ cycleEngine,
119388
+ conduitStateManager,
119389
+ getModelManager: () => modelManager,
119390
+ logger,
119391
+ setErrorState,
119392
+ startEngine,
119393
+ stopEngine,
119394
+ stopRequestedByControl: () => stopRequestedByControl
119395
+ })
119194
119396
  },
119195
119397
  "/conduit/engine/stop": {
119196
- POST: async () => {
119197
- const sourceState = conduitStateManager.getState().state;
119198
- if (sourceState !== "bootingEngine" && sourceState !== "online") {
119199
- return {
119200
- status: 409,
119201
- statusText: "Engine can only be stopped while booting or online"
119202
- };
119203
- }
119204
- if (!modelManager.canStop) {
119205
- return {
119206
- status: 409,
119207
- statusText: `Engine cannot be stopped from current state: ${modelManager.state}`
119208
- };
119209
- }
119210
- try {
119211
- logger.info("Received remote engine stop request");
119212
- stopEngine({
119213
- reason: "Remote shutdown requested"
119214
- }).catch(error => {
119215
- const parsedError = asError(error);
119216
- logger.error("Remote engine stop request failed", {
119217
- error: parsedError
119218
- });
119219
- setErrorState({ error: parsedError.message });
119220
- });
119221
- return {
119222
- body: {
119223
- acknowledged: true
119224
- },
119225
- status: 202
119226
- };
119227
- }
119228
- catch (error) {
119229
- const parsedError = asError(error);
119230
- setErrorState({ error: parsedError.message });
119231
- return {
119232
- status: 500,
119233
- statusText: parsedError.message
119234
- };
119235
- }
119236
- }
119237
- },
119238
- "/conduit/engine/cycle": {
119239
- POST: async ({ body }) => {
119240
- const sourceState = conduitStateManager.getState().state;
119241
- if (sourceState !== "bootingEngine" &&
119242
- sourceState !== "online" &&
119243
- sourceState !== "idle") {
119244
- return {
119245
- status: 409,
119246
- statusText: "Engine can only be cycled while booting, online, or idle"
119247
- };
119248
- }
119249
- if (sourceState !== "idle" && !modelManager.canStop) {
119250
- return {
119251
- status: 409,
119252
- statusText: `Engine cannot be cycled from current state: ${modelManager.state}`
119253
- };
119254
- }
119255
- try {
119256
- logger.info("Received remote engine cycle request");
119257
- const sourceState = conduitStateManager.getState().state;
119258
- if (sourceState !== "idle") {
119259
- await stopEngine({
119260
- reason: "Remote cycle requested"
119261
- });
119262
- }
119263
- logger.info("Fetching new configuration");
119264
- const newConduitConfiguration = await apiClient.getConduitConfiguration();
119265
- logger.info("Received new configuration", {
119266
- modelID: newConduitConfiguration.targetModel.id
119267
- });
119268
- logger.info("Updating configuration and model manager");
119269
- conduitConfiguration = newConduitConfiguration;
119270
- modelFileName = getConduitModelFileName(conduitConfiguration);
119271
- modelName = getConduitModelName(conduitConfiguration);
119272
- modelManager = new ModelManager({
119273
- contextLength: conduitConfiguration.contextLength ?? null,
119274
- engine: configuration.agentEngineType,
119275
- logger,
119276
- model: conduitConfiguration.targetModel,
119277
- parallelism: conduitConfiguration.parallelism ?? null,
119278
- root: configuration.rootDirectory
119279
- });
119280
- attachLifecycleListeners();
119281
- if (sourceState === "idle") {
119282
- logger.info("Restarting engine from idle");
119283
- await startEngine();
119284
- }
119285
- else {
119286
- logger.info("Restarting engine");
119287
- await startEngine();
119288
- }
119289
- return {
119290
- body: {
119291
- acknowledged: true
119292
- },
119293
- status: 202
119294
- };
119295
- }
119296
- catch (error) {
119297
- const parsedError = asError(error);
119298
- setErrorState({ error: parsedError.message });
119299
- return {
119300
- status: 500,
119301
- statusText: parsedError.message
119302
- };
119303
- }
119304
- }
119398
+ POST: createPostStopEngineHandler({
119399
+ cycleEngine,
119400
+ conduitStateManager,
119401
+ getModelManager: () => modelManager,
119402
+ logger,
119403
+ setErrorState,
119404
+ startEngine,
119405
+ stopEngine,
119406
+ stopRequestedByControl: () => stopRequestedByControl
119407
+ })
119305
119408
  }
119306
119409
  },
119307
119410
  logger,
@@ -119311,56 +119414,34 @@ async function createApplication({ abortController, apiClient, configuration, lo
119311
119414
  implementAPIReference({
119312
119415
  api: {
119313
119416
  "/v1/chat/completions": {
119314
- POST: async ({ body }) => {
119315
- return proxyOpenAIStreamingRoute({
119316
- body,
119317
- configuration,
119318
- logger,
119319
- modelID: conduitConfiguration.targetModel.id,
119320
- modelManager,
119321
- path: "/v1/chat/completions",
119322
- reportMetrics: apiClient.reportPromptMetrics
119323
- });
119324
- }
119417
+ POST: createPostChatCompletionsHandler({
119418
+ apiClient,
119419
+ configuration,
119420
+ getModelID: () => conduitConfiguration.targetModel.id,
119421
+ getModelManager: () => modelManager,
119422
+ logger,
119423
+ startup
119424
+ })
119325
119425
  },
119326
119426
  "/v1/completions": {
119327
- POST: async ({ body }) => {
119328
- return proxyOpenAIStreamingRoute({
119329
- body,
119330
- configuration,
119331
- logger,
119332
- modelID: conduitConfiguration.targetModel.id,
119333
- modelManager,
119334
- path: "/v1/completions",
119335
- reportMetrics: apiClient.reportPromptMetrics
119336
- });
119337
- }
119427
+ POST: createPostCompletionsHandler({
119428
+ apiClient,
119429
+ configuration,
119430
+ getModelID: () => conduitConfiguration.targetModel.id,
119431
+ getModelManager: () => modelManager,
119432
+ logger,
119433
+ startup
119434
+ })
119338
119435
  },
119339
119436
  "/v1/models": {
119340
- GET: async () => {
119341
- const effectiveContextLength = getEffectiveContextLength({
119342
- contextLength: modelManager.contextLength,
119343
- engine: configuration.agentEngineType,
119344
- parallelism: modelManager.parallelism
119345
- });
119346
- return {
119347
- body: {
119348
- object: "list",
119349
- data: [
119350
- {
119351
- id: conduitConfiguration.targetModel.id,
119352
- object: "model",
119353
- created: startup / 1000,
119354
- owned_by: "infersec",
119355
- limit: {
119356
- context: effectiveContextLength
119357
- }
119358
- }
119359
- ]
119360
- },
119361
- status: 200
119362
- };
119363
- }
119437
+ GET: createGetModelsHandler({
119438
+ apiClient,
119439
+ configuration,
119440
+ getModelID: () => conduitConfiguration.targetModel.id,
119441
+ getModelManager: () => modelManager,
119442
+ logger,
119443
+ startup
119444
+ })
119364
119445
  }
119365
119446
  },
119366
119447
  logger,
@@ -119440,7 +119521,6 @@ async function createApplication({ abortController, apiClient, configuration, lo
119440
119521
  app,
119441
119522
  shutdown
119442
119523
  };
119443
- // #endregion
119444
119524
  }
119445
119525
  function getConduitModelFileName(configuration) {
119446
119526
  const { source } = configuration.targetModel;