npm - lynkr - Versions diffs - 9.1.2 → 9.1.4 - Mend

lynkr 9.1.2 → 9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/README.md +21 -10
package/package.json +3 -1
package/scripts/build-knn-index.js +130 -0
package/scripts/calibrate-thresholds.js +197 -0
package/scripts/compare-policies.js +67 -0
package/scripts/learn-output-ratios.js +162 -0
package/scripts/refresh-pricing.js +122 -0
package/scripts/run-routerarena.js +26 -0
package/scripts/sample-regret.js +84 -0
package/scripts/train-risk-classifier.js +191 -0
package/src/api/middleware/budget-enforcer.js +60 -0
package/src/api/middleware/load-shedding.js +11 -1
package/src/api/middleware/tenant.js +21 -0
package/src/api/router.js +19 -40
package/src/budget/hierarchical-budget.js +159 -0
package/src/cache/semantic.js +28 -2
package/src/clients/databricks.js +59 -5
package/src/config/index.js +239 -43
package/src/context/toon.js +5 -4
package/src/orchestrator/index.js +44 -6
package/src/prompts/system.js +34 -6
package/src/routing/bandit.js +246 -0
package/src/routing/cascade.js +106 -0
package/src/routing/complexity-analyzer.js +7 -15
package/src/routing/confidence-scorer.js +121 -0
package/src/routing/context-validator.js +71 -0
package/src/routing/cost-optimizer.js +5 -2
package/src/routing/deadline.js +52 -0
package/src/routing/drift-monitor.js +113 -0
package/src/routing/embedding-cache.js +77 -0
package/src/routing/index.js +314 -5
package/src/routing/knn-router.js +206 -0
package/src/routing/latency-tracker.js +113 -71
package/src/routing/model-tiers.js +156 -6
package/src/routing/output-ratios.js +57 -0
package/src/routing/regret-estimator.js +91 -0
package/src/routing/reward-pipeline.js +62 -0
package/src/routing/risk-classifier.js +130 -0
package/src/routing/shadow-mode.js +77 -0
package/src/routing/tenant-policy.js +96 -0
package/src/routing/tokenizer.js +162 -0
package/src/server.js +9 -0

package/src/context/toon.js CHANGED Viewed

@@ -15,11 +15,12 @@ function normaliseSettings(settings = {}) {
   };
 }
-function resolveEncodeFn(overrideEncode) {
+async function resolveEncodeFn(overrideEncode) {
   if (typeof overrideEncode === "function") return overrideEncode;
   if (cachedEncode !== undefined) return cachedEncode;
   try {
-    const toon = require("@toon-format/toon");
+    // Use dynamic import for ES module compatibility
+    const toon = await import("@toon-format/toon");
     cachedEncode = typeof toon?.encode === "function" ? toon.encode : null;
     cachedLoadError = cachedEncode ? null : new Error("Missing encode() export from @toon-format/toon");
   } catch (err) {
@@ -89,7 +90,7 @@ function compressStringContent(content, cfg, encodeFn, stats) {
   return toonText;
 }
-function applyToonCompression(payload, settings = {}, options = {}) {
+async function applyToonCompression(payload, settings = {}, options = {}) {
   const cfg = normaliseSettings(settings);
   const stats = {
     enabled: cfg.enabled,
@@ -109,7 +110,7 @@ function applyToonCompression(payload, settings = {}, options = {}) {
     return { payload, stats };
   }
-  const encodeFn = resolveEncodeFn(options.encode);
+  const encodeFn = await resolveEncodeFn(options.encode);
   if (typeof encodeFn !== "function") {
     stats.available = false;
     const err = cachedLoadError ?? new Error("TOON encoder unavailable");

package/src/orchestrator/index.js CHANGED Viewed

@@ -1101,7 +1101,7 @@ function toAnthropicResponse(openai, requestedModel, wantsThinking) {
   };
 }
-function sanitizePayload(payload) {
+async function sanitizePayload(payload) {
   const { clonePayloadSmart } = require("../utils/payload");
   const providerType = config.modelProvider?.type ?? "databricks";
   const willFlatten = providerType !== "azure-anthropic";
@@ -1418,7 +1418,7 @@ function sanitizePayload(payload) {
   // Optional TOON conversion for large JSON message payloads (prompt context only).
   // Run this BEFORE message coalescing to preserve parseable JSON boundaries.
-  applyToonCompression(clean, config.toon, { logger });
+  await applyToonCompression(clean, config.toon, { logger });
   // FIX: Handle consecutive messages with the same role (causes llama.cpp 400 error)
   // Strategy: Merge consecutive same-role messages, but NEVER merge messages
@@ -1529,12 +1529,35 @@ function getToolCallSignature(toolCall) {
 }
 function buildNonJsonResponse(databricksResponse) {
+  // Convert plain text response to Anthropic message format
+  // so SSE handler can properly render it
+  const textContent = databricksResponse.text || "";
   return {
     status: databricksResponse.status,
     headers: {
-      "Content-Type": databricksResponse.contentType ?? "text/plain",
+      "Content-Type": "application/json", // Changed from text/plain
+    },
+    body: {
+      id: `msg_${Date.now()}`,
+      type: "message",
+      role: "assistant",
+      model: "unknown",
+      content: [
+        {
+          type: "text",
+          text: textContent
+        }
+      ],
+      stop_reason: "end_turn",
+      stop_sequence: null,
+      usage: {
+        input_tokens: 0,
+        output_tokens: 0,
+        cache_creation_input_tokens: 0,
+        cache_read_input_tokens: 0,
+      }
     },
-    body: databricksResponse.text,
     terminationReason: "non_json_response",
   };
 }
@@ -1966,6 +1989,17 @@ IMPORTANT TOOL USAGE RULES:
     cleanPayload._workspace = headers["x-lynkr-workspace"];
   }
+  // Phase 6.3 — thread deadline for latency-aware routing.
+  if (headers?.["lynkr-deadline-ms"]) {
+    const dl = parseInt(headers["lynkr-deadline-ms"], 10);
+    if (!isNaN(dl) && dl > 0) cleanPayload._deadlineMs = dl;
+  }
+  // Phase 6.1 — thread tenant policy for per-tenant routing overrides.
+  if (options?.tenantPolicy) {
+    cleanPayload._tenantPolicy = options.tenantPolicy;
+  }
   // RTK-inspired tool result compression: compress large tool_results
   // before they reach the model (saves 60-90% on test/git/lint output)
   if (config.toolResultCompression?.enabled !== false) {
@@ -3895,7 +3929,7 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
   const { createTimer } = require("../utils/perf-timer");
   const pTimer = createTimer("processMessage");
-  const cleanPayload = sanitizePayload(payload);
+  const cleanPayload = await sanitizePayload(payload);
   pTimer.mark("sanitizePayload");
   // Proactively load tools based on prompt content (lazy loading)
@@ -4033,7 +4067,11 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
   if (semanticCache.isEnabled() && semanticLookupResult && !semanticLookupResult.hit) {
     if (loopResult.response?.status === 200 && loopResult.response?.body) {
       try {
-        await semanticCache.store(semanticLookupResult, loopResult.response.body);
+        // Only cache valid JSON responses, not HTML error pages
+        const body = loopResult.response.body;
+        if (typeof body === 'object' || (typeof body === 'string' && body.trim().startsWith('{'))) {
+          await semanticCache.store(semanticLookupResult, body);
+        }
       } catch (err) {
         logger.debug({ error: err.message }, "Semantic cache store failed");
       }

package/src/prompts/system.js CHANGED Viewed

@@ -70,13 +70,41 @@ function compressToolDescriptions(tools, mode = null) {
     return tools; // Return unmodified if not in minimal mode
   }
-  return tools.map(tool => {
+  const validTools = tools.filter(tool => {
+    // Handle both Anthropic format (name + input_schema) and OpenAI format (function.name)
+    const hasAnthropicFormat = tool && tool.name && tool.input_schema;
+    const hasOpenAIFormat = tool && tool.function && tool.function.name;
+    const isValid = hasAnthropicFormat || hasOpenAIFormat;
+    if (!isValid) {
+      logger.debug({
+        hasName: !!tool?.name,
+        hasSchema: !!tool?.input_schema,
+        hasFunctionName: !!tool?.function?.name,
+        toolType: typeof tool
+      }, 'Filtered out malformed tool');
+    }
+    return isValid;
+  });
+  if (validTools.length === 0 && tools.length > 0) {
+    logger.warn({ originalCount: tools.length }, 'All tools filtered out as malformed - returning original');
+    return tools;
+  }
+  return validTools.map(tool => {
+    // If already in OpenAI format, return as-is (no compression for OpenAI format)
+    if (tool.function && !tool.input_schema) {
+      return tool;
+    }
+    // Compress Anthropic format
     const compressed = {
       name: tool.name,
       input_schema: {
-        type: tool.input_schema.type,
+        type: tool.input_schema?.type || "object",
         properties: {},
-        required: tool.input_schema.required || [],
+        required: tool.input_schema?.required || [],
       }
     };
@@ -190,7 +218,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
   // 2. Remove file operation guidelines if no file tools
   const hasFileTools = context.tools?.some(t =>
-    ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
+    t?.name && ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
   );
   if (!hasFileTools) {
     text = removeSection(text, /# File Operations?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'file operations');
@@ -198,7 +226,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
   // 3. Remove git guidelines if no git tools
   const hasGitTools = context.tools?.some(t =>
-    t.name.toLowerCase().includes('git')
+    t?.name && t.name.toLowerCase().includes('git')
   );
   if (!hasGitTools) {
     text = removeSection(text, /# Git.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'git guidelines');
@@ -207,7 +235,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
   // 4. Remove web search guidelines if no web tools
   const hasWebTools = context.tools?.some(t =>
-    ['WebSearch', 'WebFetch'].includes(t.name)
+    t?.name && ['WebSearch', 'WebFetch'].includes(t.name)
   );
   if (!hasWebTools) {
     text = removeSection(text, /# Web.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'web guidelines');

package/src/routing/bandit.js ADDED Viewed

@@ -0,0 +1,246 @@
+/**
+ * LinUCB contextual bandit for intra-tier model selection (Phase 4.1).
+ *
+ * Standard LinUCB-with-disjoint-models algorithm (Li et al. 2010).
+ *   - One arm per (provider, model) pair in a tier
+ *   - Context = numerical feature vector for the request
+ *   - Reward = quality_score - λ·norm_cost - μ·norm_latency
+ *   - Per-arm A (d×d ridge-regression matrix) and b (d-vector) stored to disk
+ *
+ * State persists to data/bandit-state.json. Loaded on startup; saved on
+ * every `update()` (cheap — small matrices) and on graceful shutdown.
+ */
+const fs = require('fs');
+const path = require('path');
+const logger = require('../logger');
+const STATE_PATH = path.join(__dirname, '../../data/bandit-state.json');
+const DEFAULT_ALPHA = 1.5;
+const DEFAULT_LAMBDA = 0.3; // cost penalty weight
+const DEFAULT_MU = 0.1;     // latency penalty weight
+const FEATURE_DIM = 12;
+const EXPLORATION_RATE = 0.05;
+function _identity(d) {
+  const m = new Array(d);
+  for (let i = 0; i < d; i++) {
+    m[i] = new Array(d).fill(0);
+    m[i][i] = 1;
+  }
+  return m;
+}
+function _zeros(d) {
+  return new Array(d).fill(0);
+}
+function _matVec(M, v) {
+  const d = v.length;
+  const out = new Array(d).fill(0);
+  for (let i = 0; i < d; i++) {
+    for (let j = 0; j < d; j++) out[i] += M[i][j] * v[j];
+  }
+  return out;
+}
+function _dot(a, b) {
+  let s = 0;
+  for (let i = 0; i < a.length; i++) s += a[i] * b[i];
+  return s;
+}
+function _outer(a, b) {
+  const out = new Array(a.length);
+  for (let i = 0; i < a.length; i++) {
+    out[i] = new Array(b.length);
+    for (let j = 0; j < b.length; j++) out[i][j] = a[i] * b[j];
+  }
+  return out;
+}
+function _addMat(A, B) {
+  for (let i = 0; i < A.length; i++) {
+    for (let j = 0; j < A[i].length; j++) A[i][j] += B[i][j];
+  }
+}
+function _addVec(a, b) {
+  for (let i = 0; i < a.length; i++) a[i] += b[i];
+}
+/**
+ * Invert a small dense matrix via Gauss-Jordan. For d=12 this is plenty fast
+ * and saves us a dependency on a linear algebra library.
+ */
+function _inv(M) {
+  const d = M.length;
+  const aug = M.map((row, i) => {
+    const r = row.slice();
+    for (let j = 0; j < d; j++) r.push(i === j ? 1 : 0);
+    return r;
+  });
+  for (let i = 0; i < d; i++) {
+    let pivot = aug[i][i];
+    if (Math.abs(pivot) < 1e-12) {
+      let swap = -1;
+      for (let k = i + 1; k < d; k++) {
+        if (Math.abs(aug[k][i]) > 1e-12) { swap = k; break; }
+      }
+      if (swap < 0) throw new Error('matrix singular');
+      [aug[i], aug[swap]] = [aug[swap], aug[i]];
+      pivot = aug[i][i];
+    }
+    for (let j = 0; j < 2 * d; j++) aug[i][j] /= pivot;
+    for (let k = 0; k < d; k++) {
+      if (k === i) continue;
+      const factor = aug[k][i];
+      for (let j = 0; j < 2 * d; j++) aug[k][j] -= factor * aug[i][j];
+    }
+  }
+  return aug.map(row => row.slice(d));
+}
+class LinUCBBandit {
+  constructor({ alpha = DEFAULT_ALPHA, lambda = DEFAULT_LAMBDA, mu = DEFAULT_MU, dim = FEATURE_DIM } = {}) {
+    this.alpha = alpha;
+    this.lambda = lambda;
+    this.mu = mu;
+    this.dim = dim;
+    /** arms: Map<armKey, { A: number[][], b: number[], count: number }> */
+    this.arms = new Map();
+    this.steps = 0;
+    this._load();
+  }
+  _armKey(tier, provider, model) {
+    return `${tier}|${provider}:${model}`;
+  }
+  _ensureArm(armKey) {
+    if (!this.arms.has(armKey)) {
+      this.arms.set(armKey, { A: _identity(this.dim), b: _zeros(this.dim), count: 0 });
+    }
+    return this.arms.get(armKey);
+  }
+  /**
+   * Pick an arm for a given tier and context.
+   * @param {string} tier
+   * @param {Array<{ provider: string, model: string }>} candidates — qualifying arms
+   * @param {number[]} context — feature vector
+   * @returns {{ provider, model, ucb, explored }} chosen arm
+   */
+  pick(tier, candidates, context) {
+    if (!candidates || candidates.length === 0) return null;
+    if (context.length !== this.dim) {
+      // Pad or truncate to dim
+      context = context.slice(0, this.dim);
+      while (context.length < this.dim) context.push(0);
+    }
+    // ε-greedy: 5% pure exploration
+    if (Math.random() < EXPLORATION_RATE) {
+      const random = candidates[Math.floor(Math.random() * candidates.length)];
+      return { ...random, ucb: null, explored: true };
+    }
+    let best = null;
+    let bestUcb = -Infinity;
+    for (const c of candidates) {
+      const key = this._armKey(tier, c.provider, c.model);
+      const arm = this._ensureArm(key);
+      let Ainv;
+      try {
+        Ainv = _inv(arm.A);
+      } catch (err) {
+        continue;
+      }
+      const theta = _matVec(Ainv, arm.b);
+      const mean = _dot(theta, context);
+      const variance = _dot(context, _matVec(Ainv, context));
+      const ucb = mean + this.alpha * Math.sqrt(Math.max(0, variance));
+      if (ucb > bestUcb) {
+        bestUcb = ucb;
+        best = { ...c, ucb, explored: false };
+      }
+    }
+    return best;
+  }
+  /**
+   * Update the chosen arm with the observed reward.
+   * @param {string} tier
+   * @param {string} provider
+   * @param {string} model
+   * @param {number[]} context
+   * @param {number} reward — typically in [0, 100]; will be rescaled to [0, 1] internally
+   */
+  update(tier, provider, model, context, reward) {
+    const key = this._armKey(tier, provider, model);
+    const arm = this._ensureArm(key);
+    let ctx = context;
+    if (ctx.length !== this.dim) {
+      ctx = ctx.slice(0, this.dim);
+      while (ctx.length < this.dim) ctx.push(0);
+    }
+    const r = Math.max(0, Math.min(1, reward / 100));
+    _addMat(arm.A, _outer(ctx, ctx));
+    _addVec(arm.b, ctx.map(x => x * r));
+    arm.count++;
+    this.steps++;
+    // Save periodically (not every step to limit IO)
+    if (this.steps % 25 === 0) this._save();
+  }
+  _save() {
+    try {
+      fs.mkdirSync(path.dirname(STATE_PATH), { recursive: true });
+      const arms = {};
+      for (const [k, v] of this.arms) arms[k] = v;
+      fs.writeFileSync(STATE_PATH, JSON.stringify({
+        savedAt: Date.now(),
+        steps: this.steps,
+        alpha: this.alpha,
+        lambda: this.lambda,
+        mu: this.mu,
+        dim: this.dim,
+        arms,
+      }, null, 0));
+    } catch (err) {
+      logger.debug({ err: err.message }, '[Bandit] State save failed');
+    }
+  }
+  _load() {
+    try {
+      if (!fs.existsSync(STATE_PATH)) return;
+      const raw = JSON.parse(fs.readFileSync(STATE_PATH, 'utf8'));
+      if (raw.dim && raw.dim === this.dim) {
+        for (const [k, v] of Object.entries(raw.arms || {})) {
+          this.arms.set(k, v);
+        }
+        this.steps = raw.steps || 0;
+        logger.info({ arms: this.arms.size, steps: this.steps }, '[Bandit] State loaded');
+      }
+    } catch (err) {
+      logger.debug({ err: err.message }, '[Bandit] State load failed');
+    }
+  }
+  getStats() {
+    const armStats = {};
+    for (const [k, v] of this.arms) {
+      armStats[k] = { count: v.count };
+    }
+    return { steps: this.steps, arms: armStats, alpha: this.alpha };
+  }
+}
+let _instance = null;
+function getBandit() {
+  if (!_instance) _instance = new LinUCBBandit();
+  return _instance;
+}
+module.exports = { LinUCBBandit, getBandit, FEATURE_DIM };

package/src/routing/cascade.js ADDED Viewed

@@ -0,0 +1,106 @@
+/**
+ * Small-first cascade with confidence-based deferral (Phase 3.3).
+ *
+ * For tier-MEDIUM/COMPLEX requests, optionally try a smaller model first.
+ * If the response confidence (from confidence-scorer) ≥ threshold, accept it.
+ * Otherwise, escalate to the originally-routed tier model.
+ *
+ * Off by default for streaming (can't retry mid-stream cleanly).
+ * Opt-in via LYNKR_CASCADE_ENABLED=true.
+ */
+const logger = require('../logger');
+const confidenceScorer = require('./confidence-scorer');
+const DEFAULT_THRESHOLD = 0.85;
+const TIERS_ELIGIBLE = ['MEDIUM', 'COMPLEX'];
+function isEnabled() {
+  return process.env.LYNKR_CASCADE_ENABLED === 'true';
+}
+/**
+ * @param {object} args
+ * @param {string} args.tier — the originally selected tier
+ * @param {boolean} args.streaming — true if the request is streaming
+ * @param {boolean} args.hasTools — true if tools are present
+ * @returns {boolean}
+ */
+function shouldCascade(args) {
+  if (!isEnabled()) return false;
+  if (args.streaming) return false; // streaming responses can't be retried cleanly
+  if (args.hasTools) return false; // tool calls have side effects; don't double-run
+  if (!TIERS_ELIGIBLE.includes(args.tier)) return false;
+  return true;
+}
+/**
+ * Run a small-first cascade.
+ *
+ * @param {object} args
+ * @param {object} args.payload — the request payload
+ * @param {object} args.smallModel — { provider, model }
+ * @param {object} args.bigModel — { provider, model }
+ * @param {function} args.invoke — async (provider, model, payload) → response
+ * @param {string} args.taskType — used by confidence scorer
+ * @param {number} args.threshold — confidence threshold, defaults to 0.85
+ * @param {function} args.judge — optional judge LLM for reasoning tasks
+ * @returns {Promise<{ response, usedModel, cascadeStats }>}
+ */
+async function run(args) {
+  const threshold = args.threshold ?? DEFAULT_THRESHOLD;
+  const start = Date.now();
+  let smallLatency = 0;
+  let bigLatency = 0;
+  // Try small model
+  let smallResponse;
+  try {
+    const t0 = Date.now();
+    smallResponse = await args.invoke(args.smallModel.provider, args.smallModel.model, args.payload);
+    smallLatency = Date.now() - t0;
+  } catch (err) {
+    logger.debug({ err: err.message }, '[Cascade] Small model failed, escalating');
+    const t0 = Date.now();
+    const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
+    bigLatency = Date.now() - t0;
+    return {
+      response: bigResponse,
+      usedModel: args.bigModel,
+      cascadeStats: { accepted: false, reason: 'small_failed', smallLatency, bigLatency, totalLatency: Date.now() - start },
+    };
+  }
+  const confidence = await confidenceScorer.score(smallResponse, {
+    taskType: args.taskType,
+    question: args.payload?.messages?.[args.payload.messages.length - 1]?.content,
+    judge: args.judge,
+  });
+  if (confidence >= threshold) {
+    return {
+      response: smallResponse,
+      usedModel: args.smallModel,
+      cascadeStats: { accepted: true, confidence, smallLatency, bigLatency: 0, totalLatency: Date.now() - start },
+    };
+  }
+  // Escalate
+  const t0 = Date.now();
+  const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
+  bigLatency = Date.now() - t0;
+  return {
+    response: bigResponse,
+    usedModel: args.bigModel,
+    cascadeStats: {
+      accepted: false,
+      confidence,
+      threshold,
+      smallLatency,
+      bigLatency,
+      totalLatency: Date.now() - start,
+    },
+  };
+}
+module.exports = { run, shouldCascade, isEnabled, DEFAULT_THRESHOLD };

package/src/routing/complexity-analyzer.js CHANGED Viewed

@@ -395,24 +395,16 @@ function extractContent(payload) {
 }
 /**
- * Estimate token count (rough approximation)
+ * Estimate token count.
+ *
+ * Phase 1.1: delegates to the tiktoken-backed tokenizer (graceful fallback to
+ * chars/4 if js-tiktoken is unavailable).
  */
+const { countPayloadTokens } = require('./tokenizer');
 function estimateTokens(payload) {
   if (!payload?.messages) return 0;
-  let totalChars = 0;
-  for (const msg of payload.messages) {
-    if (typeof msg.content === 'string') {
-      totalChars += msg.content.length;
-    } else if (Array.isArray(msg.content)) {
-      for (const block of msg.content) {
-        if (block?.text) totalChars += block.text.length;
-      }
-    }
-  }
-  // Rough approximation: 4 chars per token
-  return Math.ceil(totalChars / 4);
+  return countPayloadTokens(payload, payload?.model);
 }
 /**