lynkr 9.1.2 → 9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +21 -10
  2. package/package.json +3 -1
  3. package/scripts/build-knn-index.js +130 -0
  4. package/scripts/calibrate-thresholds.js +197 -0
  5. package/scripts/compare-policies.js +67 -0
  6. package/scripts/learn-output-ratios.js +162 -0
  7. package/scripts/refresh-pricing.js +122 -0
  8. package/scripts/run-routerarena.js +26 -0
  9. package/scripts/sample-regret.js +84 -0
  10. package/scripts/train-risk-classifier.js +191 -0
  11. package/src/api/middleware/budget-enforcer.js +60 -0
  12. package/src/api/middleware/load-shedding.js +11 -1
  13. package/src/api/middleware/tenant.js +21 -0
  14. package/src/api/router.js +19 -40
  15. package/src/budget/hierarchical-budget.js +159 -0
  16. package/src/cache/semantic.js +28 -2
  17. package/src/clients/databricks.js +59 -5
  18. package/src/config/index.js +239 -43
  19. package/src/context/toon.js +5 -4
  20. package/src/orchestrator/index.js +44 -6
  21. package/src/prompts/system.js +34 -6
  22. package/src/routing/bandit.js +246 -0
  23. package/src/routing/cascade.js +106 -0
  24. package/src/routing/complexity-analyzer.js +7 -15
  25. package/src/routing/confidence-scorer.js +121 -0
  26. package/src/routing/context-validator.js +71 -0
  27. package/src/routing/cost-optimizer.js +5 -2
  28. package/src/routing/deadline.js +52 -0
  29. package/src/routing/drift-monitor.js +113 -0
  30. package/src/routing/embedding-cache.js +77 -0
  31. package/src/routing/index.js +314 -5
  32. package/src/routing/knn-router.js +206 -0
  33. package/src/routing/latency-tracker.js +113 -71
  34. package/src/routing/model-tiers.js +156 -6
  35. package/src/routing/output-ratios.js +57 -0
  36. package/src/routing/regret-estimator.js +91 -0
  37. package/src/routing/reward-pipeline.js +62 -0
  38. package/src/routing/risk-classifier.js +130 -0
  39. package/src/routing/shadow-mode.js +77 -0
  40. package/src/routing/tenant-policy.js +96 -0
  41. package/src/routing/tokenizer.js +162 -0
  42. package/src/server.js +9 -0
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Confidence scoring for cascade responses (Phase 3.3).
3
+ *
4
+ * Given a response from a smaller model, estimate whether it's confident
5
+ * enough to return as-is or whether we should escalate to a bigger model.
6
+ *
7
+ * Three strategies, picked by task type:
8
+ * - Factoid: detect refusal/uncertainty markers
9
+ * - Code: parse-validity check, completeness markers
10
+ * - Reasoning: optional judge-LLM (heuristic fallback when judge unavailable)
11
+ *
12
+ * Returns a [0, 1] confidence score. Caller compares against a threshold
13
+ * (default 0.85).
14
+ */
15
+
16
+ const logger = require('../logger');
17
+
18
+ const UNCERTAINTY_MARKERS = [
19
+ /\bi don't know\b/i,
20
+ /\bi'm not sure\b/i,
21
+ /\bi cannot\b/i,
22
+ /\bi am unable\b/i,
23
+ /\bunable to\b/i,
24
+ /\bnot certain\b/i,
25
+ /\bunclear\b/i,
26
+ /\bambiguous\b/i,
27
+ /\b(?:no|insufficient) (?:information|context|details)\b/i,
28
+ ];
29
+
30
+ const REFUSAL_MARKERS = [
31
+ /\bi can't help\b/i,
32
+ /\bi won't\b/i,
33
+ /\bagainst (?:my )?(?:guidelines|policy)\b/i,
34
+ ];
35
+
36
+ const CODE_INCOMPLETE_MARKERS = [
37
+ /\/\/\s*TODO\b/,
38
+ /\/\*\s*TODO\b/,
39
+ /#\s*TODO\b/i,
40
+ /\.\.\.\s*$/m,
41
+ /\bimplement (?:this|here|me)\b/i,
42
+ /<replace[_ -]?this>/i,
43
+ /<your[_ -]?code>/i,
44
+ ];
45
+
46
+ function _extractText(content) {
47
+ if (!content) return '';
48
+ if (typeof content === 'string') return content;
49
+ if (Array.isArray(content)) {
50
+ return content
51
+ .filter(b => b?.type === 'text')
52
+ .map(b => b.text || '')
53
+ .join(' ');
54
+ }
55
+ return '';
56
+ }
57
+
58
+ function _hasMarkers(text, patterns) {
59
+ for (const re of patterns) {
60
+ if (re.test(text)) return true;
61
+ }
62
+ return false;
63
+ }
64
+
65
+ function scoreFactoid(response) {
66
+ const text = _extractText(response?.content);
67
+ if (!text) return 0;
68
+ if (_hasMarkers(text, REFUSAL_MARKERS)) return 0.2;
69
+ if (_hasMarkers(text, UNCERTAINTY_MARKERS)) return 0.5;
70
+ // Short answers to factoid questions are usually fine; long hedged answers less so.
71
+ if (text.length < 200) return 0.9;
72
+ if (text.length < 500) return 0.85;
73
+ return 0.8;
74
+ }
75
+
76
+ function scoreCode(response) {
77
+ const text = _extractText(response?.content);
78
+ if (!text) return 0;
79
+ if (_hasMarkers(text, CODE_INCOMPLETE_MARKERS)) return 0.4;
80
+ if (_hasMarkers(text, UNCERTAINTY_MARKERS)) return 0.55;
81
+ // Look for code blocks
82
+ const fenced = (text.match(/```[\s\S]*?```/g) || []).join('\n');
83
+ if (!fenced) return 0.6; // Code-gen request without code is suspicious
84
+ // Very basic balance check
85
+ const opens = (fenced.match(/[\{\[\(]/g) || []).length;
86
+ const closes = (fenced.match(/[\}\]\)]/g) || []).length;
87
+ if (Math.abs(opens - closes) > 2) return 0.5;
88
+ return 0.9;
89
+ }
90
+
91
+ async function scoreReasoning(response, opts = {}) {
92
+ const text = _extractText(response?.content);
93
+ if (!text) return 0;
94
+ if (_hasMarkers(text, REFUSAL_MARKERS)) return 0.2;
95
+ if (_hasMarkers(text, UNCERTAINTY_MARKERS)) return 0.5;
96
+ // Optional judge LLM via opts.judge({ question, answer }) → [0, 1]
97
+ if (typeof opts.judge === 'function') {
98
+ try {
99
+ const judged = await opts.judge({ question: opts.question, answer: text });
100
+ if (typeof judged === 'number') return Math.max(0, Math.min(1, judged));
101
+ } catch (err) {
102
+ logger.debug({ err: err.message }, '[ConfidenceScorer] Judge LLM failed, using heuristic');
103
+ }
104
+ }
105
+ // Heuristic: well-structured responses (paragraphs + concrete claims) score higher
106
+ const sentenceCount = (text.match(/[.!?]+\s/g) || []).length;
107
+ if (sentenceCount < 2) return 0.6;
108
+ if (sentenceCount > 30) return 0.7; // very long answers are often padding
109
+ return 0.85;
110
+ }
111
+
112
+ async function score(response, opts = {}) {
113
+ const taskType = (opts.taskType || 'reasoning').toLowerCase();
114
+ if (taskType.includes('code')) return scoreCode(response);
115
+ if (taskType.includes('factoid') || taskType.includes('qa') || taskType.includes('simple_qa')) {
116
+ return scoreFactoid(response);
117
+ }
118
+ return scoreReasoning(response, opts);
119
+ }
120
+
121
+ module.exports = { score, scoreFactoid, scoreCode, scoreReasoning };
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Context-window validation for routed models.
3
+ *
4
+ * After tier selection, verifies the chosen model can hold the estimated
5
+ * input tokens (plus response headroom). When it can't, callers should
6
+ * escalate to a context-capable model.
7
+ *
8
+ * Phase 1.3 of the routing overhaul.
9
+ *
10
+ * @module routing/context-validator
11
+ */
12
+
13
+ const logger = require('../logger');
14
+ const { getModelRegistrySync } = require('./model-registry');
15
+
16
+ /** Fraction of the context window reserved for the prompt (rest left for response). */
17
+ const HEADROOM_FRACTION = 0.85;
18
+
19
+ function getContextLimit(model) {
20
+ if (!model) return null;
21
+ try {
22
+ const registry = getModelRegistrySync();
23
+ const cost = registry.getCost(model);
24
+ return cost?.context || null;
25
+ } catch (err) {
26
+ return null;
27
+ }
28
+ }
29
+
30
+ /**
31
+ * Quick yes/no fit check.
32
+ * Unknown context windows return true (assume fits — we don't have data to reject).
33
+ */
34
+ function fits(model, estimatedTokens, fraction = HEADROOM_FRACTION) {
35
+ const ctx = getContextLimit(model);
36
+ if (!ctx) return true;
37
+ return estimatedTokens <= ctx * fraction;
38
+ }
39
+
40
+ /**
41
+ * Detailed validation result.
42
+ * @returns {{ ok: boolean, context: number|null, required: number, limit: number|null, reason?: string }}
43
+ */
44
+ function validate(model, estimatedTokens) {
45
+ const ctx = getContextLimit(model);
46
+ if (!ctx) {
47
+ return {
48
+ ok: true,
49
+ reason: 'unknown_context',
50
+ context: null,
51
+ required: estimatedTokens,
52
+ limit: null,
53
+ };
54
+ }
55
+ const limit = Math.floor(ctx * HEADROOM_FRACTION);
56
+ if (estimatedTokens <= limit) {
57
+ return { ok: true, context: ctx, required: estimatedTokens, limit };
58
+ }
59
+ logger.debug(
60
+ { model, context: ctx, required: estimatedTokens, limit },
61
+ '[ContextValidator] Estimated tokens exceed model context'
62
+ );
63
+ return { ok: false, context: ctx, required: estimatedTokens, limit };
64
+ }
65
+
66
+ module.exports = {
67
+ validate,
68
+ fits,
69
+ getContextLimit,
70
+ HEADROOM_FRACTION,
71
+ };
@@ -8,6 +8,7 @@ const logger = require('../logger');
8
8
  const config = require('../config');
9
9
  const { getModelRegistry, getModelRegistrySync } = require('./model-registry');
10
10
  const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers');
11
+ const { ratioFor } = require('./output-ratios');
11
12
 
12
13
  // Session cost tracking (in-memory)
13
14
  const sessionCosts = new Map(); // sessionId -> { total, requests, byModel, byProvider }
@@ -62,12 +63,14 @@ class CostOptimizer {
62
63
  * @param {number} outputTokens - Estimated output tokens (optional)
63
64
  * @returns {Object} Cost estimate
64
65
  */
65
- estimateCost(model, inputTokens, outputTokens = null) {
66
+ estimateCost(model, inputTokens, outputTokens = null, taskType = null) {
66
67
  const registry = this._getRegistry();
67
68
  const costs = registry.getCost(model);
68
69
 
69
70
  const inputCost = (inputTokens / 1_000_000) * costs.input;
70
- const estimatedOutputTokens = outputTokens || Math.min(inputTokens * 0.5, 4096);
71
+ // Phase 2.3: per-task-type output ratio learned from telemetry
72
+ const ratio = taskType ? ratioFor(taskType) : 0.5;
73
+ const estimatedOutputTokens = outputTokens || Math.min(inputTokens * ratio, costs.maxOutput || 4096);
71
74
  const outputCost = (estimatedOutputTokens / 1_000_000) * costs.output;
72
75
 
73
76
  return {
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Deadline-aware routing (Phase 6.3).
3
+ *
4
+ * Reads LYNKR-Deadline-Ms from the request, filters out candidate models
5
+ * whose P95 latency exceeds the deadline. If the originally-routed model
6
+ * is too slow, find a faster qualifying alternative.
7
+ */
8
+
9
+ const { getLatencyTracker } = require('./latency-tracker');
10
+
11
+ const SAFETY_FACTOR = 1.2; // leave 20% safety margin against P95 estimates
12
+
13
+ function getDeadlineMs(req) {
14
+ if (!req) return null;
15
+ const h = req.headers || req;
16
+ const raw = h['lynkr-deadline-ms'] || h['LYNKR-Deadline-Ms'];
17
+ if (!raw) return null;
18
+ const num = Number(raw);
19
+ return Number.isFinite(num) && num > 0 ? num : null;
20
+ }
21
+
22
+ /**
23
+ * Check whether a routed model is fast enough for the deadline.
24
+ */
25
+ function fits(provider, model, deadlineMs) {
26
+ if (!deadlineMs) return true;
27
+ const tracker = getLatencyTracker();
28
+ const p95 = tracker.getModelP95(provider, model);
29
+ if (p95 === null) return true; // unknown — assume yes
30
+ return p95 * SAFETY_FACTOR <= deadlineMs;
31
+ }
32
+
33
+ /**
34
+ * Pick the fastest model among candidates that meets the deadline.
35
+ */
36
+ function chooseFastest(candidates, deadlineMs) {
37
+ if (!Array.isArray(candidates) || candidates.length === 0) return null;
38
+ const tracker = getLatencyTracker();
39
+ let bestP95 = Infinity;
40
+ let best = null;
41
+ for (const c of candidates) {
42
+ const p95 = tracker.getModelP95(c.provider, c.model) ?? 5000;
43
+ const eligible = !deadlineMs || p95 * SAFETY_FACTOR <= deadlineMs;
44
+ if (eligible && p95 < bestP95) {
45
+ bestP95 = p95;
46
+ best = { ...c, p95 };
47
+ }
48
+ }
49
+ return best;
50
+ }
51
+
52
+ module.exports = { getDeadlineMs, fits, chooseFastest, SAFETY_FACTOR };
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Drift monitor (Phase 4.3).
3
+ *
4
+ * Tracks two kinds of drift:
5
+ * - Input drift: distribution of query embeddings week-over-week via PSI
6
+ * (Population Stability Index) over coarse bucket assignments.
7
+ * - Output drift: refusal rate, average response length, latency
8
+ * distribution per model.
9
+ *
10
+ * Computes a PSI per metric; alerts when PSI > 0.2 (warning) or > 0.3
11
+ * (full retrain recommended). Writes alerts to data/drift-alerts.json.
12
+ *
13
+ * Auto-retrain is gated on LYNKR_AUTO_RETRAIN=true and not implemented here —
14
+ * the consumer (a cron job or the dashboard) decides what to do.
15
+ */
16
+
17
+ const fs = require('fs');
18
+ const path = require('path');
19
+ const logger = require('../logger');
20
+
21
+ const ALERTS_PATH = path.join(__dirname, '../../data/drift-alerts.json');
22
+ const WARN_THRESHOLD = 0.2;
23
+ const RETRAIN_THRESHOLD = 0.3;
24
+
25
+ function _bucketize(values, bucketCount = 10, min, max) {
26
+ if (values.length === 0) return new Array(bucketCount).fill(0);
27
+ const lo = min ?? Math.min(...values);
28
+ const hi = max ?? Math.max(...values);
29
+ const range = Math.max(1e-9, hi - lo);
30
+ const counts = new Array(bucketCount).fill(0);
31
+ for (const v of values) {
32
+ const idx = Math.max(0, Math.min(bucketCount - 1, Math.floor(((v - lo) / range) * bucketCount)));
33
+ counts[idx]++;
34
+ }
35
+ return counts;
36
+ }
37
+
38
+ /**
39
+ * Population Stability Index between two distributions.
40
+ * PSI = Σ (p_new - p_old) · ln(p_new / p_old)
41
+ */
42
+ function psi(oldCounts, newCounts) {
43
+ const oldTotal = oldCounts.reduce((s, c) => s + c, 0);
44
+ const newTotal = newCounts.reduce((s, c) => s + c, 0);
45
+ if (oldTotal === 0 || newTotal === 0) return 0;
46
+ let sum = 0;
47
+ for (let i = 0; i < oldCounts.length; i++) {
48
+ const p = (oldCounts[i] + 0.5) / (oldTotal + oldCounts.length * 0.5);
49
+ const q = (newCounts[i] + 0.5) / (newTotal + newCounts.length * 0.5);
50
+ sum += (q - p) * Math.log(q / p);
51
+ }
52
+ return sum;
53
+ }
54
+
55
+ function _writeAlert(alert) {
56
+ try {
57
+ fs.mkdirSync(path.dirname(ALERTS_PATH), { recursive: true });
58
+ let existing = [];
59
+ if (fs.existsSync(ALERTS_PATH)) {
60
+ try { existing = JSON.parse(fs.readFileSync(ALERTS_PATH, 'utf8')); } catch {}
61
+ }
62
+ const out = Array.isArray(existing) ? existing : [];
63
+ out.push({ ...alert, timestamp: Date.now() });
64
+ fs.writeFileSync(ALERTS_PATH, JSON.stringify(out.slice(-200), null, 2));
65
+ } catch (err) {
66
+ logger.warn({ err: err.message }, '[DriftMonitor] Alert write failed');
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Detect drift between two value series.
72
+ * @param {string} metric - name for logging
73
+ * @param {number[]} oldValues - reference window (e.g. last week)
74
+ * @param {number[]} newValues - current window (e.g. last 24h)
75
+ * @returns {{ psi, level, metric }}
76
+ */
77
+ function detect(metric, oldValues, newValues) {
78
+ if (oldValues.length < 50 || newValues.length < 20) {
79
+ return { psi: 0, level: 'insufficient_data', metric };
80
+ }
81
+ const combinedMin = Math.min(...oldValues, ...newValues);
82
+ const combinedMax = Math.max(...oldValues, ...newValues);
83
+ const oldB = _bucketize(oldValues, 10, combinedMin, combinedMax);
84
+ const newB = _bucketize(newValues, 10, combinedMin, combinedMax);
85
+ const p = psi(oldB, newB);
86
+ let level = 'ok';
87
+ if (p >= RETRAIN_THRESHOLD) level = 'retrain';
88
+ else if (p >= WARN_THRESHOLD) level = 'warn';
89
+
90
+ if (level !== 'ok') {
91
+ _writeAlert({ metric, psi: p, level, oldSize: oldValues.length, newSize: newValues.length });
92
+ }
93
+ return { psi: p, level, metric };
94
+ }
95
+
96
+ /**
97
+ * Detect refusal-rate drift by counting the share of responses containing
98
+ * refusal markers in two windows.
99
+ */
100
+ function refusalRateShift(oldResponses, newResponses) {
101
+ const markers = [/i can't help/i, /i won't/i, /against (?:my )?guidelines/i, /i cannot/i];
102
+ const _rate = (arr) => arr.filter(t => markers.some(m => m.test(t))).length / Math.max(1, arr.length);
103
+ return { old: _rate(oldResponses), new: _rate(newResponses) };
104
+ }
105
+
106
+ module.exports = {
107
+ psi,
108
+ detect,
109
+ refusalRateShift,
110
+ _bucketize,
111
+ WARN_THRESHOLD,
112
+ RETRAIN_THRESHOLD,
113
+ };
@@ -0,0 +1,77 @@
1
+ /**
2
+ * In-memory LRU cache for query embeddings.
3
+ *
4
+ * Used by Phase 3.1 (kNN router) and Phase 4.3 (drift detector) to avoid
5
+ * repeated embedding calls for queries we've already seen.
6
+ */
7
+
8
+ const crypto = require('crypto');
9
+ const logger = require('../logger');
10
+
11
+ const DEFAULT_MAX = 5000;
12
+
13
+ class EmbeddingCache {
14
+ constructor(maxSize = DEFAULT_MAX) {
15
+ this.maxSize = maxSize;
16
+ this.cache = new Map(); // hash -> { embedding, lastAccess }
17
+ this.hits = 0;
18
+ this.misses = 0;
19
+ }
20
+
21
+ _hash(text) {
22
+ return crypto.createHash('sha1').update(text).digest('hex');
23
+ }
24
+
25
+ get(text) {
26
+ if (!text || typeof text !== 'string') return null;
27
+ const key = this._hash(text);
28
+ const entry = this.cache.get(key);
29
+ if (!entry) {
30
+ this.misses++;
31
+ return null;
32
+ }
33
+ // LRU touch
34
+ this.cache.delete(key);
35
+ entry.lastAccess = Date.now();
36
+ this.cache.set(key, entry);
37
+ this.hits++;
38
+ return entry.embedding;
39
+ }
40
+
41
+ set(text, embedding) {
42
+ if (!text || !embedding) return;
43
+ const key = this._hash(text);
44
+ if (this.cache.has(key)) this.cache.delete(key);
45
+ this.cache.set(key, { embedding, lastAccess: Date.now() });
46
+ if (this.cache.size > this.maxSize) {
47
+ // Evict least-recently-used (Map keeps insertion/access order)
48
+ const oldest = this.cache.keys().next().value;
49
+ this.cache.delete(oldest);
50
+ }
51
+ }
52
+
53
+ getStats() {
54
+ const total = this.hits + this.misses;
55
+ return {
56
+ size: this.cache.size,
57
+ maxSize: this.maxSize,
58
+ hits: this.hits,
59
+ misses: this.misses,
60
+ hitRate: total > 0 ? (this.hits / total).toFixed(3) : '0',
61
+ };
62
+ }
63
+
64
+ clear() {
65
+ this.cache.clear();
66
+ this.hits = 0;
67
+ this.misses = 0;
68
+ }
69
+ }
70
+
71
+ let _instance = null;
72
+ function getEmbeddingCache() {
73
+ if (!_instance) _instance = new EmbeddingCache();
74
+ return _instance;
75
+ }
76
+
77
+ module.exports = { EmbeddingCache, getEmbeddingCache };