tachibot-mcp 2.19.3 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +48 -0
  2. package/dist/evals/analysis/aggregator.js +92 -0
  3. package/dist/evals/analysis/bootstrap-ci.js +58 -0
  4. package/dist/evals/analysis/csv-reader.js +137 -0
  5. package/dist/evals/analysis/delta-table.js +60 -0
  6. package/dist/evals/cli.js +328 -0
  7. package/dist/evals/config.js +55 -0
  8. package/dist/evals/datasets/gsm8k.js +41 -0
  9. package/dist/evals/datasets/hf-fetcher.js +141 -0
  10. package/dist/evals/datasets/humaneval.js +36 -0
  11. package/dist/evals/datasets/livecodebench.js +80 -0
  12. package/dist/evals/datasets/sample.js +37 -0
  13. package/dist/evals/datasets/truthfulqa.js +53 -0
  14. package/dist/evals/models/adapters.js +341 -0
  15. package/dist/evals/models/pricing.js +29 -0
  16. package/dist/evals/models/snapshots.js +19 -0
  17. package/dist/evals/protocols/adversarial-debate.js +51 -0
  18. package/dist/evals/protocols/best-of-n.js +77 -0
  19. package/dist/evals/protocols/parallel-council.js +76 -0
  20. package/dist/evals/protocols/prompts.js +41 -0
  21. package/dist/evals/protocols/self-consistency.js +87 -0
  22. package/dist/evals/protocols/sequential-pipeline.js +68 -0
  23. package/dist/evals/protocols/single-baseline.js +30 -0
  24. package/dist/evals/protocols/types.js +35 -0
  25. package/dist/evals/reporters/methodology-md.js +179 -0
  26. package/dist/evals/reporters/results-md.js +154 -0
  27. package/dist/evals/runners/benchmark-runner.js +223 -0
  28. package/dist/evals/runners/cost-guard.js +117 -0
  29. package/dist/evals/runners/loader-registry.js +23 -0
  30. package/dist/evals/runners/p-limit.js +29 -0
  31. package/dist/evals/runners/protocol-registry.js +61 -0
  32. package/dist/evals/runners/scorer-registry.js +26 -0
  33. package/dist/evals/runners/types.js +13 -0
  34. package/dist/evals/sandbox/python-exec.js +138 -0
  35. package/dist/evals/scorers/extract.js +85 -0
  36. package/dist/evals/scorers/gsm8k-scorer.js +19 -0
  37. package/dist/evals/scorers/humaneval-scorer.js +42 -0
  38. package/dist/evals/scorers/livecodebench-scorer.js +111 -0
  39. package/dist/evals/scorers/truthfulqa-scorer.js +15 -0
  40. package/dist/evals/types.js +7 -0
  41. package/dist/evals/writers/csv-writer.js +84 -0
  42. package/dist/evals/writers/jsonl-writer.js +68 -0
  43. package/dist/src/collaborative-orchestrator.js +4 -4
  44. package/dist/src/config/model-constants.js +66 -43
  45. package/dist/src/config/model-defaults.js +6 -6
  46. package/dist/src/config/timeout-config.js +4 -4
  47. package/dist/src/config.js +1 -1
  48. package/dist/src/modes/architect.js +4 -4
  49. package/dist/src/modes/scout.js +2 -1
  50. package/dist/src/optimization/cost-monitor.js +1 -1
  51. package/dist/src/orchestrators/collaborative/registries/ModelProviderRegistry.js +1 -1
  52. package/dist/src/server.js +4 -0
  53. package/dist/src/tools/grok-enhanced.js +15 -29
  54. package/dist/src/tools/grok-tools.js +46 -31
  55. package/dist/src/tools/openai-tools.js +17 -4
  56. package/dist/src/tools/openrouter-tools.js +2 -0
  57. package/dist/src/tools/tachi-tool.js +3 -3
  58. package/dist/src/utils/ansi-styles.js +12 -0
  59. package/dist/src/utils/openrouter-gateway.js +3 -0
  60. package/dist/src/utils/param-aliases.js +68 -0
  61. package/dist/src/workflows/model-router.js +29 -2
  62. package/dist/src/workflows/tool-mapper.js +3 -3
  63. package/docs/API_KEYS.md +9 -8
  64. package/docs/TOOL_PARAMETERS.md +1 -1
  65. package/docs/superpowers/plans/2026-04-10-grok-420-upgrade.md +498 -0
  66. package/package.json +6 -6
  67. package/dist/scripts/demo-all-components.js +0 -340
  68. package/dist/scripts/test-usage-stats.js +0 -6
package/CHANGELOG.md CHANGED
@@ -5,6 +5,54 @@ All notable changes to TachiBot MCP will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.21.1] - 2026-04-26
9
+
10
+ ### Changed
11
+ - **OpenAI: gpt-5.4 → gpt-5.5** (released 2026-04-23). Agentic-focused, 1.1M context, omnimodal. Pricing $5/$30 per M tokens. `gpt-5.5-pro` ($30/$180) for premium tier. `gpt-5.4-mini` retained for `code`/`explain` tools (no `gpt-5.5-mini` released yet).
12
+ - **Kimi: kimi-k2.5 → kimi-k2.6** (released 2026-04-20). 1T MoE, leads SWE-bench Pro for long-horizon coding. Pricing ~$0.74/$4.65 per M tokens. K2.5 retained as fallback.
13
+
14
+ ### Added
15
+ - **Qwen3.6-Plus** (`qwen/qwen3.6-plus`) registered in `QWEN_MODELS.PLUS_3_6`. New April 2026 general-purpose flagship at $0.325/$1.95 per M. Not yet wired as default — `qwen3-coder-next` (coder) and `qwen3-235b-thinking-2507` (reason) remain primary; awaiting `qwen3.6-coder` variant.
16
+ - Display names + pricing for `gpt-5.5`, `gpt-5.5-pro`, `kimi-k2.6`, `qwen3.6-plus`, `qwen3-235b-a22b-thinking-2507`.
17
+ - Auto-fallback: `kimi-k2.6` → `kimi-k2.5` on quota errors.
18
+
19
+ ### Notes
20
+ - Grok 5 not released (Q2 2026 expected). Keeping `grok-4.20-0309-reasoning`.
21
+ - Gemini 3.5 in preview, GA expected at Google I/O May 2026. Keeping `gemini-3.1-pro-preview`.
22
+ - Verified all model IDs against live OpenAI `/v1/models` and OpenRouter `/v1/models` endpoints before release.
23
+
24
+ ## [2.21.0] - 2026-04-13
25
+
26
+ ### Added
27
+ - **Auto-alias param names** — `z.preprocess()` hook in `safeAddTool` remaps `query` ↔ `problem` ↔ `prompt` ↔ `question` ↔ `topic` before Zod validation. LLMs that reach for the wrong synonym now succeed instead of hard-failing with `-32602 InvalidParams`.
28
+ - **Zero per-tool changes** — single source of truth in `src/utils/param-aliases.ts`; every tool benefits automatically.
29
+ - **11 unit tests** in `src/utils/__tests__/param-aliases.test.ts` covering directional aliasing, primary-wins precedence, and missing-key behavior.
30
+
31
+ ### Notes
32
+ - Primary param value always wins when both primary and alias are provided.
33
+ - Aliasing is transparent to tool implementations — the Zod schema sees the canonical key.
34
+
35
+ ## [2.20.0] - 2026-04-10
36
+
37
+ ### Changed
38
+ - **Grok 4 → 4.20** — all defaults moved to flagship.
39
+ - `grok_reason` / `grok_search` → `grok-4.20-0309-reasoning` (low hallucination, 2M context)
40
+ - `grok_architect` → `grok-4.20-multi-agent-0309` (4–16 parallel agents)
41
+ - `grok_code` / `grok_debug` / `grok_brainstorm` → `grok-4.20-0309-non-reasoning` (fast turn-around)
42
+ - **Smart timeout defaults bumped** — OpenAI 20→60s base, Grok max 90→120s.
43
+
44
+ ### Added
45
+ - **AbortController on OpenAI** — 90s default, 180s for high-reasoning. No more hung calls.
46
+ - **AbortController on Grok** — 60–180s based on model.
47
+ - **`reasoning` param** on `callGrok` for multi-agent invocation; unified `GrokModel` enum.
48
+
49
+ ### Fixed
50
+ - Stale `gpt-4-mini` → `gpt-5.4-mini` in architect + workflows.
51
+ - Hardcoded `grok-4-0709` references across 6 scattered files (OpenRouter gateway, ANSI badges, model-router, tool-mapper).
52
+
53
+ ### Docs
54
+ - Updated `docs/API_KEYS.md` and `docs/TOOL_PARAMETERS.md` for Grok 4.20.
55
+
8
56
  ## [2.19.3] - 2026-03-21
9
57
 
10
58
  ### Fixed
@@ -0,0 +1,92 @@
1
+ /**
2
+ * P9: Aggregator — turns raw RunRows into one CellStats per
3
+ * (benchmark, displayed-protocol, model_set) group.
4
+ *
5
+ * Displayed protocol:
6
+ * - For single-model protocol families (`single-baseline`, `best-of-n`,
7
+ * `self-consistency`) → `${protocol}[${model_set}]`. This makes
8
+ * `single-baseline[openai]` distinguishable from `single-baseline[kimi]`
9
+ * downstream (and lets `delta-table` rank baselines independently).
10
+ * - For multi-model protocols (`parallel-council`, `sequential-pipeline`,
11
+ * `adversarial-debate`) → just `${protocol}`. The lineup is implied.
12
+ *
13
+ * Skip semantics:
14
+ * - Rows with `error !== null` OR `score === null` are excluded from
15
+ * accuracy and cost-per-correct computations.
16
+ * - errorRate is the fraction of *all* rows in the cell that had `error` set
17
+ * (independent of score).
18
+ */
19
+ import { bootstrapCi } from './bootstrap-ci.js';
20
+ const SINGLE_MODEL_FAMILIES = new Set([
21
+ 'single-baseline',
22
+ 'best-of-n',
23
+ 'self-consistency',
24
+ ]);
25
+ function displayProtocol(row) {
26
+ if (SINGLE_MODEL_FAMILIES.has(row.protocol)) {
27
+ return `${row.protocol}[${row.model_set}]`;
28
+ }
29
+ return row.protocol;
30
+ }
31
+ export function aggregate(rows, opts = {}) {
32
+ const buckets = new Map();
33
+ for (const row of rows) {
34
+ const displayed = displayProtocol(row);
35
+ const key = `${row.benchmark}|${displayed}|${row.model_set}`;
36
+ let b = buckets.get(key);
37
+ if (!b) {
38
+ b = {
39
+ benchmark: row.benchmark,
40
+ displayedProtocol: displayed,
41
+ model_set: row.model_set,
42
+ scores: [],
43
+ costs: [],
44
+ latencies: [],
45
+ totalRows: 0,
46
+ errorRows: 0,
47
+ };
48
+ buckets.set(key, b);
49
+ }
50
+ b.totalRows += 1;
51
+ if (row.error !== null && row.error !== undefined && row.error !== '') {
52
+ b.errorRows += 1;
53
+ }
54
+ const valid = (row.error === null || row.error === undefined || row.error === '') &&
55
+ row.score !== null;
56
+ if (valid) {
57
+ b.scores.push(row.score);
58
+ b.costs.push(row.cost_usd ?? 0);
59
+ b.latencies.push(row.latency_ms ?? 0);
60
+ }
61
+ }
62
+ const out = [];
63
+ for (const b of buckets.values()) {
64
+ const ci = bootstrapCi(b.scores, { resamples: opts.bootstrapResamples ?? 1000, seed: 1 });
65
+ const totalCostUsd = b.costs.reduce((a, c) => a + c, 0);
66
+ const correctCount = b.scores.reduce((a, s) => a + (s === 1 ? 1 : 0), 0);
67
+ const avgLatencyMs = b.latencies.length === 0
68
+ ? 0
69
+ : b.latencies.reduce((a, c) => a + c, 0) / b.latencies.length;
70
+ const costPerCorrect = correctCount === 0 ? Infinity : totalCostUsd / correctCount;
71
+ const errorRate = b.totalRows === 0 ? 0 : b.errorRows / b.totalRows;
72
+ out.push({
73
+ benchmark: b.benchmark,
74
+ protocol: b.displayedProtocol,
75
+ model_set: b.model_set,
76
+ n: b.scores.length,
77
+ mean: ci.mean,
78
+ ciLow: ci.ciLow,
79
+ ciHigh: ci.ciHigh,
80
+ totalCostUsd,
81
+ avgLatencyMs,
82
+ correctCount,
83
+ costPerCorrect,
84
+ errorRate,
85
+ });
86
+ }
87
+ // Stable sort: benchmark asc, then protocol asc.
88
+ out.sort((a, b) => a.benchmark === b.benchmark
89
+ ? a.protocol.localeCompare(b.protocol)
90
+ : a.benchmark.localeCompare(b.benchmark));
91
+ return out;
92
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * P9: Bootstrap confidence interval for binary scores.
3
+ *
4
+ * Standard percentile-of-resamples bootstrap. Input is an array of 0/1
5
+ * (correct/incorrect). Output is the observed mean and the alpha/2 and
6
+ * 1-alpha/2 quantiles of the resampled-mean distribution.
7
+ *
8
+ * Determinism: a `seed` parameter feeds the same mulberry32 PRNG used by
9
+ * `evals/datasets/sample.ts`, so the same seed → identical CI bounds across
10
+ * Node versions.
11
+ *
12
+ * Edge cases:
13
+ * scores.length === 0 → all NaN, n=0
14
+ * scores.length === 1 → CI collapses to [score, score]
15
+ */
16
+ import { mulberry32 } from '../datasets/sample.js';
17
+ export function bootstrapCi(scores, opts = {}) {
18
+ const n = scores.length;
19
+ if (n === 0) {
20
+ return { mean: NaN, ciLow: NaN, ciHigh: NaN, n: 0 };
21
+ }
22
+ const mean = scores.reduce((a, b) => a + b, 0) / n;
23
+ if (n === 1) {
24
+ const v = scores[0];
25
+ return { mean: v, ciLow: v, ciHigh: v, n };
26
+ }
27
+ const resamples = Math.max(1, opts.resamples ?? 1000);
28
+ const alpha = opts.alpha ?? 0.05;
29
+ const seed = opts.seed ?? 1;
30
+ const rand = mulberry32(seed);
31
+ const means = new Array(resamples);
32
+ for (let r = 0; r < resamples; r++) {
33
+ let sum = 0;
34
+ for (let i = 0; i < n; i++) {
35
+ const idx = Math.floor(rand() * n);
36
+ sum += scores[idx];
37
+ }
38
+ means[r] = sum / n;
39
+ }
40
+ means.sort((a, b) => a - b);
41
+ const ciLow = quantile(means, alpha / 2);
42
+ const ciHigh = quantile(means, 1 - alpha / 2);
43
+ return { mean, ciLow, ciHigh, n };
44
+ }
45
+ /** Linear-interpolated quantile of a pre-sorted array. */
46
+ function quantile(sorted, q) {
47
+ if (sorted.length === 0)
48
+ return NaN;
49
+ if (sorted.length === 1)
50
+ return sorted[0];
51
+ const pos = q * (sorted.length - 1);
52
+ const lo = Math.floor(pos);
53
+ const hi = Math.ceil(pos);
54
+ if (lo === hi)
55
+ return sorted[lo];
56
+ const frac = pos - lo;
57
+ return sorted[lo] + frac * (sorted[hi] - sorted[lo]);
58
+ }
@@ -0,0 +1,137 @@
1
+ /**
2
+ * P9: CSV reader — round-trips files written by `evals/writers/csv-writer.ts`.
3
+ *
4
+ * Implements RFC 4180 quoting:
5
+ * - Fields may be wrapped in double-quotes.
6
+ * - Inside a quoted field, `""` represents a literal `"`.
7
+ * - Quoted fields may contain commas and newlines verbatim.
8
+ *
9
+ * Also handles the schema-specific coercion: numeric fields revert from
10
+ * strings to numbers, nullable fields render an empty CSV cell as null.
11
+ */
12
+ import { promises as fs } from 'node:fs';
13
+ import { CSV_COLUMNS } from '../writers/csv-writer.js';
14
+ const NUMERIC_FIELDS = new Set([
15
+ 'k_reps',
16
+ 'run_idx',
17
+ 'tokens_in',
18
+ 'tokens_out',
19
+ 'cost_usd',
20
+ 'latency_ms',
21
+ ]);
22
+ /** Score / ci_low / ci_high are number-or-null. */
23
+ const NULLABLE_NUMERIC_FIELDS = new Set([
24
+ 'score',
25
+ 'ci_low',
26
+ 'ci_high',
27
+ ]);
28
+ /** Strings that may be null when empty. */
29
+ const NULLABLE_STRING_FIELDS = new Set([
30
+ 'judge_id',
31
+ 'error',
32
+ ]);
33
+ export async function readCsv(filepath) {
34
+ const text = await fs.readFile(filepath, 'utf8');
35
+ const records = parseCsv(text);
36
+ if (records.length === 0)
37
+ return [];
38
+ const header = records[0];
39
+ // Sanity: header must match expected columns in order.
40
+ for (let i = 0; i < CSV_COLUMNS.length; i++) {
41
+ if (header[i] !== CSV_COLUMNS[i]) {
42
+ throw new Error(`csv-reader: unexpected header at column ${i}: got "${header[i]}", expected "${CSV_COLUMNS[i]}"`);
43
+ }
44
+ }
45
+ const rows = [];
46
+ for (let r = 1; r < records.length; r++) {
47
+ const fields = records[r];
48
+ if (fields.length === 1 && fields[0] === '')
49
+ continue; // blank trailing line
50
+ const row = {};
51
+ for (let c = 0; c < CSV_COLUMNS.length; c++) {
52
+ const col = CSV_COLUMNS[c];
53
+ const raw = fields[c] ?? '';
54
+ row[col] = coerce(col, raw);
55
+ }
56
+ rows.push(row);
57
+ }
58
+ return rows;
59
+ }
60
+ function coerce(col, raw) {
61
+ if (NUMERIC_FIELDS.has(col)) {
62
+ return raw === '' ? 0 : Number(raw);
63
+ }
64
+ if (NULLABLE_NUMERIC_FIELDS.has(col)) {
65
+ return raw === '' ? null : Number(raw);
66
+ }
67
+ if (NULLABLE_STRING_FIELDS.has(col)) {
68
+ return raw === '' ? null : raw;
69
+ }
70
+ // Plain string fields.
71
+ return raw;
72
+ }
73
+ /**
74
+ * RFC 4180 streaming-state-machine parser. Returns an array of records,
75
+ * each record being an array of field strings.
76
+ */
77
+ function parseCsv(text) {
78
+ const records = [];
79
+ let field = '';
80
+ let record = [];
81
+ let inQuotes = false;
82
+ let i = 0;
83
+ const len = text.length;
84
+ while (i < len) {
85
+ const ch = text[i];
86
+ if (inQuotes) {
87
+ if (ch === '"') {
88
+ if (i + 1 < len && text[i + 1] === '"') {
89
+ // Escaped quote.
90
+ field += '"';
91
+ i += 2;
92
+ continue;
93
+ }
94
+ // End of quoted field.
95
+ inQuotes = false;
96
+ i++;
97
+ continue;
98
+ }
99
+ field += ch;
100
+ i++;
101
+ continue;
102
+ }
103
+ if (ch === '"') {
104
+ inQuotes = true;
105
+ i++;
106
+ continue;
107
+ }
108
+ if (ch === ',') {
109
+ record.push(field);
110
+ field = '';
111
+ i++;
112
+ continue;
113
+ }
114
+ if (ch === '\n' || ch === '\r') {
115
+ // End of record. Handle CRLF.
116
+ record.push(field);
117
+ records.push(record);
118
+ record = [];
119
+ field = '';
120
+ if (ch === '\r' && i + 1 < len && text[i + 1] === '\n') {
121
+ i += 2;
122
+ }
123
+ else {
124
+ i++;
125
+ }
126
+ continue;
127
+ }
128
+ field += ch;
129
+ i++;
130
+ }
131
+ // Flush trailing field/record (file may not end with newline).
132
+ if (field.length > 0 || record.length > 0) {
133
+ record.push(field);
134
+ records.push(record);
135
+ }
136
+ return records;
137
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * P9: Delta table — for each benchmark, picks the best single-baseline cell
3
+ * and the best protocol-overall cell, then computes Δaccuracy and
4
+ * Δcost-per-correct between them.
5
+ *
6
+ * Note: "best protocol overall" is allowed to BE a single-baseline. In that
7
+ * case the delta is 0 — the multi-model machinery didn't help on that bench.
8
+ * That's a legitimate finding and should be reported, not hidden.
9
+ *
10
+ * CI overlap: simple interval-overlap test. Two intervals [a, b] and [c, d]
11
+ * overlap iff a <= d AND c <= b. When CIs overlap, the absolute Δ is not
12
+ * statistically significant at the chosen alpha (95% by default).
13
+ */
14
+ function intervalsOverlap(a, b, c, d) {
15
+ return a <= d && c <= b;
16
+ }
17
+ export function buildDeltaTable(stats) {
18
+ // Group by benchmark.
19
+ const byBench = new Map();
20
+ for (const s of stats) {
21
+ const bucket = byBench.get(s.benchmark);
22
+ if (bucket)
23
+ bucket.push(s);
24
+ else
25
+ byBench.set(s.benchmark, [s]);
26
+ }
27
+ const out = [];
28
+ for (const [benchmark, cells] of byBench) {
29
+ const baselines = cells.filter((c) => c.protocol.startsWith('single-baseline'));
30
+ if (baselines.length === 0)
31
+ continue; // Nothing to compare against.
32
+ const bestBaseline = baselines.reduce((a, b) => (b.mean > a.mean ? b : a));
33
+ const bestOverall = cells.reduce((a, b) => (b.mean > a.mean ? b : a));
34
+ out.push({
35
+ benchmark,
36
+ bestSingleBaseline: {
37
+ protocol: bestBaseline.protocol,
38
+ model: bestBaseline.model_set,
39
+ accuracy: bestBaseline.mean,
40
+ ciLow: bestBaseline.ciLow,
41
+ ciHigh: bestBaseline.ciHigh,
42
+ costPerCorrect: bestBaseline.costPerCorrect,
43
+ },
44
+ bestProtocol: {
45
+ protocol: bestOverall.protocol,
46
+ model_set: bestOverall.model_set,
47
+ accuracy: bestOverall.mean,
48
+ ciLow: bestOverall.ciLow,
49
+ ciHigh: bestOverall.ciHigh,
50
+ costPerCorrect: bestOverall.costPerCorrect,
51
+ },
52
+ deltaAccuracy: bestOverall.mean - bestBaseline.mean,
53
+ deltaCostPerCorrect: bestOverall.costPerCorrect - bestBaseline.costPerCorrect,
54
+ ciOverlap: intervalsOverlap(bestBaseline.ciLow, bestBaseline.ciHigh, bestOverall.ciLow, bestOverall.ciHigh),
55
+ });
56
+ }
57
+ // Stable ordering: benchmark name asc.
58
+ out.sort((a, b) => a.benchmark.localeCompare(b.benchmark));
59
+ return out;
60
+ }