nex-code 0.4.24 → 0.4.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1001 @@
1
+ /**
2
+ * cli/skills/autoresearch.js — Autoresearch Skill
3
+ * Autonomous optimization loops: edit -> test -> log -> keep/revert
4
+ * Inspired by Karpathy's autoresearch pattern.
5
+ *
6
+ * Key design choices (aligned with Karpathy's autoresearch):
7
+ * - Dedicated branch per run (autoresearch/<tag>) for isolation
8
+ * - Git reset (not checkout) for discards — only successes in history
9
+ * - Fixed time budget per experiment for comparable results
10
+ * - Output redirection + metric grep to protect context window
11
+ * - Simplicity criterion: complexity cost weighed against metric gain
12
+ * - Crash triage: trivial bugs retried, broken ideas skipped
13
+ * - Resource tracking (memory/CPU alongside primary metric)
14
+ * - No iteration cap by default — runs until stopped
15
+ */
16
+
17
+ const { execSync } = require("child_process");
18
+ const fs = require("fs");
19
+ const path = require("path");
20
+
21
+ // Lazy-load agent to reset read guards between experiments
22
+ function resetReadGuards() {
23
+ try {
24
+ const { resetSessionTracking } = require("../agent");
25
+ resetSessionTracking();
26
+ } catch {
27
+ // agent not available (e.g. in tests) — no-op
28
+ }
29
+ }
30
+
31
+ // Lazy-load benchmark to avoid circular deps and keep startup fast
32
+ let _benchmark = null;
33
+ function getBenchmark() {
34
+ if (!_benchmark) {
35
+ try {
36
+ _benchmark = require("../benchmark");
37
+ } catch {
38
+ _benchmark = null;
39
+ }
40
+ }
41
+ return _benchmark;
42
+ }
43
+
44
+ // Track experiment history within the session
45
+ let experiments = [];
46
+ let loopActive = false;
47
+
48
+ function getLogPath() {
49
+ const dir = path.join(process.cwd(), ".nex", "autoresearch");
50
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
51
+ return path.join(dir, "experiments.json");
52
+ }
53
+
54
+ function loadExperiments() {
55
+ const logPath = getLogPath();
56
+ if (fs.existsSync(logPath)) {
57
+ try {
58
+ experiments = JSON.parse(fs.readFileSync(logPath, "utf-8"));
59
+ } catch {
60
+ experiments = [];
61
+ }
62
+ }
63
+ return experiments;
64
+ }
65
+
66
+ function saveExperiments() {
67
+ const logPath = getLogPath();
68
+ fs.writeFileSync(logPath, JSON.stringify(experiments, null, 2));
69
+ }
70
+
71
+ /** Get short git hash for current HEAD */
72
+ function gitHash() {
73
+ try {
74
+ return execSync("git rev-parse --short HEAD", {
75
+ cwd: process.cwd(),
76
+ encoding: "utf-8",
77
+ stdio: ["pipe", "pipe", "pipe"],
78
+ }).trim();
79
+ } catch {
80
+ return null;
81
+ }
82
+ }
83
+
84
+ /** Get current git branch name */
85
+ function gitBranch() {
86
+ try {
87
+ return execSync("git rev-parse --abbrev-ref HEAD", {
88
+ cwd: process.cwd(),
89
+ encoding: "utf-8",
90
+ stdio: ["pipe", "pipe", "pipe"],
91
+ }).trim();
92
+ } catch {
93
+ return null;
94
+ }
95
+ }
96
+
97
+ /** Extract metric values from output using grep patterns */
98
+ function extractMetrics(output, patterns) {
99
+ const results = {};
100
+ for (const [name, pattern] of Object.entries(patterns)) {
101
+ const re = new RegExp(pattern);
102
+ const match = output.match(re);
103
+ if (match && match[1]) {
104
+ results[name] = parseFloat(match[1]);
105
+ }
106
+ }
107
+ return results;
108
+ }
109
+
110
+ /** Parse peak memory from process output (platform-aware) */
111
+ function parseResourceUsage(output) {
112
+ const resources = {};
113
+ // Common patterns: "peak_vram_mb: 1234", "MaxRSS: 1234", "memory: 1234MB"
114
+ const vram = output.match(/peak_vram_mb:\s*([\d.]+)/);
115
+ if (vram) resources.peak_memory_mb = parseFloat(vram[1]);
116
+ const rss = output.match(/MaxRSS:\s*([\d.]+)/);
117
+ if (rss) resources.peak_memory_mb = parseFloat(rss[1]) / 1024; // KB to MB
118
+ const mem = output.match(/memory:\s*([\d.]+)\s*MB/i);
119
+ if (mem) resources.peak_memory_mb = parseFloat(mem[1]);
120
+ return resources;
121
+ }
122
+
123
+ module.exports = {
124
+ name: "autoresearch",
125
+ description:
126
+ "Autonomous optimization loops: edit -> test -> log -> keep/revert. " +
127
+ "Run experiments on a dedicated branch, track results, and automatically keep improvements or revert failures.",
128
+
129
+ instructions: `You have access to autoresearch tools for running autonomous optimization loops.
130
+
131
+ ## Workflow
132
+
133
+ When the user starts an autoresearch loop with /autoresearch <goal>, follow this cycle:
134
+
135
+ 1. **Setup branch** using skill_ar_setup_branch to create a dedicated autoresearch/<tag> branch
136
+ 2. **Baseline**: run ONE measurement command (e.g. wc -c, npm run build, a benchmark script) to get the starting metric — do NOT read every file first
137
+ 3. **Hypothesize** a specific, small change to ONE file
138
+ 4. **Commit checkpoint** using skill_ar_checkpoint before making changes
139
+ 5. **Edit** the code — make the change immediately, do not investigate further
140
+ 6. **Run experiment** using skill_ar_run_experiment with the same measurement command
141
+ 7. **Log result** using skill_ar_log_experiment with the outcome
142
+ 8. **Decide**: If improved, keep. If worse, skill_ar_revert immediately
143
+ 9. **Repeat** from step 3 — do NOT stop unless the user interrupts
144
+
145
+ ## CRITICAL: Move Fast, Investigate Less
146
+
147
+ You are a researcher running rapid experiments, NOT a code reviewer.
148
+ - **Baseline first**: measure the metric BEFORE reading any code
149
+ - **One file per experiment**: pick the most promising file, read it ONCE, make ONE targeted change
150
+ - **Never read all files** before making your first change — that wastes the entire context window
151
+ - **Max 3 reads before editing**: if you have read 3 files/ranges without making an edit, STOP reading and make a change based on what you know
152
+ - **Each experiment should take under 2 minutes**: read one file, edit it, measure, log, move on
153
+ - **Prefer bash for metrics**: use bash commands (wc -c, time, du) for measurements — they are fast and don't consume context
154
+
155
+ ## Simplicity Criterion
156
+
157
+ Not every metric improvement is worth keeping. Weigh complexity cost against improvement:
158
+ - A tiny improvement that adds 20 lines of hacky code? Probably not worth it.
159
+ - Deleting code and getting equal or better results? Definitely keep — that's a simplification win.
160
+ - An improvement of ~0 but much simpler code? Keep.
161
+ When logging experiments, note the complexity impact in the notes field.
162
+
163
+ ## Crash Triage
164
+
165
+ When an experiment crashes:
166
+ - **Trivial bug** (typo, missing import, off-by-one): fix it and re-run the same experiment
167
+ - **Fundamentally broken idea** (OOM, architectural incompatibility): log as crash, revert, move on
168
+ - Use your judgment — if you can't fix a crash in 2 attempts, skip the idea
169
+
170
+ ## Output Efficiency
171
+
172
+ When running experiments, redirect output to a log file and only grep for the target metric.
173
+ This protects the context window from being flooded with training output.
174
+ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to read just the result.
175
+
176
+ ## Rules
177
+ - Always create a checkpoint before making changes
178
+ - Always run the experiment after editing
179
+ - Always log the result (even failures and crashes)
180
+ - Revert immediately if the metric worsens
181
+ - NEVER STOP: keep running experiments until the user interrupts — they may be away
182
+ - If you run out of ideas, re-read the code for new angles, try combining previous near-misses, or try more radical changes
183
+ - Show a summary table after every 5 iterations`,
184
+
185
+ commands: [
186
+ {
187
+ cmd: "/autoresearch",
188
+ desc: "Start an autonomous optimization loop: /autoresearch <goal>",
189
+ handler: (args) => {
190
+ const goal = args.trim();
191
+ if (!goal) {
192
+ console.log("Usage: /autoresearch <optimization goal>");
193
+ console.log(
194
+ 'Example: /autoresearch "reduce test runtime while maintaining correctness"',
195
+ );
196
+ console.log(
197
+ 'Example: /autoresearch "optimize bundle size under 500kb"',
198
+ );
199
+ return;
200
+ }
201
+ loopActive = true;
202
+ loadExperiments();
203
+ console.log(`Autoresearch started: ${goal}`);
204
+ console.log(
205
+ "The agent will run autonomous optimization loops until you interrupt (Ctrl+C).",
206
+ );
207
+ console.log("Experiments run on a dedicated branch for isolation.\n");
208
+ return `AUTORESEARCH_GOAL: ${goal}\n\nStart the autoresearch loop. First, set up a dedicated branch using ar_setup_branch. Then analyze the current state and establish a baseline metric. Then begin the edit->test->log->keep/revert cycle. Do NOT stop — keep running experiments indefinitely until I interrupt.`;
209
+ },
210
+ },
211
+ {
212
+ cmd: "/ar-self-improve",
213
+ desc: "Self-improvement loop: optimize nex-code's own benchmark score",
214
+ handler: (args) => {
215
+ const focus = args.trim() || "overall benchmark score";
216
+ loopActive = true;
217
+ loadExperiments();
218
+ console.log(`Self-improvement loop started.`);
219
+ console.log(`Focus: ${focus}`);
220
+ console.log(
221
+ "The agent will optimize nex-code's benchmark score autonomously.",
222
+ );
223
+ console.log("Ctrl+C to stop.\n");
224
+ return [
225
+ `AUTORESEARCH_GOAL: Improve nex-code's ${focus}`,
226
+ "",
227
+ "## Self-Improvement Protocol",
228
+ "",
229
+ "You are optimizing nex-code itself. The benchmark suite is your eval harness — DO NOT modify it.",
230
+ "",
231
+ "### Setup",
232
+ "1. Call ar_setup_branch with a tag like 'self-improve-<date>'",
233
+ "2. Call ar_run_benchmark with quick=true to establish baseline score",
234
+ "3. Read the category breakdown — identify the weakest category",
235
+ "",
236
+ "### Loop",
237
+ "1. Pick ONE targeted improvement to address the weakest benchmark area",
238
+ "2. ar_checkpoint before making changes",
239
+ "3. Edit nex-code source files (agent.js, orchestrator.js, context-engine.js, etc.)",
240
+ "4. Run npm test to verify nothing breaks — if tests fail, fix or revert immediately",
241
+ "5. npm run build to update dist/",
242
+ "6. ar_run_benchmark with quick=true to measure the new score",
243
+ "7. ar_log_experiment with the benchmark score as metric",
244
+ "8. If score improved: keep. If score same or worse: ar_revert",
245
+ "9. Repeat — do NOT stop",
246
+ "",
247
+ "### What you CAN modify",
248
+ "- cli/agent.js — guard thresholds, system prompts, tool handling",
249
+ "- cli/orchestrator.js — sub-agent behavior, decomposition logic",
250
+ "- cli/context-engine.js — compression, token estimation",
251
+ "- cli/sub-agent.js — retry logic, error classification",
252
+ "- cli/task-router.js — routing logic",
253
+ "- Any other cli/ source file that affects agent quality",
254
+ "",
255
+ "### What you CANNOT modify",
256
+ "- cli/benchmark.js — this is the eval harness, modifying it is cheating",
257
+ "- tests/ — test files are not the optimization target",
258
+ "- Do not modify the scoring weights or task definitions",
259
+ "",
260
+ "### Quality rules",
261
+ "- Simplicity criterion: prefer removing code over adding it",
262
+ "- Each change must pass npm test before benchmarking",
263
+ "- Track which category you targeted and whether it improved",
264
+ "- If 3 consecutive experiments fail to improve, shift focus to a different category",
265
+ ].join("\n");
266
+ },
267
+ },
268
+ {
269
+ cmd: "/ar-status",
270
+ desc: "Show autoresearch experiment history",
271
+ handler: () => {
272
+ const exps = loadExperiments();
273
+ if (exps.length === 0) {
274
+ console.log("No experiments recorded yet.");
275
+ return;
276
+ }
277
+ console.log(`\nExperiment History (${exps.length} total):\n`);
278
+ console.log(
279
+ " # | Status | Metric | Memory MB | Commit | Description",
280
+ );
281
+ console.log(
282
+ " ---|----------|---------------|-----------|---------|----------------------------------",
283
+ );
284
+ for (let i = 0; i < exps.length; i++) {
285
+ const e = exps[i];
286
+ const status =
287
+ e.status === "crash"
288
+ ? "CRASH "
289
+ : e.kept
290
+ ? "KEPT "
291
+ : "REVERTED";
292
+ const metric =
293
+ e.metric != null ? String(e.metric).padEnd(13) : "N/A ";
294
+ const memory =
295
+ e.peak_memory_mb != null
296
+ ? String(e.peak_memory_mb.toFixed(1)).padEnd(9)
297
+ : "N/A ";
298
+ const commit = (e.commit || "N/A").padEnd(7);
299
+ const desc = (e.description || "").substring(0, 34);
300
+ console.log(
301
+ ` ${String(i + 1).padStart(2)} | ${status} | ${metric} | ${memory} | ${commit} | ${desc}`,
302
+ );
303
+ }
304
+ // Show trend
305
+ const kept = exps.filter((e) => e.kept);
306
+ if (kept.length >= 2) {
307
+ const first = kept[0].metric;
308
+ const last = kept[kept.length - 1].metric;
309
+ if (first != null && last != null) {
310
+ const diff = last - first;
311
+ const arrow = diff > 0 ? "+" : "";
312
+ console.log(
313
+ `\n Trend: ${first} -> ${last} (${arrow}${diff.toFixed(2)})`,
314
+ );
315
+ }
316
+ }
317
+ const crashes = exps.filter((e) => e.status === "crash");
318
+ if (crashes.length > 0) {
319
+ console.log(` Crashes: ${crashes.length}`);
320
+ }
321
+ console.log();
322
+ },
323
+ },
324
+ {
325
+ cmd: "/ar-clear",
326
+ desc: "Clear autoresearch experiment history",
327
+ handler: () => {
328
+ experiments = [];
329
+ saveExperiments();
330
+ loopActive = false;
331
+ console.log("Autoresearch history cleared.");
332
+ },
333
+ },
334
+ ],
335
+
336
+ tools: [
337
+ {
338
+ type: "function",
339
+ function: {
340
+ name: "ar_setup_branch",
341
+ description:
342
+ "Create a dedicated autoresearch branch for this experiment run. " +
343
+ "Creates 'autoresearch/<tag>' from the current branch. " +
344
+ "Call this ONCE at the start of each autoresearch session.",
345
+ parameters: {
346
+ type: "object",
347
+ properties: {
348
+ tag: {
349
+ type: "string",
350
+ description:
351
+ "Short tag for this run (e.g. 'mar31', 'perf-opt'). " +
352
+ "Used as branch name: autoresearch/<tag>",
353
+ },
354
+ },
355
+ required: ["tag"],
356
+ },
357
+ },
358
+ execute: async (args) => {
359
+ const tag = (args.tag || "").replace(/[^a-zA-Z0-9_-]/g, "-");
360
+ const branchName = `autoresearch/${tag}`;
361
+
362
+ try {
363
+ // Check if branch already exists
364
+ try {
365
+ execSync(`git rev-parse --verify ${branchName}`, {
366
+ cwd: process.cwd(),
367
+ stdio: ["pipe", "pipe", "pipe"],
368
+ });
369
+ // Branch exists — check it out
370
+ execSync(`git checkout ${branchName}`, {
371
+ cwd: process.cwd(),
372
+ stdio: ["pipe", "pipe", "pipe"],
373
+ });
374
+ return JSON.stringify({
375
+ status: "resumed",
376
+ branch: branchName,
377
+ note: "Branch already existed — resuming experiments on it.",
378
+ });
379
+ } catch {
380
+ // Branch doesn't exist — create it
381
+ }
382
+
383
+ const sourceBranch = gitBranch() || "unknown";
384
+ execSync(`git checkout -b ${branchName}`, {
385
+ cwd: process.cwd(),
386
+ stdio: ["pipe", "pipe", "pipe"],
387
+ });
388
+
389
+ return JSON.stringify({
390
+ status: "created",
391
+ branch: branchName,
392
+ source_branch: sourceBranch,
393
+ note: `Experiment branch created. All experiments will be isolated here. Merge back to '${sourceBranch}' when done.`,
394
+ });
395
+ } catch (err) {
396
+ return JSON.stringify({
397
+ status: "branch_failed",
398
+ error: err.message,
399
+ note: "Could not create branch. Continuing on current branch.",
400
+ });
401
+ }
402
+ },
403
+ },
404
+ {
405
+ type: "function",
406
+ function: {
407
+ name: "ar_checkpoint",
408
+ description:
409
+ "Create a git checkpoint before making experimental changes. " +
410
+ "This allows reverting via git reset if the experiment fails. " +
411
+ "Call this BEFORE editing any files in an autoresearch loop.",
412
+ parameters: {
413
+ type: "object",
414
+ properties: {
415
+ message: {
416
+ type: "string",
417
+ description:
418
+ "Short description of what you are about to try (e.g. 'replace forEach with for-of loop')",
419
+ },
420
+ },
421
+ required: ["message"],
422
+ },
423
+ },
424
+ execute: async (args) => {
425
+ try {
426
+ // Stage all current changes and create a checkpoint commit
427
+ execSync("git add -A", { cwd: process.cwd(), stdio: "pipe" });
428
+ const hasChanges = execSync("git diff --cached --stat", {
429
+ cwd: process.cwd(),
430
+ encoding: "utf-8",
431
+ }).trim();
432
+
433
+ if (hasChanges) {
434
+ execSync(
435
+ `git commit -m "autoresearch: checkpoint before: ${(args.message || "experiment").replace(/"/g, '\\"')}"`,
436
+ { cwd: process.cwd(), stdio: "pipe" },
437
+ );
438
+ }
439
+
440
+ const hash = gitHash();
441
+
442
+ // Reset read guards so the agent can re-read files in the next experiment
443
+ resetReadGuards();
444
+
445
+ return JSON.stringify({
446
+ status: "checkpoint_created",
447
+ commit: hash,
448
+ message: args.message,
449
+ });
450
+ } catch (err) {
451
+ return JSON.stringify({
452
+ status: "checkpoint_skipped",
453
+ reason: err.message,
454
+ note: "Working tree may be clean or git unavailable. Proceeding anyway.",
455
+ });
456
+ }
457
+ },
458
+ },
459
+ {
460
+ type: "function",
461
+ function: {
462
+ name: "ar_run_experiment",
463
+ description:
464
+ "Run a test/benchmark command to measure the effect of changes. " +
465
+ "Returns stdout, stderr, exit code, execution time, and resource usage. " +
466
+ "Supports output redirection to a log file to protect context window. " +
467
+ "Call this AFTER making changes to measure their impact.",
468
+ parameters: {
469
+ type: "object",
470
+ properties: {
471
+ command: {
472
+ type: "string",
473
+ description:
474
+ 'The shell command to run (e.g. "npm test", "time npm run build", "node bench.js")',
475
+ },
476
+ timeout_seconds: {
477
+ type: "number",
478
+ description:
479
+ "Max seconds to wait (default: 300). Kill the process if exceeded.",
480
+ },
481
+ output_file: {
482
+ type: "string",
483
+ description:
484
+ "Optional: redirect all output to this file instead of capturing in context. " +
485
+ 'Use with ar_extract_metric to read only the metric. (e.g. "run.log")',
486
+ },
487
+ metric_pattern: {
488
+ type: "string",
489
+ description:
490
+ "Optional: regex pattern to extract the primary metric from output. " +
491
+ "Must have one capture group for the numeric value. " +
492
+ '(e.g. "val_bpb:\\\\s*([\\\\d.]+)")',
493
+ },
494
+ },
495
+ required: ["command"],
496
+ },
497
+ },
498
+ execute: async (args) => {
499
+ const timeout = (args.timeout_seconds || 300) * 1000;
500
+ const start = Date.now();
501
+ const outputFile = args.output_file;
502
+
503
+ // Build the actual command — redirect if output_file specified
504
+ const cmd = outputFile
505
+ ? `${args.command} > ${outputFile} 2>&1`
506
+ : args.command;
507
+
508
+ try {
509
+ const output = execSync(cmd, {
510
+ cwd: process.cwd(),
511
+ encoding: "utf-8",
512
+ timeout,
513
+ maxBuffer: 2 * 1024 * 1024, // 2MB
514
+ stdio: ["pipe", "pipe", "pipe"],
515
+ });
516
+
517
+ const elapsed = ((Date.now() - start) / 1000).toFixed(2);
518
+ const rawOutput = outputFile
519
+ ? fs.existsSync(path.resolve(process.cwd(), outputFile))
520
+ ? fs.readFileSync(
521
+ path.resolve(process.cwd(), outputFile),
522
+ "utf-8",
523
+ )
524
+ : ""
525
+ : output;
526
+
527
+ // Extract resource usage from output
528
+ const resources = parseResourceUsage(rawOutput);
529
+
530
+ // Extract metric if pattern provided
531
+ let extractedMetric = null;
532
+ if (args.metric_pattern) {
533
+ const metrics = extractMetrics(rawOutput, {
534
+ primary: args.metric_pattern,
535
+ });
536
+ extractedMetric = metrics.primary ?? null;
537
+ }
538
+
539
+ // For redirected output, only return summary + metric
540
+ const stdout = outputFile
541
+ ? `[Output redirected to ${outputFile}]`
542
+ : output.substring(0, 4000);
543
+
544
+ return JSON.stringify({
545
+ status: "success",
546
+ exit_code: 0,
547
+ elapsed_seconds: parseFloat(elapsed),
548
+ stdout,
549
+ stderr: "",
550
+ extracted_metric: extractedMetric,
551
+ resources,
552
+ });
553
+ } catch (err) {
554
+ const elapsed = ((Date.now() - start) / 1000).toFixed(2);
555
+
556
+ // Try to read output file even on failure
557
+ let resources = {};
558
+ let extractedMetric = null;
559
+ if (outputFile) {
560
+ const outPath = path.resolve(process.cwd(), outputFile);
561
+ if (fs.existsSync(outPath)) {
562
+ const rawOutput = fs.readFileSync(outPath, "utf-8");
563
+ resources = parseResourceUsage(rawOutput);
564
+ if (args.metric_pattern) {
565
+ const metrics = extractMetrics(rawOutput, {
566
+ primary: args.metric_pattern,
567
+ });
568
+ extractedMetric = metrics.primary ?? null;
569
+ }
570
+ }
571
+ }
572
+
573
+ return JSON.stringify({
574
+ status: err.killed ? "timeout" : "failure",
575
+ exit_code: err.status || 1,
576
+ elapsed_seconds: parseFloat(elapsed),
577
+ stdout: outputFile
578
+ ? `[Output redirected to ${outputFile}]`
579
+ : (err.stdout || "").substring(0, 4000),
580
+ stderr: (err.stderr || "").substring(0, 2000),
581
+ extracted_metric: extractedMetric,
582
+ resources,
583
+ });
584
+ }
585
+ },
586
+ },
587
+ {
588
+ type: "function",
589
+ function: {
590
+ name: "ar_extract_metric",
591
+ description:
592
+ "Extract specific metrics from an experiment log file using grep patterns. " +
593
+ "Use this after ar_run_experiment with output_file to read only the metrics " +
594
+ "without loading the entire output into context.",
595
+ parameters: {
596
+ type: "object",
597
+ properties: {
598
+ file: {
599
+ type: "string",
600
+ description:
601
+ 'Path to the log file (e.g. "run.log")',
602
+ },
603
+ patterns: {
604
+ type: "object",
605
+ description:
606
+ 'Map of metric name to regex pattern with one capture group. ' +
607
+ 'Example: {"val_bpb": "val_bpb:\\\\s*([\\\\d.]+)", "memory": "peak_vram_mb:\\\\s*([\\\\d.]+)"}',
608
+ additionalProperties: { type: "string" },
609
+ },
610
+ tail_lines: {
611
+ type: "number",
612
+ description:
613
+ "If the file is large, only read the last N lines (default: 100). " +
614
+ "Set to 0 to read the entire file.",
615
+ },
616
+ },
617
+ required: ["file", "patterns"],
618
+ },
619
+ },
620
+ execute: async (args) => {
621
+ try {
622
+ const filePath = path.resolve(process.cwd(), args.file);
623
+ if (!fs.existsSync(filePath)) {
624
+ return JSON.stringify({
625
+ status: "file_not_found",
626
+ file: args.file,
627
+ });
628
+ }
629
+
630
+ let content = fs.readFileSync(filePath, "utf-8");
631
+ const tailLines =
632
+ args.tail_lines !== undefined ? args.tail_lines : 100;
633
+ if (tailLines > 0) {
634
+ const lines = content.split("\n");
635
+ content = lines.slice(-tailLines).join("\n");
636
+ }
637
+
638
+ const metrics = extractMetrics(content, args.patterns);
639
+ const resources = parseResourceUsage(content);
640
+
641
+ return JSON.stringify({
642
+ status: "extracted",
643
+ metrics,
644
+ resources,
645
+ lines_read: tailLines > 0 ? tailLines : content.split("\n").length,
646
+ });
647
+ } catch (err) {
648
+ return JSON.stringify({
649
+ status: "extract_failed",
650
+ error: err.message,
651
+ });
652
+ }
653
+ },
654
+ },
655
+ {
656
+ type: "function",
657
+ function: {
658
+ name: "ar_run_benchmark",
659
+ description:
660
+ "Run nex-code's built-in benchmark suite and return scores. " +
661
+ "This is the primary metric for self-improvement loops. " +
662
+ "Returns overall score (0-100), per-category scores, and model details. " +
663
+ "Use quick=true for fast iteration (~1-2 min), full for comprehensive evaluation.",
664
+ parameters: {
665
+ type: "object",
666
+ properties: {
667
+ quick: {
668
+ type: "boolean",
669
+ description:
670
+ "If true, run 7 tasks on 3 models (fast). If false, run all 59 tasks (thorough). Default: true.",
671
+ },
672
+ models: {
673
+ type: "array",
674
+ items: { type: "string" },
675
+ description:
676
+ "Optional: specific models to benchmark. Default: top models from previous results.",
677
+ },
678
+ },
679
+ },
680
+ },
681
+ execute: async (args) => {
682
+ const benchmark = getBenchmark();
683
+ if (!benchmark) {
684
+ return JSON.stringify({
685
+ status: "unavailable",
686
+ error: "Benchmark module not found. Make sure cli/benchmark.js exists.",
687
+ });
688
+ }
689
+
690
+ const quick = args.quick !== false; // default true
691
+ const start = Date.now();
692
+
693
+ try {
694
+ const summary = await benchmark.runBenchmark({
695
+ quick,
696
+ models: args.models || undefined,
697
+ onProgress: () => {}, // silent
698
+ });
699
+
700
+ const elapsed = ((Date.now() - start) / 1000).toFixed(1);
701
+
702
+ // Extract the key metrics for autoresearch
703
+ const results = summary.map((s) => ({
704
+ model: s.model,
705
+ score: s.score,
706
+ categoryScores: s.categoryScores || {},
707
+ toolCallRate: s.toolCallRate,
708
+ correctRate: s.correctRate,
709
+ validArgsRate: s.validArgsRate,
710
+ avgLatency: s.avgLatency,
711
+ }));
712
+
713
+ // Compute aggregate score across all models
714
+ const avgScore =
715
+ results.length > 0
716
+ ? Math.round(
717
+ (results.reduce((a, r) => a + r.score, 0) / results.length) *
718
+ 10,
719
+ ) / 10
720
+ : 0;
721
+
722
+ // Find weakest category across all models
723
+ const categoryTotals = {};
724
+ const categoryCounts = {};
725
+ for (const r of results) {
726
+ for (const [cat, score] of Object.entries(r.categoryScores)) {
727
+ categoryTotals[cat] = (categoryTotals[cat] || 0) + score;
728
+ categoryCounts[cat] = (categoryCounts[cat] || 0) + 1;
729
+ }
730
+ }
731
+ const categoryAvgs = {};
732
+ for (const cat of Object.keys(categoryTotals)) {
733
+ categoryAvgs[cat] =
734
+ Math.round((categoryTotals[cat] / categoryCounts[cat]) * 10) / 10;
735
+ }
736
+
737
+ // Sort categories by score to find weakest
738
+ const sortedCategories = Object.entries(categoryAvgs)
739
+ .sort((a, b) => a[1] - b[1]);
740
+
741
+ const weakestCategory =
742
+ sortedCategories.length > 0 ? sortedCategories[0] : null;
743
+
744
+ return JSON.stringify({
745
+ status: "success",
746
+ quick,
747
+ elapsed_seconds: parseFloat(elapsed),
748
+ models_tested: results.length,
749
+ average_score: avgScore,
750
+ category_averages: categoryAvgs,
751
+ weakest_category: weakestCategory
752
+ ? { name: weakestCategory[0], score: weakestCategory[1] }
753
+ : null,
754
+ per_model: results,
755
+ });
756
+ } catch (err) {
757
+ return JSON.stringify({
758
+ status: "benchmark_failed",
759
+ error: err.message,
760
+ elapsed_seconds:
761
+ parseFloat(((Date.now() - start) / 1000).toFixed(1)),
762
+ });
763
+ }
764
+ },
765
+ },
766
+ {
767
+ type: "function",
768
+ function: {
769
+ name: "ar_log_experiment",
770
+ description:
771
+ "Log the result of an experiment. Call this after running the experiment " +
772
+ "to record whether the change was an improvement. This builds the experiment history.",
773
+ parameters: {
774
+ type: "object",
775
+ properties: {
776
+ description: {
777
+ type: "string",
778
+ description:
779
+ "What was changed (e.g. 'replaced Array.map with for loop in parser')",
780
+ },
781
+ metric: {
782
+ type: "number",
783
+ description:
784
+ "The measured metric value (e.g. test runtime in seconds, bundle size in KB, score). Use 0 for crashes.",
785
+ },
786
+ metric_name: {
787
+ type: "string",
788
+ description:
789
+ 'Name of the metric (e.g. "runtime_seconds", "bundle_size_kb", "val_bpb")',
790
+ },
791
+ kept: {
792
+ type: "boolean",
793
+ description:
794
+ "Whether you decided to keep (true) or revert (false) this change",
795
+ },
796
+ status: {
797
+ type: "string",
798
+ enum: ["keep", "discard", "crash"],
799
+ description:
800
+ "Experiment outcome: 'keep' if metric improved, 'discard' if worse, 'crash' if it failed to run",
801
+ },
802
+ peak_memory_mb: {
803
+ type: "number",
804
+ description:
805
+ "Peak memory usage in MB during the experiment (if available)",
806
+ },
807
+ complexity_impact: {
808
+ type: "string",
809
+ enum: ["simpler", "neutral", "complex"],
810
+ description:
811
+ "How this change affects code complexity: 'simpler' (removed code), 'neutral', or 'complex' (added code)",
812
+ },
813
+ notes: {
814
+ type: "string",
815
+ description:
816
+ "Additional observations — include complexity assessment and crash triage info",
817
+ },
818
+ },
819
+ required: ["description", "metric", "kept"],
820
+ },
821
+ },
822
+ execute: async (args) => {
823
+ loadExperiments();
824
+ const commit = gitHash();
825
+ const entry = {
826
+ id: experiments.length + 1,
827
+ timestamp: new Date().toISOString(),
828
+ commit,
829
+ description: args.description,
830
+ metric: args.metric,
831
+ metric_name: args.metric_name || "metric",
832
+ kept: args.kept,
833
+ status: args.status || (args.kept ? "keep" : "discard"),
834
+ peak_memory_mb: args.peak_memory_mb ?? null,
835
+ complexity_impact: args.complexity_impact || "neutral",
836
+ notes: args.notes || "",
837
+ };
838
+ experiments.push(entry);
839
+ saveExperiments();
840
+
841
+ const trend =
842
+ experiments.length >= 2
843
+ ? `Previous: ${experiments[experiments.length - 2].metric}, Current: ${args.metric}`
844
+ : "First experiment — baseline established";
845
+
846
+ return JSON.stringify({
847
+ status: "logged",
848
+ experiment_number: entry.id,
849
+ total_experiments: experiments.length,
850
+ kept_count: experiments.filter((e) => e.kept).length,
851
+ reverted_count: experiments.filter((e) => !e.kept).length,
852
+ crash_count: experiments.filter((e) => e.status === "crash").length,
853
+ trend,
854
+ });
855
+ },
856
+ },
857
+ {
858
+ type: "function",
859
+ function: {
860
+ name: "ar_revert",
861
+ description:
862
+ "Revert to the last checkpoint using git reset. " +
863
+ "Unlike git checkout, this moves the branch pointer back so only " +
864
+ "successful experiments remain in git history. " +
865
+ "Use this when an experiment made things worse or crashed.",
866
+ parameters: {
867
+ type: "object",
868
+ properties: {
869
+ reason: {
870
+ type: "string",
871
+ description:
872
+ "Why reverting (e.g. 'metric worsened from 2.3s to 4.1s')",
873
+ },
874
+ },
875
+ required: ["reason"],
876
+ },
877
+ },
878
+ execute: async (args) => {
879
+ try {
880
+ // Use git reset --hard HEAD~1 to remove the failed experiment commit
881
+ // and move the branch pointer back (clean history, only successes)
882
+ const currentHash = gitHash();
883
+
884
+ // Check if there's a commit to reset to
885
+ try {
886
+ execSync("git log --oneline -2", {
887
+ cwd: process.cwd(),
888
+ encoding: "utf-8",
889
+ stdio: ["pipe", "pipe", "pipe"],
890
+ });
891
+ } catch {
892
+ // Fallback: just clean working tree
893
+ execSync("git checkout -- .", {
894
+ cwd: process.cwd(),
895
+ stdio: "pipe",
896
+ });
897
+ execSync("git clean -fd", {
898
+ cwd: process.cwd(),
899
+ stdio: "pipe",
900
+ });
901
+ return JSON.stringify({
902
+ status: "reverted",
903
+ method: "checkout",
904
+ reason: args.reason,
905
+ });
906
+ }
907
+
908
+ // Reset to before the experiment commit
909
+ execSync("git reset --hard HEAD~1", {
910
+ cwd: process.cwd(),
911
+ stdio: "pipe",
912
+ });
913
+ // Also clean any untracked files
914
+ execSync("git clean -fd", {
915
+ cwd: process.cwd(),
916
+ stdio: "pipe",
917
+ });
918
+
919
+ const newHash = gitHash();
920
+
921
+ // Reset read guards — files changed after revert, agent needs fresh access
922
+ resetReadGuards();
923
+
924
+ return JSON.stringify({
925
+ status: "reverted",
926
+ method: "reset",
927
+ reverted_from: currentHash,
928
+ reverted_to: newHash,
929
+ reason: args.reason,
930
+ note: "Branch pointer moved back — failed experiment removed from history. Read guards reset — you can re-read files.",
931
+ });
932
+ } catch (err) {
933
+ // Fallback to checkout if reset fails
934
+ try {
935
+ execSync("git checkout -- .", {
936
+ cwd: process.cwd(),
937
+ stdio: "pipe",
938
+ });
939
+ execSync("git clean -fd", {
940
+ cwd: process.cwd(),
941
+ stdio: "pipe",
942
+ });
943
+ return JSON.stringify({
944
+ status: "reverted",
945
+ method: "checkout_fallback",
946
+ reason: args.reason,
947
+ note: "git reset failed, fell back to checkout. Commit may remain in history.",
948
+ });
949
+ } catch (fallbackErr) {
950
+ return JSON.stringify({
951
+ status: "revert_failed",
952
+ error: fallbackErr.message,
953
+ note: "Manual cleanup may be needed. Check git status.",
954
+ });
955
+ }
956
+ }
957
+ },
958
+ },
959
+ {
960
+ type: "function",
961
+ function: {
962
+ name: "ar_history",
963
+ description:
964
+ "Get the full experiment history as JSON for analysis. " +
965
+ "Use this to review past experiments and identify patterns.",
966
+ parameters: {
967
+ type: "object",
968
+ properties: {},
969
+ },
970
+ },
971
+ execute: async () => {
972
+ loadExperiments();
973
+ const kept = experiments.filter((e) => e.kept);
974
+ const reverted = experiments.filter((e) => !e.kept);
975
+ const crashes = experiments.filter((e) => e.status === "crash");
976
+
977
+ let bestMetric = null;
978
+ let worstMetric = null;
979
+ for (const e of experiments) {
980
+ if (e.metric != null && e.status !== "crash") {
981
+ if (bestMetric === null || e.metric < bestMetric)
982
+ bestMetric = e.metric;
983
+ if (worstMetric === null || e.metric > worstMetric)
984
+ worstMetric = e.metric;
985
+ }
986
+ }
987
+
988
+ return JSON.stringify({
989
+ total: experiments.length,
990
+ kept: kept.length,
991
+ reverted: reverted.length,
992
+ crashes: crashes.length,
993
+ best_metric: bestMetric,
994
+ worst_metric: worstMetric,
995
+ branch: gitBranch(),
996
+ experiments: experiments.slice(-20), // Last 20
997
+ });
998
+ },
999
+ },
1000
+ ],
1001
+ };