nex-code 0.4.37 → 0.4.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,7 +14,7 @@
14
14
  * - No iteration cap by default — runs until stopped
15
15
  */
16
16
 
17
- const { execSync } = require("child_process");
17
+ const { execSync, spawn } = require("child_process");
18
18
  const fs = require("fs");
19
19
  const path = require("path");
20
20
 
@@ -44,6 +44,14 @@ function getBenchmark() {
44
44
  // Track experiment history within the session
45
45
  let experiments = [];
46
46
  let loopActive = false;
47
+ let sessionBaselineScore = null; // set on first ar_run_benchmark call
48
+
49
+ // ─── Watch Mode state ───────────────────────────────────────────
50
+ let _watchProcess = null;
51
+ let _watchCallbacks = { onFailure: null };
52
+ let _watchTestCommand = null;
53
+ let _watchDebounceTimer = null;
54
+ const WATCH_DEBOUNCE_MS = 2000;
47
55
 
48
56
  function getLogPath() {
49
57
  const dir = path.join(process.cwd(), ".nex", "autoresearch");
@@ -263,11 +271,16 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
263
271
  "4. If score same or worse: ar_revert, go to Step A with a different hypothesis",
264
272
  "",
265
273
  "### HARD RULES",
274
+ "- EVERY response must end with a tool call — NEVER output text as your final message",
275
+ "- After ar_revert: call ar_checkpoint immediately in the same response as your next hypothesis",
276
+ "- After ar_run_benchmark: call ar_log_experiment immediately in the same response",
266
277
  "- NEVER read more than 2 files per experiment cycle",
267
278
  "- NEVER use bash to create branches or run benchmarks — use ar_ tools only",
268
279
  "- If you are blocked from reading a file, SKIP IT and make your edit based on what you already know",
269
280
  "- If 3 consecutive experiments fail to improve, change category focus",
270
281
  "- Simplicity criterion: prefer removing code over adding it — complexity cost must be justified by metric gain",
282
+ "- cli/tools/index.js uses single-quoted JS strings — when editing descriptions, use ONLY single quotes inside the text, or escape double quotes as \\\\\" — NEVER put a raw double quote inside a JS string literal or tests will fail with SyntaxError",
283
+ "- Before every Edit call, grep the EXACT old_text from the file first so it matches byte-for-byte",
271
284
  "",
272
285
  "### How the benchmark score works — READ THIS FIRST",
273
286
  "The benchmark sends nex-code's TOOL_DEFINITIONS (schemas) to external models and checks:",
@@ -359,6 +372,53 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
359
372
  console.log("Autoresearch history cleared.");
360
373
  },
361
374
  },
375
+ {
376
+ cmd: "/ar-watch",
377
+ desc: "Start/stop background file watcher that auto-runs tests on changes",
378
+ handler: (args) => {
379
+ // Check feature flag
380
+ let watchEnabled = false;
381
+ try {
382
+ const { feature } = require("../feature-flags");
383
+ watchEnabled = feature("WATCH_MODE");
384
+ } catch {
385
+ // feature-flags not available — check env
386
+ watchEnabled =
387
+ process.env.NEX_FEATURE_WATCH_MODE === "1" ||
388
+ process.env.NEX_FEATURE_WATCH_MODE === "true";
389
+ }
390
+ if (!watchEnabled) {
391
+ console.log(
392
+ "Watch mode is disabled. Enable with NEX_FEATURE_WATCH_MODE=1",
393
+ );
394
+ return;
395
+ }
396
+
397
+ const cmd = args.trim();
398
+ if (cmd === "stop" || cmd === "off") {
399
+ stopWatch();
400
+ console.log("Watch mode stopped.");
401
+ return;
402
+ }
403
+ if (_watchProcess) {
404
+ console.log(
405
+ "Watch mode is already running. Use /ar-watch stop to stop it.",
406
+ );
407
+ return;
408
+ }
409
+
410
+ // Parse: /ar-watch <test command> [--watch-path <glob>]
411
+ const testCommand = cmd || "npm test";
412
+ const watchPath = process.cwd();
413
+ _watchTestCommand = testCommand;
414
+
415
+ startWatch(watchPath, testCommand);
416
+ console.log(`Watch mode started. Monitoring ${watchPath} for changes.`);
417
+ console.log(`Test command: ${testCommand}`);
418
+ console.log("On test failure, the agent will auto-investigate.");
419
+ console.log("Use /ar-watch stop to stop.\n");
420
+ },
421
+ },
362
422
  ],
363
423
 
364
424
  tools: [
@@ -386,11 +446,12 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
386
446
  execute: async (args) => {
387
447
  // Strip any date-like suffix the model may have hallucinated, then
388
448
  // append today's real date so the branch name is always accurate.
389
- const baseTag = (args.tag || "self-improve")
390
- .replace(/[^a-zA-Z0-9_-]/g, "-")
391
- .replace(/-?\d{4,8}$/, "") // strip trailing YYYYMMDD / YYYYMM / etc.
392
- .replace(/-[a-z]{3}\d{1,2}$/i, "") // strip trailing mon## (e.g. apr15)
393
- .replace(/-+$/, "") || "self-improve";
449
+ const baseTag =
450
+ (args.tag || "self-improve")
451
+ .replace(/[^a-zA-Z0-9_-]/g, "-")
452
+ .replace(/-?\d{4,8}$/, "") // strip trailing YYYYMMDD / YYYYMM / etc.
453
+ .replace(/-[a-z]{3}\d{1,2}$/i, "") // strip trailing mon## (e.g. apr15)
454
+ .replace(/-+$/, "") || "self-improve";
394
455
  const now = new Date();
395
456
  const dateStr =
396
457
  now.toLocaleString("en", { month: "short" }).toLowerCase() +
@@ -399,13 +460,38 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
399
460
  const branchName = `autoresearch/${tag}`;
400
461
 
401
462
  try {
463
+ const currentBranch = gitBranch();
464
+ // If we're already on the target branch, nothing to do
465
+ if (currentBranch === branchName) {
466
+ return JSON.stringify({
467
+ status: "resumed",
468
+ branch: branchName,
469
+ note: "Already on autoresearch branch — continuing experiments.",
470
+ });
471
+ }
472
+
402
473
  // Check if branch already exists
474
+ let branchExists = false;
403
475
  try {
404
476
  execSync(`git rev-parse --verify ${branchName}`, {
405
477
  cwd: process.cwd(),
406
478
  stdio: ["pipe", "pipe", "pipe"],
407
479
  });
408
- // Branch exists — check it out
480
+ branchExists = true;
481
+ } catch {
482
+ // Branch doesn't exist
483
+ }
484
+
485
+ if (branchExists) {
486
+ // Stash any uncommitted changes before switching
487
+ try {
488
+ execSync(`git stash`, {
489
+ cwd: process.cwd(),
490
+ stdio: ["pipe", "pipe", "pipe"],
491
+ });
492
+ } catch {
493
+ // Ignore stash errors (nothing to stash)
494
+ }
409
495
  execSync(`git checkout ${branchName}`, {
410
496
  cwd: process.cwd(),
411
497
  stdio: ["pipe", "pipe", "pipe"],
@@ -415,8 +501,6 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
415
501
  branch: branchName,
416
502
  note: "Branch already existed — resuming experiments on it.",
417
503
  });
418
- } catch {
419
- // Branch doesn't exist — create it
420
504
  }
421
505
 
422
506
  const sourceBranch = gitBranch() || "unknown";
@@ -636,13 +720,12 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
636
720
  properties: {
637
721
  file: {
638
722
  type: "string",
639
- description:
640
- 'Path to the log file (e.g. "run.log")',
723
+ description: 'Path to the log file (e.g. "run.log")',
641
724
  },
642
725
  patterns: {
643
726
  type: "object",
644
727
  description:
645
- 'Map of metric name to regex pattern with one capture group. ' +
728
+ "Map of metric name to regex pattern with one capture group. " +
646
729
  'Example: {"val_bpb": "val_bpb:\\\\s*([\\\\d.]+)", "memory": "peak_vram_mb:\\\\s*([\\\\d.]+)"}',
647
730
  additionalProperties: { type: "string" },
648
731
  },
@@ -722,7 +805,8 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
722
805
  if (!benchmark) {
723
806
  return JSON.stringify({
724
807
  status: "unavailable",
725
- error: "Benchmark module not found. Make sure cli/benchmark.js exists.",
808
+ error:
809
+ "Benchmark module not found. Make sure cli/benchmark.js exists.",
726
810
  });
727
811
  }
728
812
 
@@ -758,6 +842,11 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
758
842
  ) / 10
759
843
  : 0;
760
844
 
845
+ // Record baseline on first run
846
+ if (sessionBaselineScore === null) {
847
+ sessionBaselineScore = avgScore;
848
+ }
849
+
761
850
  // Find weakest category across all models
762
851
  const categoryTotals = {};
763
852
  const categoryCounts = {};
@@ -774,8 +863,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
774
863
  }
775
864
 
776
865
  // Sort categories by score to find weakest
777
- const sortedCategories = Object.entries(categoryAvgs)
778
- .sort((a, b) => a[1] - b[1]);
866
+ const sortedCategories = Object.entries(categoryAvgs).sort(
867
+ (a, b) => a[1] - b[1],
868
+ );
779
869
 
780
870
  const weakestCategory =
781
871
  sortedCategories.length > 0 ? sortedCategories[0] : null;
@@ -796,8 +886,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
796
886
  return JSON.stringify({
797
887
  status: "benchmark_failed",
798
888
  error: err.message,
799
- elapsed_seconds:
800
- parseFloat(((Date.now() - start) / 1000).toFixed(1)),
889
+ elapsed_seconds: parseFloat(
890
+ ((Date.now() - start) / 1000).toFixed(1),
891
+ ),
801
892
  });
802
893
  }
803
894
  },
@@ -860,6 +951,21 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
860
951
  },
861
952
  execute: async (args) => {
862
953
  loadExperiments();
954
+
955
+ // Enforce keep/revert decision against session baseline
956
+ if (
957
+ sessionBaselineScore !== null &&
958
+ typeof args.metric === "number" &&
959
+ args.kept === true &&
960
+ args.metric < sessionBaselineScore
961
+ ) {
962
+ console.log(
963
+ `\x1b[31m ⚠ Score ${args.metric} < baseline ${sessionBaselineScore} — overriding kept=true to kept=false\x1b[0m`,
964
+ );
965
+ args.kept = false;
966
+ args.status = "discard";
967
+ }
968
+
863
969
  const commit = gitHash();
864
970
  const entry = {
865
971
  id: experiments.length + 1,
@@ -888,7 +994,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
888
994
 
889
995
  const keptCount = experiments.filter((e) => e.kept).length;
890
996
  const revertedCount = experiments.filter((e) => !e.kept).length;
891
- const statusIcon = args.kept ? "\x1b[32m✔ KEPT\x1b[0m" : "\x1b[31m✘ REVERTED\x1b[0m";
997
+ const statusIcon = args.kept
998
+ ? "\x1b[32m✔ KEPT\x1b[0m"
999
+ : "\x1b[31m✘ REVERTED\x1b[0m";
892
1000
  const delta =
893
1001
  prev != null && typeof args.metric === "number"
894
1002
  ? ` (${args.metric > prev ? "+" : ""}${(args.metric - prev).toFixed(1)} pts)`
@@ -1016,6 +1124,23 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
1016
1124
  }
1017
1125
  },
1018
1126
  },
1127
+ {
1128
+ type: "function",
1129
+ function: {
1130
+ name: "ar_watch_status",
1131
+ description:
1132
+ "Get the current status of the background file watcher (watch mode). " +
1133
+ "Returns whether watch mode is active, the test command, and recent failure count.",
1134
+ parameters: { type: "object", properties: {} },
1135
+ },
1136
+ execute: async () => {
1137
+ return JSON.stringify({
1138
+ active: !!_watchProcess,
1139
+ testCommand: _watchTestCommand,
1140
+ pid: _watchProcess ? _watchProcess.pid : null,
1141
+ });
1142
+ },
1143
+ },
1019
1144
  {
1020
1145
  type: "function",
1021
1146
  function: {
@@ -1059,3 +1184,112 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
1059
1184
  },
1060
1185
  ],
1061
1186
  };
1187
+
1188
+ // ─── Watch Mode Implementation ──────────────────────────────────
1189
+
1190
+ /**
1191
+ * Start a background file watcher using fs.watch (recursive).
1192
+ * On file changes, debounces and runs the test command.
1193
+ * If tests fail, injects a mid-run note into the agent conversation.
1194
+ * @param {string} watchPath — directory to watch
1195
+ * @param {string} testCommand — shell command to run on changes
1196
+ */
1197
+ function startWatch(watchPath, testCommand) {
1198
+ if (_watchProcess) stopWatch();
1199
+
1200
+ const ignorePatterns = [
1201
+ /node_modules/,
1202
+ /\.git\//,
1203
+ /\.nex\//,
1204
+ /dist\//,
1205
+ /\.log$/,
1206
+ /\.tmp$/,
1207
+ ];
1208
+
1209
+ try {
1210
+ const watcher = fs.watch(
1211
+ watchPath,
1212
+ { recursive: true },
1213
+ (eventType, filename) => {
1214
+ if (!filename) return;
1215
+ // Skip ignored paths
1216
+ if (ignorePatterns.some((p) => p.test(filename))) return;
1217
+ // Skip non-source files
1218
+ if (
1219
+ !/\.(js|ts|jsx|tsx|py|rb|go|rs|json|yaml|yml|toml|cfg|ini|sh|css|html)$/.test(
1220
+ filename,
1221
+ )
1222
+ )
1223
+ return;
1224
+
1225
+ // Debounce: wait for changes to settle
1226
+ if (_watchDebounceTimer) clearTimeout(_watchDebounceTimer);
1227
+ _watchDebounceTimer = setTimeout(() => {
1228
+ _runWatchTest(testCommand, filename);
1229
+ }, WATCH_DEBOUNCE_MS);
1230
+ },
1231
+ );
1232
+
1233
+ _watchProcess = watcher;
1234
+
1235
+ // Clean up on process exit
1236
+ const cleanup = () => stopWatch();
1237
+ process.on("exit", cleanup);
1238
+ process.on("SIGINT", cleanup);
1239
+ } catch (err) {
1240
+ console.error(`Watch mode failed to start: ${err.message}`);
1241
+ _watchProcess = null;
1242
+ }
1243
+ }
1244
+
1245
+ /**
1246
+ * Stop the background file watcher.
1247
+ */
1248
+ function stopWatch() {
1249
+ if (_watchProcess) {
1250
+ try {
1251
+ _watchProcess.close();
1252
+ } catch {
1253
+ /* already closed */
1254
+ }
1255
+ _watchProcess = null;
1256
+ }
1257
+ if (_watchDebounceTimer) {
1258
+ clearTimeout(_watchDebounceTimer);
1259
+ _watchDebounceTimer = null;
1260
+ }
1261
+ _watchTestCommand = null;
1262
+ }
1263
+
1264
+ /**
1265
+ * Run the test command and handle failures.
1266
+ * @param {string} testCommand
1267
+ * @param {string} changedFile — file that triggered the watch
1268
+ */
1269
+ function _runWatchTest(testCommand, changedFile) {
1270
+ try {
1271
+ execSync(testCommand, {
1272
+ cwd: process.cwd(),
1273
+ stdio: "pipe",
1274
+ timeout: 120000, // 2 minute timeout
1275
+ encoding: "utf-8",
1276
+ });
1277
+ // Tests passed — no action needed
1278
+ } catch (err) {
1279
+ // Tests failed — notify the agent
1280
+ const output = (err.stdout || "") + (err.stderr || "");
1281
+ const truncatedOutput = output.slice(-500); // Last 500 chars of error
1282
+ const failureNote = `[WATCH MODE] Test failure detected after change to ${changedFile}:\n${truncatedOutput}`;
1283
+
1284
+ // Try to inject a note into the agent conversation
1285
+ try {
1286
+ const { injectMidRunNote } = require("../agent");
1287
+ injectMidRunNote(failureNote);
1288
+ } catch {
1289
+ // Agent not in active conversation — just log
1290
+ process.stderr.write(
1291
+ `\n\x1b[33m⚠ Watch: tests failed after ${changedFile} changed\x1b[0m\n`,
1292
+ );
1293
+ }
1294
+ }
1295
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "nex-code",
3
- "version": "0.4.37",
3
+ "version": "0.4.39",
4
4
  "description": "Run 400B+ open coding models on your codebase without the hardware bill. Ollama Cloud first — OpenAI, Anthropic, and Gemini when you need them.",
5
5
  "bin": {
6
6
  "nex-code": "./dist/nex-code.js"
@@ -16,8 +16,8 @@
16
16
  },
17
17
  "scripts": {
18
18
  "start": "node dist/nex-code.js",
19
- "build": "esbuild bin/nex-code.js --bundle --platform=node --target=node18 --outfile=dist/nex-code.js --minify --external:axios --external:dotenv --external:playwright && esbuild cli/benchmark.js --bundle --platform=node --target=node18 --outfile=dist/benchmark.js --minify --external:axios --external:dotenv --external:playwright && rm -rf dist/skills && cp -r cli/skills dist/skills",
20
- "dev": "esbuild bin/nex-code.js --bundle --platform=node --target=node18 --outfile=dist/nex-code.js --external:axios --external:dotenv --external:playwright --watch",
19
+ "build": "node scripts/build.js",
20
+ "dev": "node scripts/build.js --dev",
21
21
  "test": "jest --forceExit",
22
22
  "test:orchestrator": "jest tests/orchestrator.test.js --forceExit",
23
23
  "coverage": "jest --coverage --forceExit",
@@ -41,15 +41,12 @@
41
41
  "ollama-cloud",
42
42
  "openai",
43
43
  "anthropic",
44
- "claude",
45
44
  "gemini",
46
45
  "llm",
47
46
  "gpt",
48
47
  "agentic",
49
48
  "terminal",
50
49
  "coding-assistant",
51
- "claude-code-alternative",
52
- "gemini-cli-alternative",
53
50
  "open-source",
54
51
  "free",
55
52
  "qwen3",