nex-code 0.4.37 → 0.4.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -41
- package/dist/benchmark.js +397 -360
- package/dist/nex-code.js +664 -621
- package/dist/skills/autoresearch.js +252 -18
- package/package.json +3 -6
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
* - No iteration cap by default — runs until stopped
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
const { execSync } = require("child_process");
|
|
17
|
+
const { execSync, spawn } = require("child_process");
|
|
18
18
|
const fs = require("fs");
|
|
19
19
|
const path = require("path");
|
|
20
20
|
|
|
@@ -44,6 +44,14 @@ function getBenchmark() {
|
|
|
44
44
|
// Track experiment history within the session
|
|
45
45
|
let experiments = [];
|
|
46
46
|
let loopActive = false;
|
|
47
|
+
let sessionBaselineScore = null; // set on first ar_run_benchmark call
|
|
48
|
+
|
|
49
|
+
// ─── Watch Mode state ───────────────────────────────────────────
|
|
50
|
+
let _watchProcess = null;
|
|
51
|
+
let _watchCallbacks = { onFailure: null };
|
|
52
|
+
let _watchTestCommand = null;
|
|
53
|
+
let _watchDebounceTimer = null;
|
|
54
|
+
const WATCH_DEBOUNCE_MS = 2000;
|
|
47
55
|
|
|
48
56
|
function getLogPath() {
|
|
49
57
|
const dir = path.join(process.cwd(), ".nex", "autoresearch");
|
|
@@ -263,11 +271,16 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
263
271
|
"4. If score same or worse: ar_revert, go to Step A with a different hypothesis",
|
|
264
272
|
"",
|
|
265
273
|
"### HARD RULES",
|
|
274
|
+
"- EVERY response must end with a tool call — NEVER output text as your final message",
|
|
275
|
+
"- After ar_revert: call ar_checkpoint immediately in the same response as your next hypothesis",
|
|
276
|
+
"- After ar_run_benchmark: call ar_log_experiment immediately in the same response",
|
|
266
277
|
"- NEVER read more than 2 files per experiment cycle",
|
|
267
278
|
"- NEVER use bash to create branches or run benchmarks — use ar_ tools only",
|
|
268
279
|
"- If you are blocked from reading a file, SKIP IT and make your edit based on what you already know",
|
|
269
280
|
"- If 3 consecutive experiments fail to improve, change category focus",
|
|
270
281
|
"- Simplicity criterion: prefer removing code over adding it — complexity cost must be justified by metric gain",
|
|
282
|
+
"- cli/tools/index.js uses single-quoted JS strings — when editing descriptions, use ONLY single quotes inside the text, or escape double quotes as \\\\\" — NEVER put a raw double quote inside a JS string literal or tests will fail with SyntaxError",
|
|
283
|
+
"- Before every Edit call, grep the EXACT old_text from the file first so it matches byte-for-byte",
|
|
271
284
|
"",
|
|
272
285
|
"### How the benchmark score works — READ THIS FIRST",
|
|
273
286
|
"The benchmark sends nex-code's TOOL_DEFINITIONS (schemas) to external models and checks:",
|
|
@@ -359,6 +372,53 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
359
372
|
console.log("Autoresearch history cleared.");
|
|
360
373
|
},
|
|
361
374
|
},
|
|
375
|
+
{
|
|
376
|
+
cmd: "/ar-watch",
|
|
377
|
+
desc: "Start/stop background file watcher that auto-runs tests on changes",
|
|
378
|
+
handler: (args) => {
|
|
379
|
+
// Check feature flag
|
|
380
|
+
let watchEnabled = false;
|
|
381
|
+
try {
|
|
382
|
+
const { feature } = require("../feature-flags");
|
|
383
|
+
watchEnabled = feature("WATCH_MODE");
|
|
384
|
+
} catch {
|
|
385
|
+
// feature-flags not available — check env
|
|
386
|
+
watchEnabled =
|
|
387
|
+
process.env.NEX_FEATURE_WATCH_MODE === "1" ||
|
|
388
|
+
process.env.NEX_FEATURE_WATCH_MODE === "true";
|
|
389
|
+
}
|
|
390
|
+
if (!watchEnabled) {
|
|
391
|
+
console.log(
|
|
392
|
+
"Watch mode is disabled. Enable with NEX_FEATURE_WATCH_MODE=1",
|
|
393
|
+
);
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
const cmd = args.trim();
|
|
398
|
+
if (cmd === "stop" || cmd === "off") {
|
|
399
|
+
stopWatch();
|
|
400
|
+
console.log("Watch mode stopped.");
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
if (_watchProcess) {
|
|
404
|
+
console.log(
|
|
405
|
+
"Watch mode is already running. Use /ar-watch stop to stop it.",
|
|
406
|
+
);
|
|
407
|
+
return;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Parse: /ar-watch <test command> [--watch-path <glob>]
|
|
411
|
+
const testCommand = cmd || "npm test";
|
|
412
|
+
const watchPath = process.cwd();
|
|
413
|
+
_watchTestCommand = testCommand;
|
|
414
|
+
|
|
415
|
+
startWatch(watchPath, testCommand);
|
|
416
|
+
console.log(`Watch mode started. Monitoring ${watchPath} for changes.`);
|
|
417
|
+
console.log(`Test command: ${testCommand}`);
|
|
418
|
+
console.log("On test failure, the agent will auto-investigate.");
|
|
419
|
+
console.log("Use /ar-watch stop to stop.\n");
|
|
420
|
+
},
|
|
421
|
+
},
|
|
362
422
|
],
|
|
363
423
|
|
|
364
424
|
tools: [
|
|
@@ -386,11 +446,12 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
386
446
|
execute: async (args) => {
|
|
387
447
|
// Strip any date-like suffix the model may have hallucinated, then
|
|
388
448
|
// append today's real date so the branch name is always accurate.
|
|
389
|
-
const baseTag =
|
|
390
|
-
.
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
449
|
+
const baseTag =
|
|
450
|
+
(args.tag || "self-improve")
|
|
451
|
+
.replace(/[^a-zA-Z0-9_-]/g, "-")
|
|
452
|
+
.replace(/-?\d{4,8}$/, "") // strip trailing YYYYMMDD / YYYYMM / etc.
|
|
453
|
+
.replace(/-[a-z]{3}\d{1,2}$/i, "") // strip trailing mon## (e.g. apr15)
|
|
454
|
+
.replace(/-+$/, "") || "self-improve";
|
|
394
455
|
const now = new Date();
|
|
395
456
|
const dateStr =
|
|
396
457
|
now.toLocaleString("en", { month: "short" }).toLowerCase() +
|
|
@@ -399,13 +460,38 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
399
460
|
const branchName = `autoresearch/${tag}`;
|
|
400
461
|
|
|
401
462
|
try {
|
|
463
|
+
const currentBranch = gitBranch();
|
|
464
|
+
// If we're already on the target branch, nothing to do
|
|
465
|
+
if (currentBranch === branchName) {
|
|
466
|
+
return JSON.stringify({
|
|
467
|
+
status: "resumed",
|
|
468
|
+
branch: branchName,
|
|
469
|
+
note: "Already on autoresearch branch — continuing experiments.",
|
|
470
|
+
});
|
|
471
|
+
}
|
|
472
|
+
|
|
402
473
|
// Check if branch already exists
|
|
474
|
+
let branchExists = false;
|
|
403
475
|
try {
|
|
404
476
|
execSync(`git rev-parse --verify ${branchName}`, {
|
|
405
477
|
cwd: process.cwd(),
|
|
406
478
|
stdio: ["pipe", "pipe", "pipe"],
|
|
407
479
|
});
|
|
408
|
-
|
|
480
|
+
branchExists = true;
|
|
481
|
+
} catch {
|
|
482
|
+
// Branch doesn't exist
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
if (branchExists) {
|
|
486
|
+
// Stash any uncommitted changes before switching
|
|
487
|
+
try {
|
|
488
|
+
execSync(`git stash`, {
|
|
489
|
+
cwd: process.cwd(),
|
|
490
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
491
|
+
});
|
|
492
|
+
} catch {
|
|
493
|
+
// Ignore stash errors (nothing to stash)
|
|
494
|
+
}
|
|
409
495
|
execSync(`git checkout ${branchName}`, {
|
|
410
496
|
cwd: process.cwd(),
|
|
411
497
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -415,8 +501,6 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
415
501
|
branch: branchName,
|
|
416
502
|
note: "Branch already existed — resuming experiments on it.",
|
|
417
503
|
});
|
|
418
|
-
} catch {
|
|
419
|
-
// Branch doesn't exist — create it
|
|
420
504
|
}
|
|
421
505
|
|
|
422
506
|
const sourceBranch = gitBranch() || "unknown";
|
|
@@ -636,13 +720,12 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
636
720
|
properties: {
|
|
637
721
|
file: {
|
|
638
722
|
type: "string",
|
|
639
|
-
description:
|
|
640
|
-
'Path to the log file (e.g. "run.log")',
|
|
723
|
+
description: 'Path to the log file (e.g. "run.log")',
|
|
641
724
|
},
|
|
642
725
|
patterns: {
|
|
643
726
|
type: "object",
|
|
644
727
|
description:
|
|
645
|
-
|
|
728
|
+
"Map of metric name to regex pattern with one capture group. " +
|
|
646
729
|
'Example: {"val_bpb": "val_bpb:\\\\s*([\\\\d.]+)", "memory": "peak_vram_mb:\\\\s*([\\\\d.]+)"}',
|
|
647
730
|
additionalProperties: { type: "string" },
|
|
648
731
|
},
|
|
@@ -722,7 +805,8 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
722
805
|
if (!benchmark) {
|
|
723
806
|
return JSON.stringify({
|
|
724
807
|
status: "unavailable",
|
|
725
|
-
error:
|
|
808
|
+
error:
|
|
809
|
+
"Benchmark module not found. Make sure cli/benchmark.js exists.",
|
|
726
810
|
});
|
|
727
811
|
}
|
|
728
812
|
|
|
@@ -758,6 +842,11 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
758
842
|
) / 10
|
|
759
843
|
: 0;
|
|
760
844
|
|
|
845
|
+
// Record baseline on first run
|
|
846
|
+
if (sessionBaselineScore === null) {
|
|
847
|
+
sessionBaselineScore = avgScore;
|
|
848
|
+
}
|
|
849
|
+
|
|
761
850
|
// Find weakest category across all models
|
|
762
851
|
const categoryTotals = {};
|
|
763
852
|
const categoryCounts = {};
|
|
@@ -774,8 +863,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
774
863
|
}
|
|
775
864
|
|
|
776
865
|
// Sort categories by score to find weakest
|
|
777
|
-
const sortedCategories = Object.entries(categoryAvgs)
|
|
778
|
-
|
|
866
|
+
const sortedCategories = Object.entries(categoryAvgs).sort(
|
|
867
|
+
(a, b) => a[1] - b[1],
|
|
868
|
+
);
|
|
779
869
|
|
|
780
870
|
const weakestCategory =
|
|
781
871
|
sortedCategories.length > 0 ? sortedCategories[0] : null;
|
|
@@ -796,8 +886,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
796
886
|
return JSON.stringify({
|
|
797
887
|
status: "benchmark_failed",
|
|
798
888
|
error: err.message,
|
|
799
|
-
elapsed_seconds:
|
|
800
|
-
|
|
889
|
+
elapsed_seconds: parseFloat(
|
|
890
|
+
((Date.now() - start) / 1000).toFixed(1),
|
|
891
|
+
),
|
|
801
892
|
});
|
|
802
893
|
}
|
|
803
894
|
},
|
|
@@ -860,6 +951,21 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
860
951
|
},
|
|
861
952
|
execute: async (args) => {
|
|
862
953
|
loadExperiments();
|
|
954
|
+
|
|
955
|
+
// Enforce keep/revert decision against session baseline
|
|
956
|
+
if (
|
|
957
|
+
sessionBaselineScore !== null &&
|
|
958
|
+
typeof args.metric === "number" &&
|
|
959
|
+
args.kept === true &&
|
|
960
|
+
args.metric < sessionBaselineScore
|
|
961
|
+
) {
|
|
962
|
+
console.log(
|
|
963
|
+
`\x1b[31m ⚠ Score ${args.metric} < baseline ${sessionBaselineScore} — overriding kept=true to kept=false\x1b[0m`,
|
|
964
|
+
);
|
|
965
|
+
args.kept = false;
|
|
966
|
+
args.status = "discard";
|
|
967
|
+
}
|
|
968
|
+
|
|
863
969
|
const commit = gitHash();
|
|
864
970
|
const entry = {
|
|
865
971
|
id: experiments.length + 1,
|
|
@@ -888,7 +994,9 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
888
994
|
|
|
889
995
|
const keptCount = experiments.filter((e) => e.kept).length;
|
|
890
996
|
const revertedCount = experiments.filter((e) => !e.kept).length;
|
|
891
|
-
const statusIcon = args.kept
|
|
997
|
+
const statusIcon = args.kept
|
|
998
|
+
? "\x1b[32m✔ KEPT\x1b[0m"
|
|
999
|
+
: "\x1b[31m✘ REVERTED\x1b[0m";
|
|
892
1000
|
const delta =
|
|
893
1001
|
prev != null && typeof args.metric === "number"
|
|
894
1002
|
? ` (${args.metric > prev ? "+" : ""}${(args.metric - prev).toFixed(1)} pts)`
|
|
@@ -1016,6 +1124,23 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
1016
1124
|
}
|
|
1017
1125
|
},
|
|
1018
1126
|
},
|
|
1127
|
+
{
|
|
1128
|
+
type: "function",
|
|
1129
|
+
function: {
|
|
1130
|
+
name: "ar_watch_status",
|
|
1131
|
+
description:
|
|
1132
|
+
"Get the current status of the background file watcher (watch mode). " +
|
|
1133
|
+
"Returns whether watch mode is active, the test command, and recent failure count.",
|
|
1134
|
+
parameters: { type: "object", properties: {} },
|
|
1135
|
+
},
|
|
1136
|
+
execute: async () => {
|
|
1137
|
+
return JSON.stringify({
|
|
1138
|
+
active: !!_watchProcess,
|
|
1139
|
+
testCommand: _watchTestCommand,
|
|
1140
|
+
pid: _watchProcess ? _watchProcess.pid : null,
|
|
1141
|
+
});
|
|
1142
|
+
},
|
|
1143
|
+
},
|
|
1019
1144
|
{
|
|
1020
1145
|
type: "function",
|
|
1021
1146
|
function: {
|
|
@@ -1059,3 +1184,112 @@ Use ar_run_experiment with output_file to redirect, then ar_extract_metric to re
|
|
|
1059
1184
|
},
|
|
1060
1185
|
],
|
|
1061
1186
|
};
|
|
1187
|
+
|
|
1188
|
+
// ─── Watch Mode Implementation ──────────────────────────────────
|
|
1189
|
+
|
|
1190
|
+
/**
|
|
1191
|
+
* Start a background file watcher using fs.watch (recursive).
|
|
1192
|
+
* On file changes, debounces and runs the test command.
|
|
1193
|
+
* If tests fail, injects a mid-run note into the agent conversation.
|
|
1194
|
+
* @param {string} watchPath — directory to watch
|
|
1195
|
+
* @param {string} testCommand — shell command to run on changes
|
|
1196
|
+
*/
|
|
1197
|
+
function startWatch(watchPath, testCommand) {
|
|
1198
|
+
if (_watchProcess) stopWatch();
|
|
1199
|
+
|
|
1200
|
+
const ignorePatterns = [
|
|
1201
|
+
/node_modules/,
|
|
1202
|
+
/\.git\//,
|
|
1203
|
+
/\.nex\//,
|
|
1204
|
+
/dist\//,
|
|
1205
|
+
/\.log$/,
|
|
1206
|
+
/\.tmp$/,
|
|
1207
|
+
];
|
|
1208
|
+
|
|
1209
|
+
try {
|
|
1210
|
+
const watcher = fs.watch(
|
|
1211
|
+
watchPath,
|
|
1212
|
+
{ recursive: true },
|
|
1213
|
+
(eventType, filename) => {
|
|
1214
|
+
if (!filename) return;
|
|
1215
|
+
// Skip ignored paths
|
|
1216
|
+
if (ignorePatterns.some((p) => p.test(filename))) return;
|
|
1217
|
+
// Skip non-source files
|
|
1218
|
+
if (
|
|
1219
|
+
!/\.(js|ts|jsx|tsx|py|rb|go|rs|json|yaml|yml|toml|cfg|ini|sh|css|html)$/.test(
|
|
1220
|
+
filename,
|
|
1221
|
+
)
|
|
1222
|
+
)
|
|
1223
|
+
return;
|
|
1224
|
+
|
|
1225
|
+
// Debounce: wait for changes to settle
|
|
1226
|
+
if (_watchDebounceTimer) clearTimeout(_watchDebounceTimer);
|
|
1227
|
+
_watchDebounceTimer = setTimeout(() => {
|
|
1228
|
+
_runWatchTest(testCommand, filename);
|
|
1229
|
+
}, WATCH_DEBOUNCE_MS);
|
|
1230
|
+
},
|
|
1231
|
+
);
|
|
1232
|
+
|
|
1233
|
+
_watchProcess = watcher;
|
|
1234
|
+
|
|
1235
|
+
// Clean up on process exit
|
|
1236
|
+
const cleanup = () => stopWatch();
|
|
1237
|
+
process.on("exit", cleanup);
|
|
1238
|
+
process.on("SIGINT", cleanup);
|
|
1239
|
+
} catch (err) {
|
|
1240
|
+
console.error(`Watch mode failed to start: ${err.message}`);
|
|
1241
|
+
_watchProcess = null;
|
|
1242
|
+
}
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
/**
|
|
1246
|
+
* Stop the background file watcher.
|
|
1247
|
+
*/
|
|
1248
|
+
function stopWatch() {
|
|
1249
|
+
if (_watchProcess) {
|
|
1250
|
+
try {
|
|
1251
|
+
_watchProcess.close();
|
|
1252
|
+
} catch {
|
|
1253
|
+
/* already closed */
|
|
1254
|
+
}
|
|
1255
|
+
_watchProcess = null;
|
|
1256
|
+
}
|
|
1257
|
+
if (_watchDebounceTimer) {
|
|
1258
|
+
clearTimeout(_watchDebounceTimer);
|
|
1259
|
+
_watchDebounceTimer = null;
|
|
1260
|
+
}
|
|
1261
|
+
_watchTestCommand = null;
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
/**
|
|
1265
|
+
* Run the test command and handle failures.
|
|
1266
|
+
* @param {string} testCommand
|
|
1267
|
+
* @param {string} changedFile — file that triggered the watch
|
|
1268
|
+
*/
|
|
1269
|
+
function _runWatchTest(testCommand, changedFile) {
|
|
1270
|
+
try {
|
|
1271
|
+
execSync(testCommand, {
|
|
1272
|
+
cwd: process.cwd(),
|
|
1273
|
+
stdio: "pipe",
|
|
1274
|
+
timeout: 120000, // 2 minute timeout
|
|
1275
|
+
encoding: "utf-8",
|
|
1276
|
+
});
|
|
1277
|
+
// Tests passed — no action needed
|
|
1278
|
+
} catch (err) {
|
|
1279
|
+
// Tests failed — notify the agent
|
|
1280
|
+
const output = (err.stdout || "") + (err.stderr || "");
|
|
1281
|
+
const truncatedOutput = output.slice(-500); // Last 500 chars of error
|
|
1282
|
+
const failureNote = `[WATCH MODE] Test failure detected after change to ${changedFile}:\n${truncatedOutput}`;
|
|
1283
|
+
|
|
1284
|
+
// Try to inject a note into the agent conversation
|
|
1285
|
+
try {
|
|
1286
|
+
const { injectMidRunNote } = require("../agent");
|
|
1287
|
+
injectMidRunNote(failureNote);
|
|
1288
|
+
} catch {
|
|
1289
|
+
// Agent not in active conversation — just log
|
|
1290
|
+
process.stderr.write(
|
|
1291
|
+
`\n\x1b[33m⚠ Watch: tests failed after ${changedFile} changed\x1b[0m\n`,
|
|
1292
|
+
);
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "nex-code",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.39",
|
|
4
4
|
"description": "Run 400B+ open coding models on your codebase without the hardware bill. Ollama Cloud first — OpenAI, Anthropic, and Gemini when you need them.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"nex-code": "./dist/nex-code.js"
|
|
@@ -16,8 +16,8 @@
|
|
|
16
16
|
},
|
|
17
17
|
"scripts": {
|
|
18
18
|
"start": "node dist/nex-code.js",
|
|
19
|
-
"build": "
|
|
20
|
-
"dev": "
|
|
19
|
+
"build": "node scripts/build.js",
|
|
20
|
+
"dev": "node scripts/build.js --dev",
|
|
21
21
|
"test": "jest --forceExit",
|
|
22
22
|
"test:orchestrator": "jest tests/orchestrator.test.js --forceExit",
|
|
23
23
|
"coverage": "jest --coverage --forceExit",
|
|
@@ -41,15 +41,12 @@
|
|
|
41
41
|
"ollama-cloud",
|
|
42
42
|
"openai",
|
|
43
43
|
"anthropic",
|
|
44
|
-
"claude",
|
|
45
44
|
"gemini",
|
|
46
45
|
"llm",
|
|
47
46
|
"gpt",
|
|
48
47
|
"agentic",
|
|
49
48
|
"terminal",
|
|
50
49
|
"coding-assistant",
|
|
51
|
-
"claude-code-alternative",
|
|
52
|
-
"gemini-cli-alternative",
|
|
53
50
|
"open-source",
|
|
54
51
|
"free",
|
|
55
52
|
"qwen3",
|