@kodax-ai/kodax-cli 0.7.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +1304 -0
  2. package/LICENSE +191 -0
  3. package/README.md +1167 -0
  4. package/README_CN.md +631 -0
  5. package/dist/builtin/code-review/SKILL.md +63 -0
  6. package/dist/builtin/git-workflow/SKILL.md +84 -0
  7. package/dist/builtin/skill-creator/SKILL.md +122 -0
  8. package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
  9. package/dist/builtin/skill-creator/agents/comparator.md +13 -0
  10. package/dist/builtin/skill-creator/agents/grader.md +13 -0
  11. package/dist/builtin/skill-creator/references/schemas.md +227 -0
  12. package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
  13. package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
  14. package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
  15. package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
  16. package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
  17. package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
  18. package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
  19. package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
  20. package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
  21. package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
  22. package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
  23. package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
  24. package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
  25. package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
  26. package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
  27. package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
  28. package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
  29. package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
  30. package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
  31. package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
  32. package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
  33. package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
  34. package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
  35. package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
  36. package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
  37. package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
  38. package/dist/builtin/skill-creator/scripts/utils.js +278 -0
  39. package/dist/builtin/tdd/SKILL.md +56 -0
  40. package/dist/index.js +1717 -0
  41. package/dist/kodax_cli.js +1870 -0
  42. package/package.json +122 -0
  43. package/scripts/kodax-bin.cjs +27 -0
  44. package/scripts/production-env.cjs +16 -0
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFile, writeFile } from 'node:fs/promises';
4
+ import path from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+ import { loadSkill, pathExists, writeSkill } from './utils.js';
7
+ import { improveDescription } from './improve-description.js';
8
+ import { runTriggerEval } from './run-trigger-eval.js';
9
+
10
+ export function splitEvalSet(evals, holdout = 0.25, seed = 42) {
11
+ if (holdout <= 0 || evals.length < 4) {
12
+ return { train: [...evals], test: [] };
13
+ }
14
+
15
+ let random = seed;
16
+ const nextRandom = () => {
17
+ random = (random * 1664525 + 1013904223) % 4294967296;
18
+ return random / 4294967296;
19
+ };
20
+
21
+ const shuffle = (items) => {
22
+ const result = [...items];
23
+ for (let index = result.length - 1; index > 0; index -= 1) {
24
+ const swapIndex = Math.floor(nextRandom() * (index + 1));
25
+ [result[index], result[swapIndex]] = [result[swapIndex], result[index]];
26
+ }
27
+ return result;
28
+ };
29
+
30
+ const positives = evals.filter((item) => item.should_trigger === true);
31
+ const negatives = evals.filter((item) => item.should_trigger !== true);
32
+ const shuffledPositives = shuffle(positives);
33
+ const shuffledNegatives = shuffle(negatives);
34
+
35
+ const pickCount = (items) => {
36
+ if (items.length <= 1) {
37
+ return 0;
38
+ }
39
+ return Math.max(1, Math.floor(items.length * holdout));
40
+ };
41
+
42
+ const posCount = pickCount(shuffledPositives);
43
+ const negCount = pickCount(shuffledNegatives);
44
+
45
+ const test = [
46
+ ...shuffledPositives.slice(0, posCount),
47
+ ...shuffledNegatives.slice(0, negCount),
48
+ ];
49
+ const train = [
50
+ ...shuffledPositives.slice(posCount),
51
+ ...shuffledNegatives.slice(negCount),
52
+ ];
53
+
54
+ return { train, test };
55
+ }
56
+
57
+ async function writeTempEvalFile(workspaceDir, name, skillName, evals) {
58
+ const filePath = path.join(workspaceDir, `${name}.json`);
59
+ await writeFile(filePath, JSON.stringify({ skill_name: skillName, evals }, null, 2), 'utf8');
60
+ return filePath;
61
+ }
62
+
63
+ export async function runDescriptionLoop(
64
+ options,
65
+ dependencies = {
66
+ runTriggerEvalFn: runTriggerEval,
67
+ improveDescriptionFn: improveDescription,
68
+ }
69
+ ) {
70
+ const resolvedDependencies = {
71
+ runTriggerEvalFn: runTriggerEval,
72
+ improveDescriptionFn: improveDescription,
73
+ ...dependencies,
74
+ };
75
+ const skill = await loadSkill(options.skillPath);
76
+ const evalFile = JSON.parse(await readFile(options.evalsPath, 'utf8'));
77
+ const allEvals = Array.isArray(evalFile.evals) ? evalFile.evals : [];
78
+ const { train, test } = splitEvalSet(allEvals, options.holdout, options.seed);
79
+ const historyPath = path.join(options.workspaceDir, 'description-history.json');
80
+
81
+ let currentDescription = String(skill.frontmatter.description ?? '');
82
+ const history = [];
83
+ let best = null;
84
+
85
+ for (let iteration = 1; iteration <= options.maxIterations; iteration += 1) {
86
+ const trainFile = await writeTempEvalFile(options.workspaceDir, `train-iteration-${iteration}`, skill.frontmatter.name, train);
87
+ const trainResults = await resolvedDependencies.runTriggerEvalFn({
88
+ ...options,
89
+ evalsPath: trainFile,
90
+ descriptionOverride: currentDescription,
91
+ });
92
+
93
+ let testResults = null;
94
+ if (test.length > 0) {
95
+ const testFile = await writeTempEvalFile(options.workspaceDir, `test-iteration-${iteration}`, skill.frontmatter.name, test);
96
+ testResults = await resolvedDependencies.runTriggerEvalFn({
97
+ ...options,
98
+ evalsPath: testFile,
99
+ descriptionOverride: currentDescription,
100
+ });
101
+ }
102
+
103
+ const score = `${trainResults.summary.passed}/${trainResults.summary.total}`;
104
+ const record = {
105
+ iteration,
106
+ description: currentDescription,
107
+ score,
108
+ train: trainResults.summary,
109
+ test: testResults?.summary ?? null,
110
+ train_results: trainResults.results,
111
+ test_results: testResults?.results ?? [],
112
+ };
113
+ history.push(record);
114
+
115
+ const currentComparable = testResults?.summary.passed ?? trainResults.summary.passed;
116
+ const bestComparable = best?.comparable ?? -1;
117
+ if (currentComparable > bestComparable) {
118
+ best = {
119
+ comparable: currentComparable,
120
+ record,
121
+ };
122
+ }
123
+
124
+ if (trainResults.summary.failed === 0 || iteration === options.maxIterations) {
125
+ break;
126
+ }
127
+
128
+ const improveInputPath = path.join(options.workspaceDir, `iteration-${iteration}-train-results.json`);
129
+ await writeFile(improveInputPath, JSON.stringify(trainResults, null, 2), 'utf8');
130
+ if (!await pathExists(historyPath)) {
131
+ await writeFile(historyPath, JSON.stringify({ history }, null, 2), 'utf8');
132
+ }
133
+ const improved = await resolvedDependencies.improveDescriptionFn({
134
+ ...options,
135
+ evalResultsPath: improveInputPath,
136
+ historyPath,
137
+ });
138
+ currentDescription = improved.description;
139
+ await writeFile(
140
+ historyPath,
141
+ JSON.stringify({ history }, null, 2),
142
+ 'utf8'
143
+ );
144
+ }
145
+
146
+ if (options.writeBest) {
147
+ const finalDescription = best?.record.description ?? currentDescription;
148
+ skill.frontmatter.description = finalDescription;
149
+ await writeSkill(options.skillPath, skill.frontmatter, skill.body);
150
+ }
151
+
152
+ return {
153
+ skill_name: skill.frontmatter.name,
154
+ original_description: skill.frontmatter.description,
155
+ final_description: currentDescription,
156
+ best_description: best?.record.description ?? currentDescription,
157
+ history,
158
+ train_size: train.length,
159
+ test_size: test.length,
160
+ };
161
+ }
162
+
163
+ function parseArgs(argv) {
164
+ const args = {
165
+ skillPath: '',
166
+ evalsPath: '',
167
+ workspaceDir: '',
168
+ provider: 'anthropic',
169
+ model: undefined,
170
+ maxIterations: 3,
171
+ runsPerQuery: 1,
172
+ triggerThreshold: 0.5,
173
+ holdout: 0.25,
174
+ seed: 42,
175
+ maxIter: 18,
176
+ reasoningMode: 'off',
177
+ output: undefined,
178
+ writeBest: false,
179
+ };
180
+
181
+ for (let index = 2; index < argv.length; index += 1) {
182
+ const token = argv[index];
183
+ if (token === '--skill-path' && argv[index + 1]) {
184
+ args.skillPath = argv[++index];
185
+ } else if (token === '--evals' && argv[index + 1]) {
186
+ args.evalsPath = argv[++index];
187
+ } else if (token === '--workspace' && argv[index + 1]) {
188
+ args.workspaceDir = argv[++index];
189
+ } else if (token === '--provider' && argv[index + 1]) {
190
+ args.provider = argv[++index];
191
+ } else if (token === '--model' && argv[index + 1]) {
192
+ args.model = argv[++index];
193
+ } else if (token === '--max-iterations' && argv[index + 1]) {
194
+ args.maxIterations = Number(argv[++index]);
195
+ } else if (token === '--runs-per-query' && argv[index + 1]) {
196
+ args.runsPerQuery = Number(argv[++index]);
197
+ } else if (token === '--trigger-threshold' && argv[index + 1]) {
198
+ args.triggerThreshold = Number(argv[++index]);
199
+ } else if (token === '--holdout' && argv[index + 1]) {
200
+ args.holdout = Number(argv[++index]);
201
+ } else if (token === '--seed' && argv[index + 1]) {
202
+ args.seed = Number(argv[++index]);
203
+ } else if (token === '--max-iter' && argv[index + 1]) {
204
+ args.maxIter = Number(argv[++index]);
205
+ } else if (token === '--reasoning' && argv[index + 1]) {
206
+ args.reasoningMode = argv[++index];
207
+ } else if (token === '--output' && argv[index + 1]) {
208
+ args.output = argv[++index];
209
+ } else if (token === '--write-best') {
210
+ args.writeBest = true;
211
+ }
212
+ }
213
+
214
+ return args;
215
+ }
216
+
217
+ async function main() {
218
+ const args = parseArgs(process.argv);
219
+ if (!args.skillPath || !args.evalsPath || !args.workspaceDir) {
220
+ console.error('Usage: node scripts/run-loop.js --skill-path <dir> --evals <evals.json> --workspace <dir> [--max-iterations 3]');
221
+ process.exit(1);
222
+ }
223
+
224
+ const report = await runDescriptionLoop(args);
225
+ const outputText = `${JSON.stringify(report, null, 2)}\n`;
226
+
227
+ if (args.output) {
228
+ await writeFile(args.output, outputText, 'utf8');
229
+ console.log(`Wrote ${path.resolve(args.output)}`);
230
+ } else {
231
+ process.stdout.write(outputText);
232
+ }
233
+ }
234
+
235
+ const isDirectRun = process.argv[1]
236
+ && fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
237
+
238
+ if (isDirectRun) {
239
+ main().catch((error) => {
240
+ console.error(error instanceof Error ? error.message : String(error));
241
+ process.exit(1);
242
+ });
243
+ }
@@ -0,0 +1,58 @@
1
+ export interface TriggerEvalAttempt {
2
+ trigger: boolean;
3
+ reason: string;
4
+ }
5
+
6
+ export interface TriggerEvalItemResult {
7
+ query: string;
8
+ should_trigger: boolean;
9
+ triggers: number;
10
+ runs: number;
11
+ trigger_rate: number;
12
+ predicted_trigger: boolean;
13
+ pass: boolean;
14
+ attempts: TriggerEvalAttempt[];
15
+ }
16
+
17
+ export interface TriggerEvalSummary {
18
+ passed: number;
19
+ failed: number;
20
+ total: number;
21
+ pass_rate: number;
22
+ precision: number;
23
+ recall: number;
24
+ }
25
+
26
+ export interface TriggerEvalReport {
27
+ skill_name: string;
28
+ description: string;
29
+ results: TriggerEvalItemResult[];
30
+ summary: TriggerEvalSummary;
31
+ meta: {
32
+ provider?: string;
33
+ model?: string | null;
34
+ runs_per_query: number;
35
+ trigger_threshold: number;
36
+ note: string;
37
+ };
38
+ }
39
+
40
+ export interface TriggerEvalOptions {
41
+ skillPath: string;
42
+ evalsPath: string;
43
+ provider?: string;
44
+ model?: string;
45
+ output?: string;
46
+ runsPerQuery?: number;
47
+ triggerThreshold?: number;
48
+ maxIter?: number;
49
+ reasoningMode?: string;
50
+ descriptionOverride?: string;
51
+ }
52
+
53
+ export function parseTriggerDecision(text: string): TriggerEvalAttempt;
54
+ export function summarizeTriggerResults(results: TriggerEvalItemResult[]): TriggerEvalSummary;
55
+ export function runTriggerEval(
56
+ options: TriggerEvalOptions,
57
+ runner?: (prompt: string, options: TriggerEvalOptions) => Promise<string>
58
+ ): Promise<TriggerEvalReport>;
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFile, writeFile } from 'node:fs/promises';
4
+ import path from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+ import {
7
+ extractJsonObject,
8
+ loadKodaXSDK,
9
+ loadSkill,
10
+ } from './utils.js';
11
+
12
+ function buildTriggerEvalPrompt(skillName, description, query) {
13
+ return [
14
+ 'You are evaluating whether a KodaX skill description should trigger for a user request.',
15
+ 'Decide whether the skill should be used based only on the skill name, description, and query.',
16
+ 'Return JSON only with this exact shape:',
17
+ '{"trigger": true, "reason": "short explanation"}',
18
+ '',
19
+ `Skill name: ${skillName}`,
20
+ `Skill description: ${description}`,
21
+ `User query: ${query}`,
22
+ ].join('\n');
23
+ }
24
+
25
+ export function parseTriggerDecision(text) {
26
+ const parsed = extractJsonObject(text);
27
+ if (!parsed || typeof parsed !== 'object') {
28
+ throw new Error('Could not parse trigger decision JSON');
29
+ }
30
+
31
+ return {
32
+ trigger: parsed.trigger === true,
33
+ reason: typeof parsed.reason === 'string' ? parsed.reason.trim() : '',
34
+ };
35
+ }
36
+
37
+ async function defaultRunPrompt(prompt, options) {
38
+ const { runKodaX } = await loadKodaXSDK();
39
+ const result = await runKodaX(
40
+ {
41
+ provider: options.provider,
42
+ maxIter: options.maxIter ?? 12,
43
+ model: options.model,
44
+ reasoningMode: options.reasoningMode ?? 'off',
45
+ thinking: options.reasoningMode ? options.reasoningMode !== 'off' : false,
46
+ },
47
+ prompt
48
+ );
49
+
50
+ return result.lastText;
51
+ }
52
+
53
+ export function summarizeTriggerResults(results) {
54
+ const positives = results.filter((result) => result.should_trigger);
55
+ const negatives = results.filter((result) => !result.should_trigger);
56
+ const predictedPositives = results.filter((result) => result.predicted_trigger);
57
+ const truePositives = positives.filter((result) => result.predicted_trigger).length;
58
+ const falsePositives = negatives.filter((result) => result.predicted_trigger).length;
59
+
60
+ const summary = {
61
+ passed: 0,
62
+ failed: 0,
63
+ total: results.length,
64
+ pass_rate: 0,
65
+ precision: 0,
66
+ recall: 0,
67
+ };
68
+
69
+ for (const result of results) {
70
+ if (result.pass) {
71
+ summary.passed += 1;
72
+ } else {
73
+ summary.failed += 1;
74
+ }
75
+ }
76
+
77
+ summary.pass_rate = summary.total === 0 ? 0 : summary.passed / summary.total;
78
+ summary.precision = predictedPositives.length === 0
79
+ ? 0
80
+ : truePositives / predictedPositives.length;
81
+ summary.recall = positives.length === 0 ? 0 : truePositives / positives.length;
82
+
83
+ return summary;
84
+ }
85
+
86
+ export async function runTriggerEval(options, runner = defaultRunPrompt) {
87
+ const skill = await loadSkill(options.skillPath);
88
+ const currentDescription = options.descriptionOverride
89
+ ?? String(skill.frontmatter.description ?? '').trim();
90
+ const evalFile = JSON.parse(await readFile(options.evalsPath, 'utf8'));
91
+ const evals = Array.isArray(evalFile.evals) ? evalFile.evals : [];
92
+ const runsPerQuery = Number.isFinite(options.runsPerQuery) && options.runsPerQuery > 0
93
+ ? Math.floor(options.runsPerQuery)
94
+ : 1;
95
+ const threshold = Number.isFinite(options.triggerThreshold)
96
+ ? Math.min(Math.max(options.triggerThreshold, 0), 1)
97
+ : 0.5;
98
+ const results = [];
99
+
100
+ for (const item of evals) {
101
+ const query = String(item.query ?? item.prompt ?? '').trim();
102
+ if (!query) {
103
+ continue;
104
+ }
105
+
106
+ const shouldTrigger = item.should_trigger === true;
107
+ let triggers = 0;
108
+ const attempts = [];
109
+
110
+ for (let runIndex = 0; runIndex < runsPerQuery; runIndex += 1) {
111
+ const response = await runner(
112
+ buildTriggerEvalPrompt(skill.frontmatter.name, currentDescription, query),
113
+ options
114
+ );
115
+ const parsed = parseTriggerDecision(response);
116
+ if (parsed.trigger) {
117
+ triggers += 1;
118
+ }
119
+ attempts.push(parsed);
120
+ }
121
+
122
+ const triggerRate = triggers / runsPerQuery;
123
+ const predictedTrigger = triggerRate >= threshold;
124
+ const pass = shouldTrigger
125
+ ? predictedTrigger
126
+ : !predictedTrigger;
127
+
128
+ results.push({
129
+ query,
130
+ should_trigger: shouldTrigger,
131
+ triggers,
132
+ runs: runsPerQuery,
133
+ trigger_rate: triggerRate,
134
+ predicted_trigger: predictedTrigger,
135
+ pass,
136
+ attempts,
137
+ });
138
+ }
139
+
140
+ const summary = summarizeTriggerResults(results);
141
+
142
+ return {
143
+ skill_name: skill.frontmatter.name,
144
+ description: currentDescription,
145
+ results,
146
+ summary,
147
+ meta: {
148
+ provider: options.provider,
149
+ model: options.model ?? null,
150
+ runs_per_query: runsPerQuery,
151
+ trigger_threshold: threshold,
152
+ note: 'This is a KodaX-native description eval, not a Claude Code tool-trace replay.',
153
+ },
154
+ };
155
+ }
156
+
157
+ function parseArgs(argv) {
158
+ const args = {
159
+ skillPath: '',
160
+ evalsPath: '',
161
+ provider: 'anthropic',
162
+ model: undefined,
163
+ output: undefined,
164
+ runsPerQuery: 1,
165
+ triggerThreshold: 0.5,
166
+ maxIter: 12,
167
+ reasoningMode: 'off',
168
+ descriptionOverride: undefined,
169
+ };
170
+
171
+ for (let index = 2; index < argv.length; index += 1) {
172
+ const token = argv[index];
173
+ if (token === '--skill-path' && argv[index + 1]) {
174
+ args.skillPath = argv[++index];
175
+ } else if (token === '--evals' && argv[index + 1]) {
176
+ args.evalsPath = argv[++index];
177
+ } else if (token === '--provider' && argv[index + 1]) {
178
+ args.provider = argv[++index];
179
+ } else if (token === '--model' && argv[index + 1]) {
180
+ args.model = argv[++index];
181
+ } else if (token === '--output' && argv[index + 1]) {
182
+ args.output = argv[++index];
183
+ } else if (token === '--runs-per-query' && argv[index + 1]) {
184
+ args.runsPerQuery = Number(argv[++index]);
185
+ } else if (token === '--trigger-threshold' && argv[index + 1]) {
186
+ args.triggerThreshold = Number(argv[++index]);
187
+ } else if (token === '--max-iter' && argv[index + 1]) {
188
+ args.maxIter = Number(argv[++index]);
189
+ } else if (token === '--reasoning' && argv[index + 1]) {
190
+ args.reasoningMode = argv[++index];
191
+ } else if (token === '--description' && argv[index + 1]) {
192
+ args.descriptionOverride = argv[++index];
193
+ }
194
+ }
195
+
196
+ return args;
197
+ }
198
+
199
+ async function main() {
200
+ const args = parseArgs(process.argv);
201
+ if (!args.skillPath || !args.evalsPath) {
202
+ console.error('Usage: node scripts/run-trigger-eval.js --skill-path <dir> --evals <evals.json> [--provider anthropic] [--output results.json]');
203
+ process.exit(1);
204
+ }
205
+
206
+ const results = await runTriggerEval(args);
207
+ const outputText = `${JSON.stringify(results, null, 2)}\n`;
208
+
209
+ if (args.output) {
210
+ await writeFile(args.output, outputText, 'utf8');
211
+ console.log(`Wrote ${path.resolve(args.output)}`);
212
+ } else {
213
+ process.stdout.write(outputText);
214
+ }
215
+ }
216
+
217
+ const isDirectRun = process.argv[1]
218
+ && fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
219
+
220
+ if (isDirectRun) {
221
+ main().catch((error) => {
222
+ console.error(error instanceof Error ? error.message : String(error));
223
+ process.exit(1);
224
+ });
225
+ }