@skillmark/cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-entry-point.d.ts +3 -0
- package/dist/cli-entry-point.d.ts.map +1 -0
- package/dist/cli-entry-point.js +207 -0
- package/dist/cli-entry-point.js.map +1 -0
- package/dist/commands/auth-setup-and-token-storage-command.d.ts +21 -0
- package/dist/commands/auth-setup-and-token-storage-command.d.ts.map +1 -0
- package/dist/commands/auth-setup-and-token-storage-command.js +166 -0
- package/dist/commands/auth-setup-and-token-storage-command.js.map +1 -0
- package/dist/commands/publish-results-command.d.ts +21 -0
- package/dist/commands/publish-results-command.d.ts.map +1 -0
- package/dist/commands/publish-results-command.js +256 -0
- package/dist/commands/publish-results-command.js.map +1 -0
- package/dist/commands/run-benchmark-command.d.ts +6 -0
- package/dist/commands/run-benchmark-command.d.ts.map +1 -0
- package/dist/commands/run-benchmark-command.js +331 -0
- package/dist/commands/run-benchmark-command.js.map +1 -0
- package/dist/commands/view-leaderboard-command.d.ts +8 -0
- package/dist/commands/view-leaderboard-command.d.ts.map +1 -0
- package/dist/commands/view-leaderboard-command.js +169 -0
- package/dist/commands/view-leaderboard-command.js.map +1 -0
- package/dist/config/api-key-config-reader.d.ts +14 -0
- package/dist/config/api-key-config-reader.d.ts.map +1 -0
- package/dist/config/api-key-config-reader.js +107 -0
- package/dist/config/api-key-config-reader.js.map +1 -0
- package/dist/config/api-key-config-reader.test.d.ts +2 -0
- package/dist/config/api-key-config-reader.test.d.ts.map +1 -0
- package/dist/config/api-key-config-reader.test.js +21 -0
- package/dist/config/api-key-config-reader.test.js.map +1 -0
- package/dist/engine/claude-cli-executor.d.ts +33 -0
- package/dist/engine/claude-cli-executor.d.ts.map +1 -0
- package/dist/engine/claude-cli-executor.js +251 -0
- package/dist/engine/claude-cli-executor.js.map +1 -0
- package/dist/engine/concept-accuracy-scorer.d.ts +24 -0
- package/dist/engine/concept-accuracy-scorer.d.ts.map +1 -0
- package/dist/engine/concept-accuracy-scorer.js +186 -0
- package/dist/engine/concept-accuracy-scorer.js.map +1 -0
- package/dist/engine/concept-accuracy-scorer.test.d.ts +2 -0
- package/dist/engine/concept-accuracy-scorer.test.d.ts.map +1 -0
- package/dist/engine/concept-accuracy-scorer.test.js +230 -0
- package/dist/engine/concept-accuracy-scorer.test.js.map +1 -0
- package/dist/engine/enhanced-test-prompt-builder.d.ts +30 -0
- package/dist/engine/enhanced-test-prompt-builder.d.ts.map +1 -0
- package/dist/engine/enhanced-test-prompt-builder.js +134 -0
- package/dist/engine/enhanced-test-prompt-builder.js.map +1 -0
- package/dist/engine/markdown-test-definition-parser.d.ts +18 -0
- package/dist/engine/markdown-test-definition-parser.d.ts.map +1 -0
- package/dist/engine/markdown-test-definition-parser.js +525 -0
- package/dist/engine/markdown-test-definition-parser.js.map +1 -0
- package/dist/engine/markdown-test-definition-parser.test.d.ts +2 -0
- package/dist/engine/markdown-test-definition-parser.test.d.ts.map +1 -0
- package/dist/engine/markdown-test-definition-parser.test.js +265 -0
- package/dist/engine/markdown-test-definition-parser.test.js.map +1 -0
- package/dist/engine/retry-with-degrade-utils.d.ts +58 -0
- package/dist/engine/retry-with-degrade-utils.d.ts.map +1 -0
- package/dist/engine/retry-with-degrade-utils.js +86 -0
- package/dist/engine/retry-with-degrade-utils.js.map +1 -0
- package/dist/engine/skill-content-collector.d.ts +53 -0
- package/dist/engine/skill-content-collector.d.ts.map +1 -0
- package/dist/engine/skill-content-collector.js +157 -0
- package/dist/engine/skill-content-collector.js.map +1 -0
- package/dist/engine/skill-creator-invoker.d.ts +36 -0
- package/dist/engine/skill-creator-invoker.d.ts.map +1 -0
- package/dist/engine/skill-creator-invoker.js +222 -0
- package/dist/engine/skill-creator-invoker.js.map +1 -0
- package/dist/engine/transcript-jsonl-parser.d.ts +28 -0
- package/dist/engine/transcript-jsonl-parser.d.ts.map +1 -0
- package/dist/engine/transcript-jsonl-parser.js +175 -0
- package/dist/engine/transcript-jsonl-parser.js.map +1 -0
- package/dist/sources/git-repository-skill-source-handler.d.ts +18 -0
- package/dist/sources/git-repository-skill-source-handler.d.ts.map +1 -0
- package/dist/sources/git-repository-skill-source-handler.js +119 -0
- package/dist/sources/git-repository-skill-source-handler.js.map +1 -0
- package/dist/sources/local-skill-source-handler.d.ts +21 -0
- package/dist/sources/local-skill-source-handler.d.ts.map +1 -0
- package/dist/sources/local-skill-source-handler.js +138 -0
- package/dist/sources/local-skill-source-handler.js.map +1 -0
- package/dist/sources/local-skill-source-handler.test.d.ts +2 -0
- package/dist/sources/local-skill-source-handler.test.d.ts.map +1 -0
- package/dist/sources/local-skill-source-handler.test.js +55 -0
- package/dist/sources/local-skill-source-handler.test.js.map +1 -0
- package/dist/sources/skillsh-registry-source-handler.d.ts +18 -0
- package/dist/sources/skillsh-registry-source-handler.d.ts.map +1 -0
- package/dist/sources/skillsh-registry-source-handler.js +130 -0
- package/dist/sources/skillsh-registry-source-handler.js.map +1 -0
- package/dist/sources/unified-skill-source-resolver.d.ts +20 -0
- package/dist/sources/unified-skill-source-resolver.d.ts.map +1 -0
- package/dist/sources/unified-skill-source-resolver.js +64 -0
- package/dist/sources/unified-skill-source-resolver.js.map +1 -0
- package/dist/sources/unified-skill-source-resolver.test.d.ts +2 -0
- package/dist/sources/unified-skill-source-resolver.test.d.ts.map +1 -0
- package/dist/sources/unified-skill-source-resolver.test.js +84 -0
- package/dist/sources/unified-skill-source-resolver.test.js.map +1 -0
- package/dist/types/benchmark-types.d.ts +142 -0
- package/dist/types/benchmark-types.d.ts.map +1 -0
- package/dist/types/benchmark-types.js +5 -0
- package/dist/types/benchmark-types.js.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +38 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for Skillmark benchmark system
|
|
3
|
+
*/
|
|
4
|
+
/** Test definition parsed from markdown frontmatter */
|
|
5
|
+
export interface TestDefinition {
|
|
6
|
+
/** Unique test identifier */
|
|
7
|
+
name: string;
|
|
8
|
+
/** Type of test: knowledge (Q&A) or task (execution) */
|
|
9
|
+
type: 'knowledge' | 'task';
|
|
10
|
+
/** Concepts to check in response */
|
|
11
|
+
concepts: string[];
|
|
12
|
+
/** Timeout in seconds */
|
|
13
|
+
timeout: number;
|
|
14
|
+
/** The prompt to send to Claude */
|
|
15
|
+
prompt: string;
|
|
16
|
+
/** Expected response patterns/criteria */
|
|
17
|
+
expected: string[];
|
|
18
|
+
/** Source file path */
|
|
19
|
+
sourcePath: string;
|
|
20
|
+
}
|
|
21
|
+
/** Metrics collected from a single benchmark run */
|
|
22
|
+
export interface BenchmarkMetrics {
|
|
23
|
+
/** Accuracy percentage (concepts matched / total) */
|
|
24
|
+
accuracy: number;
|
|
25
|
+
/** Total tokens consumed */
|
|
26
|
+
tokensTotal: number;
|
|
27
|
+
/** Input tokens */
|
|
28
|
+
tokensInput: number;
|
|
29
|
+
/** Output tokens */
|
|
30
|
+
tokensOutput: number;
|
|
31
|
+
/** Wall-clock time in milliseconds */
|
|
32
|
+
durationMs: number;
|
|
33
|
+
/** Number of tool calls made */
|
|
34
|
+
toolCount: number;
|
|
35
|
+
/** Estimated API cost in USD */
|
|
36
|
+
costUsd: number;
|
|
37
|
+
}
|
|
38
|
+
/** Result of a single test execution */
|
|
39
|
+
export interface TestResult {
|
|
40
|
+
/** Test definition */
|
|
41
|
+
test: TestDefinition;
|
|
42
|
+
/** Collected metrics */
|
|
43
|
+
metrics: BenchmarkMetrics;
|
|
44
|
+
/** Concepts that were matched */
|
|
45
|
+
matchedConcepts: string[];
|
|
46
|
+
/** Concepts that were missed */
|
|
47
|
+
missedConcepts: string[];
|
|
48
|
+
/** Claude's response */
|
|
49
|
+
response: string;
|
|
50
|
+
/** Run timestamp */
|
|
51
|
+
timestamp: string;
|
|
52
|
+
/** Whether the test passed accuracy threshold */
|
|
53
|
+
passed: boolean;
|
|
54
|
+
}
|
|
55
|
+
/** Aggregated results across multiple runs */
|
|
56
|
+
export interface BenchmarkResult {
|
|
57
|
+
/** Skill identifier */
|
|
58
|
+
skillId: string;
|
|
59
|
+
/** Skill name */
|
|
60
|
+
skillName: string;
|
|
61
|
+
/** Skill source (local path, git URL, or skill.sh path) */
|
|
62
|
+
skillSource: string;
|
|
63
|
+
/** Model used for benchmark */
|
|
64
|
+
model: 'haiku' | 'sonnet' | 'opus';
|
|
65
|
+
/** Number of runs performed */
|
|
66
|
+
runs: number;
|
|
67
|
+
/** Individual test results */
|
|
68
|
+
testResults: TestResult[];
|
|
69
|
+
/** Aggregated metrics (averaged across runs) */
|
|
70
|
+
aggregatedMetrics: BenchmarkMetrics;
|
|
71
|
+
/** Benchmark timestamp */
|
|
72
|
+
timestamp: string;
|
|
73
|
+
/** Skillmark CLI version */
|
|
74
|
+
version: string;
|
|
75
|
+
/** Hash for result verification */
|
|
76
|
+
hash?: string;
|
|
77
|
+
}
|
|
78
|
+
/** Skill source types */
|
|
79
|
+
export type SkillSourceType = 'local' | 'git' | 'skillsh';
|
|
80
|
+
/** Parsed skill source information */
|
|
81
|
+
export interface SkillSource {
|
|
82
|
+
/** Source type */
|
|
83
|
+
type: SkillSourceType;
|
|
84
|
+
/** Original input (path, URL, or skill.sh identifier) */
|
|
85
|
+
original: string;
|
|
86
|
+
/** Resolved local path (after clone if needed) */
|
|
87
|
+
localPath: string;
|
|
88
|
+
/** Skill name extracted from source */
|
|
89
|
+
name: string;
|
|
90
|
+
}
|
|
91
|
+
/** CLI run command options */
|
|
92
|
+
export interface RunOptions {
|
|
93
|
+
/** Path to test suite */
|
|
94
|
+
tests: string;
|
|
95
|
+
/** Model to use */
|
|
96
|
+
model: 'haiku' | 'sonnet' | 'opus';
|
|
97
|
+
/** Number of iterations */
|
|
98
|
+
runs: number;
|
|
99
|
+
/** Output directory */
|
|
100
|
+
output: string;
|
|
101
|
+
/** Show verbose progress output */
|
|
102
|
+
verbose?: boolean;
|
|
103
|
+
}
|
|
104
|
+
/** CLI publish command options */
|
|
105
|
+
export interface PublishOptions {
|
|
106
|
+
/** API key for authentication */
|
|
107
|
+
apiKey: string;
|
|
108
|
+
/** API endpoint URL */
|
|
109
|
+
endpoint?: string;
|
|
110
|
+
}
|
|
111
|
+
/** JSONL transcript entry from Claude CLI */
|
|
112
|
+
export interface TranscriptEntry {
|
|
113
|
+
type: 'system' | 'human' | 'assistant' | 'result';
|
|
114
|
+
message?: {
|
|
115
|
+
role: string;
|
|
116
|
+
content: string | Array<{
|
|
117
|
+
type: string;
|
|
118
|
+
text?: string;
|
|
119
|
+
tool_use_id?: string;
|
|
120
|
+
}>;
|
|
121
|
+
model?: string;
|
|
122
|
+
};
|
|
123
|
+
costUSD?: number;
|
|
124
|
+
durationMs?: number;
|
|
125
|
+
inputTokens?: number;
|
|
126
|
+
outputTokens?: number;
|
|
127
|
+
cacheCreationInputTokens?: number;
|
|
128
|
+
cacheReadInputTokens?: number;
|
|
129
|
+
}
|
|
130
|
+
/** Leaderboard entry from API */
|
|
131
|
+
export interface LeaderboardEntry {
|
|
132
|
+
skillId: string;
|
|
133
|
+
skillName: string;
|
|
134
|
+
source: string;
|
|
135
|
+
bestAccuracy: number;
|
|
136
|
+
bestModel: string;
|
|
137
|
+
avgTokens: number;
|
|
138
|
+
avgCost: number;
|
|
139
|
+
lastTested: string;
|
|
140
|
+
totalRuns: number;
|
|
141
|
+
}
|
|
142
|
+
//# sourceMappingURL=benchmark-types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark-types.d.ts","sourceRoot":"","sources":["../../src/types/benchmark-types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,uDAAuD;AACvD,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,wDAAwD;IACxD,IAAI,EAAE,WAAW,GAAG,MAAM,CAAC;IAC3B,oCAAoC;IACpC,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,mCAAmC;IACnC,MAAM,EAAE,MAAM,CAAC;IACf,0CAA0C;IAC1C,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,uBAAuB;IACvB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,oDAAoD;AACpD,MAAM,WAAW,gBAAgB;IAC/B,qDAAqD;IACrD,QAAQ,EAAE,MAAM,CAAC;IACjB,4BAA4B;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,mBAAmB;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,oBAAoB;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC;IACnB,gCAAgC;IAChC,SAAS,EAAE,MAAM,CAAC;IAClB,gCAAgC;IAChC,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,wCAAwC;AACxC,MAAM,WAAW,UAAU;IACzB,sBAAsB;IACtB,IAAI,EAAE,cAAc,CAAC;IACrB,wBAAwB;IACxB,OAAO,EAAE,gBAAgB,CAAC;IAC1B,iCAAiC;IACjC,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,gCAAgC;IAChC,cAAc,EAAE,MAAM,EAAE,CAAC;IACzB,wBAAwB;IACxB,QAAQ,EAAE,MAAM,CAAC;IACjB,oBAAoB;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,iDAAiD;IACjD,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,8CAA8C;AAC9C,MAAM,WAAW,eAAe;IAC9B,uBAAuB;IACvB,OAAO,EAAE,MAAM,CAAC;IAChB,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,WAAW,EAAE,MAAM,CAAC;IACpB,+BAA+B;IAC/B,KAAK,EAAE,OAAO,GAAG,QAAQ,GAAG,MAAM,CAAC;IACnC,+BAA+B;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,8BAA8B;IAC9B,WAAW,EAAE,UAAU,EAAE,CAAC;IAC1B,gDAAgD;IAChD,iBAAiB,EAAE,gBAAgB,CAAC;IACpC,0BAA0B;IAC1B,SAAS,EAAE,MAAM,CAAC;IAClB,4BAA4B;IAC5B,OAAO,EAAE,MAAM,CAAC;IAChB,mCAAmC;IACnC,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED,yBAAyB;AACzB,MAAM,MAAM,eAAe,GAAG,OAAO,GAAG,KAAK,GAAG,SAAS,CAAC;AAE1D,sCAAsC;AACtC,MAAM,WAAW,WAAW;IAC1B,kBAAkB;IAClB,IAAI,EAAE,eAAe,CAAC;IACtB,yDAAyD;IACzD,QAAQ,EAAE,MAAM,CAAC;IACjB,kDAAkD;IAClD,SAAS,EAAE,MAAM,CAAC;IAClB,uCAAuC;IACvC,IAAI,EAAE,MAAM,CAAC;CACd;AAED,8BAA8B;AAC9B,MAAM,WAAW,UAAU;IACzB,yBAAyB;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,mBAAmB;IACnB,KAAK,EAAE,OAAO,GAAG,QAAQ,GAAG,MAAM,CAAC;IACnC,2BAA2B;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,uBAAuB;IACvB,MAAM,EAAE,MAAM,CAAC;IACf,mCAAmC;IACnC,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB;AAED,kCAAkC;AAClC,MAAM,WAAW,cAAc;IAC7B,iCAAiC;IACjC,MAAM,EAAE,MAAM,CAAC;IACf,uBAAuB;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,6CAA6C;AAC7C,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,QAAQ,GAAG,OAAO,GAAG,WAAW,GAAG,QAAQ,CAAC;IAClD,OAAO,CAAC,EAAE;QACR,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,EAAE,MAAM,GAAG,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,IAAI,CAAC,EAAE,MAAM,CAAC;YAAC,WAAW,CAAC,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;QAC/E,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,wBAAwB,CAAC,EAAE,MAAM,CAAC;IAClC,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC/B;AAED,iCAAiC;AACjC,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;CACnB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark-types.js","sourceRoot":"","sources":["../../src/types/benchmark-types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/types/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,cAAc,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/types/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,cAAc,sBAAsB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@skillmark/cli",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "CLI tool for benchmarking Claude agent skills",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"skillmark": "./dist/cli-entry-point.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/cli-entry-point.js",
|
|
10
|
+
"files": [
|
|
11
|
+
"dist"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsc",
|
|
15
|
+
"dev": "tsc --watch",
|
|
16
|
+
"lint": "tsc --noEmit",
|
|
17
|
+
"test": "vitest run",
|
|
18
|
+
"test:watch": "vitest",
|
|
19
|
+
"test:coverage": "vitest run --coverage",
|
|
20
|
+
"clean": "rm -rf dist"
|
|
21
|
+
},
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"chalk": "^5.3.0",
|
|
24
|
+
"commander": "^12.0.0",
|
|
25
|
+
"gray-matter": "^4.0.3",
|
|
26
|
+
"ora": "^8.0.1",
|
|
27
|
+
"simple-git": "^3.22.0"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/node": "^20.11.0",
|
|
31
|
+
"@vitest/coverage-v8": "^4.0.18",
|
|
32
|
+
"typescript": "^5.3.3",
|
|
33
|
+
"vitest": "^4.0.18"
|
|
34
|
+
},
|
|
35
|
+
"engines": {
|
|
36
|
+
"node": ">=18.0.0"
|
|
37
|
+
}
|
|
38
|
+
}
|