@jackwener/opencli 1.5.9 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/README.md +18 -0
  3. package/SKILL.md +59 -0
  4. package/autoresearch/baseline-browse.txt +1 -0
  5. package/autoresearch/baseline-skill.txt +1 -0
  6. package/autoresearch/browse-tasks.json +688 -0
  7. package/autoresearch/eval-browse.ts +185 -0
  8. package/autoresearch/eval-skill.ts +248 -0
  9. package/autoresearch/run-browse.sh +9 -0
  10. package/autoresearch/run-skill.sh +9 -0
  11. package/dist/browser/daemon-client.d.ts +20 -1
  12. package/dist/browser/daemon-client.js +37 -30
  13. package/dist/browser/daemon-client.test.d.ts +1 -0
  14. package/dist/browser/daemon-client.test.js +77 -0
  15. package/dist/browser/discover.js +8 -19
  16. package/dist/browser/page.d.ts +4 -0
  17. package/dist/browser/page.js +48 -1
  18. package/dist/cli.js +392 -0
  19. package/dist/clis/twitter/article.js +28 -1
  20. package/dist/clis/xiaohongshu/note.js +11 -0
  21. package/dist/clis/xiaohongshu/note.test.js +49 -0
  22. package/dist/commanderAdapter.js +1 -1
  23. package/dist/commanderAdapter.test.js +43 -0
  24. package/dist/commands/daemon.js +7 -46
  25. package/dist/commands/daemon.test.js +44 -69
  26. package/dist/discovery.js +27 -0
  27. package/dist/types.d.ts +8 -0
  28. package/docs/guide/getting-started.md +21 -0
  29. package/docs/superpowers/specs/2026-04-02-browse-skill-testing-design.md +144 -0
  30. package/docs/zh/guide/getting-started.md +21 -0
  31. package/extension/package-lock.json +2 -2
  32. package/extension/src/background.ts +51 -4
  33. package/extension/src/cdp.ts +77 -124
  34. package/extension/src/protocol.ts +5 -1
  35. package/package.json +1 -1
  36. package/skills/opencli-explorer/SKILL.md +6 -0
  37. package/skills/opencli-oneshot/SKILL.md +6 -0
  38. package/skills/opencli-operate/SKILL.md +213 -0
  39. package/skills/opencli-usage/SKILL.md +113 -32
  40. package/src/browser/daemon-client.test.ts +103 -0
  41. package/src/browser/daemon-client.ts +53 -30
  42. package/src/browser/discover.ts +8 -17
  43. package/src/browser/page.ts +48 -1
  44. package/src/cli.ts +392 -0
  45. package/src/clis/twitter/article.ts +31 -1
  46. package/src/clis/xiaohongshu/note.test.ts +51 -0
  47. package/src/clis/xiaohongshu/note.ts +18 -0
  48. package/src/commanderAdapter.test.ts +62 -0
  49. package/src/commanderAdapter.ts +1 -1
  50. package/src/commands/daemon.test.ts +49 -83
  51. package/src/commands/daemon.ts +7 -55
  52. package/src/discovery.ts +22 -0
  53. package/src/doctor.ts +1 -1
  54. package/src/types.ts +8 -0
  55. package/extension/dist/background.js +0 -681
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Layer 1: Deterministic Browse Command Testing
4
+ *
5
+ * Runs predefined opencli operate command sequences against real websites.
6
+ * No LLM involved — tests command reliability only.
7
+ *
8
+ * Usage:
9
+ * npx tsx autoresearch/eval-browse.ts # Run all tasks
10
+ * npx tsx autoresearch/eval-browse.ts --task hn-top5 # Run single task
11
+ */
12
+
13
+ import { execSync } from 'node:child_process';
14
+ import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
15
+ import { join, dirname } from 'node:path';
16
+ import { fileURLToPath } from 'node:url';
17
+
18
+ const __dirname = dirname(fileURLToPath(import.meta.url));
19
+ const TASKS_FILE = join(__dirname, 'browse-tasks.json');
20
+ const RESULTS_DIR = join(__dirname, 'results');
21
+ const BASELINE_FILE = join(__dirname, 'baseline-browse.txt');
22
+
23
+ interface BrowseTask {
24
+ name: string;
25
+ steps: string[];
26
+ judge: JudgeCriteria;
27
+ set?: 'test';
28
+ note?: string;
29
+ }
30
+
31
+ type JudgeCriteria =
32
+ | { type: 'contains'; value: string }
33
+ | { type: 'arrayMinLength'; minLength: number }
34
+ | { type: 'nonEmpty' }
35
+ | { type: 'matchesPattern'; pattern: string };
36
+
37
+ interface TaskResult {
38
+ name: string;
39
+ passed: boolean;
40
+ duration: number;
41
+ error?: string;
42
+ set: 'train' | 'test';
43
+ }
44
+
45
+ function judge(criteria: JudgeCriteria, output: string): boolean {
46
+ try {
47
+ switch (criteria.type) {
48
+ case 'contains':
49
+ return output.toLowerCase().includes(criteria.value.toLowerCase());
50
+ case 'arrayMinLength': {
51
+ try {
52
+ const arr = JSON.parse(output);
53
+ if (Array.isArray(arr)) return arr.length >= criteria.minLength;
54
+ } catch { /* not JSON array */ }
55
+ return false;
56
+ }
57
+ case 'nonEmpty':
58
+ return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined';
59
+ case 'matchesPattern':
60
+ return new RegExp(criteria.pattern).test(output);
61
+ default:
62
+ return false;
63
+ }
64
+ } catch {
65
+ return false;
66
+ }
67
+ }
68
+
69
+ function runCommand(cmd: string): string {
70
+ try {
71
+ return execSync(cmd, {
72
+ cwd: join(__dirname, '..'),
73
+ timeout: 30000,
74
+ encoding: 'utf-8',
75
+ env: process.env,
76
+ stdio: ['pipe', 'pipe', 'pipe'],
77
+ }).trim();
78
+ } catch (err: any) {
79
+ return err.stdout?.trim() ?? '';
80
+ }
81
+ }
82
+
83
+ function runTask(task: BrowseTask): TaskResult {
84
+ const start = Date.now();
85
+ let lastOutput = '';
86
+
87
+ try {
88
+ for (const step of task.steps) {
89
+ lastOutput = runCommand(step);
90
+ }
91
+
92
+ const passed = judge(task.judge, lastOutput);
93
+
94
+ return {
95
+ name: task.name,
96
+ passed,
97
+ duration: Date.now() - start,
98
+ error: passed ? undefined : `Output: ${lastOutput.slice(0, 100)}`,
99
+ set: task.set === 'test' ? 'test' : 'train',
100
+ };
101
+ } catch (err: any) {
102
+ return {
103
+ name: task.name,
104
+ passed: false,
105
+ duration: Date.now() - start,
106
+ error: err.message?.slice(0, 100),
107
+ set: task.set === 'test' ? 'test' : 'train',
108
+ };
109
+ }
110
+ }
111
+
112
+ function main() {
113
+ const args = process.argv.slice(2);
114
+ const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
115
+
116
+ const allTasks: BrowseTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
117
+ const tasks = singleTask ? allTasks.filter(t => t.name === singleTask) : allTasks;
118
+
119
+ if (tasks.length === 0) {
120
+ console.error(`Task "${singleTask}" not found.`);
121
+ process.exit(1);
122
+ }
123
+
124
+ console.log(`\n🔬 Layer 1: Browse Commands — ${tasks.length} tasks\n`);
125
+
126
+ const results: TaskResult[] = [];
127
+
128
+ for (let i = 0; i < tasks.length; i++) {
129
+ const task = tasks[i];
130
+ process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`);
131
+
132
+ const result = runTask(task);
133
+ results.push(result);
134
+
135
+ const icon = result.passed ? '✓' : '✗';
136
+ console.log(` ${icon} (${(result.duration / 1000).toFixed(1)}s)`);
137
+
138
+ // Close browser between tasks for clean state
139
+ if (i < tasks.length - 1) {
140
+ try { runCommand('opencli operate close'); } catch { /* ignore */ }
141
+ }
142
+ }
143
+
144
+ // Final close
145
+ try { runCommand('opencli operate close'); } catch { /* ignore */ }
146
+
147
+ // Summary
148
+ const trainResults = results.filter(r => r.set === 'train');
149
+ const testResults = results.filter(r => r.set === 'test');
150
+ const totalPassed = results.filter(r => r.passed).length;
151
+ const trainPassed = trainResults.filter(r => r.passed).length;
152
+ const testPassed = testResults.filter(r => r.passed).length;
153
+ const totalDuration = results.reduce((s, r) => s + r.duration, 0);
154
+
155
+ console.log(`\n${'─'.repeat(50)}`);
156
+ console.log(` Score: ${totalPassed}/${results.length} (train: ${trainPassed}/${trainResults.length}, test: ${testPassed}/${testResults.length})`);
157
+ console.log(` Time: ${Math.round(totalDuration / 60000)}min`);
158
+
159
+ const failures = results.filter(r => !r.passed);
160
+ if (failures.length > 0) {
161
+ console.log(`\n Failures:`);
162
+ for (const f of failures) {
163
+ console.log(` ✗ ${f.name}: ${f.error ?? 'unknown'}`);
164
+ }
165
+ }
166
+ console.log('');
167
+
168
+ // Save result
169
+ mkdirSync(RESULTS_DIR, { recursive: true });
170
+ const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('browse-')).length;
171
+ const roundNum = String(existing + 1).padStart(3, '0');
172
+ const resultPath = join(RESULTS_DIR, `browse-${roundNum}.json`);
173
+ writeFileSync(resultPath, JSON.stringify({
174
+ timestamp: new Date().toISOString(),
175
+ score: `${totalPassed}/${results.length}`,
176
+ trainScore: `${trainPassed}/${trainResults.length}`,
177
+ testScore: `${testPassed}/${testResults.length}`,
178
+ duration: `${Math.round(totalDuration / 60000)}min`,
179
+ tasks: results,
180
+ }, null, 2), 'utf-8');
181
+ console.log(` Results saved to: ${resultPath}`);
182
+ console.log(`\nSCORE=${totalPassed}/${results.length}`);
183
+ }
184
+
185
+ main();
@@ -0,0 +1,248 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Layer 2: Claude Code Skill E2E Testing (LLM Judge)
4
+ *
5
+ * Spawns Claude Code with the opencli-operate skill. Claude Code
6
+ * completes the task using browse commands AND judges its own result.
7
+ *
8
+ * Task format: YAML with judge_context (multi-criteria, like Browser Use)
9
+ *
10
+ * Usage:
11
+ * npx tsx autoresearch/eval-skill.ts # Run all
12
+ * npx tsx autoresearch/eval-skill.ts --task hn-top5 # Run single
13
+ */
14
+
15
+ import { execSync } from 'node:child_process';
16
+ import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
17
+ import { join, dirname } from 'node:path';
18
+ import { fileURLToPath } from 'node:url';
19
+
20
+ const __dirname = dirname(fileURLToPath(import.meta.url));
21
+ const TASKS_FILE = join(__dirname, 'skill-tasks.yaml');
22
+ const RESULTS_DIR = join(__dirname, 'results');
23
+ const SKILL_PATH = join(__dirname, '..', 'skills', 'opencli-operate', 'SKILL.md');
24
+
25
+ // ── Types ──────────────────────────────────────────────────────────
26
+
27
+ interface SkillTask {
28
+ name: string;
29
+ task: string;
30
+ url?: string;
31
+ judge_context: string[];
32
+ max_steps?: number;
33
+ }
34
+
35
+ interface TaskResult {
36
+ name: string;
37
+ passed: boolean;
38
+ duration: number;
39
+ cost: number;
40
+ explanation: string;
41
+ }
42
+
43
+ // ── Task Definitions (inline, to avoid YAML dependency) ────────────
44
+
45
+ const TASKS: SkillTask[] = [
46
+ // Extract
47
+ { name: "extract-title-example", task: "Extract the main heading text from this page", url: "https://example.com", judge_context: ["Output must contain 'Example Domain'"] },
48
+ { name: "extract-paragraph-wiki", task: "Extract the first paragraph of the JavaScript article", url: "https://en.wikipedia.org/wiki/JavaScript", judge_context: ["Output must mention 'programming language'", "Output must contain actual paragraph text, not just the title"] },
49
+ { name: "extract-github-stars", task: "Find the number of stars on this repository", url: "https://github.com/browser-use/browser-use", judge_context: ["Output must contain a number (the star count)"] },
50
+ { name: "extract-npm-downloads", task: "Find the weekly download count for this package", url: "https://www.npmjs.com/package/zod", judge_context: ["Output must contain a number (weekly downloads)"] },
51
+
52
+ // List extraction
53
+ { name: "list-hn-top5", task: "Extract the top 5 stories with their titles", url: "https://news.ycombinator.com", judge_context: ["Output must contain 5 story titles", "Each title must be an actual HN story, not made up"] },
54
+ { name: "list-books-5", task: "Extract the first 5 books with their title and price", url: "https://books.toscrape.com", judge_context: ["Output must contain 5 books", "Each book must have a title and a price"] },
55
+ { name: "list-quotes-3", task: "Extract the first 3 quotes with their text and author", url: "https://quotes.toscrape.com", judge_context: ["Output must contain 3 quotes", "Each quote must have text and an author name"] },
56
+ { name: "list-github-trending", task: "Extract the top 3 trending repositories with name and description", url: "https://github.com/trending", judge_context: ["Output must contain 3 repositories", "Each must have a repo name"] },
57
+ { name: "list-jsonplaceholder", task: "Extract the first 5 posts with their title", url: "https://jsonplaceholder.typicode.com/posts", judge_context: ["Output must contain 5 posts", "Each post must have a title"] },
58
+
59
+ // Search
60
+ { name: "search-ddg", task: "Search for 'TypeScript tutorial' and extract the first 3 result titles", url: "https://duckduckgo.com", judge_context: ["The agent must type a search query", "Output must contain at least 3 search result titles"] },
61
+ { name: "search-npm", task: "Search for 'react' and extract the top 3 package names", url: "https://www.npmjs.com", judge_context: ["The agent must search for 'react'", "Output must contain at least 3 package names"] },
62
+ { name: "search-wiki", task: "Search for 'Rust programming language' and extract the first sentence of the article", url: "https://en.wikipedia.org", judge_context: ["The agent must search and navigate to the article", "Output must mention 'programming language'"] },
63
+
64
+ // Navigation
65
+ { name: "nav-click-link", task: "Click the 'More information...' link and extract the heading of the new page", url: "https://example.com", judge_context: ["The agent must click a link", "Output must contain 'IANA' or reference the new page"] },
66
+ { name: "nav-click-hn", task: "Click on the first story link and tell me the title of the page you land on", url: "https://news.ycombinator.com", judge_context: ["The agent must click a story link", "Output must contain the title of the destination page"] },
67
+ { name: "nav-go-back", task: "Click the 'More information...' link, then go back, and tell me the heading of the original page", url: "https://example.com", judge_context: ["The agent must click a link then go back", "Output must contain 'Example Domain'"] },
68
+ { name: "nav-multi-step", task: "Click the Next page link at the bottom, then extract the first quote from page 2", url: "https://quotes.toscrape.com", judge_context: ["The agent must navigate to page 2", "Output must contain a quote from page 2"] },
69
+
70
+ // Scroll
71
+ { name: "scroll-footer", task: "Scroll to the bottom and extract the footer text", url: "https://quotes.toscrape.com", judge_context: ["The agent must scroll down", "Output must contain footer or bottom-of-page content"] },
72
+ { name: "scroll-pagination", task: "Find the pagination info at the bottom of the page", url: "https://books.toscrape.com", judge_context: ["Output must contain page number or pagination info"] },
73
+
74
+ // Form
75
+ { name: "form-fill-basic", task: "Fill the Customer Name with 'OpenCLI' and Telephone with '555-0100'. Do not submit.", url: "https://httpbin.org/forms/post", judge_context: ["The agent must type 'OpenCLI' into a name field", "The agent must type '555-0100' into a phone field", "The form must NOT be submitted"] },
76
+ { name: "form-radio", task: "Select the 'Medium' pizza size option. Do not submit.", url: "https://httpbin.org/forms/post", judge_context: ["The agent must select a radio button for Medium size"] },
77
+ { name: "form-login", task: "Fill the username with 'testuser' and password with 'testpass'. Do not submit.", url: "https://the-internet.herokuapp.com/login", judge_context: ["The agent must fill the username field", "The agent must fill the password field", "The form must NOT be submitted"] },
78
+
79
+ // Complex
80
+ { name: "complex-wiki-toc", task: "Extract the table of contents headings", url: "https://en.wikipedia.org/wiki/JavaScript", judge_context: ["Output must contain at least 5 section headings from the table of contents"] },
81
+ { name: "complex-books-detail", task: "Click on the first book and extract its title and price from the detail page", url: "https://books.toscrape.com", judge_context: ["The agent must click on a book", "Output must contain the book title", "Output must contain a price"] },
82
+ { name: "complex-quotes-page2", task: "Navigate to page 2 and extract the first 3 quotes with their authors", url: "https://quotes.toscrape.com", judge_context: ["The agent must navigate to page 2", "Output must contain 3 quotes with authors"] },
83
+ { name: "complex-multi-extract", task: "Extract both the page title and the first paragraph text", url: "https://en.wikipedia.org/wiki/TypeScript", judge_context: ["Output must contain 'TypeScript'", "Output must contain actual paragraph text"] },
84
+
85
+ // Bench (harder, real-world)
86
+ { name: "bench-reddit", task: "Extract the titles of the top 5 posts", url: "https://old.reddit.com", judge_context: ["Output must contain 5 post titles", "Titles must be actual Reddit posts"] },
87
+ { name: "bench-imdb", task: "Find the year and rating of The Matrix", url: "https://www.imdb.com/title/tt0133093/", judge_context: ["Output must contain '1999'", "Output must contain a rating number"] },
88
+ { name: "bench-github-profile", task: "Extract the bio and number of public repositories", url: "https://github.com/torvalds", judge_context: ["Output must contain bio text or 'Linux'", "Output must contain a number for repos"] },
89
+ { name: "bench-httpbin", task: "Extract the User-Agent header shown on this page", url: "https://httpbin.org/headers", judge_context: ["Output must contain a User-Agent string"] },
90
+ { name: "bench-jsonapi-todo", task: "Extract the first 5 todo items with their title and completion status", url: "https://jsonplaceholder.typicode.com/todos", judge_context: ["Output must contain 5 todo items", "Each must have a title and completed status"] },
91
+
92
+ // Codex form (the real test)
93
+ { name: "codex-form-fill", task: "Fill the basic information using 'opencli' as the identity (first name=open, last name=cli, email=opencli@example.com, GitHub username=opencli). Do NOT submit the form.", url: "https://openai.com/form/codex-for-oss/", judge_context: ["The agent must fill the first name field", "The agent must fill the last name field", "The agent must fill the email field", "The form must NOT be submitted"], max_steps: 15 },
94
+ ];
95
+
96
+ // ── Run Task ───────────────────────────────────────────────────────
97
+
98
+ function runSkillTask(task: SkillTask): TaskResult {
99
+ const start = Date.now();
100
+ const skillContent = readFileSync(SKILL_PATH, 'utf-8');
101
+ const urlPart = task.url ? ` Start URL: ${task.url}` : '';
102
+ const criteria = task.judge_context.map((c, i) => `${i + 1}. ${c}`).join('\n');
103
+
104
+ const prompt = `Complete this browser task using opencli operate commands:
105
+
106
+ TASK: ${task.task}${urlPart}
107
+
108
+ After completing the task, evaluate your own result against these criteria:
109
+ ${criteria}
110
+
111
+ At the very end of your response, output a JSON verdict on its own line:
112
+ {"success": true/false, "explanation": "brief explanation"}
113
+
114
+ Always close the browser with 'opencli operate close' when done.`;
115
+
116
+ try {
117
+ const output = execSync(
118
+ `claude -p --dangerously-skip-permissions --allowedTools "Bash(opencli:*)" --system-prompt ${JSON.stringify(skillContent)} --output-format json --no-session-persistence ${JSON.stringify(prompt)}`,
119
+ {
120
+ cwd: join(__dirname, '..'),
121
+ timeout: (task.max_steps ?? 10) * 15_000,
122
+ encoding: 'utf-8',
123
+ env: process.env,
124
+ stdio: ['pipe', 'pipe', 'pipe'],
125
+ }
126
+ );
127
+
128
+ const duration = Date.now() - start;
129
+
130
+ // Parse Claude Code output
131
+ let resultText = '';
132
+ let cost = 0;
133
+ try {
134
+ const parsed = JSON.parse(output);
135
+ resultText = parsed.result ?? output;
136
+ cost = parsed.total_cost_usd ?? 0;
137
+ } catch {
138
+ resultText = output;
139
+ }
140
+
141
+ // Extract verdict JSON from the result
142
+ const verdict = extractVerdict(resultText);
143
+
144
+ return {
145
+ name: task.name,
146
+ passed: verdict.success,
147
+ duration,
148
+ cost,
149
+ explanation: verdict.explanation,
150
+ };
151
+ } catch (err: any) {
152
+ return {
153
+ name: task.name,
154
+ passed: false,
155
+ duration: Date.now() - start,
156
+ cost: 0,
157
+ explanation: (err.stdout ?? err.message ?? 'timeout or crash').slice(0, 200),
158
+ };
159
+ }
160
+ }
161
+
162
+ function extractVerdict(text: string): { success: boolean; explanation: string } {
163
+ // Try to find {"success": ...} JSON in the text
164
+ const jsonMatches = text.match(/\{"success"\s*:\s*(true|false)\s*,\s*"explanation"\s*:\s*"([^"]*)"\s*\}/g);
165
+ if (jsonMatches) {
166
+ const last = jsonMatches[jsonMatches.length - 1];
167
+ try {
168
+ return JSON.parse(last);
169
+ } catch { /* fall through */ }
170
+ }
171
+
172
+ // Fallback: check for success indicators in text
173
+ const lower = text.toLowerCase();
174
+ if (lower.includes('"success": true') || lower.includes('"success":true')) {
175
+ return { success: true, explanation: 'Parsed success from output' };
176
+ }
177
+ if (lower.includes('"success": false') || lower.includes('"success":false')) {
178
+ return { success: false, explanation: 'Parsed failure from output' };
179
+ }
180
+
181
+ // Final fallback: assume failure if we can't parse
182
+ return { success: false, explanation: 'Could not parse verdict from output' };
183
+ }
184
+
185
+ // ── Main ───────────────────────────────────────────────────────────
186
+
187
+ function main() {
188
+ const args = process.argv.slice(2);
189
+ const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
190
+ const tasks = singleTask ? TASKS.filter(t => t.name === singleTask) : TASKS;
191
+
192
+ if (tasks.length === 0) {
193
+ console.error(`Task "${singleTask}" not found. Available: ${TASKS.map(t => t.name).join(', ')}`);
194
+ process.exit(1);
195
+ }
196
+
197
+ console.log(`\n🔬 Layer 2: Skill E2E (LLM Judge) — ${tasks.length} tasks\n`);
198
+
199
+ const results: TaskResult[] = [];
200
+
201
+ for (let i = 0; i < tasks.length; i++) {
202
+ const task = tasks[i];
203
+ process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`);
204
+
205
+ const result = runSkillTask(task);
206
+ results.push(result);
207
+
208
+ const icon = result.passed ? '✓' : '✗';
209
+ const costStr = result.cost > 0 ? `, $${result.cost.toFixed(2)}` : '';
210
+ console.log(` ${icon} (${Math.round(result.duration / 1000)}s${costStr})`);
211
+ }
212
+
213
+ // Summary
214
+ const totalPassed = results.filter(r => r.passed).length;
215
+ const totalCost = results.reduce((s, r) => s + r.cost, 0);
216
+ const totalDuration = results.reduce((s, r) => s + r.duration, 0);
217
+
218
+ console.log(`\n${'─'.repeat(50)}`);
219
+ console.log(` Score: ${totalPassed}/${results.length} (${Math.round(totalPassed / results.length * 100)}%)`);
220
+ console.log(` Cost: $${totalCost.toFixed(2)}`);
221
+ console.log(` Time: ${Math.round(totalDuration / 60000)}min`);
222
+
223
+ const failures = results.filter(r => !r.passed);
224
+ if (failures.length > 0) {
225
+ console.log(`\n Failures:`);
226
+ for (const f of failures) {
227
+ console.log(` ✗ ${f.name}: ${f.explanation}`);
228
+ }
229
+ }
230
+ console.log('');
231
+
232
+ // Save
233
+ mkdirSync(RESULTS_DIR, { recursive: true });
234
+ const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('skill-')).length;
235
+ const roundNum = String(existing + 1).padStart(3, '0');
236
+ const resultPath = join(RESULTS_DIR, `skill-${roundNum}.json`);
237
+ writeFileSync(resultPath, JSON.stringify({
238
+ timestamp: new Date().toISOString(),
239
+ score: `${totalPassed}/${results.length}`,
240
+ totalCost,
241
+ duration: `${Math.round(totalDuration / 60000)}min`,
242
+ tasks: results,
243
+ }, null, 2), 'utf-8');
244
+ console.log(` Results saved to: ${resultPath}`);
245
+ console.log(`\nSCORE=${totalPassed}/${results.length}`);
246
+ }
247
+
248
+ main();
@@ -0,0 +1,9 @@
1
+ #!/bin/bash
2
+ # Layer 1: Deterministic browse command testing
3
+ set -e
4
+ cd "$(dirname "$0")/.."
5
+ echo "Building OpenCLI..."
6
+ npm run build > /dev/null 2>&1
7
+ echo "Build OK"
8
+ echo ""
9
+ npx tsx autoresearch/eval-browse.ts "$@"
@@ -0,0 +1,9 @@
1
+ #!/bin/bash
2
+ # Layer 2: Claude Code skill E2E testing
3
+ set -e
4
+ cd "$(dirname "$0")/.."
5
+ echo "Building OpenCLI..."
6
+ npm run build > /dev/null 2>&1
7
+ echo "Build OK"
8
+ echo ""
9
+ npx tsx autoresearch/eval-skill.ts "$@"
@@ -6,7 +6,7 @@
6
6
  import type { BrowserSessionInfo } from '../types.js';
7
7
  export interface DaemonCommand {
8
8
  id: string;
9
- action: 'exec' | 'navigate' | 'tabs' | 'cookies' | 'screenshot' | 'close-window' | 'sessions' | 'set-file-input';
9
+ action: 'exec' | 'navigate' | 'tabs' | 'cookies' | 'screenshot' | 'close-window' | 'sessions' | 'set-file-input' | 'cdp';
10
10
  tabId?: number;
11
11
  code?: string;
12
12
  workspace?: string;
@@ -21,6 +21,8 @@ export interface DaemonCommand {
21
21
  files?: string[];
22
22
  /** CSS selector for file input element (set-file-input action) */
23
23
  selector?: string;
24
+ cdpMethod?: string;
25
+ cdpParams?: Record<string, unknown>;
24
26
  }
25
27
  export interface DaemonResult {
26
28
  id: string;
@@ -28,6 +30,23 @@ export interface DaemonResult {
28
30
  data?: unknown;
29
31
  error?: string;
30
32
  }
33
+ export interface DaemonStatus {
34
+ ok: boolean;
35
+ pid: number;
36
+ uptime: number;
37
+ extensionConnected: boolean;
38
+ extensionVersion?: string;
39
+ pending: number;
40
+ lastCliRequestTime: number;
41
+ memoryMB: number;
42
+ port: number;
43
+ }
44
+ export declare function fetchDaemonStatus(opts?: {
45
+ timeout?: number;
46
+ }): Promise<DaemonStatus | null>;
47
+ export declare function requestDaemonShutdown(opts?: {
48
+ timeout?: number;
49
+ }): Promise<boolean>;
31
50
  /**
32
51
  * Check if daemon is running.
33
52
  */
@@ -8,48 +8,58 @@ import { sleep } from '../utils.js';
8
8
  import { isTransientBrowserError } from './errors.js';
9
9
  const DAEMON_PORT = parseInt(process.env.OPENCLI_DAEMON_PORT ?? String(DEFAULT_DAEMON_PORT), 10);
10
10
  const DAEMON_URL = `http://127.0.0.1:${DAEMON_PORT}`;
11
+ const OPENCLI_HEADERS = { 'X-OpenCLI': '1' };
11
12
  let _idCounter = 0;
12
13
  function generateId() {
13
14
  return `cmd_${Date.now()}_${++_idCounter}`;
14
15
  }
15
- /**
16
- * Check if daemon is running.
17
- */
18
- export async function isDaemonRunning() {
16
+ async function requestDaemon(pathname, init) {
17
+ const { timeout = 2000, headers, ...rest } = init ?? {};
18
+ const controller = new AbortController();
19
+ const timer = setTimeout(() => controller.abort(), timeout);
19
20
  try {
20
- const controller = new AbortController();
21
- const timer = setTimeout(() => controller.abort(), 2000);
22
- const res = await fetch(`${DAEMON_URL}/status`, {
23
- headers: { 'X-OpenCLI': '1' },
21
+ return await fetch(`${DAEMON_URL}${pathname}`, {
22
+ ...rest,
23
+ headers: { ...OPENCLI_HEADERS, ...headers },
24
24
  signal: controller.signal,
25
25
  });
26
+ }
27
+ finally {
26
28
  clearTimeout(timer);
29
+ }
30
+ }
31
+ export async function fetchDaemonStatus(opts) {
32
+ try {
33
+ const res = await requestDaemon('/status', { timeout: opts?.timeout ?? 2000 });
34
+ if (!res.ok)
35
+ return null;
36
+ return await res.json();
37
+ }
38
+ catch {
39
+ return null;
40
+ }
41
+ }
42
+ export async function requestDaemonShutdown(opts) {
43
+ try {
44
+ const res = await requestDaemon('/shutdown', { method: 'POST', timeout: opts?.timeout ?? 5000 });
27
45
  return res.ok;
28
46
  }
29
47
  catch {
30
48
  return false;
31
49
  }
32
50
  }
51
+ /**
52
+ * Check if daemon is running.
53
+ */
54
+ export async function isDaemonRunning() {
55
+ return (await fetchDaemonStatus()) !== null;
56
+ }
33
57
  /**
34
58
  * Check if daemon is running AND the extension is connected.
35
59
  */
36
60
  export async function isExtensionConnected() {
37
- try {
38
- const controller = new AbortController();
39
- const timer = setTimeout(() => controller.abort(), 2000);
40
- const res = await fetch(`${DAEMON_URL}/status`, {
41
- headers: { 'X-OpenCLI': '1' },
42
- signal: controller.signal,
43
- });
44
- clearTimeout(timer);
45
- if (!res.ok)
46
- return false;
47
- const data = await res.json();
48
- return !!data.extensionConnected;
49
- }
50
- catch {
51
- return false;
52
- }
61
+ const status = await fetchDaemonStatus();
62
+ return !!status?.extensionConnected;
53
63
  }
54
64
  /**
55
65
  * Send a command to the daemon and wait for a result.
@@ -63,15 +73,12 @@ export async function sendCommand(action, params = {}) {
63
73
  const id = generateId();
64
74
  const command = { id, action, ...params };
65
75
  try {
66
- const controller = new AbortController();
67
- const timer = setTimeout(() => controller.abort(), 30000);
68
- const res = await fetch(`${DAEMON_URL}/command`, {
76
+ const res = await requestDaemon('/command', {
69
77
  method: 'POST',
70
- headers: { 'Content-Type': 'application/json', 'X-OpenCLI': '1' },
78
+ headers: { 'Content-Type': 'application/json' },
71
79
  body: JSON.stringify(command),
72
- signal: controller.signal,
80
+ timeout: 30000,
73
81
  });
74
- clearTimeout(timer);
75
82
  const result = (await res.json());
76
83
  if (!result.ok) {
77
84
  // Check if error is a transient extension issue worth retrying
@@ -0,0 +1 @@
1
+ export {};