@jackwener/opencli 1.6.1 → 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +1 -1
- package/README.md +27 -45
- package/README.zh-CN.md +32 -34
- package/autoresearch/browse-tasks.json +18 -20
- package/autoresearch/commands/debug.ts +163 -0
- package/autoresearch/commands/fix.ts +145 -0
- package/autoresearch/commands/plan.ts +88 -0
- package/autoresearch/commands/run.ts +138 -0
- package/autoresearch/config.ts +82 -0
- package/autoresearch/engine.ts +359 -0
- package/autoresearch/eval-all.ts +127 -0
- package/autoresearch/eval-browse.ts +1 -1
- package/autoresearch/eval-publish.ts +238 -0
- package/autoresearch/eval-save.ts +249 -0
- package/autoresearch/eval-skill.ts +14 -8
- package/autoresearch/eval-v2ex.ts +220 -0
- package/autoresearch/eval-zhihu.ts +230 -0
- package/autoresearch/logger.ts +69 -0
- package/autoresearch/presets/combined-reliability.ts +27 -0
- package/autoresearch/presets/index.ts +23 -0
- package/autoresearch/presets/operate-reliability.ts +24 -0
- package/autoresearch/presets/save-reliability.ts +26 -0
- package/autoresearch/presets/skill-quality.ts +20 -0
- package/autoresearch/presets/v2ex-reliability.ts +24 -0
- package/autoresearch/presets/zhihu-reliability.ts +25 -0
- package/autoresearch/publish-tasks.json +345 -0
- package/autoresearch/run-save.sh +11 -0
- package/autoresearch/save-adapters/xhs-explore-deep.ts +64 -0
- package/autoresearch/save-adapters/xhs-note-comments.ts +61 -0
- package/autoresearch/save-adapters/xhs-search-full.ts +62 -0
- package/autoresearch/save-adapters/zhihu-hot-detail.ts +52 -0
- package/autoresearch/save-adapters/zhihu-question-full.ts +57 -0
- package/autoresearch/save-adapters/zhihu-search-detail.ts +53 -0
- package/autoresearch/save-tasks.json +281 -0
- package/autoresearch/v2ex-tasks.json +899 -0
- package/autoresearch/zhihu-tasks.json +848 -0
- package/dist/browser/base-page.d.ts +4 -2
- package/dist/browser/base-page.js +37 -4
- package/dist/browser/bridge.js +10 -8
- package/dist/browser/cdp.js +2 -6
- package/dist/browser/daemon-client.d.ts +11 -1
- package/dist/browser/daemon-client.js +3 -0
- package/dist/browser/dom-helpers.d.ts +4 -2
- package/dist/browser/dom-helpers.js +42 -31
- package/dist/browser/dom-snapshot.js +23 -1
- package/dist/browser/page.d.ts +7 -2
- package/dist/browser/page.js +112 -30
- package/dist/browser.test.js +1 -1
- package/dist/build-manifest.d.ts +1 -0
- package/dist/build-manifest.js +1 -0
- package/dist/cli-manifest.json +1135 -184
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +48 -7
- package/dist/cli.test.d.ts +1 -0
- package/dist/cli.test.js +88 -0
- package/dist/clis/1688/item.d.ts +70 -0
- package/dist/clis/1688/item.js +187 -0
- package/dist/clis/1688/item.test.d.ts +1 -0
- package/dist/clis/1688/item.test.js +67 -0
- package/dist/clis/1688/search.d.ts +56 -0
- package/dist/clis/1688/search.js +309 -0
- package/dist/clis/1688/search.test.d.ts +1 -0
- package/dist/clis/1688/search.test.js +75 -0
- package/dist/clis/1688/shared.d.ts +112 -0
- package/dist/clis/1688/shared.js +514 -0
- package/dist/clis/1688/shared.test.d.ts +1 -0
- package/dist/clis/1688/shared.test.js +57 -0
- package/dist/clis/1688/store.d.ts +45 -0
- package/dist/clis/1688/store.js +226 -0
- package/dist/clis/1688/store.test.d.ts +1 -0
- package/dist/clis/1688/store.test.js +62 -0
- package/dist/clis/amazon/bestsellers.d.ts +0 -20
- package/dist/clis/amazon/bestsellers.js +6 -129
- package/dist/clis/amazon/bestsellers.test.js +12 -3
- package/dist/clis/amazon/movers-shakers.d.ts +1 -0
- package/dist/clis/amazon/movers-shakers.js +7 -0
- package/dist/clis/amazon/new-releases.d.ts +1 -0
- package/dist/clis/amazon/new-releases.js +7 -0
- package/dist/clis/amazon/rankings.d.ts +59 -0
- package/dist/clis/amazon/rankings.js +226 -0
- package/dist/clis/amazon/rankings.test.d.ts +1 -0
- package/dist/clis/amazon/rankings.test.js +41 -0
- package/dist/clis/amazon/shared.d.ts +11 -0
- package/dist/clis/amazon/shared.js +121 -11
- package/dist/clis/amazon/shared.test.js +11 -0
- package/dist/clis/bilibili/comments.js +2 -2
- package/dist/clis/bilibili/comments.test.js +3 -2
- package/dist/clis/bilibili/download.js +2 -1
- package/dist/clis/bilibili/subtitle.js +4 -3
- package/dist/clis/bilibili/subtitle.test.js +2 -1
- package/dist/clis/bilibili/utils.d.ts +5 -0
- package/dist/clis/bilibili/utils.js +30 -0
- package/dist/clis/bilibili/utils.test.d.ts +1 -0
- package/dist/clis/bilibili/utils.test.js +17 -0
- package/dist/clis/douban/marks.js +1 -1
- package/dist/clis/douban/subject.yaml +50 -19
- package/dist/clis/doubao/utils.js +32 -12
- package/dist/clis/douyin/_shared/browser-fetch.test.js +0 -1
- package/dist/clis/douyin/_shared/transcode.test.js +0 -2
- package/dist/clis/douyin/draft.test.js +0 -2
- package/dist/clis/facebook/search.test.js +0 -2
- package/dist/clis/gemini/ask.js +9 -3
- package/dist/clis/gemini/ask.test.d.ts +1 -0
- package/dist/clis/gemini/ask.test.js +100 -0
- package/dist/clis/gemini/reply-state.test.d.ts +1 -0
- package/dist/clis/gemini/reply-state.test.js +641 -0
- package/dist/clis/gemini/utils.d.ts +44 -1
- package/dist/clis/gemini/utils.js +528 -61
- package/dist/clis/gemini/utils.test.js +149 -2
- package/dist/clis/hupu/detail.d.ts +1 -0
- package/dist/clis/hupu/detail.js +72 -0
- package/dist/clis/hupu/hot.yaml +43 -0
- package/dist/clis/hupu/like.d.ts +1 -0
- package/dist/clis/hupu/like.js +75 -0
- package/dist/clis/hupu/reply.d.ts +1 -0
- package/dist/clis/hupu/reply.js +71 -0
- package/dist/clis/hupu/search.d.ts +1 -0
- package/dist/clis/hupu/search.js +59 -0
- package/dist/clis/hupu/unlike.d.ts +1 -0
- package/dist/clis/hupu/unlike.js +75 -0
- package/dist/clis/hupu/utils.d.ts +20 -0
- package/dist/clis/hupu/utils.js +319 -0
- package/dist/clis/instagram/_shared/private-publish.d.ts +138 -0
- package/dist/clis/instagram/_shared/private-publish.js +1030 -0
- package/dist/clis/instagram/_shared/private-publish.test.d.ts +1 -0
- package/dist/clis/instagram/_shared/private-publish.test.js +705 -0
- package/dist/clis/instagram/_shared/protocol-capture.d.ts +26 -0
- package/dist/clis/instagram/_shared/protocol-capture.js +282 -0
- package/dist/clis/instagram/_shared/protocol-capture.test.d.ts +1 -0
- package/dist/clis/instagram/_shared/protocol-capture.test.js +114 -0
- package/dist/clis/instagram/_shared/runtime-info.d.ts +9 -0
- package/dist/clis/instagram/_shared/runtime-info.js +81 -0
- package/dist/clis/instagram/note.d.ts +1 -0
- package/dist/clis/instagram/note.js +222 -0
- package/dist/clis/instagram/note.test.d.ts +1 -0
- package/dist/clis/instagram/note.test.js +81 -0
- package/dist/clis/instagram/post.d.ts +4 -0
- package/dist/clis/instagram/post.js +1496 -0
- package/dist/clis/instagram/post.test.d.ts +1 -0
- package/dist/clis/instagram/post.test.js +1647 -0
- package/dist/clis/instagram/reel.d.ts +1 -0
- package/dist/clis/instagram/reel.js +826 -0
- package/dist/clis/instagram/reel.test.d.ts +1 -0
- package/dist/clis/instagram/reel.test.js +167 -0
- package/dist/clis/instagram/story.d.ts +1 -0
- package/dist/clis/instagram/story.js +115 -0
- package/dist/clis/instagram/story.test.d.ts +1 -0
- package/dist/clis/instagram/story.test.js +167 -0
- package/dist/clis/sinafinance/stock-rank.d.ts +4 -0
- package/dist/clis/sinafinance/stock-rank.js +65 -0
- package/dist/clis/substack/utils.test.js +0 -2
- package/dist/clis/twitter/post.js +72 -45
- package/dist/clis/twitter/post.test.d.ts +1 -0
- package/dist/clis/twitter/post.test.js +116 -0
- package/dist/clis/twitter/reply.d.ts +12 -0
- package/dist/clis/twitter/reply.js +257 -35
- package/dist/clis/twitter/reply.test.d.ts +1 -0
- package/dist/clis/twitter/reply.test.js +151 -0
- package/dist/clis/xianyu/chat.d.ts +7 -0
- package/dist/clis/xianyu/chat.js +146 -0
- package/dist/clis/xianyu/chat.test.d.ts +1 -0
- package/dist/clis/xianyu/chat.test.js +15 -0
- package/dist/clis/xianyu/item.d.ts +7 -0
- package/dist/clis/xianyu/item.js +152 -0
- package/dist/clis/xianyu/item.test.d.ts +1 -0
- package/dist/clis/xianyu/item.test.js +56 -0
- package/dist/clis/xianyu/search.d.ts +10 -0
- package/dist/clis/xianyu/search.js +134 -0
- package/dist/clis/xianyu/search.test.d.ts +1 -0
- package/dist/clis/xianyu/search.test.js +17 -0
- package/dist/clis/xianyu/utils.d.ts +1 -0
- package/dist/clis/xianyu/utils.js +8 -0
- package/dist/clis/xiaoe/catalog.yaml +129 -0
- package/dist/clis/xiaoe/content.yaml +43 -0
- package/dist/clis/xiaoe/courses.yaml +73 -0
- package/dist/clis/xiaoe/detail.yaml +39 -0
- package/dist/clis/xiaoe/play-url.yaml +124 -0
- package/dist/clis/xiaohongshu/comments.test.js +0 -2
- package/dist/clis/xiaohongshu/creator-note-detail.test.js +0 -2
- package/dist/clis/xiaohongshu/creator-notes.test.js +0 -2
- package/dist/clis/xiaohongshu/download.test.js +0 -2
- package/dist/clis/xiaohongshu/note.test.js +0 -2
- package/dist/clis/xiaohongshu/publish.test.js +0 -2
- package/dist/clis/xiaohongshu/search.js +29 -20
- package/dist/clis/xiaohongshu/search.test.js +56 -48
- package/dist/clis/yuanbao/ask.d.ts +21 -0
- package/dist/clis/yuanbao/ask.js +427 -0
- package/dist/clis/yuanbao/ask.test.d.ts +1 -0
- package/dist/clis/yuanbao/ask.test.js +124 -0
- package/dist/clis/yuanbao/new.d.ts +1 -0
- package/dist/clis/yuanbao/new.js +70 -0
- package/dist/clis/yuanbao/new.test.d.ts +1 -0
- package/dist/clis/yuanbao/new.test.js +30 -0
- package/dist/clis/yuanbao/shared.d.ts +13 -0
- package/dist/clis/yuanbao/shared.js +49 -0
- package/dist/clis/zhihu/question.js +30 -19
- package/dist/clis/zhihu/question.test.js +34 -16
- package/dist/commanderAdapter.js +8 -4
- package/dist/commanderAdapter.test.js +42 -0
- package/dist/completion.js +3 -1
- package/dist/completion.test.d.ts +1 -0
- package/dist/completion.test.js +23 -0
- package/dist/doctor.js +1 -1
- package/dist/electron-apps.d.ts +2 -0
- package/dist/electron-apps.js +7 -1
- package/dist/errors.js +1 -1
- package/dist/execution.js +25 -35
- package/dist/explore.js +1 -1
- package/dist/launcher.d.ts +4 -0
- package/dist/launcher.js +64 -8
- package/dist/launcher.test.js +88 -7
- package/dist/output.d.ts +2 -0
- package/dist/output.js +10 -1
- package/dist/output.test.d.ts +0 -3
- package/dist/output.test.js +59 -92
- package/dist/pipeline/executor.test.js +0 -2
- package/dist/pipeline/steps/download.test.js +0 -2
- package/dist/registry.d.ts +2 -0
- package/dist/serialization.d.ts +1 -0
- package/dist/serialization.js +1 -0
- package/dist/types.d.ts +9 -2
- package/docs/.vitepress/config.mts +4 -0
- package/docs/adapters/browser/1688.md +52 -0
- package/docs/adapters/browser/36kr.md +2 -1
- package/docs/adapters/browser/doubao.md +5 -1
- package/docs/adapters/browser/hupu.md +53 -0
- package/docs/adapters/browser/sinafinance.md +32 -2
- package/docs/adapters/browser/weibo.md +6 -1
- package/docs/adapters/browser/wikipedia.md +2 -0
- package/docs/adapters/browser/xianyu.md +42 -0
- package/docs/adapters/browser/xiaoe.md +44 -0
- package/docs/adapters/browser/yuanbao.md +64 -0
- package/docs/adapters/index.md +14 -5
- package/docs/comparison.md +1 -1
- package/docs/developer/ai-workflow.md +2 -2
- package/docs/developer/contributing.md +1 -1
- package/docs/developer/testing.md +2 -0
- package/docs/guide/plugins.md +1 -0
- package/docs/guide/troubleshooting.md +11 -0
- package/docs/superpowers/specs/2026-04-03-v2ex-autoresearch-design.md +41 -0
- package/docs/zh/guide/plugins.md +1 -0
- package/extension/dist/background.js +1127 -0
- package/extension/src/background.test.ts +39 -0
- package/extension/src/background.ts +223 -34
- package/extension/src/cdp.ts +194 -4
- package/extension/src/protocol.ts +22 -1
- package/package.json +3 -2
- package/scripts/postinstall.js +1 -1
- package/skills/opencli-explorer/SKILL.md +1 -1
- package/skills/opencli-oneshot/SKILL.md +2 -2
- package/skills/opencli-operate/SKILL.md +120 -27
- package/skills/opencli-usage/SKILL.md +31 -20
- package/skills/opencli-usage/browser.md +114 -16
- package/skills/opencli-usage/public-api.md +32 -3
- package/skills/smart-search/SKILL.md +156 -0
- package/skills/smart-search/references/sources-ai.md +74 -0
- package/skills/smart-search/references/sources-info.md +43 -0
- package/skills/smart-search/references/sources-media.md +50 -0
- package/skills/smart-search/references/sources-other.md +42 -0
- package/skills/smart-search/references/sources-shopping.md +31 -0
- package/skills/smart-search/references/sources-social.md +51 -0
- package/skills/smart-search/references/sources-tech.md +42 -0
- package/skills/smart-search/references/sources-travel.md +20 -0
- package/src/browser/base-page.ts +41 -6
- package/src/browser/bridge.ts +11 -8
- package/src/browser/cdp.ts +1 -8
- package/src/browser/daemon-client.ts +11 -1
- package/src/browser/dom-helpers.ts +43 -31
- package/src/browser/dom-snapshot.ts +23 -1
- package/src/browser/page.ts +115 -31
- package/src/browser.test.ts +1 -1
- package/src/build-manifest.ts +2 -0
- package/src/cli.test.ts +133 -0
- package/src/cli.ts +73 -11
- package/src/clis/1688/item.test.ts +69 -0
- package/src/clis/1688/item.ts +282 -0
- package/src/clis/1688/search.test.ts +81 -0
- package/src/clis/1688/search.ts +402 -0
- package/src/clis/1688/shared.test.ts +75 -0
- package/src/clis/1688/shared.ts +623 -0
- package/src/clis/1688/store.test.ts +69 -0
- package/src/clis/1688/store.ts +300 -0
- package/src/clis/amazon/bestsellers.test.ts +12 -3
- package/src/clis/amazon/bestsellers.ts +6 -178
- package/src/clis/amazon/movers-shakers.ts +8 -0
- package/src/clis/amazon/new-releases.ts +8 -0
- package/src/clis/amazon/rankings.test.ts +47 -0
- package/src/clis/amazon/rankings.ts +312 -0
- package/src/clis/amazon/shared.test.ts +16 -0
- package/src/clis/amazon/shared.ts +134 -12
- package/src/clis/bilibili/comments.test.ts +4 -3
- package/src/clis/bilibili/comments.ts +2 -2
- package/src/clis/bilibili/download.ts +2 -1
- package/src/clis/bilibili/subtitle.test.ts +2 -1
- package/src/clis/bilibili/subtitle.ts +4 -3
- package/src/clis/bilibili/utils.test.ts +21 -0
- package/src/clis/bilibili/utils.ts +27 -0
- package/src/clis/douban/marks.ts +1 -1
- package/src/clis/douban/subject.yaml +50 -19
- package/src/clis/doubao/utils.ts +32 -12
- package/src/clis/douyin/_shared/browser-fetch.test.ts +0 -1
- package/src/clis/douyin/_shared/transcode.test.ts +0 -2
- package/src/clis/douyin/draft.test.ts +0 -2
- package/src/clis/facebook/search.test.ts +0 -2
- package/src/clis/gemini/ask.test.ts +116 -0
- package/src/clis/gemini/ask.ts +10 -3
- package/src/clis/gemini/reply-state.test.ts +708 -0
- package/src/clis/gemini/utils.test.ts +184 -2
- package/src/clis/gemini/utils.ts +588 -60
- package/src/clis/hupu/detail.ts +126 -0
- package/src/clis/hupu/hot.yaml +43 -0
- package/src/clis/hupu/like.ts +76 -0
- package/src/clis/hupu/reply.ts +76 -0
- package/src/clis/hupu/search.ts +95 -0
- package/src/clis/hupu/unlike.ts +76 -0
- package/src/clis/hupu/utils.ts +381 -0
- package/src/clis/instagram/_shared/private-publish.test.ts +827 -0
- package/src/clis/instagram/_shared/private-publish.ts +1303 -0
- package/src/clis/instagram/_shared/protocol-capture.test.ts +148 -0
- package/src/clis/instagram/_shared/protocol-capture.ts +321 -0
- package/src/clis/instagram/_shared/runtime-info.ts +91 -0
- package/src/clis/instagram/note.test.ts +96 -0
- package/src/clis/instagram/note.ts +254 -0
- package/src/clis/instagram/post.test.ts +1716 -0
- package/src/clis/instagram/post.ts +1620 -0
- package/src/clis/instagram/reel.test.ts +191 -0
- package/src/clis/instagram/reel.ts +886 -0
- package/src/clis/instagram/story.test.ts +191 -0
- package/src/clis/instagram/story.ts +151 -0
- package/src/clis/sinafinance/stock-rank.ts +68 -0
- package/src/clis/substack/utils.test.ts +0 -2
- package/src/clis/twitter/post.test.ts +157 -0
- package/src/clis/twitter/post.ts +82 -48
- package/src/clis/twitter/reply.test.ts +177 -0
- package/src/clis/twitter/reply.ts +285 -39
- package/src/clis/xianyu/chat.test.ts +20 -0
- package/src/clis/xianyu/chat.ts +175 -0
- package/src/clis/xianyu/item.test.ts +67 -0
- package/src/clis/xianyu/item.ts +172 -0
- package/src/clis/xianyu/search.test.ts +22 -0
- package/src/clis/xianyu/search.ts +151 -0
- package/src/clis/xianyu/utils.ts +9 -0
- package/src/clis/xiaoe/catalog.yaml +129 -0
- package/src/clis/xiaoe/content.yaml +43 -0
- package/src/clis/xiaoe/courses.yaml +73 -0
- package/src/clis/xiaoe/detail.yaml +39 -0
- package/src/clis/xiaoe/play-url.yaml +124 -0
- package/src/clis/xiaohongshu/comments.test.ts +0 -2
- package/src/clis/xiaohongshu/creator-note-detail.test.ts +0 -2
- package/src/clis/xiaohongshu/creator-notes.test.ts +0 -2
- package/src/clis/xiaohongshu/download.test.ts +0 -2
- package/src/clis/xiaohongshu/note.test.ts +0 -2
- package/src/clis/xiaohongshu/publish.test.ts +0 -2
- package/src/clis/xiaohongshu/search.test.ts +59 -48
- package/src/clis/xiaohongshu/search.ts +31 -21
- package/src/clis/yuanbao/ask.test.ts +156 -0
- package/src/clis/yuanbao/ask.ts +522 -0
- package/src/clis/yuanbao/new.test.ts +36 -0
- package/src/clis/yuanbao/new.ts +81 -0
- package/src/clis/yuanbao/shared.ts +57 -0
- package/src/clis/zhihu/question.test.ts +42 -17
- package/src/clis/zhihu/question.ts +31 -26
- package/src/commanderAdapter.test.ts +51 -0
- package/src/commanderAdapter.ts +8 -4
- package/src/completion.test.ts +30 -0
- package/src/completion.ts +3 -1
- package/src/doctor.ts +1 -1
- package/src/electron-apps.ts +9 -1
- package/src/errors.ts +1 -1
- package/src/execution.ts +26 -30
- package/src/explore.ts +1 -1
- package/src/launcher.test.ts +121 -7
- package/src/launcher.ts +87 -9
- package/src/output.test.ts +50 -90
- package/src/output.ts +10 -1
- package/src/pipeline/executor.test.ts +0 -2
- package/src/pipeline/steps/download.test.ts +0 -2
- package/src/registry.ts +2 -0
- package/src/serialization.ts +2 -0
- package/src/types.ts +9 -2
- package/tests/e2e/browser-auth.test.ts +9 -0
- package/CLI-EXPLORER.md +0 -724
- package/CLI-ONESHOT.md +0 -216
- package/SKILL.md +0 -59
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* V2EX Test Suite: Deterministic command testing against v2ex.com.
|
|
4
|
+
*
|
|
5
|
+
* 40 tasks across 5 difficulty layers:
|
|
6
|
+
* L1 Atomic (10) → L2 Single Page (10) → L3 Multi-Step (10)
|
|
7
|
+
* → L4 Write Ops (5) → L5 Complex Chain (5)
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* npx tsx autoresearch/eval-v2ex.ts # Run all tasks
|
|
11
|
+
* npx tsx autoresearch/eval-v2ex.ts --task v2ex-hot-topics # Run single task
|
|
12
|
+
* npx tsx autoresearch/eval-v2ex.ts --layer 1 # Run only Layer 1 (atomic)
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { execSync } from 'node:child_process';
|
|
16
|
+
import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
|
|
17
|
+
import { join, dirname } from 'node:path';
|
|
18
|
+
import { fileURLToPath } from 'node:url';
|
|
19
|
+
|
|
20
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
|
+
const TASKS_FILE = join(__dirname, 'v2ex-tasks.json');
|
|
22
|
+
const RESULTS_DIR = join(__dirname, 'results');
|
|
23
|
+
|
|
24
|
+
interface BrowseTask {
|
|
25
|
+
name: string;
|
|
26
|
+
steps: string[];
|
|
27
|
+
judge: JudgeCriteria;
|
|
28
|
+
set?: 'test';
|
|
29
|
+
note?: string;
|
|
30
|
+
_comment?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
type JudgeCriteria =
|
|
34
|
+
| { type: 'contains'; value: string }
|
|
35
|
+
| { type: 'arrayMinLength'; minLength: number }
|
|
36
|
+
| { type: 'nonEmpty' }
|
|
37
|
+
| { type: 'matchesPattern'; pattern: string };
|
|
38
|
+
|
|
39
|
+
interface TaskResult {
|
|
40
|
+
name: string;
|
|
41
|
+
passed: boolean;
|
|
42
|
+
duration: number;
|
|
43
|
+
error?: string;
|
|
44
|
+
layer: string;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Layer classification by task name prefix pattern
|
|
48
|
+
function getLayer(name: string): string {
|
|
49
|
+
if (['v2ex-open-', 'v2ex-state-', 'v2ex-get-title', 'v2ex-click-tab', 'v2ex-scroll-down',
|
|
50
|
+
'v2ex-get-first-', 'v2ex-eval-extract', 'v2ex-get-url', 'v2ex-back-nav', 'v2ex-wait-'].some(p => name.startsWith(p)))
|
|
51
|
+
return 'L1-atomic';
|
|
52
|
+
if (['v2ex-hot-topics', 'v2ex-node-list', 'v2ex-topic-meta', 'v2ex-node-topics',
|
|
53
|
+
'v2ex-node-pagination', 'v2ex-tab-content', 'v2ex-topic-replies-extract',
|
|
54
|
+
'v2ex-topic-reply-count', 'v2ex-member-info', 'v2ex-search-results'].includes(name))
|
|
55
|
+
return 'L2-single-page';
|
|
56
|
+
if (['v2ex-click-topic-read', 'v2ex-click-author', 'v2ex-navigate-node', 'v2ex-pagination-page2',
|
|
57
|
+
'v2ex-topic-and-back', 'v2ex-tab-then-topic', 'v2ex-scroll-find-more',
|
|
58
|
+
'v2ex-node-to-topic', 'v2ex-multi-tab-compare', 'v2ex-topic-reply-to-author'].some(p => name.startsWith(p)))
|
|
59
|
+
return 'L3-multi-step';
|
|
60
|
+
if (['v2ex-reply-', 'v2ex-favorite-', 'v2ex-thank-', 'v2ex-create-'].some(p => name.startsWith(p)))
|
|
61
|
+
return 'L4-write';
|
|
62
|
+
if (['v2ex-collect-', 'v2ex-multi-node-', 'v2ex-topic-deep-', 'v2ex-cross-page-', 'v2ex-full-'].some(p => name.startsWith(p)))
|
|
63
|
+
return 'L5-complex';
|
|
64
|
+
return 'unknown';
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function judge(criteria: JudgeCriteria, output: string): boolean {
|
|
68
|
+
try {
|
|
69
|
+
switch (criteria.type) {
|
|
70
|
+
case 'contains':
|
|
71
|
+
return output.toLowerCase().includes(criteria.value.toLowerCase());
|
|
72
|
+
case 'arrayMinLength': {
|
|
73
|
+
try {
|
|
74
|
+
const arr = JSON.parse(output);
|
|
75
|
+
if (Array.isArray(arr)) return arr.length >= criteria.minLength;
|
|
76
|
+
} catch { /* not JSON array */ }
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
case 'nonEmpty':
|
|
80
|
+
return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined';
|
|
81
|
+
case 'matchesPattern':
|
|
82
|
+
return new RegExp(criteria.pattern).test(output);
|
|
83
|
+
default:
|
|
84
|
+
return false;
|
|
85
|
+
}
|
|
86
|
+
} catch {
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function runCommand(cmd: string, timeout = 30000): string {
|
|
92
|
+
try {
|
|
93
|
+
return execSync(cmd, {
|
|
94
|
+
cwd: join(__dirname, '..'),
|
|
95
|
+
timeout,
|
|
96
|
+
encoding: 'utf-8',
|
|
97
|
+
env: process.env,
|
|
98
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
99
|
+
}).trim();
|
|
100
|
+
} catch (err: any) {
|
|
101
|
+
return err.stdout?.trim() ?? '';
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function runTask(task: BrowseTask): TaskResult {
|
|
106
|
+
const start = Date.now();
|
|
107
|
+
let lastOutput = '';
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
for (const step of task.steps) {
|
|
111
|
+
lastOutput = runCommand(step);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const passed = judge(task.judge, lastOutput);
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
name: task.name,
|
|
118
|
+
passed,
|
|
119
|
+
duration: Date.now() - start,
|
|
120
|
+
error: passed ? undefined : `Output: ${lastOutput.slice(0, 150)}`,
|
|
121
|
+
layer: getLayer(task.name),
|
|
122
|
+
};
|
|
123
|
+
} catch (err: any) {
|
|
124
|
+
return {
|
|
125
|
+
name: task.name,
|
|
126
|
+
passed: false,
|
|
127
|
+
duration: Date.now() - start,
|
|
128
|
+
error: err.message?.slice(0, 100),
|
|
129
|
+
layer: getLayer(task.name),
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function main() {
|
|
135
|
+
const args = process.argv.slice(2);
|
|
136
|
+
const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
|
|
137
|
+
const layerFilter = args.includes('--layer') ? args[args.indexOf('--layer') + 1] : null;
|
|
138
|
+
|
|
139
|
+
const raw = JSON.parse(readFileSync(TASKS_FILE, 'utf-8')) as (BrowseTask | { _comment: string })[];
|
|
140
|
+
const allTasks = raw.filter((t): t is BrowseTask => 'name' in t && 'steps' in t);
|
|
141
|
+
|
|
142
|
+
let tasks = allTasks;
|
|
143
|
+
if (singleTask) {
|
|
144
|
+
tasks = allTasks.filter(t => t.name === singleTask);
|
|
145
|
+
} else if (layerFilter) {
|
|
146
|
+
const prefix = `L${layerFilter}`;
|
|
147
|
+
tasks = allTasks.filter(t => getLayer(t.name).startsWith(prefix));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
if (tasks.length === 0) {
|
|
151
|
+
console.error(singleTask ? `Task "${singleTask}" not found.` : `No tasks for layer ${layerFilter}.`);
|
|
152
|
+
process.exit(1);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
console.log(`\n🔬 V2EX Test Suite — ${tasks.length} tasks\n`);
|
|
156
|
+
|
|
157
|
+
const results: TaskResult[] = [];
|
|
158
|
+
|
|
159
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
160
|
+
const task = tasks[i];
|
|
161
|
+
process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`);
|
|
162
|
+
|
|
163
|
+
const result = runTask(task);
|
|
164
|
+
results.push(result);
|
|
165
|
+
|
|
166
|
+
const icon = result.passed ? '✓' : '✗';
|
|
167
|
+
console.log(` ${icon} (${(result.duration / 1000).toFixed(1)}s)`);
|
|
168
|
+
|
|
169
|
+
// Close browser between tasks for clean state
|
|
170
|
+
if (i < tasks.length - 1) {
|
|
171
|
+
try { runCommand('opencli operate close'); } catch { /* ignore */ }
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Final close
|
|
176
|
+
try { runCommand('opencli operate close'); } catch { /* ignore */ }
|
|
177
|
+
|
|
178
|
+
// Summary by layer
|
|
179
|
+
const layers = [...new Set(results.map(r => r.layer))].sort();
|
|
180
|
+
const totalPassed = results.filter(r => r.passed).length;
|
|
181
|
+
const totalDuration = results.reduce((s, r) => s + r.duration, 0);
|
|
182
|
+
|
|
183
|
+
console.log(`\n${'─'.repeat(50)}`);
|
|
184
|
+
console.log(` Total: ${totalPassed}/${results.length}`);
|
|
185
|
+
for (const layer of layers) {
|
|
186
|
+
const layerResults = results.filter(r => r.layer === layer);
|
|
187
|
+
const layerPassed = layerResults.filter(r => r.passed).length;
|
|
188
|
+
console.log(` ${layer}: ${layerPassed}/${layerResults.length}`);
|
|
189
|
+
}
|
|
190
|
+
console.log(` Time: ${Math.round(totalDuration / 60000)}min`);
|
|
191
|
+
|
|
192
|
+
const failures = results.filter(r => !r.passed);
|
|
193
|
+
if (failures.length > 0) {
|
|
194
|
+
console.log(`\n Failures:`);
|
|
195
|
+
for (const f of failures) {
|
|
196
|
+
console.log(` ✗ [${f.layer}] ${f.name}: ${f.error ?? 'unknown'}`);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
console.log('');
|
|
200
|
+
|
|
201
|
+
// Save result
|
|
202
|
+
mkdirSync(RESULTS_DIR, { recursive: true });
|
|
203
|
+
const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('v2ex-')).length;
|
|
204
|
+
const roundNum = String(existing + 1).padStart(3, '0');
|
|
205
|
+
const resultPath = join(RESULTS_DIR, `v2ex-${roundNum}.json`);
|
|
206
|
+
writeFileSync(resultPath, JSON.stringify({
|
|
207
|
+
timestamp: new Date().toISOString(),
|
|
208
|
+
score: `${totalPassed}/${results.length}`,
|
|
209
|
+
layers: Object.fromEntries(layers.map(l => {
|
|
210
|
+
const lr = results.filter(r => r.layer === l);
|
|
211
|
+
return [l, `${lr.filter(r => r.passed).length}/${lr.length}`];
|
|
212
|
+
})),
|
|
213
|
+
duration: `${Math.round(totalDuration / 60000)}min`,
|
|
214
|
+
tasks: results,
|
|
215
|
+
}, null, 2), 'utf-8');
|
|
216
|
+
console.log(` Results saved to: ${resultPath}`);
|
|
217
|
+
console.log(`\nSCORE=${totalPassed}/${results.length}`);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
main();
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Zhihu Test Suite: Deterministic command testing against v2ex.com.
|
|
4
|
+
*
|
|
5
|
+
* 40 tasks across 5 difficulty layers:
|
|
6
|
+
* L1 Atomic (10) → L2 Single Page (10) → L3 Multi-Step (10)
|
|
7
|
+
* → L4 Write Ops (5) → L5 Complex Chain (5)
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* npx tsx autoresearch/eval-v2ex.ts # Run all tasks
|
|
11
|
+
* npx tsx autoresearch/eval-v2ex.ts --task zhihu-hot-topics # Run single task
|
|
12
|
+
* npx tsx autoresearch/eval-v2ex.ts --layer 1 # Run only Layer 1 (atomic)
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { execSync } from 'node:child_process';
|
|
16
|
+
import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
|
|
17
|
+
import { join, dirname } from 'node:path';
|
|
18
|
+
import { fileURLToPath } from 'node:url';
|
|
19
|
+
|
|
20
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
|
+
const TASKS_FILE = join(__dirname, 'zhihu-tasks.json');
|
|
22
|
+
const RESULTS_DIR = join(__dirname, 'results');
|
|
23
|
+
|
|
24
|
+
interface BrowseTask {
|
|
25
|
+
name: string;
|
|
26
|
+
steps: string[];
|
|
27
|
+
judge: JudgeCriteria;
|
|
28
|
+
set?: 'test';
|
|
29
|
+
note?: string;
|
|
30
|
+
_comment?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
type JudgeCriteria =
|
|
34
|
+
| { type: 'contains'; value: string }
|
|
35
|
+
| { type: 'arrayMinLength'; minLength: number }
|
|
36
|
+
| { type: 'nonEmpty' }
|
|
37
|
+
| { type: 'matchesPattern'; pattern: string };
|
|
38
|
+
|
|
39
|
+
interface TaskResult {
|
|
40
|
+
name: string;
|
|
41
|
+
passed: boolean;
|
|
42
|
+
duration: number;
|
|
43
|
+
error?: string;
|
|
44
|
+
layer: string;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Layer classification by task name
|
|
48
|
+
function getLayer(name: string): string {
|
|
49
|
+
const l1 = ['zhihu-open-home', 'zhihu-get-title', 'zhihu-state', 'zhihu-get-url', 'zhihu-scroll-down',
|
|
50
|
+
'zhihu-click-tab-hot', 'zhihu-back-navigation', 'zhihu-wait-page-load', 'zhihu-keys-escape', 'zhihu-screenshot'];
|
|
51
|
+
const l2 = ['zhihu-feed-titles', 'zhihu-hot-list', 'zhihu-hot-metrics', 'zhihu-nav-tabs',
|
|
52
|
+
'zhihu-feed-with-authors', 'zhihu-feed-types', 'zhihu-user-avatar', 'zhihu-search-input-exists'];
|
|
53
|
+
const l3 = ['zhihu-question-title', 'zhihu-question-meta', 'zhihu-first-answer', 'zhihu-answer-votes',
|
|
54
|
+
'zhihu-question-buttons', 'zhihu-multiple-answers', 'zhihu-question-description', 'zhihu-answer-count-number'];
|
|
55
|
+
const l4 = ['zhihu-hot-to-question', 'zhihu-feed-to-question', 'zhihu-question-to-author',
|
|
56
|
+
'zhihu-search-navigate', 'zhihu-topic-page', 'zhihu-user-profile', 'zhihu-question-and-back', 'zhihu-scroll-load-more'];
|
|
57
|
+
const l5 = ['zhihu-upvote-button-find', 'zhihu-follow-question-find', 'zhihu-comment-button-find',
|
|
58
|
+
'zhihu-bookmark-find', 'zhihu-write-answer-btn', 'zhihu-share-find'];
|
|
59
|
+
const l6 = ['zhihu-hot-read-answer-author', 'zhihu-hot-to-author-profile', 'zhihu-multi-hot-topics',
|
|
60
|
+
'zhihu-search-then-read', 'zhihu-question-scroll-answers', 'zhihu-compare-tabs', 'zhihu-user-answers', 'zhihu-topic-questions'];
|
|
61
|
+
const l7 = ['zhihu-search-basic', 'zhihu-search-people', 'zhihu-search-topic',
|
|
62
|
+
'zhihu-search-click-result', 'zhihu-search-filter-answers', 'zhihu-search-and-back'];
|
|
63
|
+
const l8 = ['zhihu-full-browse-workflow', 'zhihu-deep-author-chain', 'zhihu-cross-question-compare',
|
|
64
|
+
'zhihu-search-read-chain', 'zhihu-3-page-chain', 'zhihu-hot-scroll-deep-read'];
|
|
65
|
+
|
|
66
|
+
if (l1.includes(name)) return 'L1-atomic';
|
|
67
|
+
if (l2.includes(name)) return 'L2-feed';
|
|
68
|
+
if (l3.includes(name)) return 'L3-question';
|
|
69
|
+
if (l4.includes(name)) return 'L4-navigation';
|
|
70
|
+
if (l5.includes(name)) return 'L5-write';
|
|
71
|
+
if (l6.includes(name)) return 'L6-chain';
|
|
72
|
+
if (l7.includes(name)) return 'L7-search';
|
|
73
|
+
if (l8.includes(name)) return 'L8-complex';
|
|
74
|
+
return 'unknown';
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function judge(criteria: JudgeCriteria, output: string): boolean {
|
|
78
|
+
try {
|
|
79
|
+
switch (criteria.type) {
|
|
80
|
+
case 'contains':
|
|
81
|
+
return output.toLowerCase().includes(criteria.value.toLowerCase());
|
|
82
|
+
case 'arrayMinLength': {
|
|
83
|
+
try {
|
|
84
|
+
const arr = JSON.parse(output);
|
|
85
|
+
if (Array.isArray(arr)) return arr.length >= criteria.minLength;
|
|
86
|
+
} catch { /* not JSON array */ }
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
case 'nonEmpty':
|
|
90
|
+
return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined';
|
|
91
|
+
case 'matchesPattern':
|
|
92
|
+
return new RegExp(criteria.pattern).test(output);
|
|
93
|
+
default:
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
} catch {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function runCommand(cmd: string, timeout = 30000): string {
|
|
102
|
+
try {
|
|
103
|
+
return execSync(cmd, {
|
|
104
|
+
cwd: join(__dirname, '..'),
|
|
105
|
+
timeout,
|
|
106
|
+
encoding: 'utf-8',
|
|
107
|
+
env: process.env,
|
|
108
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
109
|
+
}).trim();
|
|
110
|
+
} catch (err: any) {
|
|
111
|
+
return err.stdout?.trim() ?? '';
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function runTask(task: BrowseTask): TaskResult {
|
|
116
|
+
const start = Date.now();
|
|
117
|
+
let lastOutput = '';
|
|
118
|
+
|
|
119
|
+
try {
|
|
120
|
+
for (const step of task.steps) {
|
|
121
|
+
lastOutput = runCommand(step);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const passed = judge(task.judge, lastOutput);
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
name: task.name,
|
|
128
|
+
passed,
|
|
129
|
+
duration: Date.now() - start,
|
|
130
|
+
error: passed ? undefined : `Output: ${lastOutput.slice(0, 150)}`,
|
|
131
|
+
layer: getLayer(task.name),
|
|
132
|
+
};
|
|
133
|
+
} catch (err: any) {
|
|
134
|
+
return {
|
|
135
|
+
name: task.name,
|
|
136
|
+
passed: false,
|
|
137
|
+
duration: Date.now() - start,
|
|
138
|
+
error: err.message?.slice(0, 100),
|
|
139
|
+
layer: getLayer(task.name),
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function main() {
|
|
145
|
+
const args = process.argv.slice(2);
|
|
146
|
+
const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
|
|
147
|
+
const layerFilter = args.includes('--layer') ? args[args.indexOf('--layer') + 1] : null;
|
|
148
|
+
|
|
149
|
+
const raw = JSON.parse(readFileSync(TASKS_FILE, 'utf-8')) as (BrowseTask | { _comment: string })[];
|
|
150
|
+
const allTasks = raw.filter((t): t is BrowseTask => 'name' in t && 'steps' in t);
|
|
151
|
+
|
|
152
|
+
let tasks = allTasks;
|
|
153
|
+
if (singleTask) {
|
|
154
|
+
tasks = allTasks.filter(t => t.name === singleTask);
|
|
155
|
+
} else if (layerFilter) {
|
|
156
|
+
const prefix = `L${layerFilter}`;
|
|
157
|
+
tasks = allTasks.filter(t => getLayer(t.name).startsWith(prefix));
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (tasks.length === 0) {
|
|
161
|
+
console.error(singleTask ? `Task "${singleTask}" not found.` : `No tasks for layer ${layerFilter}.`);
|
|
162
|
+
process.exit(1);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
console.log(`\n🔬 Zhihu Test Suite — ${tasks.length} tasks\n`);
|
|
166
|
+
|
|
167
|
+
const results: TaskResult[] = [];
|
|
168
|
+
|
|
169
|
+
for (let i = 0; i < tasks.length; i++) {
|
|
170
|
+
const task = tasks[i];
|
|
171
|
+
process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`);
|
|
172
|
+
|
|
173
|
+
const result = runTask(task);
|
|
174
|
+
results.push(result);
|
|
175
|
+
|
|
176
|
+
const icon = result.passed ? '✓' : '✗';
|
|
177
|
+
console.log(` ${icon} (${(result.duration / 1000).toFixed(1)}s)`);
|
|
178
|
+
|
|
179
|
+
// Close browser between tasks for clean state
|
|
180
|
+
if (i < tasks.length - 1) {
|
|
181
|
+
try { runCommand('opencli operate close'); } catch { /* ignore */ }
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Final close
|
|
186
|
+
try { runCommand('opencli operate close'); } catch { /* ignore */ }
|
|
187
|
+
|
|
188
|
+
// Summary by layer
|
|
189
|
+
const layers = [...new Set(results.map(r => r.layer))].sort();
|
|
190
|
+
const totalPassed = results.filter(r => r.passed).length;
|
|
191
|
+
const totalDuration = results.reduce((s, r) => s + r.duration, 0);
|
|
192
|
+
|
|
193
|
+
console.log(`\n${'─'.repeat(50)}`);
|
|
194
|
+
console.log(` Total: ${totalPassed}/${results.length}`);
|
|
195
|
+
for (const layer of layers) {
|
|
196
|
+
const layerResults = results.filter(r => r.layer === layer);
|
|
197
|
+
const layerPassed = layerResults.filter(r => r.passed).length;
|
|
198
|
+
console.log(` ${layer}: ${layerPassed}/${layerResults.length}`);
|
|
199
|
+
}
|
|
200
|
+
console.log(` Time: ${Math.round(totalDuration / 60000)}min`);
|
|
201
|
+
|
|
202
|
+
const failures = results.filter(r => !r.passed);
|
|
203
|
+
if (failures.length > 0) {
|
|
204
|
+
console.log(`\n Failures:`);
|
|
205
|
+
for (const f of failures) {
|
|
206
|
+
console.log(` ✗ [${f.layer}] ${f.name}: ${f.error ?? 'unknown'}`);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
console.log('');
|
|
210
|
+
|
|
211
|
+
// Save result
|
|
212
|
+
mkdirSync(RESULTS_DIR, { recursive: true });
|
|
213
|
+
const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('zhihu-')).length;
|
|
214
|
+
const roundNum = String(existing + 1).padStart(3, '0');
|
|
215
|
+
const resultPath = join(RESULTS_DIR, `zhihu-${roundNum}.json`);
|
|
216
|
+
writeFileSync(resultPath, JSON.stringify({
|
|
217
|
+
timestamp: new Date().toISOString(),
|
|
218
|
+
score: `${totalPassed}/${results.length}`,
|
|
219
|
+
layers: Object.fromEntries(layers.map(l => {
|
|
220
|
+
const lr = results.filter(r => r.layer === l);
|
|
221
|
+
return [l, `${lr.filter(r => r.passed).length}/${lr.length}`];
|
|
222
|
+
})),
|
|
223
|
+
duration: `${Math.round(totalDuration / 60000)}min`,
|
|
224
|
+
tasks: results,
|
|
225
|
+
}, null, 2), 'utf-8');
|
|
226
|
+
console.log(` Results saved to: ${resultPath}`);
|
|
227
|
+
console.log(`\nSCORE=${totalPassed}/${results.length}`);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
main();
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AutoResearch TSV Logger — append-only results log with metadata header.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { writeFileSync, readFileSync, existsSync, appendFileSync } from 'node:fs';
|
|
6
|
+
import type { AutoResearchConfig, IterationResult } from './config.js';
|
|
7
|
+
|
|
8
|
+
const COLUMNS = ['iteration', 'commit', 'metric', 'delta', 'guard', 'status', 'description'];
|
|
9
|
+
|
|
10
|
+
export class Logger {
|
|
11
|
+
constructor(private path: string) {}
|
|
12
|
+
|
|
13
|
+
/** Create the TSV file with metadata header */
|
|
14
|
+
init(config: AutoResearchConfig): void {
|
|
15
|
+
const header = [
|
|
16
|
+
`# metric_direction: ${config.direction === 'higher' ? 'higher_is_better' : 'lower_is_better'}`,
|
|
17
|
+
`# goal: ${config.goal}`,
|
|
18
|
+
`# scope: ${config.scope.join(', ')}`,
|
|
19
|
+
`# verify: ${config.verify}`,
|
|
20
|
+
config.guard ? `# guard: ${config.guard}` : null,
|
|
21
|
+
COLUMNS.join('\t'),
|
|
22
|
+
].filter(Boolean).join('\n');
|
|
23
|
+
writeFileSync(this.path, header + '\n', 'utf-8');
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Append one iteration result */
|
|
27
|
+
append(result: IterationResult): void {
|
|
28
|
+
const row = [
|
|
29
|
+
result.iteration,
|
|
30
|
+
result.commit,
|
|
31
|
+
result.metric,
|
|
32
|
+
result.delta >= 0 ? `+${result.delta}` : result.delta,
|
|
33
|
+
result.guard,
|
|
34
|
+
result.status,
|
|
35
|
+
result.description,
|
|
36
|
+
].join('\t');
|
|
37
|
+
appendFileSync(this.path, row + '\n', 'utf-8');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Read last N entries for pattern recognition */
|
|
41
|
+
readLast(n: number): IterationResult[] {
|
|
42
|
+
if (!existsSync(this.path)) return [];
|
|
43
|
+
const lines = readFileSync(this.path, 'utf-8').split('\n')
|
|
44
|
+
.filter(l => l && !l.startsWith('#') && !l.startsWith('iteration'));
|
|
45
|
+
return lines.slice(-n).map(line => {
|
|
46
|
+
const [iteration, commit, metric, delta, guard, status, ...desc] = line.split('\t');
|
|
47
|
+
return {
|
|
48
|
+
iteration: parseInt(iteration, 10),
|
|
49
|
+
commit,
|
|
50
|
+
metric: parseFloat(metric),
|
|
51
|
+
delta: parseFloat(delta),
|
|
52
|
+
guard: guard as 'pass' | 'fail' | '-',
|
|
53
|
+
status: status as IterationResult['status'],
|
|
54
|
+
description: desc.join('\t'),
|
|
55
|
+
};
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Count consecutive discards from the end */
|
|
60
|
+
consecutiveDiscards(): number {
|
|
61
|
+
const entries = this.readLast(20);
|
|
62
|
+
let count = 0;
|
|
63
|
+
for (let i = entries.length - 1; i >= 0; i--) {
|
|
64
|
+
if (entries[i].status === 'discard') count++;
|
|
65
|
+
else break;
|
|
66
|
+
}
|
|
67
|
+
return count;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset: Combined Reliability (browse + V2EX + Zhihu)
|
|
3
|
+
*
|
|
4
|
+
* Optimizes across ALL test suites simultaneously.
|
|
5
|
+
* Current baseline: 57/59 + 60/60 + 60/60 = 177/179
|
|
6
|
+
* Target: 179/179 (100%)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
10
|
+
|
|
11
|
+
export const combinedReliability: AutoResearchConfig = {
|
|
12
|
+
goal: 'Fix all remaining test failures across browse + V2EX + Zhihu (177/179 → 179/179)',
|
|
13
|
+
scope: [
|
|
14
|
+
'src/browser/dom-snapshot.ts',
|
|
15
|
+
'src/browser/dom-helpers.ts',
|
|
16
|
+
'src/browser/base-page.ts',
|
|
17
|
+
'src/browser/page.ts',
|
|
18
|
+
'src/cli.ts',
|
|
19
|
+
'autoresearch/browse-tasks.json',
|
|
20
|
+
],
|
|
21
|
+
metric: 'pass_count',
|
|
22
|
+
direction: 'higher',
|
|
23
|
+
verify: 'npx tsx autoresearch/eval-all.ts 2>&1 | tail -1',
|
|
24
|
+
guard: 'npm run build',
|
|
25
|
+
iterations: 10,
|
|
26
|
+
minDelta: 1,
|
|
27
|
+
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export { operateReliability } from './operate-reliability.js';
|
|
2
|
+
export { skillQuality } from './skill-quality.js';
|
|
3
|
+
export { v2exReliability } from './v2ex-reliability.js';
|
|
4
|
+
export { zhihuReliability } from './zhihu-reliability.js';
|
|
5
|
+
export { combinedReliability } from './combined-reliability.js';
|
|
6
|
+
export { saveReliability } from './save-reliability.js';
|
|
7
|
+
|
|
8
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
9
|
+
import { operateReliability } from './operate-reliability.js';
|
|
10
|
+
import { skillQuality } from './skill-quality.js';
|
|
11
|
+
import { v2exReliability } from './v2ex-reliability.js';
|
|
12
|
+
import { zhihuReliability } from './zhihu-reliability.js';
|
|
13
|
+
import { combinedReliability } from './combined-reliability.js';
|
|
14
|
+
import { saveReliability } from './save-reliability.js';
|
|
15
|
+
|
|
16
|
+
export const PRESETS: Record<string, AutoResearchConfig> = {
|
|
17
|
+
'operate-reliability': operateReliability,
|
|
18
|
+
'skill-quality': skillQuality,
|
|
19
|
+
'v2ex-reliability': v2exReliability,
|
|
20
|
+
'zhihu-reliability': zhihuReliability,
|
|
21
|
+
'combined': combinedReliability,
|
|
22
|
+
'save-reliability': saveReliability,
|
|
23
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset: Operate Command Reliability
|
|
3
|
+
*
|
|
4
|
+
* Optimizes opencli operate commands against the Layer 1 deterministic test suite.
|
|
5
|
+
* Metric: number of passing browse-tasks (out of 59).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
9
|
+
|
|
10
|
+
export const operateReliability: AutoResearchConfig = {
|
|
11
|
+
goal: 'Increase operate command pass rate to 59/59 (100%)',
|
|
12
|
+
scope: [
|
|
13
|
+
'src/browser/dom-snapshot.ts',
|
|
14
|
+
'src/browser/dom-helpers.ts',
|
|
15
|
+
'src/browser/base-page.ts',
|
|
16
|
+
'src/browser/page.ts',
|
|
17
|
+
'src/cli.ts',
|
|
18
|
+
],
|
|
19
|
+
metric: 'pass_count',
|
|
20
|
+
direction: 'higher',
|
|
21
|
+
verify: 'npx tsx autoresearch/eval-browse.ts 2>&1 | tail -1',
|
|
22
|
+
guard: 'npm run build',
|
|
23
|
+
minDelta: 1,
|
|
24
|
+
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset: Save as CLI Reliability
|
|
3
|
+
*
|
|
4
|
+
* Optimizes the "Save as CLI" pipeline: operate init → write adapter → run.
|
|
5
|
+
* Covers PUBLIC (no auth) and COOKIE (browser session) strategies.
|
|
6
|
+
* Metric: number of passing save-tasks.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
10
|
+
|
|
11
|
+
export const saveReliability: AutoResearchConfig = {
|
|
12
|
+
goal: 'Increase "Save as CLI" pipeline pass rate to 100%. The flow is: operate init creates a scaffold, user writes adapter code, opencli discovers and runs it. Covers both PUBLIC (fetch API) and COOKIE (browser session) strategies. Focus on: init template correctness, user CLI discovery, adapter loading, verify command robustness, and browser session handling.',
|
|
13
|
+
scope: [
|
|
14
|
+
'src/cli.ts',
|
|
15
|
+
'src/discovery.ts',
|
|
16
|
+
'src/registry.ts',
|
|
17
|
+
'skills/opencli-operate/SKILL.md',
|
|
18
|
+
'autoresearch/save-tasks.json',
|
|
19
|
+
'autoresearch/save-adapters/*.ts',
|
|
20
|
+
],
|
|
21
|
+
metric: 'pass_count',
|
|
22
|
+
direction: 'higher',
|
|
23
|
+
verify: 'npx tsx autoresearch/eval-save.ts 2>&1 | tail -1',
|
|
24
|
+
guard: 'npm run build',
|
|
25
|
+
minDelta: 1,
|
|
26
|
+
};
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset: Skill E2E Quality
|
|
3
|
+
*
|
|
4
|
+
* Optimizes the opencli-operate SKILL.md against the Layer 2 LLM E2E test suite.
|
|
5
|
+
* Metric: number of passing skill-tasks (out of 35).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
9
|
+
|
|
10
|
+
export const skillQuality: AutoResearchConfig = {
|
|
11
|
+
goal: 'Increase skill E2E pass rate to 35/35 (100%)',
|
|
12
|
+
scope: [
|
|
13
|
+
'skills/opencli-operate/SKILL.md',
|
|
14
|
+
],
|
|
15
|
+
metric: 'pass_count',
|
|
16
|
+
direction: 'higher',
|
|
17
|
+
verify: 'npx tsx autoresearch/eval-skill.ts 2>&1 | tail -1',
|
|
18
|
+
guard: 'npm run build',
|
|
19
|
+
iterations: 20,
|
|
20
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Preset: V2EX Command Reliability
|
|
3
|
+
*
|
|
4
|
+
* Optimizes opencli operate commands against the V2EX-specific test suite.
|
|
5
|
+
* 40 tasks across 5 difficulty layers (atomic → complex chain).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { AutoResearchConfig } from '../config.js';
|
|
9
|
+
|
|
10
|
+
export const v2exReliability: AutoResearchConfig = {
|
|
11
|
+
goal: 'Increase V2EX operate command pass rate to 40/40 (100%)',
|
|
12
|
+
scope: [
|
|
13
|
+
'src/browser/dom-snapshot.ts',
|
|
14
|
+
'src/browser/dom-helpers.ts',
|
|
15
|
+
'src/browser/base-page.ts',
|
|
16
|
+
'src/browser/page.ts',
|
|
17
|
+
'src/cli.ts',
|
|
18
|
+
],
|
|
19
|
+
metric: 'pass_count',
|
|
20
|
+
direction: 'higher',
|
|
21
|
+
verify: 'npx tsx autoresearch/eval-v2ex.ts 2>&1 | tail -1',
|
|
22
|
+
guard: 'npm run build',
|
|
23
|
+
minDelta: 1,
|
|
24
|
+
};
|