@seanyao/roll 2026.515.1 → 2026.516.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -4,6 +4,7 @@
4
4
 
5
5
  - **New**: `roll brief` / `roll dream` 生成文档后自动提交推送 — 每次晨报和夜检不再需要手动 commit `[loop]`
6
6
  - **New**: 双语 FAQ 指南 — 10 个自治交付常见场景(loop 卡住、PR 冲突、agent 切换、权限问题等),每条含原因和原理,EN + ZH 对照 `[docs]`
7
+ - **Fixed**: loop 孤儿状态自愈 — cycle 启动时检测 state.yaml 残留 running,若无活跃进程则自动重置为 idle,防止 loop 因中断永久卡死 `[loop]`
7
8
  - **New**: 可选的事件驱动 PR 评审模板 — `cp templates/workflows/pr-review-event.yml .github/workflows/`,PR 开即触发 AI 评审,不装也行(loop 每轮兜底) `[pr]`
8
9
  - **New**: loop PR inbox 从"分类但空转"升级到"分类+执行" — eligible PR 自动调 AI 评审,stale PR 自动 rebase,fork 和冲突写 ALERT;bot 已评审的 PR 自动让步 `[loop]`
9
10
  - **New**: `roll review-pr <number>` — agent-agnostic AI 代码评审,任意 agent(Claude/Kimi/DeepSeek 等)均可评审任意 git 平台的 PR;PR body 加 `[skip-ai-review]` 可跳过 `[pr]`
package/README.md CHANGED
@@ -11,6 +11,7 @@
11
11
 
12
12
  **[中文版 README](README_CN.md)**
13
13
 
14
+ [![Website](https://img.shields.io/badge/Website-seanyao.github.io%2FRoll-blue)](https://seanyao.github.io/Roll/)
14
15
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
15
16
  [![npm version](https://img.shields.io/npm/v/@seanyao/roll.svg)](https://www.npmjs.com/package/@seanyao/roll)
16
17
  [![CI](https://github.com/seanyao/roll/actions/workflows/ci.yml/badge.svg)](https://github.com/seanyao/roll/actions/workflows/ci.yml)
package/bin/roll CHANGED
@@ -4,7 +4,7 @@ set -euo pipefail
4
4
  # Roll — AI Agent Convention Manager
5
5
  # Single source of truth for how all AI coding agents behave.
6
6
 
7
- VERSION="2026.515.1"
7
+ VERSION="2026.516.1"
8
8
  ROLL_HOME="${ROLL_HOME:-${HOME}/.roll}"
9
9
  ROLL_CONFIG="${ROLL_HOME}/config.yaml"
10
10
  ROLL_GLOBAL="${ROLL_HOME}/conventions/global"
@@ -1131,60 +1131,6 @@ _ensure_features_dir() {
1131
1131
  _ROLL_MERGE_SUMMARY+=("created|docs/features/")
1132
1132
  }
1133
1133
 
1134
- # ─── Helper: write starter .gitignore (no-op if exists) ──────────────────────
1135
- _write_gitignore() {
1136
- [[ -f "$1" ]] && return
1137
- cat > "$1" << 'EOF'
1138
- node_modules/
1139
- dist/
1140
- build/
1141
- .env
1142
- *.local
1143
- .DS_Store
1144
- *.log
1145
- EOF
1146
- }
1147
-
1148
- # ─── Helper: write starter .env.example (no-op if exists) ────────────────────
1149
- _write_env_example() {
1150
- [[ -f "$1" ]] && return
1151
- cat > "$1" << 'EOF'
1152
- # Environment Variables — copy to .env and fill in values
1153
-
1154
- # Application
1155
- # NODE_ENV=development
1156
- # PORT=3000
1157
-
1158
- # Database
1159
- # DATABASE_URL=postgresql://user:pass@localhost:5432/db
1160
-
1161
- # Auth
1162
- # JWT_SECRET=your-secret-key
1163
- EOF
1164
- }
1165
-
1166
- # ─── Helper: detect project type from existing AGENTS.md ─────────────────────
1167
- detect_project_type() {
1168
- local agents_file="$1/AGENTS.md"
1169
- [[ -f "$agents_file" ]] || { echo "unknown"; return; }
1170
-
1171
- local content
1172
- content="$(cat "$agents_file")"
1173
-
1174
- if echo "$content" | grep -qi "Fullstack Web"; then
1175
- echo "fullstack"
1176
- elif echo "$content" | grep -qi "Backend Service"; then
1177
- echo "backend-service"
1178
- elif echo "$content" | grep -qi "Frontend Only"; then
1179
- echo "frontend-only"
1180
- elif echo "$content" | grep -qi "CLI Tool"; then
1181
- echo "cli"
1182
- else
1183
- # AGENTS.md exists but has no type marker — fall back to file-based scan
1184
- scan_project_type_from_files "$1"
1185
- fi
1186
- }
1187
-
1188
1134
  # ═══════════════════════════════════════════════════════════════════════════════
1189
1135
  # COMMAND: status
1190
1136
  # Show current state of conventions
@@ -1478,7 +1424,7 @@ _peer_auto_attach() {
1478
1424
  [ "$(uname)" = "Darwin" ] || return 0
1479
1425
  [ -f "$_LOOP_MUTE_FILE" ] && return 0
1480
1426
  local terminal_pref
1481
- terminal_pref=$(_config_read_string "loop_attach_terminal" "")
1427
+ terminal_pref=$(config_get "loop_attach_terminal" "")
1482
1428
  if [[ -z "$terminal_pref" ]]; then
1483
1429
  case "${TERM_PROGRAM:-}" in
1484
1430
  ghostty) terminal_pref="ghostty" ;;
@@ -2111,6 +2057,14 @@ _LAUNCHD_DIR="${HOME}/Library/LaunchAgents"
2111
2057
  # hash of the full path, ensuring uniqueness across sibling dirs with same name.
2112
2058
  _project_slug() {
2113
2059
  local path="$1"
2060
+ # FIX-034: when inside a git worktree, git-common-dir returns the main tree's
2061
+ # absolute .git path; resolve to the main tree so worktree and main-tree runs
2062
+ # produce the same slug.
2063
+ local _common
2064
+ _common=$(git -C "$path" rev-parse --git-common-dir 2>/dev/null)
2065
+ if [[ -n "$_common" && "$_common" == *"/.git" ]]; then
2066
+ path="${_common%/.git}"
2067
+ fi
2114
2068
  local base; base=$(basename "$path")
2115
2069
  local hash
2116
2070
  if command -v md5 &>/dev/null; then
@@ -2125,17 +2079,10 @@ _project_slug() {
2125
2079
  _config_read_int() {
2126
2080
  local key="$1" default="$2"
2127
2081
  local val
2128
- val=$(grep "^${key}:" "$ROLL_CONFIG" 2>/dev/null | awk '{print $2}' | tr -d '"' | head -1)
2082
+ val=$(config_get "$key" "")
2129
2083
  if [[ "$val" =~ ^[0-9]+$ ]]; then echo "$val"; else echo "$default"; fi
2130
2084
  }
2131
2085
 
2132
- _config_read_string() {
2133
- local key="$1" default="$2"
2134
- local val
2135
- val=$(grep "^${key}:" "$ROLL_CONFIG" 2>/dev/null | awk '{print $2}' | tr -d '"' | head -1)
2136
- if [[ -n "$val" ]]; then echo "$val"; else echo "$default"; fi
2137
- }
2138
-
2139
2086
  # Derive a minute in [1,55] from project path hash + offset so different projects
2140
2087
  # and different services within a project don't fire at the same time.
2141
2088
  # Offsets used: loop=0, dream=2, brief=4 → always three distinct values (2<55).
@@ -2248,7 +2195,15 @@ if [ -f "\$INNER_LOCK" ]; then
2248
2195
  rm -f "\$INNER_LOCK"
2249
2196
  fi
2250
2197
  printf '%s:%s\n' "\$\$" "\$(date -u +%s)" > "\$INNER_LOCK"
2251
- trap 'rm -f "\$INNER_LOCK"' EXIT
2198
+ # FIX-038: background heartbeat writer — outer script uses this as primary liveness signal
2199
+ # to detect stale execution without relying on PID reuse heuristics.
2200
+ HEARTBEAT_FILE="${HOME}/.shared/roll/loop/.heartbeat-${slug}"
2201
+ _heartbeat_writer() {
2202
+ while true; do echo "\$(date -u +%s)" > "\$HEARTBEAT_FILE"; sleep 60; done
2203
+ }
2204
+ _heartbeat_writer &
2205
+ _HEARTBEAT_PID=\$!
2206
+ trap 'kill "\${_HEARTBEAT_PID}" 2>/dev/null; rm -f "\$INNER_LOCK" "\$HEARTBEAT_FILE"' EXIT
2252
2207
 
2253
2208
  # US-AUTO-037: pull in worktree helpers (US-AUTO-036). Sourcing bin/roll is
2254
2209
  # safe — its main() only runs when invoked directly (BASH_SOURCE == \$0).
@@ -2306,26 +2261,34 @@ _claude_cleanup_stale_worktrees "${project_path}" || true
2306
2261
  # When \`gh\` is unavailable, fall back to the legacy ff-merge path.
2307
2262
  if [ "\$_USE_WORKTREE" = "1" ]; then
2308
2263
  if [ "\$_exit" -eq 0 ]; then
2309
- if ( cd "\$WT" && _loop_is_doc_only_change ); then
2310
- ( cd "\$WT" && _loop_publish_doc_pr "\$BRANCH" "doc: loop cycle \${CYCLE_ID}" )
2311
- else
2312
- ( cd "\$WT" && _loop_publish_pr "\$BRANCH" "loop cycle \${CYCLE_ID}" )
2313
- fi
2314
- _publish_status=\$?
2315
- if [ "\$_publish_status" -eq 0 ]; then
2264
+ # Idle cycle no commits ahead of origin/main means nothing was built;
2265
+ # skip publish and reclaim the worktree immediately.
2266
+ _cycle_commits=\$(cd "\$WT" && git rev-list --count origin/main..HEAD 2>/dev/null || echo 0)
2267
+ if [ "\$_cycle_commits" -eq 0 ]; then
2316
2268
  _worktree_cleanup "\$WT" "\$BRANCH"
2317
- echo "[loop] cycle \${CYCLE_ID}: published; worktree cleaned"
2318
- elif [ "\$_publish_status" -eq 2 ]; then
2319
- if ( cd "${project_path}" && _worktree_merge_back "\$BRANCH" ); then
2269
+ echo "[loop] cycle \${CYCLE_ID}: idle (no new commits); worktree cleaned"
2270
+ else
2271
+ if ( cd "\$WT" && _loop_is_doc_only_change ); then
2272
+ ( cd "\$WT" && _loop_publish_doc_pr "\$BRANCH" "doc: loop cycle \${CYCLE_ID}" )
2273
+ else
2274
+ ( cd "\$WT" && _loop_publish_pr "\$BRANCH" "loop cycle \${CYCLE_ID}" )
2275
+ fi
2276
+ _publish_status=\$?
2277
+ if [ "\$_publish_status" -eq 0 ]; then
2320
2278
  _worktree_cleanup "\$WT" "\$BRANCH"
2321
- echo "[loop] cycle \${CYCLE_ID}: gh unavailable; merged via ff and cleaned up"
2279
+ echo "[loop] cycle \${CYCLE_ID}: published; worktree cleaned"
2280
+ elif [ "\$_publish_status" -eq 2 ]; then
2281
+ if ( cd "${project_path}" && _worktree_merge_back "\$BRANCH" ); then
2282
+ _worktree_cleanup "\$WT" "\$BRANCH"
2283
+ echo "[loop] cycle \${CYCLE_ID}: gh unavailable; merged via ff and cleaned up"
2284
+ else
2285
+ _worktree_alert "cycle \${CYCLE_ID}: gh unavailable AND merge_back failed; worktree preserved at \$WT"
2286
+ echo "[loop] cycle \${CYCLE_ID}: gh+merge_back both failed; worktree preserved at \$WT"
2287
+ fi
2322
2288
  else
2323
- _worktree_alert "cycle \${CYCLE_ID}: gh unavailable AND merge_back failed; worktree preserved at \$WT"
2324
- echo "[loop] cycle \${CYCLE_ID}: gh+merge_back both failed; worktree preserved at \$WT"
2289
+ _worktree_alert "cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT (branch \$BRANCH)"
2290
+ echo "[loop] cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT"
2325
2291
  fi
2326
- else
2327
- _worktree_alert "cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT (branch \$BRANCH)"
2328
- echo "[loop] cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT"
2329
2292
  fi
2330
2293
  else
2331
2294
  _worktree_alert "cycle \${CYCLE_ID}: claude exited \$_exit; worktree preserved at \$WT (branch \$BRANCH)"
@@ -2340,6 +2303,8 @@ INNER
2340
2303
 
2341
2304
  cat > "$script_path" << SCRIPT
2342
2305
  #!/bin/bash -l
2306
+ # caffeinate: prevent idle sleep from killing claude during cycles
2307
+ caffeinate -i -w \$\$ &
2343
2308
  # Active-window check — skipped when ROLL_LOOP_FORCE is set (manual 'roll loop now')
2344
2309
  if [ -z "\$ROLL_LOOP_FORCE" ]; then
2345
2310
  h=\$(printf '%d' "\$(date +%H)")
@@ -2348,6 +2313,50 @@ fi
2348
2313
  # Pause check — 'roll loop pause' creates this marker to suspend scheduling
2349
2314
  PAUSE="\$HOME/.shared/roll/loop/PAUSE-${slug}"
2350
2315
  if [ -z "\$ROLL_LOOP_FORCE" ] && [ -f "\$PAUSE" ]; then exit 0; fi
2316
+ # FIX-037: orphan state detection & self-heal — if state.yaml says running
2317
+ # but no LOCK process or tmux session exists, the previous cycle was killed
2318
+ # (e.g. SIGKILL / sleep / terminal close). Heal state to idle so the next
2319
+ # cycle can proceed normally; write ALERT for transparency.
2320
+ # FIX-038: heartbeat is the primary liveness signal (avoids PID reuse race);
2321
+ # LOCK pid check is secondary fallback for backward compatibility.
2322
+ HEARTBEAT_TIMEOUT="\${ROLL_HEARTBEAT_TIMEOUT:-1800}"
2323
+ STATE_FILE="${HOME}/.shared/roll/loop/state.yaml"
2324
+ if [ -f "\$STATE_FILE" ]; then
2325
+ _state=\$(grep '^status:' "\$STATE_FILE" | awk '{print \$2}' 2>/dev/null || echo "")
2326
+ if [ "\$_state" = "running" ]; then
2327
+ _still_active=false
2328
+ # FIX-038: heartbeat is primary signal
2329
+ _heartbeat_file="${HOME}/.shared/roll/loop/.heartbeat-${slug}"
2330
+ if [ -f "\$_heartbeat_file" ]; then
2331
+ _hb_ts=\$(cat "\$_heartbeat_file" 2>/dev/null || echo "0")
2332
+ _now=\$(date -u +%s)
2333
+ _hb_age=\$(( _now - _hb_ts ))
2334
+ if [ "\$_hb_age" -lt "\$HEARTBEAT_TIMEOUT" ]; then
2335
+ _still_active=true
2336
+ fi
2337
+ fi
2338
+ # Fallback: LOCK pid check (for cycles without heartbeat, e.g. pre-FIX-038)
2339
+ if [ "\$_still_active" = false ]; then
2340
+ _lock_file="\$(dirname "\$0")/.LOCK-\$(basename "\$0" .sh | sed 's/^run-//')"
2341
+ if [ -f "\$_lock_file" ]; then
2342
+ _lock_pid=\$(head -1 "\$_lock_file" 2>/dev/null || echo "")
2343
+ [ -n "\$_lock_pid" ] && kill -0 "\$_lock_pid" 2>/dev/null && _still_active=true
2344
+ fi
2345
+ fi
2346
+ # Final: tmux session check
2347
+ if [ "\$_still_active" = false ]; then
2348
+ command -v tmux >/dev/null 2>&1 && tmux has-session -t "roll-loop-\$(basename "\$0" .sh | sed 's/^run-//')" 2>/dev/null && _still_active=true
2349
+ fi
2350
+ if [ "\$_still_active" = false ]; then
2351
+ echo "[\$(date '+%Y-%m-%dT%H:%M:%S%z')] FIX-037: orphan state detected (status=running, heartbeat stale or missing) — healing to idle" >> "\$LOG"
2352
+ echo "status: idle" > "\${STATE_FILE}.tmp" && mv "\${STATE_FILE}.tmp" "\$STATE_FILE"
2353
+ rm -f "\$_lock_file" 2>/dev/null || true
2354
+ _alert_file="\$(dirname "\$0")/ALERT.md"
2355
+ echo "\$(date '+%Y-%m-%dT%H:%M:%S%z') | FIX-037 auto-heal | Orphan state detected and cleared (status=running → idle)" >> "\$_alert_file" 2>/dev/null || true
2356
+ echo "[\$(date '+%Y-%m-%dT%H:%M:%S%z')] FIX-037: healed to idle, ALERT written" >> "\$LOG"
2357
+ fi
2358
+ fi
2359
+ fi
2351
2360
  LOCK="\$(dirname "\$0")/.LOCK-\$(basename "\$0" .sh | sed 's/^run-//')"
2352
2361
  SESSION="roll-loop-\$(basename "\$0" .sh | sed 's/^run-//')"
2353
2362
  INNER_SCRIPT="${inner_path}"
@@ -2449,7 +2458,7 @@ _install_launchd_plists() {
2449
2458
 
2450
2459
  # Terminal preference: config wins, then TERM_PROGRAM env, then "Terminal"
2451
2460
  local terminal_pref
2452
- terminal_pref=$(_config_read_string "loop_attach_terminal" "")
2461
+ terminal_pref=$(config_get "loop_attach_terminal" "")
2453
2462
  if [[ -z "$terminal_pref" ]]; then
2454
2463
  case "${TERM_PROGRAM:-}" in
2455
2464
  ghostty) terminal_pref="ghostty" ;;
@@ -2679,7 +2688,7 @@ _loop_test() {
2679
2688
 
2680
2689
  # Detect terminal pref same way _install_launchd_plists does
2681
2690
  local terminal_pref
2682
- terminal_pref=$(_config_read_string "loop_attach_terminal" "")
2691
+ terminal_pref=$(config_get "loop_attach_terminal" "")
2683
2692
  if [[ -z "$terminal_pref" ]]; then
2684
2693
  case "${TERM_PROGRAM:-}" in
2685
2694
  ghostty) terminal_pref="ghostty" ;;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@seanyao/roll",
3
- "version": "2026.515.1",
3
+ "version": "2026.516.1",
4
4
  "description": "Roll — Roll out features with AI agents",
5
5
  "scripts": {
6
6
  "test": "bash tests/run.sh"
@@ -27,7 +27,6 @@
27
27
  "conventions/",
28
28
  "lib/",
29
29
  "skills/",
30
- "tools/",
31
30
  "template/",
32
31
  "README.md",
33
32
  "CHANGELOG.md"
@@ -1,182 +0,0 @@
1
- ---
2
- hidden: true
3
- name: roll-fetch
4
- description: Web page fetching and crawling for AI agents. Extract content from URLs for research, documentation, and competitive analysis.
5
- ---
6
-
7
- # Roll Fetch - Web Content Extraction
8
-
9
- Extract content from web pages for research and analysis.
10
-
11
- ## When to Use
12
-
13
- - Product research (competitor analysis)
14
- - Technical documentation gathering
15
- - Code examples and best practices
16
- - Full site crawling for backup/analysis
17
-
18
- ## Environment Setup
19
-
20
- Configure API keys per machine:
21
-
22
- ```bash
23
- # Required for Tavily
24
- export TAVILY_API_KEY=tvly-dev-...
25
-
26
- # Optional for cloud browser fallback
27
- export BROWSER_USE_API_KEY=bu-...
28
- ```
29
-
30
- Or create `.env` file in project root:
31
- ```
32
- TAVILY_API_KEY=tvly-dev-...
33
- BROWSER_USE_API_KEY=bu-...
34
- ```
35
-
36
- ## Methods
37
-
38
- ### 1. Tavily API (Recommended)
39
-
40
- Best quality extraction, requires `TAVILY_API_KEY`.
41
-
42
- ```bash
43
- # Using Tavily CLI or API
44
- curl -X POST https://api.tavily.com/extract \
45
- -H "Content-Type: application/json" \
46
- -d '{
47
- "urls": ["https://example.com"],
48
- "api_key": "your_tavily_api_key"
49
- }'
50
- ```
51
-
52
- **Pros**: AI-optimized extraction, handles complex layouts
53
- **Cons**: Requires API key, rate limited
54
-
55
- ### 2. LLM Native Fetch (Default)
56
-
57
- Use your built-in URL fetching capability directly.
58
-
59
- **When to use**: When Tavily is unavailable or for quick checks.
60
-
61
- **Note**: Most modern AI agents (Kimi, Codex, Claude) have native URL fetching. Use `FetchURL` tool or equivalent.
62
-
63
- ### 3. Browser Automation (Fallback)
64
-
65
- Local browser automation for stubborn pages using **[browser-use](https://github.com/browser-use/browser-use)**.
66
-
67
- **How to Choose:**
68
-
69
- | If | Then Use | Why |
70
- |----|---------|-----|
71
- | `BROWSER_USE_API_KEY` in env | **Cloud** | Managed browsers, less setup |
72
- | No API key, but `browser-use` installed | **Local** | Free, no external dependency |
73
- | Neither | Skip to manual extraction | Tell user "Need browser automation setup" |
74
-
75
- **Option A: Local (Free, No API Key)**
76
- ```python
77
- from browser_use import Agent, Browser, BrowserConfig
78
- import asyncio
79
-
80
- async def fetch_page(url):
81
- # Pure local, no API key needed
82
- browser = Browser(config=BrowserConfig(headless=True))
83
- await browser.start()
84
- page = await browser.get_current_page()
85
- await page.goto(url)
86
- content = await page.content()
87
- await browser.stop()
88
- return content
89
-
90
- # Run
91
- content = asyncio.run(fetch_page("https://example.com"))
92
- ```
93
-
94
- **Option B: Cloud API**
95
- ```python
96
- from browser_use import Agent
97
-
98
- agent = Agent(
99
- task=f"Extract the main content from {url} and return as markdown",
100
- llm="moonshot" # or openai, anthropic
101
- )
102
- result = await agent.run()
103
- ```
104
-
105
- **Setup** (Local):
106
- ```bash
107
- pip install browser-use
108
- playwright install chromium
109
- ```
110
-
111
- ## Usage
112
-
113
- ### CLI Usage (via smart-web-fetch.js)
114
-
115
- ```bash
116
- # Auto mode (Tavily → Native → Browser)
117
- node smart-web-fetch.js fetch https://example.com
118
-
119
- # Explicit method
120
- node smart-web-fetch.js fetch https://example.com tavily
121
- node smart-web-fetch.js fetch https://example.com native
122
- node smart-web-fetch.js fetch https://example.com browser
123
-
124
- # Search
125
- node smart-web-fetch.js search "Python async" 5
126
- ```
127
-
128
- ### Programmatic Usage
129
-
130
- ```javascript
131
- const { smartFetch, smartSearch } = require('./smart-web-fetch.js');
132
-
133
- // Fetch a page
134
- const result = await smartFetch('https://example.com');
135
- console.log(result.content);
136
-
137
- // Search
138
- const searchResult = await smartSearch('OpenAI GPT-5', 5);
139
- console.log(searchResult.results);
140
- ```
141
-
142
- ### Single Page Fetch
143
-
144
- ```
145
- User: "Fetch https://docs.example.com/api"
146
- → Use smart-web-fetch.js with auto mode
147
- → Return clean markdown content
148
- ```
149
-
150
- ### Full Site Crawl
151
-
152
- ```
153
- User: "Crawl https://docs.example.com"
154
- → Use smart-web-fetch.js recursively
155
- → Extract all internal links
156
- → Recursively fetch up to max depth (default: 2)
157
- → Save each page as separate markdown file
158
- ```
159
-
160
- ## Output Format
161
-
162
- Always return clean Markdown:
163
- - Extract main content only (remove nav, ads, footers)
164
- - Preserve code blocks and tables
165
- - Include source URL as header
166
-
167
- ## Quality Check
168
-
169
- Validate extracted content:
170
- - Min length: 500 chars (reject if shorter)
171
- - Check for captcha/error messages
172
- - Verify main content structure (headings, paragraphs)
173
-
174
- ## Examples
175
-
176
- | Task | Method | Command |
177
- |------|--------|---------|
178
- | Quick article | Auto | `node smart-web-fetch.js fetch https://blog.example.com` |
179
- | API docs | Tavily | `node smart-web-fetch.js fetch https://docs.example.com tavily` |
180
- | SPA site | Browser | `node smart-web-fetch.js fetch https://spa.example.com browser` |
181
- | Search | Tavily | `node smart-web-fetch.js search "Python async" 5` |
182
- | Fallback test | Native | `node smart-web-fetch.js fetch https://example.com native` |
@@ -1,15 +0,0 @@
1
- {
2
- "name": "smart-web-fetch",
3
- "version": "1.0.0",
4
- "description": "Intelligent web fetching with automatic Tavily → Scrapling fallback",
5
- "main": "smart-web-fetch.js",
6
- "bin": {
7
- "smart-web-fetch": "./smart-web-fetch.js"
8
- },
9
- "scripts": {
10
- "test": "node smart-web-fetch.js fetch https://example.com"
11
- },
12
- "keywords": ["web-scraping", "tavily", "scrapling", "fallback"],
13
- "author": "R0_lobster",
14
- "license": "MIT"
15
- }
@@ -1,558 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Smart Web Fetch Skill - Simplified 3-Layer Strategy
4
- * 三层策略: Tavily → LLM Native → Browser
5
- * 移除 mcporter, 直接 HTTP 调用, Key 从环境变量获取
6
- */
7
-
8
- const { execSync } = require('child_process');
9
- const https = require('https');
10
-
11
- // Configuration
12
- const TAVILY_TIMEOUT = 30000;
13
- const BROWSER_TIMEOUT = 90000;
14
- const MIN_CONTENT_LENGTH = 200;
15
- const MAX_RETRIES = 2;
16
-
17
- // Blocked content keywords
18
- const BLOCKED_KEYWORDS = [
19
- // Chinese
20
- '验证', 'captcha', '请登录', '环境异常', '登录后', '需要验证',
21
- '请完成验证', '安全检查', '访问受限', 'blocked', 'access denied',
22
- '拖动滑块', '完成拼图', '点击验证', '继续访问', '登录查看',
23
- '验证后即可', '异常访问', '安全验证', '人机验证',
24
- // English
25
- 'verify', 'verification', 'complete the verification', 'captcha required',
26
- 'please log in', 'sign in to', 'access denied', 'blocked',
27
- 'security check', 'human verification', 'prove you\'re human'
28
- ];
29
-
30
- /**
31
- * Check if content is blocked or low quality
32
- */
33
- function isBlockedOrLowQuality(content, source = 'unknown') {
34
- if (!content || content.length < MIN_CONTENT_LENGTH) {
35
- return { blocked: true, reason: 'Content too short or empty', severity: 'high' };
36
- }
37
-
38
- const lowerContent = content.toLowerCase();
39
- const foundKeywords = [];
40
-
41
- for (const keyword of BLOCKED_KEYWORDS) {
42
- if (lowerContent.includes(keyword.toLowerCase())) {
43
- foundKeywords.push(keyword);
44
- }
45
- }
46
-
47
- if (foundKeywords.length > 0) {
48
- const isLikelyBlocked = foundKeywords.some(k =>
49
- ['验证', 'captcha', '环境异常', '请登录', '拖动滑块'].includes(k)
50
- );
51
- return {
52
- blocked: isLikelyBlocked,
53
- reason: `Detected keywords: ${foundKeywords.slice(0, 3).join(', ')}`,
54
- severity: isLikelyBlocked ? 'high' : 'medium'
55
- };
56
- }
57
-
58
- return { blocked: false };
59
- }
60
-
61
- /**
62
- * Calculate content quality score
63
- */
64
- function calculateQualityScore(content) {
65
- if (!content) return 0;
66
-
67
- let score = 0;
68
- const length = content.length;
69
-
70
- // Length score (0-30)
71
- score += Math.min(Math.log10(length) * 10, 30);
72
-
73
- // Content density (0-25)
74
- const wordCount = content.split(/\s+/).length;
75
- const avgWordLength = content.length / wordCount;
76
- if (avgWordLength > 3 && avgWordLength < 15) score += 15;
77
- if (content.match(/[。\.]/g)?.length > 5) score += 10;
78
-
79
- // Structure indicators (0-25)
80
- if (content.includes('#') || content.includes('##')) score += 8;
81
- if (content.includes('###')) score += 5;
82
- if (content.includes('- ') || content.includes('* ')) score += 6;
83
- if (content.includes('```')) score += 6;
84
-
85
- // Rich content indicators (0-20)
86
- if (content.match(/\[.*?\]\(.*?\)/)) score += 5;
87
- if (content.match(/\!\[.*?\]\(.*?\)/)) score += 5;
88
- if (content.match(/\*\*.*?\*\*/)) score += 5;
89
- if (content.match(/`.*?`/)) score += 5;
90
-
91
- return Math.min(score / 100, 1.0);
92
- }
93
-
94
- /**
95
- * Level 1: Tavily API (HTTP direct call)
96
- */
97
- function tryTavily(url, retries = 0) {
98
- console.error(`[SmartFetch] Level 1: Trying Tavily for: ${url}`);
99
-
100
- const apiKey = process.env.TAVILY_API_KEY;
101
- if (!apiKey) {
102
- return {
103
- success: false,
104
- tool: 'tavily',
105
- error: 'TAVILY_API_KEY not set in environment',
106
- needs_fallback: true
107
- };
108
- }
109
-
110
- return new Promise((resolve) => {
111
- const postData = JSON.stringify({
112
- urls: [url],
113
- api_key: apiKey,
114
- extract_depth: 'advanced',
115
- include_images: false
116
- });
117
-
118
- const options = {
119
- hostname: 'api.tavily.com',
120
- path: '/extract',
121
- method: 'POST',
122
- headers: {
123
- 'Content-Type': 'application/json',
124
- 'Content-Length': Buffer.byteLength(postData)
125
- },
126
- timeout: TAVILY_TIMEOUT
127
- };
128
-
129
- const req = https.request(options, (res) => {
130
- let data = '';
131
-
132
- res.on('data', chunk => data += chunk);
133
-
134
- res.on('end', () => {
135
- try {
136
- const response = JSON.parse(data);
137
-
138
- if (response.results && response.results[0]) {
139
- const result = response.results[0];
140
- const content = result.raw_content || result.content || '';
141
-
142
- if (content.length > MIN_CONTENT_LENGTH) {
143
- resolve({
144
- success: true,
145
- tool: 'tavily',
146
- content: content,
147
- title: result.title || '',
148
- url: result.url || url
149
- });
150
- } else {
151
- resolve({
152
- success: false,
153
- tool: 'tavily',
154
- error: 'Content too short',
155
- needs_fallback: true
156
- });
157
- }
158
- } else if (response.error) {
159
- resolve({
160
- success: false,
161
- tool: 'tavily',
162
- error: response.error,
163
- needs_fallback: true
164
- });
165
- } else {
166
- resolve({
167
- success: false,
168
- tool: 'tavily',
169
- error: 'No results',
170
- needs_fallback: true
171
- });
172
- }
173
- } catch (e) {
174
- resolve({
175
- success: false,
176
- tool: 'tavily',
177
- error: `Parse error: ${e.message}`,
178
- needs_fallback: true
179
- });
180
- }
181
- });
182
- });
183
-
184
- req.on('error', (err) => {
185
- if (retries < MAX_RETRIES) {
186
- console.error(`[SmartFetch] Tavily error, retrying... (${retries + 1}/${MAX_RETRIES})`);
187
- resolve(tryTavily(url, retries + 1));
188
- } else {
189
- resolve({
190
- success: false,
191
- tool: 'tavily',
192
- error: err.message,
193
- needs_fallback: true
194
- });
195
- }
196
- });
197
-
198
- req.on('timeout', () => {
199
- req.destroy();
200
- if (retries < MAX_RETRIES) {
201
- console.error(`[SmartFetch] Tavily timeout, retrying... (${retries + 1}/${MAX_RETRIES})`);
202
- resolve(tryTavily(url, retries + 1));
203
- } else {
204
- resolve({
205
- success: false,
206
- tool: 'tavily',
207
- error: 'Timeout',
208
- needs_fallback: true
209
- });
210
- }
211
- });
212
-
213
- req.write(postData);
214
- req.end();
215
- });
216
- }
217
-
218
- /**
219
- * Level 2: LLM Native Fetch (return instruction for caller)
220
- */
221
- function tryLLMNative(url) {
222
- console.error(`[SmartFetch] Level 2: LLM Native Fetch for: ${url}`);
223
-
224
- return {
225
- success: false,
226
- tool: 'llm_native',
227
- error: 'LLM Native fetch requires caller to use FetchURL tool',
228
- instruction: `Use FetchURL tool to fetch "${url}" and return the content`,
229
- needs_fallback: true,
230
- native_fetch: true,
231
- url: url
232
- };
233
- }
234
-
235
- /**
236
- * Check if browser-use is installed locally
237
- */
238
- function isBrowserUseInstalled() {
239
- try {
240
- execSync('/opt/homebrew/bin/python3.11 -c "import browser_use"', {
241
- encoding: 'utf-8',
242
- timeout: 5000,
243
- stdio: 'pipe'
244
- });
245
- return true;
246
- } catch (e) {
247
- return false;
248
- }
249
- }
250
-
251
- /**
252
- * Level 3: Browser Automation (Local first, then Cloud)
253
- */
254
- async function tryBrowser(url) {
255
- console.error(`[SmartFetch] Level 3: Trying Browser automation for: ${url}`);
256
-
257
- // Try local browser-use first
258
- if (isBrowserUseInstalled()) {
259
- console.error('[SmartFetch] Using local browser-use...');
260
-
261
- try {
262
- const result = execSync(
263
- `/opt/homebrew/bin/python3.11 -c "
264
- import asyncio
265
- import sys
266
- from browser_use import Browser, BrowserConfig
267
-
268
- async def fetch():
269
- browser = Browser(config=BrowserConfig(headless=True))
270
- await browser.start()
271
- try:
272
- page = await browser.get_current_page()
273
- await page.goto('${url}', wait_until='networkidle')
274
- content = await page.content()
275
- title = await page.title()
276
- print(f'TITLE:{title}')
277
- print('---CONTENT---')
278
- print(content)
279
- finally:
280
- await browser.stop()
281
-
282
- asyncio.run(fetch())
283
- "`,
284
- {
285
- encoding: 'utf-8',
286
- timeout: BROWSER_TIMEOUT,
287
- stdio: ['pipe', 'pipe', 'pipe']
288
- }
289
- );
290
-
291
- // Parse output
292
- const lines = result.split('\n');
293
- let title = '';
294
- let content = result;
295
-
296
- for (const line of lines) {
297
- if (line.startsWith('TITLE:')) {
298
- title = line.substring(6);
299
- } else if (line === '---CONTENT---') {
300
- const idx = lines.indexOf(line);
301
- content = lines.slice(idx + 1).join('\n');
302
- break;
303
- }
304
- }
305
-
306
- // Convert HTML to text (simple)
307
- const textContent = content
308
- .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
309
- .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
310
- .replace(/<[^>]+>/g, ' ')
311
- .replace(/\s+/g, ' ')
312
- .replace(/&lt;/g, '<')
313
- .replace(/&gt;/g, '>')
314
- .replace(/&amp;/g, '&')
315
- .replace(/&quot;/g, '"')
316
- .trim();
317
-
318
- if (textContent.length > MIN_CONTENT_LENGTH) {
319
- return {
320
- success: true,
321
- tool: 'browser_local',
322
- content: textContent,
323
- title: title,
324
- url: url
325
- };
326
- }
327
-
328
- return {
329
- success: false,
330
- tool: 'browser_local',
331
- error: 'Content too short'
332
- };
333
-
334
- } catch (error) {
335
- console.error(`[SmartFetch] Local browser failed: ${error.message.split('\n')[0]}`);
336
- // Fall through to cloud
337
- }
338
- } else {
339
- console.error('[SmartFetch] Local browser-use not installed, trying cloud...');
340
- }
341
-
342
- // Try Cloud browser-use (if API key available)
343
- const cloudApiKey = process.env.BROWSER_USE_API_KEY;
344
- if (!cloudApiKey) {
345
- return {
346
- success: false,
347
- tool: 'browser',
348
- error: 'Browser automation failed. Local browser-use not installed, and BROWSER_USE_API_KEY not set for cloud.'
349
- };
350
- }
351
-
352
- // Cloud browser-use would be implemented here
353
- // For now, return error with setup instructions
354
- return {
355
- success: false,
356
- tool: 'browser_cloud',
357
- error: 'Cloud browser-use not yet implemented. Please install local browser-use: pip install browser-use && playwright install chromium'
358
- };
359
- }
360
-
361
- /**
362
- * Smart fetch with 3-layer fallback
363
- * Strategy: Tavily → LLM Native → Browser
364
- */
365
- async function smartFetch(url, options = {}) {
366
- const method = options.method || 'auto';
367
- const skipQualityCheck = options.skipQualityCheck || false;
368
-
369
- console.error(`[SmartFetch] Starting fetch for: ${url} (method: ${method})`);
370
-
371
- // Explicit method selection
372
- if (method !== 'auto') {
373
- switch (method) {
374
- case 'tavily': return await tryTavily(url);
375
- case 'native': return tryLLMNative(url);
376
- case 'browser': return await tryBrowser(url);
377
- default: return { success: false, error: 'Unknown method' };
378
- }
379
- }
380
-
381
- // Auto mode: 3-layer cascade
382
-
383
- // Level 1: Tavily
384
- const tavilyResult = await tryTavily(url);
385
-
386
- if (tavilyResult.success) {
387
- const quality = isBlockedOrLowQuality(tavilyResult.content, 'tavily');
388
-
389
- if (skipQualityCheck || !quality.blocked) {
390
- const score = calculateQualityScore(tavilyResult.content);
391
- console.error(`[SmartFetch] ✓ Tavily succeeded (quality: ${score.toFixed(2)})`);
392
-
393
- return {
394
- ...tavilyResult,
395
- fallback_used: false,
396
- quality_score: score,
397
- quality_check: 'passed'
398
- };
399
- }
400
-
401
- console.error(`[SmartFetch] Tavily content blocked: ${quality.reason}`);
402
- } else {
403
- console.error(`[SmartFetch] Tavily failed: ${tavilyResult.error}`);
404
-
405
- // If Tavily key not set, skip to next level
406
- if (!tavilyResult.needs_fallback) {
407
- return tavilyResult;
408
- }
409
- }
410
-
411
- // Level 2: LLM Native Fetch
412
- console.error('[SmartFetch] Falling back to LLM Native...');
413
- const nativeResult = tryLLMNative(url);
414
-
415
- // Return instruction for caller to handle
416
- return {
417
- ...nativeResult,
418
- fallback_used: true,
419
- fallback_chain: ['tavily']
420
- };
421
-
422
- // Note: Browser (Level 3) is called by the agent if native fetch fails
423
- }
424
-
425
- /**
426
- * Smart search with Tavily (HTTP direct)
427
- */
428
- function smartSearch(query, maxResults = 5) {
429
- console.error(`[SmartFetch] Searching: ${query}`);
430
-
431
- const apiKey = process.env.TAVILY_API_KEY;
432
- if (!apiKey) {
433
- return {
434
- success: false,
435
- error: 'TAVILY_API_KEY not set in environment'
436
- };
437
- }
438
-
439
- return new Promise((resolve) => {
440
- const postData = JSON.stringify({
441
- query: query,
442
- api_key: apiKey,
443
- max_results: maxResults,
444
- search_depth: 'advanced',
445
- include_answer: true
446
- });
447
-
448
- const options = {
449
- hostname: 'api.tavily.com',
450
- path: '/search',
451
- method: 'POST',
452
- headers: {
453
- 'Content-Type': 'application/json',
454
- 'Content-Length': Buffer.byteLength(postData)
455
- },
456
- timeout: TAVILY_TIMEOUT
457
- };
458
-
459
- const req = https.request(options, (res) => {
460
- let data = '';
461
- res.on('data', chunk => data += chunk);
462
- res.on('end', () => {
463
- try {
464
- const response = JSON.parse(data);
465
- resolve({
466
- success: true,
467
- query: query,
468
- results: response.results || [],
469
- answer: response.answer || ''
470
- });
471
- } catch (e) {
472
- resolve({ success: false, error: `Parse error: ${e.message}` });
473
- }
474
- });
475
- });
476
-
477
- req.on('error', (err) => {
478
- resolve({ success: false, error: err.message });
479
- });
480
-
481
- req.on('timeout', () => {
482
- req.destroy();
483
- resolve({ success: false, error: 'Timeout' });
484
- });
485
-
486
- req.write(postData);
487
- req.end();
488
- });
489
- }
490
-
491
- // CLI interface
492
- if (require.main === module) {
493
- const args = process.argv.slice(2);
494
- const command = args[0];
495
-
496
- if (command === 'fetch') {
497
- const url = args[1];
498
- const method = args[2] || 'auto';
499
-
500
- if (!url) {
501
- console.log(JSON.stringify({ error: 'URL required' }));
502
- process.exit(1);
503
- }
504
-
505
- smartFetch(url, { method }).then(result => {
506
- console.log(JSON.stringify(result, null, 2));
507
- }).catch(err => {
508
- console.log(JSON.stringify({ error: err.message }));
509
- process.exit(1);
510
- });
511
- }
512
-
513
- else if (command === 'search') {
514
- const query = args[1];
515
- const maxResults = parseInt(args[2]) || 5;
516
-
517
- if (!query) {
518
- console.log(JSON.stringify({ error: 'Query required' }));
519
- process.exit(1);
520
- }
521
-
522
- smartSearch(query, maxResults).then(result => {
523
- console.log(JSON.stringify(result, null, 2));
524
- });
525
- }
526
-
527
- else {
528
- console.log(`
529
- Smart Web Fetch Skill - 3-Layer Strategy
530
-
531
- Usage:
532
- smart-web-fetch fetch <url> [method]
533
- smart-web-fetch search <query> [max_results]
534
-
535
- Methods: auto (default), tavily, native, browser
536
-
537
- 3-Layer Strategy:
538
- 1. Tavily - AI extraction, best quality (needs TAVILY_API_KEY)
539
- 2. LLM Native - Use FetchURL tool (for agents with native capability)
540
- 3. Browser - Local browser-use (fallback for stubborn pages)
541
-
542
- Environment Variables:
543
- TAVILY_API_KEY - Required for Tavily API
544
- BROWSER_USE_API_KEY - Optional for cloud browser (local preferred)
545
-
546
- Examples:
547
- smart-web-fetch fetch https://example.com
548
- smart-web-fetch fetch https://example.com tavily
549
- smart-web-fetch search "OpenAI GPT-5" 10
550
-
551
- Install local browser:
552
- pip install browser-use
553
- playwright install chromium
554
- `);
555
- }
556
- }
557
-
558
- module.exports = { smartFetch, smartSearch, tryTavily, tryLLMNative, tryBrowser };