@seanyao/roll 2026.515.1 → 2026.516.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1 -0
- package/README.md +1 -0
- package/bin/roll +92 -83
- package/package.json +1 -2
- package/tools/roll-fetch/SKILL.md +0 -182
- package/tools/roll-fetch/package.json +0 -15
- package/tools/roll-fetch/smart-web-fetch.js +0 -558
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
- **New**: `roll brief` / `roll dream` 生成文档后自动提交推送 — 每次晨报和夜检不再需要手动 commit `[loop]`
|
|
6
6
|
- **New**: 双语 FAQ 指南 — 10 个自治交付常见场景(loop 卡住、PR 冲突、agent 切换、权限问题等),每条含原因和原理,EN + ZH 对照 `[docs]`
|
|
7
|
+
- **Fixed**: loop 孤儿状态自愈 — cycle 启动时检测 state.yaml 残留 running,若无活跃进程则自动重置为 idle,防止 loop 因中断永久卡死 `[loop]`
|
|
7
8
|
- **New**: 可选的事件驱动 PR 评审模板 — `cp templates/workflows/pr-review-event.yml .github/workflows/`,PR 开即触发 AI 评审,不装也行(loop 每轮兜底) `[pr]`
|
|
8
9
|
- **New**: loop PR inbox 从"分类但空转"升级到"分类+执行" — eligible PR 自动调 AI 评审,stale PR 自动 rebase,fork 和冲突写 ALERT;bot 已评审的 PR 自动让步 `[loop]`
|
|
9
10
|
- **New**: `roll review-pr <number>` — agent-agnostic AI 代码评审,任意 agent(Claude/Kimi/DeepSeek 等)均可评审任意 git 平台的 PR;PR body 加 `[skip-ai-review]` 可跳过 `[pr]`
|
package/README.md
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
**[中文版 README](README_CN.md)**
|
|
13
13
|
|
|
14
|
+
[](https://seanyao.github.io/Roll/)
|
|
14
15
|
[](LICENSE)
|
|
15
16
|
[](https://www.npmjs.com/package/@seanyao/roll)
|
|
16
17
|
[](https://github.com/seanyao/roll/actions/workflows/ci.yml)
|
package/bin/roll
CHANGED
|
@@ -4,7 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
# Roll — AI Agent Convention Manager
|
|
5
5
|
# Single source of truth for how all AI coding agents behave.
|
|
6
6
|
|
|
7
|
-
VERSION="2026.
|
|
7
|
+
VERSION="2026.516.1"
|
|
8
8
|
ROLL_HOME="${ROLL_HOME:-${HOME}/.roll}"
|
|
9
9
|
ROLL_CONFIG="${ROLL_HOME}/config.yaml"
|
|
10
10
|
ROLL_GLOBAL="${ROLL_HOME}/conventions/global"
|
|
@@ -1131,60 +1131,6 @@ _ensure_features_dir() {
|
|
|
1131
1131
|
_ROLL_MERGE_SUMMARY+=("created|docs/features/")
|
|
1132
1132
|
}
|
|
1133
1133
|
|
|
1134
|
-
# ─── Helper: write starter .gitignore (no-op if exists) ──────────────────────
|
|
1135
|
-
_write_gitignore() {
|
|
1136
|
-
[[ -f "$1" ]] && return
|
|
1137
|
-
cat > "$1" << 'EOF'
|
|
1138
|
-
node_modules/
|
|
1139
|
-
dist/
|
|
1140
|
-
build/
|
|
1141
|
-
.env
|
|
1142
|
-
*.local
|
|
1143
|
-
.DS_Store
|
|
1144
|
-
*.log
|
|
1145
|
-
EOF
|
|
1146
|
-
}
|
|
1147
|
-
|
|
1148
|
-
# ─── Helper: write starter .env.example (no-op if exists) ────────────────────
|
|
1149
|
-
_write_env_example() {
|
|
1150
|
-
[[ -f "$1" ]] && return
|
|
1151
|
-
cat > "$1" << 'EOF'
|
|
1152
|
-
# Environment Variables — copy to .env and fill in values
|
|
1153
|
-
|
|
1154
|
-
# Application
|
|
1155
|
-
# NODE_ENV=development
|
|
1156
|
-
# PORT=3000
|
|
1157
|
-
|
|
1158
|
-
# Database
|
|
1159
|
-
# DATABASE_URL=postgresql://user:pass@localhost:5432/db
|
|
1160
|
-
|
|
1161
|
-
# Auth
|
|
1162
|
-
# JWT_SECRET=your-secret-key
|
|
1163
|
-
EOF
|
|
1164
|
-
}
|
|
1165
|
-
|
|
1166
|
-
# ─── Helper: detect project type from existing AGENTS.md ─────────────────────
|
|
1167
|
-
detect_project_type() {
|
|
1168
|
-
local agents_file="$1/AGENTS.md"
|
|
1169
|
-
[[ -f "$agents_file" ]] || { echo "unknown"; return; }
|
|
1170
|
-
|
|
1171
|
-
local content
|
|
1172
|
-
content="$(cat "$agents_file")"
|
|
1173
|
-
|
|
1174
|
-
if echo "$content" | grep -qi "Fullstack Web"; then
|
|
1175
|
-
echo "fullstack"
|
|
1176
|
-
elif echo "$content" | grep -qi "Backend Service"; then
|
|
1177
|
-
echo "backend-service"
|
|
1178
|
-
elif echo "$content" | grep -qi "Frontend Only"; then
|
|
1179
|
-
echo "frontend-only"
|
|
1180
|
-
elif echo "$content" | grep -qi "CLI Tool"; then
|
|
1181
|
-
echo "cli"
|
|
1182
|
-
else
|
|
1183
|
-
# AGENTS.md exists but has no type marker — fall back to file-based scan
|
|
1184
|
-
scan_project_type_from_files "$1"
|
|
1185
|
-
fi
|
|
1186
|
-
}
|
|
1187
|
-
|
|
1188
1134
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
1189
1135
|
# COMMAND: status
|
|
1190
1136
|
# Show current state of conventions
|
|
@@ -1478,7 +1424,7 @@ _peer_auto_attach() {
|
|
|
1478
1424
|
[ "$(uname)" = "Darwin" ] || return 0
|
|
1479
1425
|
[ -f "$_LOOP_MUTE_FILE" ] && return 0
|
|
1480
1426
|
local terminal_pref
|
|
1481
|
-
terminal_pref=$(
|
|
1427
|
+
terminal_pref=$(config_get "loop_attach_terminal" "")
|
|
1482
1428
|
if [[ -z "$terminal_pref" ]]; then
|
|
1483
1429
|
case "${TERM_PROGRAM:-}" in
|
|
1484
1430
|
ghostty) terminal_pref="ghostty" ;;
|
|
@@ -2111,6 +2057,14 @@ _LAUNCHD_DIR="${HOME}/Library/LaunchAgents"
|
|
|
2111
2057
|
# hash of the full path, ensuring uniqueness across sibling dirs with same name.
|
|
2112
2058
|
_project_slug() {
|
|
2113
2059
|
local path="$1"
|
|
2060
|
+
# FIX-034: when inside a git worktree, git-common-dir returns the main tree's
|
|
2061
|
+
# absolute .git path; resolve to the main tree so worktree and main-tree runs
|
|
2062
|
+
# produce the same slug.
|
|
2063
|
+
local _common
|
|
2064
|
+
_common=$(git -C "$path" rev-parse --git-common-dir 2>/dev/null)
|
|
2065
|
+
if [[ -n "$_common" && "$_common" == *"/.git" ]]; then
|
|
2066
|
+
path="${_common%/.git}"
|
|
2067
|
+
fi
|
|
2114
2068
|
local base; base=$(basename "$path")
|
|
2115
2069
|
local hash
|
|
2116
2070
|
if command -v md5 &>/dev/null; then
|
|
@@ -2125,17 +2079,10 @@ _project_slug() {
|
|
|
2125
2079
|
_config_read_int() {
|
|
2126
2080
|
local key="$1" default="$2"
|
|
2127
2081
|
local val
|
|
2128
|
-
val=$(
|
|
2082
|
+
val=$(config_get "$key" "")
|
|
2129
2083
|
if [[ "$val" =~ ^[0-9]+$ ]]; then echo "$val"; else echo "$default"; fi
|
|
2130
2084
|
}
|
|
2131
2085
|
|
|
2132
|
-
_config_read_string() {
|
|
2133
|
-
local key="$1" default="$2"
|
|
2134
|
-
local val
|
|
2135
|
-
val=$(grep "^${key}:" "$ROLL_CONFIG" 2>/dev/null | awk '{print $2}' | tr -d '"' | head -1)
|
|
2136
|
-
if [[ -n "$val" ]]; then echo "$val"; else echo "$default"; fi
|
|
2137
|
-
}
|
|
2138
|
-
|
|
2139
2086
|
# Derive a minute in [1,55] from project path hash + offset so different projects
|
|
2140
2087
|
# and different services within a project don't fire at the same time.
|
|
2141
2088
|
# Offsets used: loop=0, dream=2, brief=4 → always three distinct values (2<55).
|
|
@@ -2248,7 +2195,15 @@ if [ -f "\$INNER_LOCK" ]; then
|
|
|
2248
2195
|
rm -f "\$INNER_LOCK"
|
|
2249
2196
|
fi
|
|
2250
2197
|
printf '%s:%s\n' "\$\$" "\$(date -u +%s)" > "\$INNER_LOCK"
|
|
2251
|
-
|
|
2198
|
+
# FIX-038: background heartbeat writer — outer script uses this as primary liveness signal
|
|
2199
|
+
# to detect stale execution without relying on PID reuse heuristics.
|
|
2200
|
+
HEARTBEAT_FILE="${HOME}/.shared/roll/loop/.heartbeat-${slug}"
|
|
2201
|
+
_heartbeat_writer() {
|
|
2202
|
+
while true; do echo "\$(date -u +%s)" > "\$HEARTBEAT_FILE"; sleep 60; done
|
|
2203
|
+
}
|
|
2204
|
+
_heartbeat_writer &
|
|
2205
|
+
_HEARTBEAT_PID=\$!
|
|
2206
|
+
trap 'kill "\${_HEARTBEAT_PID}" 2>/dev/null; rm -f "\$INNER_LOCK" "\$HEARTBEAT_FILE"' EXIT
|
|
2252
2207
|
|
|
2253
2208
|
# US-AUTO-037: pull in worktree helpers (US-AUTO-036). Sourcing bin/roll is
|
|
2254
2209
|
# safe — its main() only runs when invoked directly (BASH_SOURCE == \$0).
|
|
@@ -2306,26 +2261,34 @@ _claude_cleanup_stale_worktrees "${project_path}" || true
|
|
|
2306
2261
|
# When \`gh\` is unavailable, fall back to the legacy ff-merge path.
|
|
2307
2262
|
if [ "\$_USE_WORKTREE" = "1" ]; then
|
|
2308
2263
|
if [ "\$_exit" -eq 0 ]; then
|
|
2309
|
-
|
|
2310
|
-
|
|
2311
|
-
|
|
2312
|
-
|
|
2313
|
-
fi
|
|
2314
|
-
_publish_status=\$?
|
|
2315
|
-
if [ "\$_publish_status" -eq 0 ]; then
|
|
2264
|
+
# Idle cycle — no commits ahead of origin/main means nothing was built;
|
|
2265
|
+
# skip publish and reclaim the worktree immediately.
|
|
2266
|
+
_cycle_commits=\$(cd "\$WT" && git rev-list --count origin/main..HEAD 2>/dev/null || echo 0)
|
|
2267
|
+
if [ "\$_cycle_commits" -eq 0 ]; then
|
|
2316
2268
|
_worktree_cleanup "\$WT" "\$BRANCH"
|
|
2317
|
-
echo "[loop] cycle \${CYCLE_ID}:
|
|
2318
|
-
|
|
2319
|
-
if ( cd "
|
|
2269
|
+
echo "[loop] cycle \${CYCLE_ID}: idle (no new commits); worktree cleaned"
|
|
2270
|
+
else
|
|
2271
|
+
if ( cd "\$WT" && _loop_is_doc_only_change ); then
|
|
2272
|
+
( cd "\$WT" && _loop_publish_doc_pr "\$BRANCH" "doc: loop cycle \${CYCLE_ID}" )
|
|
2273
|
+
else
|
|
2274
|
+
( cd "\$WT" && _loop_publish_pr "\$BRANCH" "loop cycle \${CYCLE_ID}" )
|
|
2275
|
+
fi
|
|
2276
|
+
_publish_status=\$?
|
|
2277
|
+
if [ "\$_publish_status" -eq 0 ]; then
|
|
2320
2278
|
_worktree_cleanup "\$WT" "\$BRANCH"
|
|
2321
|
-
echo "[loop] cycle \${CYCLE_ID}:
|
|
2279
|
+
echo "[loop] cycle \${CYCLE_ID}: published; worktree cleaned"
|
|
2280
|
+
elif [ "\$_publish_status" -eq 2 ]; then
|
|
2281
|
+
if ( cd "${project_path}" && _worktree_merge_back "\$BRANCH" ); then
|
|
2282
|
+
_worktree_cleanup "\$WT" "\$BRANCH"
|
|
2283
|
+
echo "[loop] cycle \${CYCLE_ID}: gh unavailable; merged via ff and cleaned up"
|
|
2284
|
+
else
|
|
2285
|
+
_worktree_alert "cycle \${CYCLE_ID}: gh unavailable AND merge_back failed; worktree preserved at \$WT"
|
|
2286
|
+
echo "[loop] cycle \${CYCLE_ID}: gh+merge_back both failed; worktree preserved at \$WT"
|
|
2287
|
+
fi
|
|
2322
2288
|
else
|
|
2323
|
-
_worktree_alert "cycle \${CYCLE_ID}:
|
|
2324
|
-
echo "[loop] cycle \${CYCLE_ID}:
|
|
2289
|
+
_worktree_alert "cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT (branch \$BRANCH)"
|
|
2290
|
+
echo "[loop] cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT"
|
|
2325
2291
|
fi
|
|
2326
|
-
else
|
|
2327
|
-
_worktree_alert "cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT (branch \$BRANCH)"
|
|
2328
|
-
echo "[loop] cycle \${CYCLE_ID}: PR publish failed; worktree preserved at \$WT"
|
|
2329
2292
|
fi
|
|
2330
2293
|
else
|
|
2331
2294
|
_worktree_alert "cycle \${CYCLE_ID}: claude exited \$_exit; worktree preserved at \$WT (branch \$BRANCH)"
|
|
@@ -2340,6 +2303,8 @@ INNER
|
|
|
2340
2303
|
|
|
2341
2304
|
cat > "$script_path" << SCRIPT
|
|
2342
2305
|
#!/bin/bash -l
|
|
2306
|
+
# caffeinate: prevent idle sleep from killing claude during cycles
|
|
2307
|
+
caffeinate -i -w \$\$ &
|
|
2343
2308
|
# Active-window check — skipped when ROLL_LOOP_FORCE is set (manual 'roll loop now')
|
|
2344
2309
|
if [ -z "\$ROLL_LOOP_FORCE" ]; then
|
|
2345
2310
|
h=\$(printf '%d' "\$(date +%H)")
|
|
@@ -2348,6 +2313,50 @@ fi
|
|
|
2348
2313
|
# Pause check — 'roll loop pause' creates this marker to suspend scheduling
|
|
2349
2314
|
PAUSE="\$HOME/.shared/roll/loop/PAUSE-${slug}"
|
|
2350
2315
|
if [ -z "\$ROLL_LOOP_FORCE" ] && [ -f "\$PAUSE" ]; then exit 0; fi
|
|
2316
|
+
# FIX-037: orphan state detection & self-heal — if state.yaml says running
|
|
2317
|
+
# but no LOCK process or tmux session exists, the previous cycle was killed
|
|
2318
|
+
# (e.g. SIGKILL / sleep / terminal close). Heal state to idle so the next
|
|
2319
|
+
# cycle can proceed normally; write ALERT for transparency.
|
|
2320
|
+
# FIX-038: heartbeat is the primary liveness signal (avoids PID reuse race);
|
|
2321
|
+
# LOCK pid check is secondary fallback for backward compatibility.
|
|
2322
|
+
HEARTBEAT_TIMEOUT="\${ROLL_HEARTBEAT_TIMEOUT:-1800}"
|
|
2323
|
+
STATE_FILE="${HOME}/.shared/roll/loop/state.yaml"
|
|
2324
|
+
if [ -f "\$STATE_FILE" ]; then
|
|
2325
|
+
_state=\$(grep '^status:' "\$STATE_FILE" | awk '{print \$2}' 2>/dev/null || echo "")
|
|
2326
|
+
if [ "\$_state" = "running" ]; then
|
|
2327
|
+
_still_active=false
|
|
2328
|
+
# FIX-038: heartbeat is primary signal
|
|
2329
|
+
_heartbeat_file="${HOME}/.shared/roll/loop/.heartbeat-${slug}"
|
|
2330
|
+
if [ -f "\$_heartbeat_file" ]; then
|
|
2331
|
+
_hb_ts=\$(cat "\$_heartbeat_file" 2>/dev/null || echo "0")
|
|
2332
|
+
_now=\$(date -u +%s)
|
|
2333
|
+
_hb_age=\$(( _now - _hb_ts ))
|
|
2334
|
+
if [ "\$_hb_age" -lt "\$HEARTBEAT_TIMEOUT" ]; then
|
|
2335
|
+
_still_active=true
|
|
2336
|
+
fi
|
|
2337
|
+
fi
|
|
2338
|
+
# Fallback: LOCK pid check (for cycles without heartbeat, e.g. pre-FIX-038)
|
|
2339
|
+
if [ "\$_still_active" = false ]; then
|
|
2340
|
+
_lock_file="\$(dirname "\$0")/.LOCK-\$(basename "\$0" .sh | sed 's/^run-//')"
|
|
2341
|
+
if [ -f "\$_lock_file" ]; then
|
|
2342
|
+
_lock_pid=\$(head -1 "\$_lock_file" 2>/dev/null || echo "")
|
|
2343
|
+
[ -n "\$_lock_pid" ] && kill -0 "\$_lock_pid" 2>/dev/null && _still_active=true
|
|
2344
|
+
fi
|
|
2345
|
+
fi
|
|
2346
|
+
# Final: tmux session check
|
|
2347
|
+
if [ "\$_still_active" = false ]; then
|
|
2348
|
+
command -v tmux >/dev/null 2>&1 && tmux has-session -t "roll-loop-\$(basename "\$0" .sh | sed 's/^run-//')" 2>/dev/null && _still_active=true
|
|
2349
|
+
fi
|
|
2350
|
+
if [ "\$_still_active" = false ]; then
|
|
2351
|
+
echo "[\$(date '+%Y-%m-%dT%H:%M:%S%z')] FIX-037: orphan state detected (status=running, heartbeat stale or missing) — healing to idle" >> "\$LOG"
|
|
2352
|
+
echo "status: idle" > "\${STATE_FILE}.tmp" && mv "\${STATE_FILE}.tmp" "\$STATE_FILE"
|
|
2353
|
+
rm -f "\$_lock_file" 2>/dev/null || true
|
|
2354
|
+
_alert_file="\$(dirname "\$0")/ALERT.md"
|
|
2355
|
+
echo "\$(date '+%Y-%m-%dT%H:%M:%S%z') | FIX-037 auto-heal | Orphan state detected and cleared (status=running → idle)" >> "\$_alert_file" 2>/dev/null || true
|
|
2356
|
+
echo "[\$(date '+%Y-%m-%dT%H:%M:%S%z')] FIX-037: healed to idle, ALERT written" >> "\$LOG"
|
|
2357
|
+
fi
|
|
2358
|
+
fi
|
|
2359
|
+
fi
|
|
2351
2360
|
LOCK="\$(dirname "\$0")/.LOCK-\$(basename "\$0" .sh | sed 's/^run-//')"
|
|
2352
2361
|
SESSION="roll-loop-\$(basename "\$0" .sh | sed 's/^run-//')"
|
|
2353
2362
|
INNER_SCRIPT="${inner_path}"
|
|
@@ -2449,7 +2458,7 @@ _install_launchd_plists() {
|
|
|
2449
2458
|
|
|
2450
2459
|
# Terminal preference: config wins, then TERM_PROGRAM env, then "Terminal"
|
|
2451
2460
|
local terminal_pref
|
|
2452
|
-
terminal_pref=$(
|
|
2461
|
+
terminal_pref=$(config_get "loop_attach_terminal" "")
|
|
2453
2462
|
if [[ -z "$terminal_pref" ]]; then
|
|
2454
2463
|
case "${TERM_PROGRAM:-}" in
|
|
2455
2464
|
ghostty) terminal_pref="ghostty" ;;
|
|
@@ -2679,7 +2688,7 @@ _loop_test() {
|
|
|
2679
2688
|
|
|
2680
2689
|
# Detect terminal pref same way _install_launchd_plists does
|
|
2681
2690
|
local terminal_pref
|
|
2682
|
-
terminal_pref=$(
|
|
2691
|
+
terminal_pref=$(config_get "loop_attach_terminal" "")
|
|
2683
2692
|
if [[ -z "$terminal_pref" ]]; then
|
|
2684
2693
|
case "${TERM_PROGRAM:-}" in
|
|
2685
2694
|
ghostty) terminal_pref="ghostty" ;;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@seanyao/roll",
|
|
3
|
-
"version": "2026.
|
|
3
|
+
"version": "2026.516.1",
|
|
4
4
|
"description": "Roll — Roll out features with AI agents",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"test": "bash tests/run.sh"
|
|
@@ -27,7 +27,6 @@
|
|
|
27
27
|
"conventions/",
|
|
28
28
|
"lib/",
|
|
29
29
|
"skills/",
|
|
30
|
-
"tools/",
|
|
31
30
|
"template/",
|
|
32
31
|
"README.md",
|
|
33
32
|
"CHANGELOG.md"
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
hidden: true
|
|
3
|
-
name: roll-fetch
|
|
4
|
-
description: Web page fetching and crawling for AI agents. Extract content from URLs for research, documentation, and competitive analysis.
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
# Roll Fetch - Web Content Extraction
|
|
8
|
-
|
|
9
|
-
Extract content from web pages for research and analysis.
|
|
10
|
-
|
|
11
|
-
## When to Use
|
|
12
|
-
|
|
13
|
-
- Product research (competitor analysis)
|
|
14
|
-
- Technical documentation gathering
|
|
15
|
-
- Code examples and best practices
|
|
16
|
-
- Full site crawling for backup/analysis
|
|
17
|
-
|
|
18
|
-
## Environment Setup
|
|
19
|
-
|
|
20
|
-
Configure API keys per machine:
|
|
21
|
-
|
|
22
|
-
```bash
|
|
23
|
-
# Required for Tavily
|
|
24
|
-
export TAVILY_API_KEY=tvly-dev-...
|
|
25
|
-
|
|
26
|
-
# Optional for cloud browser fallback
|
|
27
|
-
export BROWSER_USE_API_KEY=bu-...
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
Or create `.env` file in project root:
|
|
31
|
-
```
|
|
32
|
-
TAVILY_API_KEY=tvly-dev-...
|
|
33
|
-
BROWSER_USE_API_KEY=bu-...
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## Methods
|
|
37
|
-
|
|
38
|
-
### 1. Tavily API (Recommended)
|
|
39
|
-
|
|
40
|
-
Best quality extraction, requires `TAVILY_API_KEY`.
|
|
41
|
-
|
|
42
|
-
```bash
|
|
43
|
-
# Using Tavily CLI or API
|
|
44
|
-
curl -X POST https://api.tavily.com/extract \
|
|
45
|
-
-H "Content-Type: application/json" \
|
|
46
|
-
-d '{
|
|
47
|
-
"urls": ["https://example.com"],
|
|
48
|
-
"api_key": "your_tavily_api_key"
|
|
49
|
-
}'
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
**Pros**: AI-optimized extraction, handles complex layouts
|
|
53
|
-
**Cons**: Requires API key, rate limited
|
|
54
|
-
|
|
55
|
-
### 2. LLM Native Fetch (Default)
|
|
56
|
-
|
|
57
|
-
Use your built-in URL fetching capability directly.
|
|
58
|
-
|
|
59
|
-
**When to use**: When Tavily is unavailable or for quick checks.
|
|
60
|
-
|
|
61
|
-
**Note**: Most modern AI agents (Kimi, Codex, Claude) have native URL fetching. Use `FetchURL` tool or equivalent.
|
|
62
|
-
|
|
63
|
-
### 3. Browser Automation (Fallback)
|
|
64
|
-
|
|
65
|
-
Local browser automation for stubborn pages using **[browser-use](https://github.com/browser-use/browser-use)**.
|
|
66
|
-
|
|
67
|
-
**How to Choose:**
|
|
68
|
-
|
|
69
|
-
| If | Then Use | Why |
|
|
70
|
-
|----|---------|-----|
|
|
71
|
-
| `BROWSER_USE_API_KEY` in env | **Cloud** | Managed browsers, less setup |
|
|
72
|
-
| No API key, but `browser-use` installed | **Local** | Free, no external dependency |
|
|
73
|
-
| Neither | Skip to manual extraction | Tell user "Need browser automation setup" |
|
|
74
|
-
|
|
75
|
-
**Option A: Local (Free, No API Key)**
|
|
76
|
-
```python
|
|
77
|
-
from browser_use import Agent, Browser, BrowserConfig
|
|
78
|
-
import asyncio
|
|
79
|
-
|
|
80
|
-
async def fetch_page(url):
|
|
81
|
-
# Pure local, no API key needed
|
|
82
|
-
browser = Browser(config=BrowserConfig(headless=True))
|
|
83
|
-
await browser.start()
|
|
84
|
-
page = await browser.get_current_page()
|
|
85
|
-
await page.goto(url)
|
|
86
|
-
content = await page.content()
|
|
87
|
-
await browser.stop()
|
|
88
|
-
return content
|
|
89
|
-
|
|
90
|
-
# Run
|
|
91
|
-
content = asyncio.run(fetch_page("https://example.com"))
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
**Option B: Cloud API**
|
|
95
|
-
```python
|
|
96
|
-
from browser_use import Agent
|
|
97
|
-
|
|
98
|
-
agent = Agent(
|
|
99
|
-
task=f"Extract the main content from {url} and return as markdown",
|
|
100
|
-
llm="moonshot" # or openai, anthropic
|
|
101
|
-
)
|
|
102
|
-
result = await agent.run()
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
**Setup** (Local):
|
|
106
|
-
```bash
|
|
107
|
-
pip install browser-use
|
|
108
|
-
playwright install chromium
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
## Usage
|
|
112
|
-
|
|
113
|
-
### CLI Usage (via smart-web-fetch.js)
|
|
114
|
-
|
|
115
|
-
```bash
|
|
116
|
-
# Auto mode (Tavily → Native → Browser)
|
|
117
|
-
node smart-web-fetch.js fetch https://example.com
|
|
118
|
-
|
|
119
|
-
# Explicit method
|
|
120
|
-
node smart-web-fetch.js fetch https://example.com tavily
|
|
121
|
-
node smart-web-fetch.js fetch https://example.com native
|
|
122
|
-
node smart-web-fetch.js fetch https://example.com browser
|
|
123
|
-
|
|
124
|
-
# Search
|
|
125
|
-
node smart-web-fetch.js search "Python async" 5
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
### Programmatic Usage
|
|
129
|
-
|
|
130
|
-
```javascript
|
|
131
|
-
const { smartFetch, smartSearch } = require('./smart-web-fetch.js');
|
|
132
|
-
|
|
133
|
-
// Fetch a page
|
|
134
|
-
const result = await smartFetch('https://example.com');
|
|
135
|
-
console.log(result.content);
|
|
136
|
-
|
|
137
|
-
// Search
|
|
138
|
-
const searchResult = await smartSearch('OpenAI GPT-5', 5);
|
|
139
|
-
console.log(searchResult.results);
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
### Single Page Fetch
|
|
143
|
-
|
|
144
|
-
```
|
|
145
|
-
User: "Fetch https://docs.example.com/api"
|
|
146
|
-
→ Use smart-web-fetch.js with auto mode
|
|
147
|
-
→ Return clean markdown content
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
### Full Site Crawl
|
|
151
|
-
|
|
152
|
-
```
|
|
153
|
-
User: "Crawl https://docs.example.com"
|
|
154
|
-
→ Use smart-web-fetch.js recursively
|
|
155
|
-
→ Extract all internal links
|
|
156
|
-
→ Recursively fetch up to max depth (default: 2)
|
|
157
|
-
→ Save each page as separate markdown file
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
## Output Format
|
|
161
|
-
|
|
162
|
-
Always return clean Markdown:
|
|
163
|
-
- Extract main content only (remove nav, ads, footers)
|
|
164
|
-
- Preserve code blocks and tables
|
|
165
|
-
- Include source URL as header
|
|
166
|
-
|
|
167
|
-
## Quality Check
|
|
168
|
-
|
|
169
|
-
Validate extracted content:
|
|
170
|
-
- Min length: 500 chars (reject if shorter)
|
|
171
|
-
- Check for captcha/error messages
|
|
172
|
-
- Verify main content structure (headings, paragraphs)
|
|
173
|
-
|
|
174
|
-
## Examples
|
|
175
|
-
|
|
176
|
-
| Task | Method | Command |
|
|
177
|
-
|------|--------|---------|
|
|
178
|
-
| Quick article | Auto | `node smart-web-fetch.js fetch https://blog.example.com` |
|
|
179
|
-
| API docs | Tavily | `node smart-web-fetch.js fetch https://docs.example.com tavily` |
|
|
180
|
-
| SPA site | Browser | `node smart-web-fetch.js fetch https://spa.example.com browser` |
|
|
181
|
-
| Search | Tavily | `node smart-web-fetch.js search "Python async" 5` |
|
|
182
|
-
| Fallback test | Native | `node smart-web-fetch.js fetch https://example.com native` |
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "smart-web-fetch",
|
|
3
|
-
"version": "1.0.0",
|
|
4
|
-
"description": "Intelligent web fetching with automatic Tavily → Scrapling fallback",
|
|
5
|
-
"main": "smart-web-fetch.js",
|
|
6
|
-
"bin": {
|
|
7
|
-
"smart-web-fetch": "./smart-web-fetch.js"
|
|
8
|
-
},
|
|
9
|
-
"scripts": {
|
|
10
|
-
"test": "node smart-web-fetch.js fetch https://example.com"
|
|
11
|
-
},
|
|
12
|
-
"keywords": ["web-scraping", "tavily", "scrapling", "fallback"],
|
|
13
|
-
"author": "R0_lobster",
|
|
14
|
-
"license": "MIT"
|
|
15
|
-
}
|
|
@@ -1,558 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* Smart Web Fetch Skill - Simplified 3-Layer Strategy
|
|
4
|
-
* 三层策略: Tavily → LLM Native → Browser
|
|
5
|
-
* 移除 mcporter, 直接 HTTP 调用, Key 从环境变量获取
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
const { execSync } = require('child_process');
|
|
9
|
-
const https = require('https');
|
|
10
|
-
|
|
11
|
-
// Configuration
|
|
12
|
-
const TAVILY_TIMEOUT = 30000;
|
|
13
|
-
const BROWSER_TIMEOUT = 90000;
|
|
14
|
-
const MIN_CONTENT_LENGTH = 200;
|
|
15
|
-
const MAX_RETRIES = 2;
|
|
16
|
-
|
|
17
|
-
// Blocked content keywords
|
|
18
|
-
const BLOCKED_KEYWORDS = [
|
|
19
|
-
// Chinese
|
|
20
|
-
'验证', 'captcha', '请登录', '环境异常', '登录后', '需要验证',
|
|
21
|
-
'请完成验证', '安全检查', '访问受限', 'blocked', 'access denied',
|
|
22
|
-
'拖动滑块', '完成拼图', '点击验证', '继续访问', '登录查看',
|
|
23
|
-
'验证后即可', '异常访问', '安全验证', '人机验证',
|
|
24
|
-
// English
|
|
25
|
-
'verify', 'verification', 'complete the verification', 'captcha required',
|
|
26
|
-
'please log in', 'sign in to', 'access denied', 'blocked',
|
|
27
|
-
'security check', 'human verification', 'prove you\'re human'
|
|
28
|
-
];
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* Check if content is blocked or low quality
|
|
32
|
-
*/
|
|
33
|
-
function isBlockedOrLowQuality(content, source = 'unknown') {
|
|
34
|
-
if (!content || content.length < MIN_CONTENT_LENGTH) {
|
|
35
|
-
return { blocked: true, reason: 'Content too short or empty', severity: 'high' };
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
const lowerContent = content.toLowerCase();
|
|
39
|
-
const foundKeywords = [];
|
|
40
|
-
|
|
41
|
-
for (const keyword of BLOCKED_KEYWORDS) {
|
|
42
|
-
if (lowerContent.includes(keyword.toLowerCase())) {
|
|
43
|
-
foundKeywords.push(keyword);
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
if (foundKeywords.length > 0) {
|
|
48
|
-
const isLikelyBlocked = foundKeywords.some(k =>
|
|
49
|
-
['验证', 'captcha', '环境异常', '请登录', '拖动滑块'].includes(k)
|
|
50
|
-
);
|
|
51
|
-
return {
|
|
52
|
-
blocked: isLikelyBlocked,
|
|
53
|
-
reason: `Detected keywords: ${foundKeywords.slice(0, 3).join(', ')}`,
|
|
54
|
-
severity: isLikelyBlocked ? 'high' : 'medium'
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
return { blocked: false };
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
/**
|
|
62
|
-
* Calculate content quality score
|
|
63
|
-
*/
|
|
64
|
-
function calculateQualityScore(content) {
|
|
65
|
-
if (!content) return 0;
|
|
66
|
-
|
|
67
|
-
let score = 0;
|
|
68
|
-
const length = content.length;
|
|
69
|
-
|
|
70
|
-
// Length score (0-30)
|
|
71
|
-
score += Math.min(Math.log10(length) * 10, 30);
|
|
72
|
-
|
|
73
|
-
// Content density (0-25)
|
|
74
|
-
const wordCount = content.split(/\s+/).length;
|
|
75
|
-
const avgWordLength = content.length / wordCount;
|
|
76
|
-
if (avgWordLength > 3 && avgWordLength < 15) score += 15;
|
|
77
|
-
if (content.match(/[。\.]/g)?.length > 5) score += 10;
|
|
78
|
-
|
|
79
|
-
// Structure indicators (0-25)
|
|
80
|
-
if (content.includes('#') || content.includes('##')) score += 8;
|
|
81
|
-
if (content.includes('###')) score += 5;
|
|
82
|
-
if (content.includes('- ') || content.includes('* ')) score += 6;
|
|
83
|
-
if (content.includes('```')) score += 6;
|
|
84
|
-
|
|
85
|
-
// Rich content indicators (0-20)
|
|
86
|
-
if (content.match(/\[.*?\]\(.*?\)/)) score += 5;
|
|
87
|
-
if (content.match(/\!\[.*?\]\(.*?\)/)) score += 5;
|
|
88
|
-
if (content.match(/\*\*.*?\*\*/)) score += 5;
|
|
89
|
-
if (content.match(/`.*?`/)) score += 5;
|
|
90
|
-
|
|
91
|
-
return Math.min(score / 100, 1.0);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
/**
|
|
95
|
-
* Level 1: Tavily API (HTTP direct call)
|
|
96
|
-
*/
|
|
97
|
-
function tryTavily(url, retries = 0) {
|
|
98
|
-
console.error(`[SmartFetch] Level 1: Trying Tavily for: ${url}`);
|
|
99
|
-
|
|
100
|
-
const apiKey = process.env.TAVILY_API_KEY;
|
|
101
|
-
if (!apiKey) {
|
|
102
|
-
return {
|
|
103
|
-
success: false,
|
|
104
|
-
tool: 'tavily',
|
|
105
|
-
error: 'TAVILY_API_KEY not set in environment',
|
|
106
|
-
needs_fallback: true
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
return new Promise((resolve) => {
|
|
111
|
-
const postData = JSON.stringify({
|
|
112
|
-
urls: [url],
|
|
113
|
-
api_key: apiKey,
|
|
114
|
-
extract_depth: 'advanced',
|
|
115
|
-
include_images: false
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
const options = {
|
|
119
|
-
hostname: 'api.tavily.com',
|
|
120
|
-
path: '/extract',
|
|
121
|
-
method: 'POST',
|
|
122
|
-
headers: {
|
|
123
|
-
'Content-Type': 'application/json',
|
|
124
|
-
'Content-Length': Buffer.byteLength(postData)
|
|
125
|
-
},
|
|
126
|
-
timeout: TAVILY_TIMEOUT
|
|
127
|
-
};
|
|
128
|
-
|
|
129
|
-
const req = https.request(options, (res) => {
|
|
130
|
-
let data = '';
|
|
131
|
-
|
|
132
|
-
res.on('data', chunk => data += chunk);
|
|
133
|
-
|
|
134
|
-
res.on('end', () => {
|
|
135
|
-
try {
|
|
136
|
-
const response = JSON.parse(data);
|
|
137
|
-
|
|
138
|
-
if (response.results && response.results[0]) {
|
|
139
|
-
const result = response.results[0];
|
|
140
|
-
const content = result.raw_content || result.content || '';
|
|
141
|
-
|
|
142
|
-
if (content.length > MIN_CONTENT_LENGTH) {
|
|
143
|
-
resolve({
|
|
144
|
-
success: true,
|
|
145
|
-
tool: 'tavily',
|
|
146
|
-
content: content,
|
|
147
|
-
title: result.title || '',
|
|
148
|
-
url: result.url || url
|
|
149
|
-
});
|
|
150
|
-
} else {
|
|
151
|
-
resolve({
|
|
152
|
-
success: false,
|
|
153
|
-
tool: 'tavily',
|
|
154
|
-
error: 'Content too short',
|
|
155
|
-
needs_fallback: true
|
|
156
|
-
});
|
|
157
|
-
}
|
|
158
|
-
} else if (response.error) {
|
|
159
|
-
resolve({
|
|
160
|
-
success: false,
|
|
161
|
-
tool: 'tavily',
|
|
162
|
-
error: response.error,
|
|
163
|
-
needs_fallback: true
|
|
164
|
-
});
|
|
165
|
-
} else {
|
|
166
|
-
resolve({
|
|
167
|
-
success: false,
|
|
168
|
-
tool: 'tavily',
|
|
169
|
-
error: 'No results',
|
|
170
|
-
needs_fallback: true
|
|
171
|
-
});
|
|
172
|
-
}
|
|
173
|
-
} catch (e) {
|
|
174
|
-
resolve({
|
|
175
|
-
success: false,
|
|
176
|
-
tool: 'tavily',
|
|
177
|
-
error: `Parse error: ${e.message}`,
|
|
178
|
-
needs_fallback: true
|
|
179
|
-
});
|
|
180
|
-
}
|
|
181
|
-
});
|
|
182
|
-
});
|
|
183
|
-
|
|
184
|
-
req.on('error', (err) => {
|
|
185
|
-
if (retries < MAX_RETRIES) {
|
|
186
|
-
console.error(`[SmartFetch] Tavily error, retrying... (${retries + 1}/${MAX_RETRIES})`);
|
|
187
|
-
resolve(tryTavily(url, retries + 1));
|
|
188
|
-
} else {
|
|
189
|
-
resolve({
|
|
190
|
-
success: false,
|
|
191
|
-
tool: 'tavily',
|
|
192
|
-
error: err.message,
|
|
193
|
-
needs_fallback: true
|
|
194
|
-
});
|
|
195
|
-
}
|
|
196
|
-
});
|
|
197
|
-
|
|
198
|
-
req.on('timeout', () => {
|
|
199
|
-
req.destroy();
|
|
200
|
-
if (retries < MAX_RETRIES) {
|
|
201
|
-
console.error(`[SmartFetch] Tavily timeout, retrying... (${retries + 1}/${MAX_RETRIES})`);
|
|
202
|
-
resolve(tryTavily(url, retries + 1));
|
|
203
|
-
} else {
|
|
204
|
-
resolve({
|
|
205
|
-
success: false,
|
|
206
|
-
tool: 'tavily',
|
|
207
|
-
error: 'Timeout',
|
|
208
|
-
needs_fallback: true
|
|
209
|
-
});
|
|
210
|
-
}
|
|
211
|
-
});
|
|
212
|
-
|
|
213
|
-
req.write(postData);
|
|
214
|
-
req.end();
|
|
215
|
-
});
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
/**
|
|
219
|
-
* Level 2: LLM Native Fetch (return instruction for caller)
|
|
220
|
-
*/
|
|
221
|
-
function tryLLMNative(url) {
|
|
222
|
-
console.error(`[SmartFetch] Level 2: LLM Native Fetch for: ${url}`);
|
|
223
|
-
|
|
224
|
-
return {
|
|
225
|
-
success: false,
|
|
226
|
-
tool: 'llm_native',
|
|
227
|
-
error: 'LLM Native fetch requires caller to use FetchURL tool',
|
|
228
|
-
instruction: `Use FetchURL tool to fetch "${url}" and return the content`,
|
|
229
|
-
needs_fallback: true,
|
|
230
|
-
native_fetch: true,
|
|
231
|
-
url: url
|
|
232
|
-
};
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
/**
|
|
236
|
-
* Check if browser-use is installed locally
|
|
237
|
-
*/
|
|
238
|
-
function isBrowserUseInstalled() {
|
|
239
|
-
try {
|
|
240
|
-
execSync('/opt/homebrew/bin/python3.11 -c "import browser_use"', {
|
|
241
|
-
encoding: 'utf-8',
|
|
242
|
-
timeout: 5000,
|
|
243
|
-
stdio: 'pipe'
|
|
244
|
-
});
|
|
245
|
-
return true;
|
|
246
|
-
} catch (e) {
|
|
247
|
-
return false;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
/**
|
|
252
|
-
* Level 3: Browser Automation (Local first, then Cloud)
|
|
253
|
-
*/
|
|
254
|
-
async function tryBrowser(url) {
|
|
255
|
-
console.error(`[SmartFetch] Level 3: Trying Browser automation for: ${url}`);
|
|
256
|
-
|
|
257
|
-
// Try local browser-use first
|
|
258
|
-
if (isBrowserUseInstalled()) {
|
|
259
|
-
console.error('[SmartFetch] Using local browser-use...');
|
|
260
|
-
|
|
261
|
-
try {
|
|
262
|
-
const result = execSync(
|
|
263
|
-
`/opt/homebrew/bin/python3.11 -c "
|
|
264
|
-
import asyncio
|
|
265
|
-
import sys
|
|
266
|
-
from browser_use import Browser, BrowserConfig
|
|
267
|
-
|
|
268
|
-
async def fetch():
|
|
269
|
-
browser = Browser(config=BrowserConfig(headless=True))
|
|
270
|
-
await browser.start()
|
|
271
|
-
try:
|
|
272
|
-
page = await browser.get_current_page()
|
|
273
|
-
await page.goto('${url}', wait_until='networkidle')
|
|
274
|
-
content = await page.content()
|
|
275
|
-
title = await page.title()
|
|
276
|
-
print(f'TITLE:{title}')
|
|
277
|
-
print('---CONTENT---')
|
|
278
|
-
print(content)
|
|
279
|
-
finally:
|
|
280
|
-
await browser.stop()
|
|
281
|
-
|
|
282
|
-
asyncio.run(fetch())
|
|
283
|
-
"`,
|
|
284
|
-
{
|
|
285
|
-
encoding: 'utf-8',
|
|
286
|
-
timeout: BROWSER_TIMEOUT,
|
|
287
|
-
stdio: ['pipe', 'pipe', 'pipe']
|
|
288
|
-
}
|
|
289
|
-
);
|
|
290
|
-
|
|
291
|
-
// Parse output
|
|
292
|
-
const lines = result.split('\n');
|
|
293
|
-
let title = '';
|
|
294
|
-
let content = result;
|
|
295
|
-
|
|
296
|
-
for (const line of lines) {
|
|
297
|
-
if (line.startsWith('TITLE:')) {
|
|
298
|
-
title = line.substring(6);
|
|
299
|
-
} else if (line === '---CONTENT---') {
|
|
300
|
-
const idx = lines.indexOf(line);
|
|
301
|
-
content = lines.slice(idx + 1).join('\n');
|
|
302
|
-
break;
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// Convert HTML to text (simple)
|
|
307
|
-
const textContent = content
|
|
308
|
-
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
309
|
-
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
310
|
-
.replace(/<[^>]+>/g, ' ')
|
|
311
|
-
.replace(/\s+/g, ' ')
|
|
312
|
-
.replace(/</g, '<')
|
|
313
|
-
.replace(/>/g, '>')
|
|
314
|
-
.replace(/&/g, '&')
|
|
315
|
-
.replace(/"/g, '"')
|
|
316
|
-
.trim();
|
|
317
|
-
|
|
318
|
-
if (textContent.length > MIN_CONTENT_LENGTH) {
|
|
319
|
-
return {
|
|
320
|
-
success: true,
|
|
321
|
-
tool: 'browser_local',
|
|
322
|
-
content: textContent,
|
|
323
|
-
title: title,
|
|
324
|
-
url: url
|
|
325
|
-
};
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
return {
|
|
329
|
-
success: false,
|
|
330
|
-
tool: 'browser_local',
|
|
331
|
-
error: 'Content too short'
|
|
332
|
-
};
|
|
333
|
-
|
|
334
|
-
} catch (error) {
|
|
335
|
-
console.error(`[SmartFetch] Local browser failed: ${error.message.split('\n')[0]}`);
|
|
336
|
-
// Fall through to cloud
|
|
337
|
-
}
|
|
338
|
-
} else {
|
|
339
|
-
console.error('[SmartFetch] Local browser-use not installed, trying cloud...');
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
// Try Cloud browser-use (if API key available)
|
|
343
|
-
const cloudApiKey = process.env.BROWSER_USE_API_KEY;
|
|
344
|
-
if (!cloudApiKey) {
|
|
345
|
-
return {
|
|
346
|
-
success: false,
|
|
347
|
-
tool: 'browser',
|
|
348
|
-
error: 'Browser automation failed. Local browser-use not installed, and BROWSER_USE_API_KEY not set for cloud.'
|
|
349
|
-
};
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
// Cloud browser-use would be implemented here
|
|
353
|
-
// For now, return error with setup instructions
|
|
354
|
-
return {
|
|
355
|
-
success: false,
|
|
356
|
-
tool: 'browser_cloud',
|
|
357
|
-
error: 'Cloud browser-use not yet implemented. Please install local browser-use: pip install browser-use && playwright install chromium'
|
|
358
|
-
};
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
/**
|
|
362
|
-
* Smart fetch with 3-layer fallback
|
|
363
|
-
* Strategy: Tavily → LLM Native → Browser
|
|
364
|
-
*/
|
|
365
|
-
async function smartFetch(url, options = {}) {
|
|
366
|
-
const method = options.method || 'auto';
|
|
367
|
-
const skipQualityCheck = options.skipQualityCheck || false;
|
|
368
|
-
|
|
369
|
-
console.error(`[SmartFetch] Starting fetch for: ${url} (method: ${method})`);
|
|
370
|
-
|
|
371
|
-
// Explicit method selection
|
|
372
|
-
if (method !== 'auto') {
|
|
373
|
-
switch (method) {
|
|
374
|
-
case 'tavily': return await tryTavily(url);
|
|
375
|
-
case 'native': return tryLLMNative(url);
|
|
376
|
-
case 'browser': return await tryBrowser(url);
|
|
377
|
-
default: return { success: false, error: 'Unknown method' };
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// Auto mode: 3-layer cascade
|
|
382
|
-
|
|
383
|
-
// Level 1: Tavily
|
|
384
|
-
const tavilyResult = await tryTavily(url);
|
|
385
|
-
|
|
386
|
-
if (tavilyResult.success) {
|
|
387
|
-
const quality = isBlockedOrLowQuality(tavilyResult.content, 'tavily');
|
|
388
|
-
|
|
389
|
-
if (skipQualityCheck || !quality.blocked) {
|
|
390
|
-
const score = calculateQualityScore(tavilyResult.content);
|
|
391
|
-
console.error(`[SmartFetch] ✓ Tavily succeeded (quality: ${score.toFixed(2)})`);
|
|
392
|
-
|
|
393
|
-
return {
|
|
394
|
-
...tavilyResult,
|
|
395
|
-
fallback_used: false,
|
|
396
|
-
quality_score: score,
|
|
397
|
-
quality_check: 'passed'
|
|
398
|
-
};
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
console.error(`[SmartFetch] Tavily content blocked: ${quality.reason}`);
|
|
402
|
-
} else {
|
|
403
|
-
console.error(`[SmartFetch] Tavily failed: ${tavilyResult.error}`);
|
|
404
|
-
|
|
405
|
-
// If Tavily key not set, skip to next level
|
|
406
|
-
if (!tavilyResult.needs_fallback) {
|
|
407
|
-
return tavilyResult;
|
|
408
|
-
}
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
// Level 2: LLM Native Fetch
|
|
412
|
-
console.error('[SmartFetch] Falling back to LLM Native...');
|
|
413
|
-
const nativeResult = tryLLMNative(url);
|
|
414
|
-
|
|
415
|
-
// Return instruction for caller to handle
|
|
416
|
-
return {
|
|
417
|
-
...nativeResult,
|
|
418
|
-
fallback_used: true,
|
|
419
|
-
fallback_chain: ['tavily']
|
|
420
|
-
};
|
|
421
|
-
|
|
422
|
-
// Note: Browser (Level 3) is called by the agent if native fetch fails
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
/**
|
|
426
|
-
* Smart search with Tavily (HTTP direct)
|
|
427
|
-
*/
|
|
428
|
-
function smartSearch(query, maxResults = 5) {
|
|
429
|
-
console.error(`[SmartFetch] Searching: ${query}`);
|
|
430
|
-
|
|
431
|
-
const apiKey = process.env.TAVILY_API_KEY;
|
|
432
|
-
if (!apiKey) {
|
|
433
|
-
return {
|
|
434
|
-
success: false,
|
|
435
|
-
error: 'TAVILY_API_KEY not set in environment'
|
|
436
|
-
};
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
return new Promise((resolve) => {
|
|
440
|
-
const postData = JSON.stringify({
|
|
441
|
-
query: query,
|
|
442
|
-
api_key: apiKey,
|
|
443
|
-
max_results: maxResults,
|
|
444
|
-
search_depth: 'advanced',
|
|
445
|
-
include_answer: true
|
|
446
|
-
});
|
|
447
|
-
|
|
448
|
-
const options = {
|
|
449
|
-
hostname: 'api.tavily.com',
|
|
450
|
-
path: '/search',
|
|
451
|
-
method: 'POST',
|
|
452
|
-
headers: {
|
|
453
|
-
'Content-Type': 'application/json',
|
|
454
|
-
'Content-Length': Buffer.byteLength(postData)
|
|
455
|
-
},
|
|
456
|
-
timeout: TAVILY_TIMEOUT
|
|
457
|
-
};
|
|
458
|
-
|
|
459
|
-
const req = https.request(options, (res) => {
|
|
460
|
-
let data = '';
|
|
461
|
-
res.on('data', chunk => data += chunk);
|
|
462
|
-
res.on('end', () => {
|
|
463
|
-
try {
|
|
464
|
-
const response = JSON.parse(data);
|
|
465
|
-
resolve({
|
|
466
|
-
success: true,
|
|
467
|
-
query: query,
|
|
468
|
-
results: response.results || [],
|
|
469
|
-
answer: response.answer || ''
|
|
470
|
-
});
|
|
471
|
-
} catch (e) {
|
|
472
|
-
resolve({ success: false, error: `Parse error: ${e.message}` });
|
|
473
|
-
}
|
|
474
|
-
});
|
|
475
|
-
});
|
|
476
|
-
|
|
477
|
-
req.on('error', (err) => {
|
|
478
|
-
resolve({ success: false, error: err.message });
|
|
479
|
-
});
|
|
480
|
-
|
|
481
|
-
req.on('timeout', () => {
|
|
482
|
-
req.destroy();
|
|
483
|
-
resolve({ success: false, error: 'Timeout' });
|
|
484
|
-
});
|
|
485
|
-
|
|
486
|
-
req.write(postData);
|
|
487
|
-
req.end();
|
|
488
|
-
});
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
// CLI interface
|
|
492
|
-
if (require.main === module) {
|
|
493
|
-
const args = process.argv.slice(2);
|
|
494
|
-
const command = args[0];
|
|
495
|
-
|
|
496
|
-
if (command === 'fetch') {
|
|
497
|
-
const url = args[1];
|
|
498
|
-
const method = args[2] || 'auto';
|
|
499
|
-
|
|
500
|
-
if (!url) {
|
|
501
|
-
console.log(JSON.stringify({ error: 'URL required' }));
|
|
502
|
-
process.exit(1);
|
|
503
|
-
}
|
|
504
|
-
|
|
505
|
-
smartFetch(url, { method }).then(result => {
|
|
506
|
-
console.log(JSON.stringify(result, null, 2));
|
|
507
|
-
}).catch(err => {
|
|
508
|
-
console.log(JSON.stringify({ error: err.message }));
|
|
509
|
-
process.exit(1);
|
|
510
|
-
});
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
else if (command === 'search') {
|
|
514
|
-
const query = args[1];
|
|
515
|
-
const maxResults = parseInt(args[2]) || 5;
|
|
516
|
-
|
|
517
|
-
if (!query) {
|
|
518
|
-
console.log(JSON.stringify({ error: 'Query required' }));
|
|
519
|
-
process.exit(1);
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
smartSearch(query, maxResults).then(result => {
|
|
523
|
-
console.log(JSON.stringify(result, null, 2));
|
|
524
|
-
});
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
else {
|
|
528
|
-
console.log(`
|
|
529
|
-
Smart Web Fetch Skill - 3-Layer Strategy
|
|
530
|
-
|
|
531
|
-
Usage:
|
|
532
|
-
smart-web-fetch fetch <url> [method]
|
|
533
|
-
smart-web-fetch search <query> [max_results]
|
|
534
|
-
|
|
535
|
-
Methods: auto (default), tavily, native, browser
|
|
536
|
-
|
|
537
|
-
3-Layer Strategy:
|
|
538
|
-
1. Tavily - AI extraction, best quality (needs TAVILY_API_KEY)
|
|
539
|
-
2. LLM Native - Use FetchURL tool (for agents with native capability)
|
|
540
|
-
3. Browser - Local browser-use (fallback for stubborn pages)
|
|
541
|
-
|
|
542
|
-
Environment Variables:
|
|
543
|
-
TAVILY_API_KEY - Required for Tavily API
|
|
544
|
-
BROWSER_USE_API_KEY - Optional for cloud browser (local preferred)
|
|
545
|
-
|
|
546
|
-
Examples:
|
|
547
|
-
smart-web-fetch fetch https://example.com
|
|
548
|
-
smart-web-fetch fetch https://example.com tavily
|
|
549
|
-
smart-web-fetch search "OpenAI GPT-5" 10
|
|
550
|
-
|
|
551
|
-
Install local browser:
|
|
552
|
-
pip install browser-use
|
|
553
|
-
playwright install chromium
|
|
554
|
-
`);
|
|
555
|
-
}
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
module.exports = { smartFetch, smartSearch, tryTavily, tryLLMNative, tryBrowser };
|