fr-cli 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fr_cli/README.md +148 -0
- fr_cli/WEAPON.MD +186 -0
- fr_cli/__init__.py +4 -0
- fr_cli/addon/plugin.py +69 -0
- fr_cli/agent/__init__.py +9 -0
- fr_cli/agent/builtins/__init__.py +4 -0
- fr_cli/agent/builtins/_utils.py +48 -0
- fr_cli/agent/builtins/db.py +269 -0
- fr_cli/agent/builtins/local.py +105 -0
- fr_cli/agent/builtins/rag.py +652 -0
- fr_cli/agent/builtins/rag_watcher_daemon.py +156 -0
- fr_cli/agent/builtins/remote.py +214 -0
- fr_cli/agent/builtins/spider.py +247 -0
- fr_cli/agent/client.py +164 -0
- fr_cli/agent/executor.py +86 -0
- fr_cli/agent/generator.py +104 -0
- fr_cli/agent/manager.py +193 -0
- fr_cli/agent/master.py +604 -0
- fr_cli/agent/master_prompt.py +118 -0
- fr_cli/agent/remote.py +70 -0
- fr_cli/agent/server.py +279 -0
- fr_cli/agent/workflow.py +164 -0
- fr_cli/breakthrough/update.py +154 -0
- fr_cli/command/__init__.py +4 -0
- fr_cli/command/executor.py +276 -0
- fr_cli/command/registry.py +1034 -0
- fr_cli/command/security.py +30 -0
- fr_cli/conf/config.py +126 -0
- fr_cli/conf/wizard.py +172 -0
- fr_cli/core/chat.py +280 -0
- fr_cli/core/core.py +111 -0
- fr_cli/core/intent.py +129 -0
- fr_cli/core/recommender.py +71 -0
- fr_cli/core/stream.py +83 -0
- fr_cli/core/sysmon.py +117 -0
- fr_cli/core/thinking.py +215 -0
- fr_cli/gatekeeper/__init__.py +7 -0
- fr_cli/gatekeeper/daemon.py +216 -0
- fr_cli/gatekeeper/manager.py +218 -0
- fr_cli/lang/i18n.py +827 -0
- fr_cli/main.py +329 -0
- fr_cli/memory/context.py +119 -0
- fr_cli/memory/history.py +96 -0
- fr_cli/memory/session.py +134 -0
- fr_cli/repl/__init__.py +0 -0
- fr_cli/repl/commands.py +1098 -0
- fr_cli/security/security.py +46 -0
- fr_cli/ui/ui.py +116 -0
- fr_cli/weapon/cron.py +217 -0
- fr_cli/weapon/dataframe.py +97 -0
- fr_cli/weapon/disk.py +141 -0
- fr_cli/weapon/fs.py +206 -0
- fr_cli/weapon/launcher.py +249 -0
- fr_cli/weapon/loader.py +98 -0
- fr_cli/weapon/mail.py +227 -0
- fr_cli/weapon/mcp.py +204 -0
- fr_cli/weapon/vision.py +74 -0
- fr_cli/weapon/web.py +88 -0
- fr_cli-2.1.0.dist-info/METADATA +227 -0
- fr_cli-2.1.0.dist-info/RECORD +64 -0
- fr_cli-2.1.0.dist-info/WHEEL +5 -0
- fr_cli-2.1.0.dist-info/entry_points.txt +2 -0
- fr_cli-2.1.0.dist-info/licenses/LICENSE +21 -0
- fr_cli-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAG 知识库守护进程 —— 后台藏经阁守护
|
|
3
|
+
负责在主进程退出后继续监控知识库目录,新文件自动向量化入库。
|
|
4
|
+
|
|
5
|
+
启动方式(不应由用户直接调用):
|
|
6
|
+
python -m fr_cli.agent.builtins.rag_watcher_daemon --kb_dir <目录> [--db_path <路径>] [--interval <秒>]
|
|
7
|
+
|
|
8
|
+
停止方式:
|
|
9
|
+
创建 ~/.fr_cli_rag_watcher.stop 标记文件,守护进程检测到后自行退出。
|
|
10
|
+
"""
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
import signal
|
|
15
|
+
import atexit
|
|
16
|
+
import argparse
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
|
|
20
|
+
# 确保项目根目录在 Python 路径中
|
|
21
|
+
_project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
22
|
+
if _project_root not in sys.path:
|
|
23
|
+
sys.path.insert(0, _project_root)
|
|
24
|
+
|
|
25
|
+
PID_FILE = Path.home() / ".fr_cli_rag_watcher.pid"
|
|
26
|
+
STOP_FILE = Path.home() / ".fr_cli_rag_watcher.stop"
|
|
27
|
+
LOG_FILE = Path.home() / ".fr_cli_rag_watcher.log"
|
|
28
|
+
DB_PATH = Path.home() / ".fr_cli_rag_db"
|
|
29
|
+
|
|
30
|
+
DEFAULT_INTERVAL = 30
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _log(msg):
|
|
34
|
+
"""写入日志文件并打印到 stderr(便于调试)"""
|
|
35
|
+
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
36
|
+
line = f"[{ts}] {msg}"
|
|
37
|
+
try:
|
|
38
|
+
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
39
|
+
f.write(line + "\n")
|
|
40
|
+
except Exception:
|
|
41
|
+
pass
|
|
42
|
+
sys.stderr.write(line + "\n")
|
|
43
|
+
sys.stderr.flush()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _write_pid(pid):
|
|
47
|
+
try:
|
|
48
|
+
with open(PID_FILE, "w", encoding="utf-8") as f:
|
|
49
|
+
f.write(str(pid))
|
|
50
|
+
except Exception as e:
|
|
51
|
+
_log(f"写入 PID 文件失败: {e}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _clear_stop_marker():
|
|
55
|
+
if STOP_FILE.exists():
|
|
56
|
+
try:
|
|
57
|
+
STOP_FILE.unlink()
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _cleanup():
|
|
63
|
+
_clear_stop_marker()
|
|
64
|
+
if PID_FILE.exists():
|
|
65
|
+
try:
|
|
66
|
+
PID_FILE.unlink()
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
_log("守护进程已清理并退出。")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _setup_signal_handlers():
|
|
73
|
+
def _sigterm_handler(signum, frame):
|
|
74
|
+
_log(f"收到信号 {signum},准备退出...")
|
|
75
|
+
_cleanup()
|
|
76
|
+
sys.exit(0)
|
|
77
|
+
|
|
78
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
79
|
+
if hasattr(signal, "SIGINT"):
|
|
80
|
+
signal.signal(signal.SIGINT, _sigterm_handler)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _parse_args():
|
|
84
|
+
parser = argparse.ArgumentParser(description="RAG Knowledge Base Watcher Daemon")
|
|
85
|
+
parser.add_argument("--kb_dir", required=True, help="知识库目录路径")
|
|
86
|
+
parser.add_argument("--db_path", default=str(DB_PATH), help="ChromaDB 持久化路径")
|
|
87
|
+
parser.add_argument("--interval", type=int, default=DEFAULT_INTERVAL, help="扫描间隔(秒,默认30)")
|
|
88
|
+
return parser.parse_args()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def run_daemon():
|
|
92
|
+
"""守护进程主循环"""
|
|
93
|
+
args = _parse_args()
|
|
94
|
+
kb_dir = Path(args.kb_dir)
|
|
95
|
+
db_path = Path(args.db_path)
|
|
96
|
+
interval = max(5, args.interval)
|
|
97
|
+
|
|
98
|
+
_clear_stop_marker()
|
|
99
|
+
_write_pid(os.getpid())
|
|
100
|
+
atexit.register(_cleanup)
|
|
101
|
+
_setup_signal_handlers()
|
|
102
|
+
|
|
103
|
+
_log("=" * 50)
|
|
104
|
+
_log(f"RAG 知识库守护进程启动")
|
|
105
|
+
_log(f" 知识库目录: {kb_dir}")
|
|
106
|
+
_log(f" 向量数据库: {db_path}")
|
|
107
|
+
_log(f" 扫描间隔: {interval} 秒")
|
|
108
|
+
_log("=" * 50)
|
|
109
|
+
|
|
110
|
+
if not kb_dir.exists():
|
|
111
|
+
_log(f"错误: 知识库目录不存在: {kb_dir}")
|
|
112
|
+
_cleanup()
|
|
113
|
+
sys.exit(1)
|
|
114
|
+
|
|
115
|
+
# 延迟导入,减少启动开销
|
|
116
|
+
try:
|
|
117
|
+
from fr_cli.agent.builtins.rag import RAGManager
|
|
118
|
+
except Exception as e:
|
|
119
|
+
_log(f"导入 RAGManager 失败: {e}")
|
|
120
|
+
_cleanup()
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
|
|
123
|
+
mgr = RAGManager(kb_dir=str(kb_dir), db_path=str(db_path))
|
|
124
|
+
|
|
125
|
+
# 首次全量同步
|
|
126
|
+
_log("开始首次全量同步...")
|
|
127
|
+
try:
|
|
128
|
+
ok, msg = mgr.sync_directory()
|
|
129
|
+
_log(f"首次同步结果: {msg}")
|
|
130
|
+
except Exception as e:
|
|
131
|
+
_log(f"首次同步异常: {e}")
|
|
132
|
+
|
|
133
|
+
# 主循环:定期检查停止标记并同步目录
|
|
134
|
+
scan_count = 0
|
|
135
|
+
while True:
|
|
136
|
+
time.sleep(interval)
|
|
137
|
+
|
|
138
|
+
if STOP_FILE.exists():
|
|
139
|
+
_log("检测到停止标记,准备退出...")
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
scan_count += 1
|
|
143
|
+
try:
|
|
144
|
+
ok, msg = mgr.sync_directory()
|
|
145
|
+
if ok and "已是最新状态" not in msg:
|
|
146
|
+
_log(f"扫描 #{scan_count}: {msg}")
|
|
147
|
+
elif scan_count % 120 == 0: # 每约1小时(120次×30秒)记录一次心跳
|
|
148
|
+
_log(f"心跳 #{scan_count}: 知识库监控正常,{msg}")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
_log(f"扫描 #{scan_count} 异常: {e}")
|
|
151
|
+
|
|
152
|
+
_cleanup()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
run_daemon()
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@remote 内置 Agent —— 远程 SSH 操作助手
|
|
3
|
+
支持多机配置、配置向导、AI 生成远程命令。
|
|
4
|
+
"""
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
REMOTE_CFG_PATH = Path.home() / ".fr_cli_remotes.json"
|
|
8
|
+
|
|
9
|
+
REMOTE_SYS_PROMPT = """你是一个远程系统命令专家。请根据目标主机的操作系统类型和用户需求,生成最合适的远程命令。
|
|
10
|
+
|
|
11
|
+
规则:
|
|
12
|
+
1. 只输出命令本身,不要任何解释、不要 markdown 代码块、不要多余文字
|
|
13
|
+
2. 如果需求涉及危险操作,输出 COMMENT: 开头的注释警告
|
|
14
|
+
3. 优先使用标准 Linux/Unix 命令(目标多为服务器)
|
|
15
|
+
4. 如果需要多条命令,用 && 或 ; 连接成一行
|
|
16
|
+
5. 避免交互式命令(如 vim、top),使用非交互式替代
|
|
17
|
+
|
|
18
|
+
目标主机 OS: {os_name}
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _load_hosts():
|
|
23
|
+
from fr_cli.agent.builtins._utils import load_json_config
|
|
24
|
+
return load_json_config(REMOTE_CFG_PATH)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _save_hosts(hosts):
|
|
28
|
+
from fr_cli.agent.builtins._utils import save_json_config
|
|
29
|
+
save_json_config(REMOTE_CFG_PATH, hosts)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def list_hosts():
|
|
33
|
+
return _load_hosts()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def save_host(alias, ip, port, user, auth_type, auth_value):
|
|
37
|
+
hosts = _load_hosts()
|
|
38
|
+
hosts[alias] = {
|
|
39
|
+
"ip": ip,
|
|
40
|
+
"port": int(port) if port else 22,
|
|
41
|
+
"user": user,
|
|
42
|
+
"auth_type": auth_type, # "password" or "key"
|
|
43
|
+
"auth_value": auth_value,
|
|
44
|
+
}
|
|
45
|
+
_save_hosts(hosts)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def delete_host(alias):
|
|
49
|
+
hosts = _load_hosts()
|
|
50
|
+
if alias in hosts:
|
|
51
|
+
del hosts[alias]
|
|
52
|
+
_save_hosts(hosts)
|
|
53
|
+
return True
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _exec_ssh(host_cfg, command):
|
|
58
|
+
"""通过 ssh 命令执行远程操作(使用 paramiko 避免命令注入)"""
|
|
59
|
+
ip = host_cfg["ip"]
|
|
60
|
+
port = host_cfg.get("port", 22)
|
|
61
|
+
user = host_cfg["user"]
|
|
62
|
+
auth_type = host_cfg.get("auth_type", "password")
|
|
63
|
+
auth_value = host_cfg.get("auth_value", "")
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
import paramiko
|
|
67
|
+
except ImportError:
|
|
68
|
+
return None, "缺少 paramiko (pip install paramiko)"
|
|
69
|
+
|
|
70
|
+
client = paramiko.SSHClient()
|
|
71
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
72
|
+
try:
|
|
73
|
+
connect_kwargs = {
|
|
74
|
+
"hostname": ip,
|
|
75
|
+
"port": int(port),
|
|
76
|
+
"username": user,
|
|
77
|
+
"timeout": 30,
|
|
78
|
+
"look_for_keys": False,
|
|
79
|
+
}
|
|
80
|
+
if auth_type == "key":
|
|
81
|
+
connect_kwargs["key_filename"] = auth_value
|
|
82
|
+
else:
|
|
83
|
+
connect_kwargs["password"] = auth_value
|
|
84
|
+
|
|
85
|
+
client.connect(**connect_kwargs)
|
|
86
|
+
stdin, stdout, stderr = client.exec_command(command)
|
|
87
|
+
out = stdout.read().decode("utf-8", errors="ignore")
|
|
88
|
+
err = stderr.read().decode("utf-8", errors="ignore")
|
|
89
|
+
client.close()
|
|
90
|
+
return out + err, None
|
|
91
|
+
except Exception as e:
|
|
92
|
+
return None, str(e)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _detect_os(host_cfg):
|
|
96
|
+
"""探测远程主机操作系统"""
|
|
97
|
+
out, err = _exec_ssh(host_cfg, "uname -s")
|
|
98
|
+
if err:
|
|
99
|
+
return "Unknown", err
|
|
100
|
+
return out.strip() or "Linux", None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def handle_remote(user_input, state):
|
|
104
|
+
"""处理 @remote 前缀的请求"""
|
|
105
|
+
from fr_cli.core.stream import stream_cnt
|
|
106
|
+
from fr_cli.ui.ui import CYAN, GREEN, RED, YELLOW, DIM, RESET
|
|
107
|
+
|
|
108
|
+
hosts = list_hosts()
|
|
109
|
+
if not hosts:
|
|
110
|
+
print(f"{YELLOW}未配置远程主机。正在启动配置向导...{RESET}")
|
|
111
|
+
_setup_wizard(state.lang)
|
|
112
|
+
hosts = list_hosts()
|
|
113
|
+
if not hosts:
|
|
114
|
+
print(f"{RED}配置取消,无法执行远程操作。{RESET}")
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
# 解析输入: @remote [ip/alias] 需求
|
|
118
|
+
text = user_input[len("@remote"):].strip()
|
|
119
|
+
parts = text.split(None, 1)
|
|
120
|
+
|
|
121
|
+
# 如果只配置了一台,默认使用它
|
|
122
|
+
if len(hosts) == 1:
|
|
123
|
+
alias = list(hosts.keys())[0]
|
|
124
|
+
requirement = text
|
|
125
|
+
else:
|
|
126
|
+
if len(parts) < 2:
|
|
127
|
+
print(f"{YELLOW}用法: @remote <别名/IP> <需求描述>{RESET}")
|
|
128
|
+
print(f"{DIM}已配置主机: {', '.join(hosts.keys())}{RESET}")
|
|
129
|
+
return
|
|
130
|
+
alias = parts[0]
|
|
131
|
+
requirement = parts[1]
|
|
132
|
+
|
|
133
|
+
# 查找主机配置
|
|
134
|
+
host_cfg = hosts.get(alias)
|
|
135
|
+
if not host_cfg:
|
|
136
|
+
# 尝试用 alias 模糊匹配
|
|
137
|
+
for k, v in hosts.items():
|
|
138
|
+
if k.lower() == alias.lower() or v.get("ip") == alias:
|
|
139
|
+
host_cfg = v
|
|
140
|
+
alias = k
|
|
141
|
+
break
|
|
142
|
+
if not host_cfg:
|
|
143
|
+
print(f"{RED}未找到主机 [{alias}]。已配置: {', '.join(hosts.keys())}{RESET}")
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
# 探测 OS
|
|
147
|
+
os_name, err = _detect_os(host_cfg)
|
|
148
|
+
if err:
|
|
149
|
+
print(f"{RED}无法连接主机 [{alias}]: {err}{RESET}")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
prompt = REMOTE_SYS_PROMPT.format(os_name=os_name)
|
|
153
|
+
messages = [
|
|
154
|
+
{"role": "system", "content": prompt},
|
|
155
|
+
{"role": "user", "content": requirement},
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
print(f"{CYAN}🧙 正在为 [{alias}]({os_name}) 生成远程命令...{RESET}")
|
|
159
|
+
cmd_text, _, _ = stream_cnt(state.client, state.model_name, messages, state.lang, custom_prefix="", max_tokens=1024)
|
|
160
|
+
cmd_text = cmd_text.strip()
|
|
161
|
+
|
|
162
|
+
from fr_cli.agent.builtins._utils import strip_code_blocks
|
|
163
|
+
cmd_text = strip_code_blocks(cmd_text)
|
|
164
|
+
|
|
165
|
+
if not cmd_text:
|
|
166
|
+
print(f"{RED}未能生成有效命令。{RESET}")
|
|
167
|
+
return
|
|
168
|
+
|
|
169
|
+
if cmd_text.startswith("COMMENT:"):
|
|
170
|
+
print(f"{YELLOW}{cmd_text}{RESET}")
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
print(f"\n{DIM}建议命令 ({alias}):{RESET}\n{CYAN}{cmd_text}{RESET}")
|
|
174
|
+
from fr_cli.agent.builtins._utils import confirm_execute
|
|
175
|
+
if not confirm_execute():
|
|
176
|
+
print(f"{DIM}已取消。{RESET}")
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
out, err = _exec_ssh(host_cfg, cmd_text)
|
|
180
|
+
if err:
|
|
181
|
+
print(f"{RED}❌ 执行失败: {err}{RESET}")
|
|
182
|
+
else:
|
|
183
|
+
if out.strip():
|
|
184
|
+
print(f"\n{GREEN}{out.strip()[:3000]}{RESET}")
|
|
185
|
+
else:
|
|
186
|
+
print(f"{GREEN}✅ 命令执行完成(无输出){RESET}")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _setup_wizard(lang="zh"):
|
|
190
|
+
"""远程主机配置向导"""
|
|
191
|
+
from fr_cli.ui.ui import CYAN, GREEN, YELLOW, DIM, RESET
|
|
192
|
+
|
|
193
|
+
print(f"{CYAN}═══ 远程主机配置向导 ═══{RESET}")
|
|
194
|
+
alias = input(f"{DIM}别名 (如: myserver): {RESET}").strip()
|
|
195
|
+
if not alias:
|
|
196
|
+
print(f"{YELLOW}别名不能为空。{RESET}")
|
|
197
|
+
return
|
|
198
|
+
ip = input(f"{DIM}IP 地址: {RESET}").strip()
|
|
199
|
+
if not ip:
|
|
200
|
+
print(f"{YELLOW}IP 不能为空。{RESET}")
|
|
201
|
+
return
|
|
202
|
+
port = input(f"{DIM}端口 [22]: {RESET}").strip() or "22"
|
|
203
|
+
user = input(f"{DIM}用户名: {RESET}").strip()
|
|
204
|
+
if not user:
|
|
205
|
+
print(f"{YELLOW}用户名不能为空。{RESET}")
|
|
206
|
+
return
|
|
207
|
+
auth = input(f"{DIM}认证方式 (password/key) [password]: {RESET}").strip() or "password"
|
|
208
|
+
if auth == "key":
|
|
209
|
+
auth_value = input(f"{DIM}私钥文件路径 (如: ~/.ssh/id_rsa): {RESET}").strip()
|
|
210
|
+
else:
|
|
211
|
+
auth_value = input(f"{DIM}密码: {RESET}").strip()
|
|
212
|
+
|
|
213
|
+
save_host(alias, ip, port, user, auth, auth_value)
|
|
214
|
+
print(f"{GREEN}✅ 主机 [{alias}] ({ip}) 已保存。{RESET}")
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@spider 内置 Agent —— 智能网页爬虫助手
|
|
3
|
+
模拟真人浏览行为,支持反爬自适应,使用 requests → selenium 降级策略。
|
|
4
|
+
"""
|
|
5
|
+
import random
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from urllib.parse import urljoin, urlparse
|
|
11
|
+
|
|
12
|
+
# 尝试导入可选依赖
|
|
13
|
+
_requests = None
|
|
14
|
+
_selenium = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_requests():
|
|
18
|
+
global _requests
|
|
19
|
+
if _requests is None:
|
|
20
|
+
try:
|
|
21
|
+
import requests as r
|
|
22
|
+
_requests = r
|
|
23
|
+
except ImportError:
|
|
24
|
+
pass
|
|
25
|
+
return _requests
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_selenium():
|
|
29
|
+
global _selenium
|
|
30
|
+
if _selenium is None:
|
|
31
|
+
try:
|
|
32
|
+
from selenium import webdriver
|
|
33
|
+
from selenium.webdriver.common.by import By
|
|
34
|
+
from selenium.webdriver.common.action_chains import ActionChains
|
|
35
|
+
_selenium = {"webdriver": webdriver, "By": By, "ActionChains": ActionChains}
|
|
36
|
+
except ImportError:
|
|
37
|
+
pass
|
|
38
|
+
return _selenium
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
USER_AGENTS = [
|
|
42
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
43
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
44
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _sanitize_filename(url):
|
|
49
|
+
"""将 URL 转换为安全的文件名"""
|
|
50
|
+
parsed = urlparse(url)
|
|
51
|
+
path = parsed.path.strip("/").replace("/", "_")
|
|
52
|
+
if not path:
|
|
53
|
+
path = "index"
|
|
54
|
+
name = f"{parsed.netloc}_{path}"
|
|
55
|
+
name = re.sub(r'[^\w\-_]', '_', name)[:100]
|
|
56
|
+
return name + ".html"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _fetch_with_requests(url):
|
|
60
|
+
"""使用 requests 获取页面"""
|
|
61
|
+
requests = _get_requests()
|
|
62
|
+
if not requests:
|
|
63
|
+
return None, "requests 未安装 (pip install requests)"
|
|
64
|
+
headers = {"User-Agent": random.choice(USER_AGENTS)}
|
|
65
|
+
try:
|
|
66
|
+
resp = requests.get(url, headers=headers, timeout=15)
|
|
67
|
+
resp.raise_for_status()
|
|
68
|
+
text = resp.text
|
|
69
|
+
# 简单反爬检测
|
|
70
|
+
if len(text) < 500 or "captcha" in text.lower() or "access denied" in text.lower():
|
|
71
|
+
return None, "可能触发反爬机制"
|
|
72
|
+
return text, None
|
|
73
|
+
except Exception as e:
|
|
74
|
+
return None, str(e)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _fetch_with_selenium(url):
|
|
78
|
+
"""使用 selenium 模拟真人浏览"""
|
|
79
|
+
sel = _get_selenium()
|
|
80
|
+
if not sel:
|
|
81
|
+
return None, "selenium 未安装 (pip install selenium)"
|
|
82
|
+
|
|
83
|
+
webdriver = sel["webdriver"]
|
|
84
|
+
By = sel["By"]
|
|
85
|
+
ActionChains = sel["ActionChains"]
|
|
86
|
+
|
|
87
|
+
options = webdriver.ChromeOptions()
|
|
88
|
+
options.add_argument("--headless")
|
|
89
|
+
options.add_argument("--no-sandbox")
|
|
90
|
+
options.add_argument("--disable-dev-shm-usage")
|
|
91
|
+
options.add_argument(f"--user-agent={random.choice(USER_AGENTS)}")
|
|
92
|
+
options.add_argument("--window-size=1920,1080")
|
|
93
|
+
|
|
94
|
+
driver = None
|
|
95
|
+
try:
|
|
96
|
+
driver = webdriver.Chrome(options=options)
|
|
97
|
+
driver.get(url)
|
|
98
|
+
|
|
99
|
+
# 模拟真人行为
|
|
100
|
+
time.sleep(random.uniform(1.5, 3.0))
|
|
101
|
+
|
|
102
|
+
# 随机滚动
|
|
103
|
+
for _ in range(random.randint(2, 5)):
|
|
104
|
+
scroll_y = random.randint(100, 800)
|
|
105
|
+
driver.execute_script(f"window.scrollBy(0, {scroll_y});")
|
|
106
|
+
time.sleep(random.uniform(0.5, 1.5))
|
|
107
|
+
|
|
108
|
+
# 随机悬停
|
|
109
|
+
try:
|
|
110
|
+
elements = driver.find_elements(By.TAG_NAME, "a")
|
|
111
|
+
if elements:
|
|
112
|
+
el = random.choice(elements[:10])
|
|
113
|
+
ActionChains(driver).move_to_element(el).pause(random.uniform(0.3, 0.8)).perform()
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
# 再滚动回顶部
|
|
118
|
+
driver.execute_script("window.scrollTo(0, 0);")
|
|
119
|
+
time.sleep(random.uniform(0.5, 1.0))
|
|
120
|
+
|
|
121
|
+
html = driver.page_source
|
|
122
|
+
return html, None
|
|
123
|
+
except Exception as e:
|
|
124
|
+
return None, str(e)
|
|
125
|
+
finally:
|
|
126
|
+
if driver:
|
|
127
|
+
driver.quit()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _extract_links(html, base_url):
|
|
131
|
+
"""从 HTML 中提取同域名链接"""
|
|
132
|
+
links = set()
|
|
133
|
+
base_domain = urlparse(base_url).netloc
|
|
134
|
+
pattern = re.compile(r'href=["\']([^"\']+)["\']', re.I)
|
|
135
|
+
for m in pattern.finditer(html):
|
|
136
|
+
href = m.group(1)
|
|
137
|
+
full = urljoin(base_url, href)
|
|
138
|
+
if urlparse(full).netloc == base_domain:
|
|
139
|
+
# 去重和过滤
|
|
140
|
+
if not full.startswith("javascript:") and not full.startswith("mailto:"):
|
|
141
|
+
links.add(full)
|
|
142
|
+
return list(links)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _save_page(url, html, output_dir):
|
|
146
|
+
"""保存页面到工作区"""
|
|
147
|
+
output_dir = Path(output_dir)
|
|
148
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
filename = _sanitize_filename(url)
|
|
150
|
+
filepath = output_dir / filename
|
|
151
|
+
filepath.write_text(html, encoding="utf-8")
|
|
152
|
+
return str(filepath)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def crawl(url, depth=1, output_base=None, lang="zh"):
|
|
156
|
+
"""爬取指定 URL,返回 (saved_files, errors)"""
|
|
157
|
+
from fr_cli.ui.ui import CYAN, GREEN, RED, DIM, YELLOW, RESET
|
|
158
|
+
|
|
159
|
+
if depth < 1:
|
|
160
|
+
depth = 1
|
|
161
|
+
if depth > 3:
|
|
162
|
+
depth = 3 # 限制最大深度为 3
|
|
163
|
+
|
|
164
|
+
if output_base is None:
|
|
165
|
+
output_base = Path.cwd() / f"web_{datetime.now().strftime('%Y%m%d')}"
|
|
166
|
+
else:
|
|
167
|
+
output_base = Path(output_base)
|
|
168
|
+
|
|
169
|
+
saved = []
|
|
170
|
+
errors = []
|
|
171
|
+
visited = set()
|
|
172
|
+
to_crawl = [(url, 0)] # (url, current_depth)
|
|
173
|
+
|
|
174
|
+
while to_crawl:
|
|
175
|
+
current_url, current_depth = to_crawl.pop(0)
|
|
176
|
+
if current_url in visited or current_depth >= depth:
|
|
177
|
+
continue
|
|
178
|
+
visited.add(current_url)
|
|
179
|
+
|
|
180
|
+
print(f"{CYAN}🕷️ 爬取 [{current_depth+1}/{depth}]: {current_url[:80]}...{RESET}")
|
|
181
|
+
|
|
182
|
+
# 第一优先级: requests
|
|
183
|
+
html, err = _fetch_with_requests(current_url)
|
|
184
|
+
if err or not html:
|
|
185
|
+
print(f"{YELLOW} requests 失败: {err or '内容为空'},尝试 selenium...{RESET}")
|
|
186
|
+
html, err2 = _fetch_with_selenium(current_url)
|
|
187
|
+
if err2 or not html:
|
|
188
|
+
errors.append(f"{current_url}: {err2 or err}")
|
|
189
|
+
print(f"{RED} ❌ 爬取失败: {err2 or err}{RESET}")
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
# 保存页面
|
|
193
|
+
filepath = _save_page(current_url, html, output_base)
|
|
194
|
+
saved.append(filepath)
|
|
195
|
+
print(f"{GREEN} ✅ 已保存: {filepath}{RESET}")
|
|
196
|
+
|
|
197
|
+
# 如果还有深度,提取链接继续爬
|
|
198
|
+
if current_depth + 1 < depth:
|
|
199
|
+
links = _extract_links(html, current_url)
|
|
200
|
+
for link in links[:20]: # 每页最多20个链接
|
|
201
|
+
if link not in visited:
|
|
202
|
+
to_crawl.append((link, current_depth + 1))
|
|
203
|
+
if links:
|
|
204
|
+
print(f"{DIM} 发现 {len(links)} 个链接,加入 {min(len(links), 20)} 个待爬取{RESET}")
|
|
205
|
+
|
|
206
|
+
# 礼貌延迟
|
|
207
|
+
time.sleep(random.uniform(1.0, 2.5))
|
|
208
|
+
|
|
209
|
+
return saved, errors
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def handle_spider(user_input, state):
|
|
213
|
+
"""处理 @spider 前缀的请求"""
|
|
214
|
+
from fr_cli.ui.ui import CYAN, GREEN, RED, YELLOW, DIM, RESET
|
|
215
|
+
|
|
216
|
+
text = user_input[len("@spider"):].strip()
|
|
217
|
+
if not text:
|
|
218
|
+
print(f"{RED}用法: @spider <URL> [深度]{RESET}")
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
parts = text.split()
|
|
222
|
+
url = parts[0]
|
|
223
|
+
depth = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 1
|
|
224
|
+
|
|
225
|
+
if not url.startswith("http"):
|
|
226
|
+
url = "https://" + url
|
|
227
|
+
|
|
228
|
+
# 检查依赖
|
|
229
|
+
if not _get_requests():
|
|
230
|
+
print(f"{RED}缺少依赖: pip install requests selenium{RESET}")
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
output_dir = None
|
|
234
|
+
if state.vfs and state.vfs.cwd:
|
|
235
|
+
output_dir = state.vfs.cwd
|
|
236
|
+
|
|
237
|
+
print(f"{CYAN}🕷️ 开始爬取: {url} | 深度: {depth}{RESET}")
|
|
238
|
+
saved, errors = crawl(url, depth, output_dir, state.lang)
|
|
239
|
+
|
|
240
|
+
print(f"\n{GREEN}═══ 爬取完成 ═══{RESET}")
|
|
241
|
+
print(f"{GREEN} 成功: {len(saved)} 个页面{RESET}")
|
|
242
|
+
if errors:
|
|
243
|
+
print(f"{RED} 失败: {len(errors)} 个页面{RESET}")
|
|
244
|
+
for e in errors[:5]:
|
|
245
|
+
print(f"{RED} - {e}{RESET}")
|
|
246
|
+
if saved:
|
|
247
|
+
print(f"{DIM} 保存目录: {Path(saved[0]).parent}{RESET}")
|