abelworkflow 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitignore +13 -0
- package/.skill-lock.json +29 -0
- package/AGENTS.md +45 -0
- package/README.md +147 -0
- package/bin/abelworkflow.mjs +2 -0
- package/commands/oc/diagnose.md +63 -0
- package/commands/oc/implementation.md +157 -0
- package/commands/oc/init.md +27 -0
- package/commands/oc/plan.md +88 -0
- package/commands/oc/research.md +126 -0
- package/lib/cli.mjs +222 -0
- package/package.json +23 -0
- package/skills/confidence-check/SKILL.md +124 -0
- package/skills/confidence-check/confidence.ts +335 -0
- package/skills/context7-auto-research/.env +4 -0
- package/skills/context7-auto-research/.env.example +4 -0
- package/skills/context7-auto-research/SKILL.md +83 -0
- package/skills/context7-auto-research/context7-api.js +283 -0
- package/skills/dev-browser/SKILL.md +225 -0
- package/skills/dev-browser/bun.lock +443 -0
- package/skills/dev-browser/package-lock.json +2988 -0
- package/skills/dev-browser/package.json +31 -0
- package/skills/dev-browser/references/scraping.md +155 -0
- package/skills/dev-browser/resolve-skill-dir.sh +35 -0
- package/skills/dev-browser/scripts/start-relay.ts +32 -0
- package/skills/dev-browser/scripts/start-server.ts +117 -0
- package/skills/dev-browser/server.sh +24 -0
- package/skills/dev-browser/src/client.ts +474 -0
- package/skills/dev-browser/src/index.ts +287 -0
- package/skills/dev-browser/src/relay.ts +731 -0
- package/skills/dev-browser/src/snapshot/browser-script.ts +877 -0
- package/skills/dev-browser/src/snapshot/index.ts +14 -0
- package/skills/dev-browser/src/snapshot/inject.ts +13 -0
- package/skills/dev-browser/src/types.ts +34 -0
- package/skills/dev-browser/tsconfig.json +36 -0
- package/skills/dev-browser/vitest.config.ts +12 -0
- package/skills/git-commit/SKILL.md +124 -0
- package/skills/grok-search/.env.example +24 -0
- package/skills/grok-search/SKILL.md +114 -0
- package/skills/grok-search/requirements.txt +2 -0
- package/skills/grok-search/scripts/groksearch_cli.py +1214 -0
- package/skills/grok-search/scripts/groksearch_entry.py +116 -0
- package/skills/prompt-enhancer/ADVANCED.md +74 -0
- package/skills/prompt-enhancer/SKILL.md +71 -0
- package/skills/prompt-enhancer/TEMPLATE.md +91 -0
- package/skills/prompt-enhancer/scripts/enhance.py +142 -0
- package/skills/sequential-think/SKILL.md +198 -0
- package/skills/sequential-think/scripts/.env.example +5 -0
- package/skills/sequential-think/scripts/sequential_think_cli.py +253 -0
- package/skills/time/SKILL.md +116 -0
- package/skills/time/scripts/time_cli.py +104 -0
|
@@ -0,0 +1,1214 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""GrokSearch CLI - 独立的网页搜索/获取/映射工具 (Grok/Tavily/Firecrawl)。"""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import asyncio
|
|
6
|
+
import ipaddress
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from email.utils import parsedate_to_datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import httpx
|
|
20
|
+
from tenacity import AsyncRetrying, retry_if_exception, stop_after_attempt, wait_random_exponential
|
|
21
|
+
from tenacity.wait import wait_base
|
|
22
|
+
except ImportError:
|
|
23
|
+
print("Error: 所需包未安装。请运行: python scripts/groksearch_entry.py --help 或 pip install httpx tenacity", file=sys.stderr)
|
|
24
|
+
sys.exit(1)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ============================================================================
|
|
28
|
+
# .env 文件支持
|
|
29
|
+
# ============================================================================
|
|
30
|
+
|
|
31
|
+
def load_dotenv() -> bool:
|
|
32
|
+
"""从项目根目录的 .env 文件加载环境变量。"""
|
|
33
|
+
env_path = Path(__file__).parent.parent / ".env"
|
|
34
|
+
if env_path.exists():
|
|
35
|
+
try:
|
|
36
|
+
with open(env_path, 'r', encoding='utf-8') as f:
|
|
37
|
+
for line in f:
|
|
38
|
+
line = line.strip()
|
|
39
|
+
if not line or line.startswith('#'):
|
|
40
|
+
continue
|
|
41
|
+
if '=' in line:
|
|
42
|
+
key, _, value = line.partition('=')
|
|
43
|
+
key = key.strip()
|
|
44
|
+
value = value.strip()
|
|
45
|
+
# 如果存在引号则移除
|
|
46
|
+
if (value.startswith('"') and value.endswith('"')) or \
|
|
47
|
+
(value.startswith("'") and value.endswith("'")):
|
|
48
|
+
value = value[1:-1]
|
|
49
|
+
# 仅在环境变量中未设置时才设置
|
|
50
|
+
if key and key not in os.environ:
|
|
51
|
+
os.environ[key] = value
|
|
52
|
+
return True
|
|
53
|
+
except IOError:
|
|
54
|
+
pass
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# 模块导入时加载 .env
|
|
59
|
+
load_dotenv()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ============================================================================
|
|
63
|
+
# 配置
|
|
64
|
+
# ============================================================================
|
|
65
|
+
|
|
66
|
+
class Config:
|
|
67
|
+
_instance = None
|
|
68
|
+
_DEFAULT_MODEL = "grok-4-fast"
|
|
69
|
+
|
|
70
|
+
def __new__(cls):
|
|
71
|
+
if cls._instance is None:
|
|
72
|
+
cls._instance = super().__new__(cls)
|
|
73
|
+
cls._instance._override_url = None
|
|
74
|
+
cls._instance._override_debug = None
|
|
75
|
+
return cls._instance
|
|
76
|
+
|
|
77
|
+
def set_overrides(self, api_url: Optional[str], debug: Optional[bool] = None):
|
|
78
|
+
self._override_url = api_url
|
|
79
|
+
self._override_debug = debug
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def debug_enabled(self) -> bool:
|
|
83
|
+
if self._override_debug is not None:
|
|
84
|
+
return self._override_debug
|
|
85
|
+
return os.getenv("GROK_DEBUG", "false").lower() in ("true", "1", "yes")
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def retry_max_attempts(self) -> int:
|
|
89
|
+
try:
|
|
90
|
+
v = int(os.getenv("GROK_RETRY_MAX_ATTEMPTS", "3"))
|
|
91
|
+
except ValueError:
|
|
92
|
+
v = 3
|
|
93
|
+
return max(1, v)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def retry_multiplier(self) -> float:
|
|
97
|
+
try:
|
|
98
|
+
v = float(os.getenv("GROK_RETRY_MULTIPLIER", "1"))
|
|
99
|
+
except ValueError:
|
|
100
|
+
v = 1.0
|
|
101
|
+
return max(0.1, v)
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def retry_max_wait(self) -> int:
|
|
105
|
+
try:
|
|
106
|
+
v = int(os.getenv("GROK_RETRY_MAX_WAIT", "10"))
|
|
107
|
+
except ValueError:
|
|
108
|
+
v = 10
|
|
109
|
+
return max(1, v)
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def grok_api_url(self) -> str:
|
|
113
|
+
if self._override_url:
|
|
114
|
+
return self._override_url
|
|
115
|
+
url = os.getenv("GROK_API_URL")
|
|
116
|
+
if not url:
|
|
117
|
+
raise ValueError("GROK_API_URL 未配置。请设置环境变量或使用 --api-url")
|
|
118
|
+
return url.rstrip('/')
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def grok_api_key(self) -> str:
|
|
122
|
+
key = os.getenv("GROK_API_KEY")
|
|
123
|
+
if not key:
|
|
124
|
+
raise ValueError("GROK_API_KEY 未配置。请设置环境变量或 .env 文件")
|
|
125
|
+
return key
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def tavily_enabled(self) -> bool:
|
|
129
|
+
return os.getenv("TAVILY_ENABLED", "true").lower() in ("true", "1", "yes")
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def tavily_api_url(self) -> str:
|
|
133
|
+
raw = os.getenv("TAVILY_API_URL", "https://api.tavily.com")
|
|
134
|
+
return _normalize_tavily_base_url(raw) or "https://api.tavily.com"
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def tavily_api_key(self) -> Optional[str]:
|
|
138
|
+
return os.getenv("TAVILY_API_KEY") or None
|
|
139
|
+
|
|
140
|
+
def _apply_model_suffix(self, model: str) -> str:
|
|
141
|
+
try:
|
|
142
|
+
url = self.grok_api_url
|
|
143
|
+
except ValueError:
|
|
144
|
+
return model
|
|
145
|
+
if "openrouter" in url and ":online" not in model:
|
|
146
|
+
return f"{model}:online"
|
|
147
|
+
return model
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def grok_model(self) -> str:
|
|
151
|
+
model = os.getenv("GROK_MODEL") or self._DEFAULT_MODEL
|
|
152
|
+
return self._apply_model_suffix(model)
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def _mask_api_key(key: str) -> str:
|
|
156
|
+
if not key or len(key) <= 8:
|
|
157
|
+
return "***"
|
|
158
|
+
return f"{key[:4]}{'*' * (len(key) - 8)}{key[-4:]}"
|
|
159
|
+
|
|
160
|
+
def get_config_info(self) -> dict:
|
|
161
|
+
try:
|
|
162
|
+
api_url = self.grok_api_url
|
|
163
|
+
api_key_raw = self.grok_api_key
|
|
164
|
+
api_key_masked = self._mask_api_key(api_key_raw)
|
|
165
|
+
config_status = "✅ 配置完成"
|
|
166
|
+
except ValueError as e:
|
|
167
|
+
api_url = "未配置"
|
|
168
|
+
api_key_masked = "未配置"
|
|
169
|
+
config_status = f"❌ 错误: {str(e)}"
|
|
170
|
+
|
|
171
|
+
return {
|
|
172
|
+
"GROK_API_URL": api_url,
|
|
173
|
+
"GROK_API_KEY": api_key_masked,
|
|
174
|
+
"GROK_MODEL": self.grok_model,
|
|
175
|
+
"GROK_DEBUG": self.debug_enabled,
|
|
176
|
+
"GROK_RETRY_MAX_ATTEMPTS": self.retry_max_attempts,
|
|
177
|
+
"GROK_RETRY_MULTIPLIER": self.retry_multiplier,
|
|
178
|
+
"GROK_RETRY_MAX_WAIT": self.retry_max_wait,
|
|
179
|
+
"TAVILY_ENABLED": self.tavily_enabled,
|
|
180
|
+
"TAVILY_API_URL": self.tavily_api_url,
|
|
181
|
+
"TAVILY_API_KEY": self._mask_api_key(self.tavily_api_key) if self.tavily_api_key else "未配置",
|
|
182
|
+
"config_status": config_status
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
config = Config()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ============================================================================
|
|
190
|
+
# 提示词
|
|
191
|
+
# ============================================================================
|
|
192
|
+
|
|
193
|
+
SEARCH_PROMPT = """# 角色: 搜索助手
|
|
194
|
+
|
|
195
|
+
以 JSON 数组形式返回搜索结果。每个结果必须包含以下字段:
|
|
196
|
+
- "title": 字符串,结果标题
|
|
197
|
+
- "url": 字符串,有效的 URL
|
|
198
|
+
- "description": 字符串,20-50 字摘要
|
|
199
|
+
|
|
200
|
+
仅输出有效的 JSON 数组,不包含 markdown,不包含解释。
|
|
201
|
+
|
|
202
|
+
示例:
|
|
203
|
+
[
|
|
204
|
+
{"title": "示例", "url": "https://example.com", "description": "简要描述"}
|
|
205
|
+
]
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
FETCH_PROMPT = """# 角色: 网页内容获取器
|
|
209
|
+
|
|
210
|
+
获取网页内容并转换为结构化 Markdown:
|
|
211
|
+
- 保留所有标题、段落、列表、表格、代码块
|
|
212
|
+
- 包含元数据头部:源 URL、标题、获取时间戳
|
|
213
|
+
- 不要摘要 - 返回完整内容
|
|
214
|
+
- 使用 UTF-8 编码
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _get_tavily_status() -> tuple[bool, Optional[str]]:
|
|
219
|
+
if not config.tavily_enabled:
|
|
220
|
+
return False, "Tavily 已禁用 (TAVILY_ENABLED=false)"
|
|
221
|
+
if not config.tavily_api_key:
|
|
222
|
+
return False, "TAVILY_API_KEY 未配置"
|
|
223
|
+
return True, None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _emit_tavily_warning(message: str) -> None:
|
|
227
|
+
print(f"Tavily warning: {message}", file=sys.stderr)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ============================================================================
|
|
231
|
+
# 重试策略
|
|
232
|
+
# ============================================================================
|
|
233
|
+
|
|
234
|
+
RETRYABLE_STATUS_CODES = {408, 429, 500, 502, 503, 504}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _is_retryable_exception(exc) -> bool:
|
|
238
|
+
if isinstance(exc, (httpx.TimeoutException, httpx.NetworkError, httpx.ConnectError, httpx.RemoteProtocolError)):
|
|
239
|
+
return True
|
|
240
|
+
if isinstance(exc, httpx.HTTPStatusError):
|
|
241
|
+
return exc.response.status_code in RETRYABLE_STATUS_CODES
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class _WaitWithRetryAfter(wait_base):
|
|
246
|
+
def __init__(self, multiplier: float, max_wait: int):
|
|
247
|
+
self._base_wait = wait_random_exponential(multiplier=multiplier, max=max_wait)
|
|
248
|
+
self._protocol_error_base = 3.0
|
|
249
|
+
|
|
250
|
+
def __call__(self, retry_state):
|
|
251
|
+
if retry_state.outcome and retry_state.outcome.failed:
|
|
252
|
+
exc = retry_state.outcome.exception()
|
|
253
|
+
if isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code == 429:
|
|
254
|
+
retry_after = self._parse_retry_after(exc.response)
|
|
255
|
+
if retry_after is not None:
|
|
256
|
+
return retry_after
|
|
257
|
+
if isinstance(exc, httpx.RemoteProtocolError):
|
|
258
|
+
return self._base_wait(retry_state) + self._protocol_error_base
|
|
259
|
+
return self._base_wait(retry_state)
|
|
260
|
+
|
|
261
|
+
def _parse_retry_after(self, response: httpx.Response) -> Optional[float]:
|
|
262
|
+
header = response.headers.get("Retry-After")
|
|
263
|
+
if not header:
|
|
264
|
+
return None
|
|
265
|
+
header = header.strip()
|
|
266
|
+
if header.isdigit():
|
|
267
|
+
return float(header)
|
|
268
|
+
try:
|
|
269
|
+
retry_dt = parsedate_to_datetime(header)
|
|
270
|
+
if retry_dt.tzinfo is None:
|
|
271
|
+
retry_dt = retry_dt.replace(tzinfo=timezone.utc)
|
|
272
|
+
delay = (retry_dt - datetime.now(timezone.utc)).total_seconds()
|
|
273
|
+
return max(0.0, delay)
|
|
274
|
+
except (TypeError, ValueError):
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
# ============================================================================
|
|
279
|
+
# URL 规范化
|
|
280
|
+
# ============================================================================
|
|
281
|
+
|
|
282
|
+
_URL_WRAPPERS: tuple[tuple[str, str], ...] = (("<", ">"), ("(", ")"), ("[", "]"), ("{", "}"), ('"', '"'), ("'", "'"))
|
|
283
|
+
_TRAILING_URL_PUNCT = ".,;:!?\u3002\uff0c\uff1b\uff1a\uff01\uff1f"
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _strip_url_wrappers(text: str) -> str:
|
|
287
|
+
s = (text or "").strip()
|
|
288
|
+
changed = True
|
|
289
|
+
while changed and s:
|
|
290
|
+
changed = False
|
|
291
|
+
for left, right in _URL_WRAPPERS:
|
|
292
|
+
if s.startswith(left) and s.endswith(right) and len(s) >= 2:
|
|
293
|
+
s = s[1:-1].strip()
|
|
294
|
+
changed = True
|
|
295
|
+
return s
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _strip_trailing_url_punct(text: str) -> str:
|
|
299
|
+
s = (text or "").strip()
|
|
300
|
+
while s and s[-1] in _TRAILING_URL_PUNCT:
|
|
301
|
+
s = s[:-1].rstrip()
|
|
302
|
+
return s
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _extract_host_from_authority(authority: str) -> str:
|
|
306
|
+
s = (authority or "").strip()
|
|
307
|
+
if not s:
|
|
308
|
+
return ""
|
|
309
|
+
# Drop userinfo if any: user:pass@host
|
|
310
|
+
if "@" in s:
|
|
311
|
+
s = s.rsplit("@", 1)[-1]
|
|
312
|
+
if s.startswith("[") and "]" in s:
|
|
313
|
+
return s[1:s.index("]")].strip()
|
|
314
|
+
# Common invalid-but-seen form: IPv6:port without brackets, e.g. ::1:8080
|
|
315
|
+
split = _split_unbracketed_ipv6_hostport(s)
|
|
316
|
+
if split is not None:
|
|
317
|
+
host, _port = split
|
|
318
|
+
return host.strip()
|
|
319
|
+
# IPv6 without brackets (rare); try parse directly first.
|
|
320
|
+
try:
|
|
321
|
+
ipaddress.ip_address(s.split("%", 1)[0])
|
|
322
|
+
return s.split("%", 1)[0]
|
|
323
|
+
except ValueError:
|
|
324
|
+
pass
|
|
325
|
+
host, _, _port = s.partition(":")
|
|
326
|
+
return host.strip()
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _is_local_host(host: str) -> bool:
|
|
330
|
+
h = (host or "").strip().lower()
|
|
331
|
+
if not h:
|
|
332
|
+
return False
|
|
333
|
+
if h in ("localhost",):
|
|
334
|
+
return True
|
|
335
|
+
if h.endswith(".localhost"):
|
|
336
|
+
return True
|
|
337
|
+
|
|
338
|
+
candidate = h.split("%", 1)[0]
|
|
339
|
+
try:
|
|
340
|
+
ip = ipaddress.ip_address(candidate)
|
|
341
|
+
except ValueError:
|
|
342
|
+
return False
|
|
343
|
+
return bool(ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_unspecified)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _split_authority_and_remainder(raw: str) -> tuple[str, str]:
|
|
347
|
+
s = (raw or "").lstrip()
|
|
348
|
+
if s.startswith("//"):
|
|
349
|
+
s = s[2:]
|
|
350
|
+
|
|
351
|
+
min_index = None
|
|
352
|
+
for sep in ("/", "?", "#"):
|
|
353
|
+
idx = s.find(sep)
|
|
354
|
+
if idx != -1 and (min_index is None or idx < min_index):
|
|
355
|
+
min_index = idx
|
|
356
|
+
|
|
357
|
+
if min_index is None:
|
|
358
|
+
return s, ""
|
|
359
|
+
return s[:min_index], s[min_index:]
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _is_ipv6_literal(host: str) -> bool:
|
|
363
|
+
candidate = (host or "").strip()
|
|
364
|
+
if not candidate:
|
|
365
|
+
return False
|
|
366
|
+
candidate = candidate.split("%", 1)[0]
|
|
367
|
+
try:
|
|
368
|
+
return ipaddress.ip_address(candidate).version == 6
|
|
369
|
+
except ValueError:
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _split_unbracketed_ipv6_hostport(hostport: str) -> Optional[tuple[str, str]]:
|
|
374
|
+
s = (hostport or "").strip()
|
|
375
|
+
if not s or s.startswith("["):
|
|
376
|
+
return None
|
|
377
|
+
head, sep, tail = s.rpartition(":")
|
|
378
|
+
if not sep or not tail.isdigit() or not head or ":" not in head:
|
|
379
|
+
return None
|
|
380
|
+
try:
|
|
381
|
+
port = int(tail)
|
|
382
|
+
except ValueError:
|
|
383
|
+
return None
|
|
384
|
+
if not (0 <= port <= 65535):
|
|
385
|
+
return None
|
|
386
|
+
candidate = head.split("%", 1)[0]
|
|
387
|
+
try:
|
|
388
|
+
return (head, tail) if ipaddress.ip_address(candidate).version == 6 else None
|
|
389
|
+
except ValueError:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _bracket_ipv6_authority(authority: str) -> str:
|
|
394
|
+
s = (authority or "").strip()
|
|
395
|
+
if not s:
|
|
396
|
+
return s
|
|
397
|
+
|
|
398
|
+
userinfo, at, hostport = s.rpartition("@")
|
|
399
|
+
prefix = f"{userinfo}@" if at else ""
|
|
400
|
+
target = hostport if at else s
|
|
401
|
+
if target.startswith("["):
|
|
402
|
+
return s
|
|
403
|
+
|
|
404
|
+
split = _split_unbracketed_ipv6_hostport(target)
|
|
405
|
+
if split is not None:
|
|
406
|
+
host, port = split
|
|
407
|
+
return f"{prefix}[{host}]:{port}"
|
|
408
|
+
|
|
409
|
+
if not _is_ipv6_literal(target):
|
|
410
|
+
return s
|
|
411
|
+
|
|
412
|
+
return f"{prefix}[{target}]"
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def normalize_url(url: str) -> str:
|
|
416
|
+
s = (url or "").strip()
|
|
417
|
+
while True:
|
|
418
|
+
unwrapped = _strip_url_wrappers(s)
|
|
419
|
+
if unwrapped != s:
|
|
420
|
+
s = unwrapped
|
|
421
|
+
continue
|
|
422
|
+
if s and any(s.startswith(left) for left, _ in _URL_WRAPPERS):
|
|
423
|
+
stripped = _strip_trailing_url_punct(s)
|
|
424
|
+
if stripped != s:
|
|
425
|
+
s = stripped
|
|
426
|
+
continue
|
|
427
|
+
break
|
|
428
|
+
if not s:
|
|
429
|
+
raise ValueError("URL 为空")
|
|
430
|
+
if re.search(r"\s", s):
|
|
431
|
+
raise ValueError(f"URL 不应包含空白字符: {url!r}")
|
|
432
|
+
|
|
433
|
+
scheme_match = re.match(r"^([a-zA-Z][a-zA-Z0-9+.-]*)://", s)
|
|
434
|
+
if scheme_match:
|
|
435
|
+
scheme = scheme_match.group(1).lower()
|
|
436
|
+
if scheme not in ("http", "https"):
|
|
437
|
+
raise ValueError(f"仅支持 http/https URL: {s}")
|
|
438
|
+
parts = urlsplit(s)
|
|
439
|
+
if parts.netloc:
|
|
440
|
+
fixed_netloc = _bracket_ipv6_authority(parts.netloc)
|
|
441
|
+
if fixed_netloc != parts.netloc:
|
|
442
|
+
return urlunsplit((parts.scheme, fixed_netloc, parts.path, parts.query, parts.fragment))
|
|
443
|
+
# Fix common invalid form like: http://::1 (IPv6 without brackets)
|
|
444
|
+
if not parts.netloc and parts.path and ":" in parts.path and not parts.path.startswith("/"):
|
|
445
|
+
rest = s[len(scheme_match.group(0)) :]
|
|
446
|
+
authority, remainder = _split_authority_and_remainder(rest)
|
|
447
|
+
if authority:
|
|
448
|
+
fixed_authority = _bracket_ipv6_authority(authority)
|
|
449
|
+
if fixed_authority != authority:
|
|
450
|
+
return f"{scheme}://{fixed_authority}{remainder}"
|
|
451
|
+
return s
|
|
452
|
+
|
|
453
|
+
# Protocol-relative URL.
|
|
454
|
+
if s.startswith("//"):
|
|
455
|
+
authority, remainder = _split_authority_and_remainder(s)
|
|
456
|
+
if not authority:
|
|
457
|
+
raise ValueError(f"URL 缺少主机名: {url!r}")
|
|
458
|
+
host = _extract_host_from_authority(authority)
|
|
459
|
+
if not host:
|
|
460
|
+
raise ValueError(f"URL 缺少主机名: {url!r}")
|
|
461
|
+
scheme = "http" if _is_local_host(host) else "https"
|
|
462
|
+
authority = _bracket_ipv6_authority(authority)
|
|
463
|
+
return f"{scheme}://{authority}{remainder}"
|
|
464
|
+
|
|
465
|
+
authority, remainder = _split_authority_and_remainder(s)
|
|
466
|
+
if not authority:
|
|
467
|
+
raise ValueError(f"URL 缺少主机名: {url!r}")
|
|
468
|
+
host = _extract_host_from_authority(authority)
|
|
469
|
+
if not host:
|
|
470
|
+
raise ValueError(f"URL 缺少主机名: {url!r}")
|
|
471
|
+
scheme = "http" if _is_local_host(host) else "https"
|
|
472
|
+
authority = _bracket_ipv6_authority(authority)
|
|
473
|
+
return f"{scheme}://{authority}{remainder}"
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _materialize_docsify_markdown_url(url: str) -> Optional[str]:
|
|
477
|
+
parts = urlsplit(url)
|
|
478
|
+
fragment = (parts.fragment or "").strip()
|
|
479
|
+
if not fragment.startswith("/"):
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
route, _, _route_query = fragment[1:].partition("?")
|
|
483
|
+
route = route.strip()
|
|
484
|
+
while route.startswith("./"):
|
|
485
|
+
route = route[2:].lstrip("/")
|
|
486
|
+
if not route:
|
|
487
|
+
return None
|
|
488
|
+
if route.startswith("../") or "/../" in route:
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
base_path = parts.path or "/"
|
|
492
|
+
if not base_path.endswith("/"):
|
|
493
|
+
base_path = base_path.rsplit("/", 1)[0] + "/"
|
|
494
|
+
path = f"{base_path}{route}"
|
|
495
|
+
if not path.lower().endswith((".md", ".markdown")):
|
|
496
|
+
path = f"{path}.md"
|
|
497
|
+
return urlunsplit((parts.scheme, parts.netloc, path, "", ""))
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _normalize_tavily_base_url(raw: str) -> str:
|
|
501
|
+
value = (raw or "").strip()
|
|
502
|
+
if not value:
|
|
503
|
+
return value
|
|
504
|
+
parts = urlsplit(value)
|
|
505
|
+
path = parts.path or ""
|
|
506
|
+
path = path.rstrip("/")
|
|
507
|
+
lowered = path.lower()
|
|
508
|
+
for suffix in ("/search", "/extract", "/map", "/crawl", "/research"):
|
|
509
|
+
if lowered.endswith(suffix):
|
|
510
|
+
path = path[: -len(suffix)]
|
|
511
|
+
break
|
|
512
|
+
base = urlunsplit((parts.scheme, parts.netloc, path, "", ""))
|
|
513
|
+
return base.rstrip("/")
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
# ============================================================================
|
|
517
|
+
# 连接池
|
|
518
|
+
# ============================================================================
|
|
519
|
+
|
|
520
|
+
_http_client: Optional[httpx.AsyncClient] = None
|
|
521
|
+
_DEFAULT_TIMEOUT = httpx.Timeout(connect=6.0, read=60.0, write=10.0, pool=None)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
async def get_http_client() -> httpx.AsyncClient:
|
|
525
|
+
global _http_client
|
|
526
|
+
if _http_client is None or _http_client.is_closed:
|
|
527
|
+
_http_client = httpx.AsyncClient(
|
|
528
|
+
timeout=_DEFAULT_TIMEOUT,
|
|
529
|
+
follow_redirects=True,
|
|
530
|
+
limits=httpx.Limits(max_connections=10, max_keepalive_connections=5)
|
|
531
|
+
)
|
|
532
|
+
return _http_client
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
async def close_http_client():
|
|
536
|
+
global _http_client
|
|
537
|
+
if _http_client is not None and not _http_client.is_closed:
|
|
538
|
+
await _http_client.aclose()
|
|
539
|
+
_http_client = None
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# ============================================================================
|
|
543
|
+
# Grok 提供者
|
|
544
|
+
# ============================================================================
|
|
545
|
+
|
|
546
|
+
def _get_local_time_info() -> str:
|
|
547
|
+
try:
|
|
548
|
+
local_tz = datetime.now().astimezone().tzinfo
|
|
549
|
+
local_now = datetime.now(local_tz)
|
|
550
|
+
except Exception:
|
|
551
|
+
local_now = datetime.now(timezone.utc)
|
|
552
|
+
|
|
553
|
+
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
|
554
|
+
return (
|
|
555
|
+
f"[当前时间上下文]\n"
|
|
556
|
+
f"- 日期: {local_now.strftime('%Y-%m-%d')} ({weekdays[local_now.weekday()]})\n"
|
|
557
|
+
f"- 时间: {local_now.strftime('%H:%M:%S')}\n"
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def _needs_time_context(query: str) -> bool:
|
|
562
|
+
keywords = [
|
|
563
|
+
"current", "now", "today", "tomorrow", "yesterday",
|
|
564
|
+
"this week", "last week", "next week",
|
|
565
|
+
"latest", "recent", "recently", "up-to-date",
|
|
566
|
+
"当前", "现在", "今天", "最新", "最近"
|
|
567
|
+
]
|
|
568
|
+
query_lower = query.lower()
|
|
569
|
+
return any(kw in query_lower or kw in query for kw in keywords)
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
class GrokSearchProvider:
|
|
573
|
+
def __init__(self, api_url: str, api_key: str, model: str):
|
|
574
|
+
self.api_url = api_url.rstrip('/')
|
|
575
|
+
self.api_key = api_key
|
|
576
|
+
self.model = model
|
|
577
|
+
self._headers = {
|
|
578
|
+
"Authorization": f"Bearer {api_key}",
|
|
579
|
+
"Content-Type": "application/json",
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
async def search(self, query: str, platform: str = "", min_results: int = 3, max_results: int = 10) -> str:
|
|
583
|
+
platform_prompt = f"\n\n专注于平台: {platform}" if platform else ""
|
|
584
|
+
return_prompt = f"\n\n以 JSON 数组形式返回 {min_results}-{max_results} 个结果。"
|
|
585
|
+
time_context = _get_local_time_info() + "\n" if _needs_time_context(query) else ""
|
|
586
|
+
|
|
587
|
+
payload = {
|
|
588
|
+
"model": self.model,
|
|
589
|
+
"messages": [
|
|
590
|
+
{"role": "system", "content": SEARCH_PROMPT},
|
|
591
|
+
{"role": "user", "content": time_context + query + platform_prompt + return_prompt},
|
|
592
|
+
],
|
|
593
|
+
}
|
|
594
|
+
return await self._execute(payload)
|
|
595
|
+
|
|
596
|
+
async def fetch(self, url: str) -> str:
|
|
597
|
+
payload = {
|
|
598
|
+
"model": self.model,
|
|
599
|
+
"messages": [
|
|
600
|
+
{"role": "system", "content": FETCH_PROMPT},
|
|
601
|
+
{"role": "user", "content": f"{url}\n\n获取并返回结构化 Markdown。"},
|
|
602
|
+
],
|
|
603
|
+
}
|
|
604
|
+
return await self._execute(payload)
|
|
605
|
+
|
|
606
|
+
async def _execute(self, payload: dict) -> str:
|
|
607
|
+
"""执行请求:先尝试非流式,失败时回退到流式。"""
|
|
608
|
+
try:
|
|
609
|
+
return await self._execute_non_stream(payload)
|
|
610
|
+
except (httpx.HTTPStatusError, json.JSONDecodeError) as e:
|
|
611
|
+
if config.debug_enabled:
|
|
612
|
+
print(f"[DEBUG] 非流式失败: {e},回退到流式", file=sys.stderr)
|
|
613
|
+
return await self._execute_stream(payload)
|
|
614
|
+
|
|
615
|
+
async def _execute_non_stream(self, payload: dict) -> str:
|
|
616
|
+
"""非流式请求(首选,对短响应更快)。"""
|
|
617
|
+
payload_copy = {**payload, "stream": False}
|
|
618
|
+
client = await get_http_client()
|
|
619
|
+
|
|
620
|
+
async for attempt in AsyncRetrying(
|
|
621
|
+
stop=stop_after_attempt(config.retry_max_attempts),
|
|
622
|
+
wait=_WaitWithRetryAfter(config.retry_multiplier, config.retry_max_wait),
|
|
623
|
+
retry=retry_if_exception(_is_retryable_exception),
|
|
624
|
+
reraise=True,
|
|
625
|
+
):
|
|
626
|
+
with attempt:
|
|
627
|
+
response = await client.post(
|
|
628
|
+
f"{self.api_url}/chat/completions",
|
|
629
|
+
headers=self._headers,
|
|
630
|
+
json=payload_copy,
|
|
631
|
+
)
|
|
632
|
+
response.raise_for_status()
|
|
633
|
+
data = response.json()
|
|
634
|
+
choices = data.get("choices", [])
|
|
635
|
+
if choices:
|
|
636
|
+
return choices[0].get("message", {}).get("content", "")
|
|
637
|
+
return ""
|
|
638
|
+
|
|
639
|
+
async def _execute_stream(self, payload: dict) -> str:
|
|
640
|
+
"""流式请求(大响应的回退方案)。"""
|
|
641
|
+
payload_copy = {**payload, "stream": True}
|
|
642
|
+
client = await get_http_client()
|
|
643
|
+
|
|
644
|
+
async for attempt in AsyncRetrying(
|
|
645
|
+
stop=stop_after_attempt(config.retry_max_attempts),
|
|
646
|
+
wait=_WaitWithRetryAfter(config.retry_multiplier, config.retry_max_wait),
|
|
647
|
+
retry=retry_if_exception(_is_retryable_exception),
|
|
648
|
+
reraise=True,
|
|
649
|
+
):
|
|
650
|
+
with attempt:
|
|
651
|
+
async with client.stream(
|
|
652
|
+
"POST",
|
|
653
|
+
f"{self.api_url}/chat/completions",
|
|
654
|
+
headers=self._headers,
|
|
655
|
+
json=payload_copy,
|
|
656
|
+
) as response:
|
|
657
|
+
response.raise_for_status()
|
|
658
|
+
return await self._parse_streaming_response(response)
|
|
659
|
+
|
|
660
|
+
async def _parse_streaming_response(self, response) -> str:
|
|
661
|
+
content = ""
|
|
662
|
+
full_body_buffer = []
|
|
663
|
+
|
|
664
|
+
async for line in response.aiter_lines():
|
|
665
|
+
line = line.strip()
|
|
666
|
+
if not line:
|
|
667
|
+
continue
|
|
668
|
+
full_body_buffer.append(line)
|
|
669
|
+
|
|
670
|
+
if line.startswith("data:"):
|
|
671
|
+
if line in ("data: [DONE]", "data:[DONE]"):
|
|
672
|
+
continue
|
|
673
|
+
try:
|
|
674
|
+
json_str = line[5:].lstrip()
|
|
675
|
+
data = json.loads(json_str)
|
|
676
|
+
choices = data.get("choices", [])
|
|
677
|
+
if choices:
|
|
678
|
+
delta = choices[0].get("delta", {})
|
|
679
|
+
if "content" in delta:
|
|
680
|
+
content += delta["content"]
|
|
681
|
+
except (json.JSONDecodeError, IndexError):
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
if not content and full_body_buffer:
|
|
685
|
+
try:
|
|
686
|
+
full_text = "".join(full_body_buffer)
|
|
687
|
+
data = json.loads(full_text)
|
|
688
|
+
if "choices" in data and data["choices"]:
|
|
689
|
+
message = data["choices"][0].get("message", {})
|
|
690
|
+
content = message.get("content", "")
|
|
691
|
+
except json.JSONDecodeError:
|
|
692
|
+
pass
|
|
693
|
+
|
|
694
|
+
return content
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
# ============================================================================
|
|
698
|
+
# Tavily
|
|
699
|
+
# ============================================================================
|
|
700
|
+
|
|
701
|
+
async def _call_tavily_extract(url: str) -> tuple[Optional[str], Optional[str]]:
|
|
702
|
+
has_tavily, reason = _get_tavily_status()
|
|
703
|
+
if not has_tavily:
|
|
704
|
+
return None, reason
|
|
705
|
+
api_key = config.tavily_api_key
|
|
706
|
+
|
|
707
|
+
endpoint = f"{config.tavily_api_url.rstrip('/')}/extract"
|
|
708
|
+
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
709
|
+
body = {"urls": [url], "format": "markdown"}
|
|
710
|
+
try:
|
|
711
|
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
712
|
+
response = await client.post(endpoint, headers=headers, json=body)
|
|
713
|
+
response.raise_for_status()
|
|
714
|
+
data = response.json()
|
|
715
|
+
except httpx.TimeoutException:
|
|
716
|
+
return None, "Tavily extract 超时"
|
|
717
|
+
except httpx.HTTPStatusError as e:
|
|
718
|
+
return None, f"Tavily extract HTTP {e.response.status_code}"
|
|
719
|
+
except Exception as e:
|
|
720
|
+
return None, f"Tavily extract 错误: {str(e)}"
|
|
721
|
+
|
|
722
|
+
def _first_non_empty_str(*values) -> str:
|
|
723
|
+
for value in values:
|
|
724
|
+
if isinstance(value, str) and value.strip():
|
|
725
|
+
return value
|
|
726
|
+
return ""
|
|
727
|
+
|
|
728
|
+
results = None
|
|
729
|
+
if isinstance(data, dict):
|
|
730
|
+
err = _first_non_empty_str(data.get("error"), data.get("message"))
|
|
731
|
+
if err:
|
|
732
|
+
return None, f"Tavily extract 错误: {err}"
|
|
733
|
+
top_level_content = _first_non_empty_str(
|
|
734
|
+
data.get("raw_content"),
|
|
735
|
+
data.get("content"),
|
|
736
|
+
data.get("markdown"),
|
|
737
|
+
data.get("text"),
|
|
738
|
+
)
|
|
739
|
+
if top_level_content:
|
|
740
|
+
return top_level_content, None
|
|
741
|
+
results = data.get("results")
|
|
742
|
+
if results is None:
|
|
743
|
+
results = data.get("result") or data.get("data")
|
|
744
|
+
elif isinstance(data, list):
|
|
745
|
+
results = data
|
|
746
|
+
|
|
747
|
+
if isinstance(results, dict):
|
|
748
|
+
results_list = [results]
|
|
749
|
+
elif isinstance(results, list):
|
|
750
|
+
results_list = results
|
|
751
|
+
else:
|
|
752
|
+
results_list = []
|
|
753
|
+
|
|
754
|
+
if results_list:
|
|
755
|
+
first = results_list[0]
|
|
756
|
+
if isinstance(first, dict):
|
|
757
|
+
content = _first_non_empty_str(
|
|
758
|
+
first.get("raw_content"),
|
|
759
|
+
first.get("content"),
|
|
760
|
+
first.get("markdown"),
|
|
761
|
+
first.get("text"),
|
|
762
|
+
)
|
|
763
|
+
else:
|
|
764
|
+
content = _first_non_empty_str(first)
|
|
765
|
+
if content:
|
|
766
|
+
return content, None
|
|
767
|
+
return None, "Tavily extract 返回空内容"
|
|
768
|
+
return None, "Tavily extract 未返回结果"
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
async def _call_tavily_search(query: str, max_results: int = 6) -> tuple[list[dict], Optional[str]]:
|
|
772
|
+
has_tavily, reason = _get_tavily_status()
|
|
773
|
+
if not has_tavily:
|
|
774
|
+
return [], reason
|
|
775
|
+
api_key = config.tavily_api_key
|
|
776
|
+
|
|
777
|
+
endpoint = f"{config.tavily_api_url.rstrip('/')}/search"
|
|
778
|
+
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
779
|
+
body = {
|
|
780
|
+
"query": query,
|
|
781
|
+
"max_results": max_results,
|
|
782
|
+
"search_depth": "advanced",
|
|
783
|
+
"include_raw_content": False,
|
|
784
|
+
"include_answer": False,
|
|
785
|
+
}
|
|
786
|
+
try:
|
|
787
|
+
async with httpx.AsyncClient(timeout=90.0) as client:
|
|
788
|
+
response = await client.post(endpoint, headers=headers, json=body)
|
|
789
|
+
response.raise_for_status()
|
|
790
|
+
data = response.json()
|
|
791
|
+
except httpx.TimeoutException:
|
|
792
|
+
return [], "Tavily search 超时"
|
|
793
|
+
except httpx.HTTPStatusError as e:
|
|
794
|
+
return [], f"Tavily search HTTP {e.response.status_code}"
|
|
795
|
+
except Exception as e:
|
|
796
|
+
return [], f"Tavily search 错误: {str(e)}"
|
|
797
|
+
|
|
798
|
+
results = (data or {}).get("results") or []
|
|
799
|
+
if not isinstance(results, list) or not results:
|
|
800
|
+
return [], None
|
|
801
|
+
return [
|
|
802
|
+
{
|
|
803
|
+
"title": (r or {}).get("title", "") or "",
|
|
804
|
+
"url": (r or {}).get("url", "") or "",
|
|
805
|
+
"description": (r or {}).get("content", "") or "",
|
|
806
|
+
}
|
|
807
|
+
for r in results
|
|
808
|
+
], None
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
async def _call_tavily_map(
|
|
812
|
+
url: str,
|
|
813
|
+
instructions: str = "",
|
|
814
|
+
max_depth: int = 1,
|
|
815
|
+
max_breadth: int = 20,
|
|
816
|
+
limit: int = 50,
|
|
817
|
+
timeout: int = 150,
|
|
818
|
+
) -> dict:
|
|
819
|
+
api_key = config.tavily_api_key
|
|
820
|
+
if not api_key or not config.tavily_enabled:
|
|
821
|
+
return {"error": "配置错误: TAVILY_API_KEY 未配置或 Tavily 已禁用 (TAVILY_ENABLED=false)"}
|
|
822
|
+
|
|
823
|
+
endpoint = f"{config.tavily_api_url.rstrip('/')}/map"
|
|
824
|
+
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
825
|
+
body: dict = {
|
|
826
|
+
"url": url,
|
|
827
|
+
"max_depth": max_depth,
|
|
828
|
+
"max_breadth": max_breadth,
|
|
829
|
+
"limit": limit,
|
|
830
|
+
"timeout": timeout,
|
|
831
|
+
}
|
|
832
|
+
if instructions:
|
|
833
|
+
body["instructions"] = instructions
|
|
834
|
+
|
|
835
|
+
try:
|
|
836
|
+
async with httpx.AsyncClient(timeout=float(timeout + 10)) as client:
|
|
837
|
+
response = await client.post(endpoint, headers=headers, json=body)
|
|
838
|
+
response.raise_for_status()
|
|
839
|
+
data = response.json()
|
|
840
|
+
return {
|
|
841
|
+
"base_url": data.get("base_url", ""),
|
|
842
|
+
"results": data.get("results", []),
|
|
843
|
+
"response_time": data.get("response_time", 0),
|
|
844
|
+
}
|
|
845
|
+
except httpx.TimeoutException:
|
|
846
|
+
return {"error": f"映射超时: 请求超过{timeout}秒"}
|
|
847
|
+
except httpx.HTTPStatusError as e:
|
|
848
|
+
return {"error": f"HTTP错误: {e.response.status_code}", "status_code": e.response.status_code, "detail": e.response.text[:200]}
|
|
849
|
+
except Exception as e:
|
|
850
|
+
return {"error": f"映射错误: {str(e)}"}
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
# ============================================================================
|
|
854
|
+
# JSON 提取
|
|
855
|
+
# ============================================================================
|
|
856
|
+
|
|
857
|
+
def extract_json(text: str) -> str:
|
|
858
|
+
"""从文本中提取 JSON,处理 markdown 代码块和混合文本+JSON。"""
|
|
859
|
+
# 尝试从 markdown 代码块中提取
|
|
860
|
+
match = re.search(r'```(?:json)?\s*\n?([\s\S]*?)\n?```', text)
|
|
861
|
+
if match:
|
|
862
|
+
text = match.group(1).strip()
|
|
863
|
+
else:
|
|
864
|
+
# 尝试从混合文本中提取 JSON 数组/对象
|
|
865
|
+
# 查找数组模式: [ ... ]
|
|
866
|
+
array_match = re.search(r'\[\s*\{[\s\S]*?\}\s*\]', text)
|
|
867
|
+
if array_match:
|
|
868
|
+
text = array_match.group(0)
|
|
869
|
+
else:
|
|
870
|
+
# 查找对象模式: { ... }
|
|
871
|
+
object_match = re.search(r'\{[\s\S]*?\}', text)
|
|
872
|
+
if object_match:
|
|
873
|
+
text = object_match.group(0)
|
|
874
|
+
|
|
875
|
+
# 尝试解析为 JSON
|
|
876
|
+
try:
|
|
877
|
+
data = json.loads(text)
|
|
878
|
+
# 标准化字段名
|
|
879
|
+
if isinstance(data, list):
|
|
880
|
+
standardized = []
|
|
881
|
+
for item in data:
|
|
882
|
+
if isinstance(item, dict):
|
|
883
|
+
standardized.append({
|
|
884
|
+
"title": item.get("title", ""),
|
|
885
|
+
"url": item.get("url", item.get("link", "")),
|
|
886
|
+
"description": item.get("description", item.get("content", item.get("snippet", item.get("summary", ""))))
|
|
887
|
+
})
|
|
888
|
+
return json.dumps(standardized, ensure_ascii=False, indent=2)
|
|
889
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
890
|
+
except json.JSONDecodeError:
|
|
891
|
+
return json.dumps({"error": "解析 JSON 失败", "raw": text[:500]}, ensure_ascii=False, indent=2)
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
# ============================================================================
|
|
895
|
+
# 命令
|
|
896
|
+
# ============================================================================
|
|
897
|
+
|
|
898
|
+
async def cmd_web_search(args):
|
|
899
|
+
try:
|
|
900
|
+
effective_model = config._apply_model_suffix(args.model) if args.model else config.grok_model
|
|
901
|
+
provider = GrokSearchProvider(config.grok_api_url, config.grok_api_key, effective_model)
|
|
902
|
+
result = await provider.search(args.query, args.platform, args.min_results, args.max_results)
|
|
903
|
+
if args.raw:
|
|
904
|
+
print(result)
|
|
905
|
+
else:
|
|
906
|
+
parsed = json.loads(extract_json(result))
|
|
907
|
+
if not isinstance(parsed, list):
|
|
908
|
+
print(json.dumps(parsed, ensure_ascii=False, indent=2))
|
|
909
|
+
return
|
|
910
|
+
|
|
911
|
+
merged: list[dict] = parsed
|
|
912
|
+
|
|
913
|
+
extra_sources = int(args.extra_sources or 0)
|
|
914
|
+
if extra_sources < 0:
|
|
915
|
+
raise ValueError("--extra-sources 必须大于等于 0")
|
|
916
|
+
has_tavily = bool(config.tavily_api_key) and config.tavily_enabled
|
|
917
|
+
if extra_sources > 0:
|
|
918
|
+
if not has_tavily:
|
|
919
|
+
_, tavily_reason = _get_tavily_status()
|
|
920
|
+
if tavily_reason:
|
|
921
|
+
_emit_tavily_warning(f"已请求 --extra-sources {extra_sources},但 {tavily_reason};仅返回 Grok 搜索结果")
|
|
922
|
+
else:
|
|
923
|
+
extras, tavily_warning = await _call_tavily_search(args.query, extra_sources)
|
|
924
|
+
if tavily_warning:
|
|
925
|
+
_emit_tavily_warning(f"{tavily_warning};仅附加 Grok 搜索结果")
|
|
926
|
+
|
|
927
|
+
seen: set[str] = set()
|
|
928
|
+
out: list[dict] = []
|
|
929
|
+
for item in merged:
|
|
930
|
+
url = (item or {}).get("url", "")
|
|
931
|
+
if isinstance(url, str) and url:
|
|
932
|
+
seen.add(url)
|
|
933
|
+
out.append(item)
|
|
934
|
+
|
|
935
|
+
for item in extras:
|
|
936
|
+
url = (item or {}).get("url", "")
|
|
937
|
+
if not isinstance(url, str) or not url.startswith(("http://", "https://")):
|
|
938
|
+
continue
|
|
939
|
+
if url in seen:
|
|
940
|
+
continue
|
|
941
|
+
seen.add(url)
|
|
942
|
+
out.append(
|
|
943
|
+
{
|
|
944
|
+
"title": (item or {}).get("title", "") or "",
|
|
945
|
+
"url": url,
|
|
946
|
+
"description": (item or {}).get("description", "") or "",
|
|
947
|
+
}
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
merged = out
|
|
951
|
+
|
|
952
|
+
print(json.dumps(merged, ensure_ascii=False, indent=2))
|
|
953
|
+
except ValueError as e:
|
|
954
|
+
print(json.dumps({"error": str(e)}, ensure_ascii=False), file=sys.stderr)
|
|
955
|
+
sys.exit(1)
|
|
956
|
+
except httpx.HTTPStatusError as e:
|
|
957
|
+
print(json.dumps({"error": f"API错误: {e.response.status_code}"}, ensure_ascii=False), file=sys.stderr)
|
|
958
|
+
sys.exit(1)
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
async def cmd_web_fetch(args):
|
|
962
|
+
try:
|
|
963
|
+
url = normalize_url(args.url)
|
|
964
|
+
except ValueError as e:
|
|
965
|
+
print(f"错误: {e}", file=sys.stderr)
|
|
966
|
+
sys.exit(1)
|
|
967
|
+
|
|
968
|
+
has_tavily, tavily_reason = _get_tavily_status()
|
|
969
|
+
docsify_markdown_url = _materialize_docsify_markdown_url(url)
|
|
970
|
+
|
|
971
|
+
result = None
|
|
972
|
+
tavily_error = None
|
|
973
|
+
if has_tavily:
|
|
974
|
+
result, tavily_error = await _call_tavily_extract(url)
|
|
975
|
+
if not result and docsify_markdown_url and docsify_markdown_url != url:
|
|
976
|
+
result2, tavily_error2 = await _call_tavily_extract(docsify_markdown_url)
|
|
977
|
+
if result2:
|
|
978
|
+
result = result2
|
|
979
|
+
tavily_error = None
|
|
980
|
+
else:
|
|
981
|
+
tavily_error = tavily_error2 or tavily_error
|
|
982
|
+
|
|
983
|
+
use_grok_fallback = bool(args.fallback_grok) or (not has_tavily)
|
|
984
|
+
if not has_tavily and tavily_reason:
|
|
985
|
+
_emit_tavily_warning(f"{tavily_reason};web_fetch 将改用 Grok")
|
|
986
|
+
if tavily_error and use_grok_fallback:
|
|
987
|
+
_emit_tavily_warning(f"{tavily_error};web_fetch 将改用 Grok")
|
|
988
|
+
if not result and use_grok_fallback:
|
|
989
|
+
try:
|
|
990
|
+
provider = GrokSearchProvider(config.grok_api_url, config.grok_api_key, config.grok_model)
|
|
991
|
+
result = await provider.fetch(docsify_markdown_url or url)
|
|
992
|
+
except ValueError as e:
|
|
993
|
+
print(f"错误: {e}", file=sys.stderr)
|
|
994
|
+
sys.exit(1)
|
|
995
|
+
except httpx.HTTPStatusError as e:
|
|
996
|
+
print(f"API错误: {e.response.status_code}", file=sys.stderr)
|
|
997
|
+
sys.exit(1)
|
|
998
|
+
|
|
999
|
+
if not result and tavily_error and not use_grok_fallback:
|
|
1000
|
+
if docsify_markdown_url:
|
|
1001
|
+
print(f"错误: {tavily_error}(检测到 hash 路由,可尝试: {docsify_markdown_url} 或加 --fallback-grok)", file=sys.stderr)
|
|
1002
|
+
else:
|
|
1003
|
+
print(f"错误: {tavily_error}", file=sys.stderr)
|
|
1004
|
+
sys.exit(1)
|
|
1005
|
+
if not result or not str(result).strip():
|
|
1006
|
+
print("错误: 获取内容失败", file=sys.stderr)
|
|
1007
|
+
sys.exit(1)
|
|
1008
|
+
|
|
1009
|
+
if args.out:
|
|
1010
|
+
Path(args.out).write_text(result, encoding="utf-8")
|
|
1011
|
+
print(f"内容已保存到 {args.out}")
|
|
1012
|
+
else:
|
|
1013
|
+
print(result)
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
async def cmd_web_map(args):
|
|
1017
|
+
try:
|
|
1018
|
+
url = normalize_url(args.url)
|
|
1019
|
+
except ValueError as e:
|
|
1020
|
+
print(json.dumps({"error": str(e)}, ensure_ascii=False), file=sys.stderr)
|
|
1021
|
+
sys.exit(1)
|
|
1022
|
+
|
|
1023
|
+
result = await _call_tavily_map(
|
|
1024
|
+
url,
|
|
1025
|
+
args.instructions,
|
|
1026
|
+
args.max_depth,
|
|
1027
|
+
args.max_breadth,
|
|
1028
|
+
args.limit,
|
|
1029
|
+
args.timeout,
|
|
1030
|
+
)
|
|
1031
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
1032
|
+
if (result or {}).get("error"):
|
|
1033
|
+
sys.exit(1)
|
|
1034
|
+
|
|
1035
|
+
|
|
1036
|
+
async def cmd_get_config_info(args):
|
|
1037
|
+
config_info = config.get_config_info()
|
|
1038
|
+
|
|
1039
|
+
if not args.no_test:
|
|
1040
|
+
test_result = {"status": "未测试", "message": "", "response_time_ms": 0}
|
|
1041
|
+
try:
|
|
1042
|
+
api_url = config.grok_api_url
|
|
1043
|
+
api_key = config.grok_api_key
|
|
1044
|
+
models_url = f"{api_url}/models"
|
|
1045
|
+
|
|
1046
|
+
start_time = time.time()
|
|
1047
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
1048
|
+
response = await client.get(
|
|
1049
|
+
models_url,
|
|
1050
|
+
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
1051
|
+
)
|
|
1052
|
+
response_time = (time.time() - start_time) * 1000
|
|
1053
|
+
|
|
1054
|
+
if response.status_code == 200:
|
|
1055
|
+
test_result["status"] = "✅ 连接成功"
|
|
1056
|
+
test_result["response_time_ms"] = round(response_time, 2)
|
|
1057
|
+
try:
|
|
1058
|
+
models_data = response.json()
|
|
1059
|
+
if "data" in models_data:
|
|
1060
|
+
model_count = len(models_data["data"])
|
|
1061
|
+
test_result["message"] = f"已获取 {model_count} 个模型"
|
|
1062
|
+
test_result["available_models"] = [m.get("id") for m in models_data["data"] if isinstance(m, dict)]
|
|
1063
|
+
except:
|
|
1064
|
+
pass
|
|
1065
|
+
else:
|
|
1066
|
+
test_result["status"] = "⚠️ 连接问题"
|
|
1067
|
+
test_result["message"] = f"HTTP {response.status_code}"
|
|
1068
|
+
|
|
1069
|
+
except httpx.TimeoutException:
|
|
1070
|
+
test_result["status"] = "❌ 连接超时"
|
|
1071
|
+
test_result["message"] = "请求超时 (10秒)"
|
|
1072
|
+
except Exception as e:
|
|
1073
|
+
test_result["status"] = "❌ 连接失败"
|
|
1074
|
+
test_result["message"] = str(e)
|
|
1075
|
+
|
|
1076
|
+
config_info["connection_test"] = test_result
|
|
1077
|
+
|
|
1078
|
+
print(json.dumps(config_info, ensure_ascii=False, indent=2))
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
async def cmd_toggle_builtin_tools(args):
|
|
1082
|
+
# 查找项目根目录
|
|
1083
|
+
if args.root:
|
|
1084
|
+
root = Path(args.root)
|
|
1085
|
+
if not root.exists():
|
|
1086
|
+
print(json.dumps({"error": f"指定的根目录不存在: {args.root}"}, ensure_ascii=False), file=sys.stderr)
|
|
1087
|
+
sys.exit(1)
|
|
1088
|
+
else:
|
|
1089
|
+
root = Path.cwd()
|
|
1090
|
+
while root != root.parent and not (root / ".git").exists():
|
|
1091
|
+
root = root.parent
|
|
1092
|
+
if not (root / ".git").exists():
|
|
1093
|
+
print(json.dumps({
|
|
1094
|
+
"error": "未找到 .git 目录。使用 --root 指定项目根目录。",
|
|
1095
|
+
"hint": "从 git 仓库中运行此命令,或指定 --root PATH"
|
|
1096
|
+
}, ensure_ascii=False), file=sys.stderr)
|
|
1097
|
+
sys.exit(1)
|
|
1098
|
+
|
|
1099
|
+
settings_path = root / ".agent" / "settings.json"
|
|
1100
|
+
tools = ["WebFetch", "WebSearch"]
|
|
1101
|
+
|
|
1102
|
+
# 加载或初始化
|
|
1103
|
+
if settings_path.exists():
|
|
1104
|
+
with open(settings_path, 'r', encoding='utf-8') as f:
|
|
1105
|
+
settings = json.load(f)
|
|
1106
|
+
else:
|
|
1107
|
+
settings = {"permissions": {"deny": []}}
|
|
1108
|
+
|
|
1109
|
+
deny = settings.setdefault("permissions", {}).setdefault("deny", [])
|
|
1110
|
+
blocked = all(t in deny for t in tools)
|
|
1111
|
+
|
|
1112
|
+
# 执行操作
|
|
1113
|
+
action = args.action.lower()
|
|
1114
|
+
if action in ["on", "enable"]:
|
|
1115
|
+
for t in tools:
|
|
1116
|
+
if t not in deny:
|
|
1117
|
+
deny.append(t)
|
|
1118
|
+
settings_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1119
|
+
with open(settings_path, 'w', encoding='utf-8') as f:
|
|
1120
|
+
json.dump(settings, f, ensure_ascii=False, indent=2)
|
|
1121
|
+
msg = "内置工具已禁用"
|
|
1122
|
+
blocked = True
|
|
1123
|
+
elif action in ["off", "disable"]:
|
|
1124
|
+
deny[:] = [t for t in deny if t not in tools]
|
|
1125
|
+
settings_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1126
|
+
with open(settings_path, 'w', encoding='utf-8') as f:
|
|
1127
|
+
json.dump(settings, f, ensure_ascii=False, indent=2)
|
|
1128
|
+
msg = "内置工具已启用"
|
|
1129
|
+
blocked = False
|
|
1130
|
+
else:
|
|
1131
|
+
msg = f"内置工具当前{'已禁用' if blocked else '已启用'}"
|
|
1132
|
+
|
|
1133
|
+
print(json.dumps({
|
|
1134
|
+
"blocked": blocked,
|
|
1135
|
+
"deny_list": deny,
|
|
1136
|
+
"file": str(settings_path),
|
|
1137
|
+
"message": msg
|
|
1138
|
+
}, ensure_ascii=False, indent=2))
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
# ============================================================================
|
|
1142
|
+
# 主程序
|
|
1143
|
+
# ============================================================================
|
|
1144
|
+
|
|
1145
|
+
async def _run_command(args):
|
|
1146
|
+
"""使用适当的清理运行命令。"""
|
|
1147
|
+
commands = {
|
|
1148
|
+
"web_search": cmd_web_search,
|
|
1149
|
+
"web_fetch": cmd_web_fetch,
|
|
1150
|
+
"web_map": cmd_web_map,
|
|
1151
|
+
"get_config_info": cmd_get_config_info,
|
|
1152
|
+
"toggle_builtin_tools": cmd_toggle_builtin_tools,
|
|
1153
|
+
}
|
|
1154
|
+
try:
|
|
1155
|
+
await commands[args.command](args)
|
|
1156
|
+
finally:
|
|
1157
|
+
await close_http_client()
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def main():
|
|
1161
|
+
parser = argparse.ArgumentParser(
|
|
1162
|
+
prog="groksearch_cli",
|
|
1163
|
+
description="GrokSearch CLI - 通过 Grok/Tavily 进行独立的网页搜索/获取/映射"
|
|
1164
|
+
)
|
|
1165
|
+
parser.add_argument("--api-url", help="覆盖 GROK_API_URL")
|
|
1166
|
+
parser.add_argument("--debug", action="store_true", help="启用调试输出")
|
|
1167
|
+
|
|
1168
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
1169
|
+
|
|
1170
|
+
# web_search
|
|
1171
|
+
p_search = subparsers.add_parser("web_search", help="执行网页搜索")
|
|
1172
|
+
p_search.add_argument("--query", "-q", required=True, help="搜索查询")
|
|
1173
|
+
p_search.add_argument("--platform", "-p", default="", help="专注平台 (例如 'GitHub,Reddit')")
|
|
1174
|
+
p_search.add_argument("--min-results", type=int, default=3, help="最少结果数")
|
|
1175
|
+
p_search.add_argument("--max-results", type=int, default=10, help="最多结果数")
|
|
1176
|
+
p_search.add_argument("--model", default="", help="仅为此请求覆盖模型")
|
|
1177
|
+
p_search.add_argument("--extra-sources", type=int, default=0, help="来自 Tavily 的额外结果 (可选)")
|
|
1178
|
+
p_search.add_argument("--raw", action="store_true", help="输出原始响应,不进行 JSON 解析")
|
|
1179
|
+
|
|
1180
|
+
# web_fetch
|
|
1181
|
+
p_fetch = subparsers.add_parser("web_fetch", help="获取网页内容")
|
|
1182
|
+
p_fetch.add_argument("--url", "-u", required=True, help="要获取的 URL")
|
|
1183
|
+
p_fetch.add_argument("--out", "-o", help="输出文件路径")
|
|
1184
|
+
p_fetch.add_argument("--fallback-grok", action="store_true", help="当 Tavily 失败或未配置时回退到 Grok")
|
|
1185
|
+
|
|
1186
|
+
# web_map
|
|
1187
|
+
p_map = subparsers.add_parser("web_map", help="映射网站结构 (Tavily)")
|
|
1188
|
+
p_map.add_argument("--url", "-u", required=True, help="要映射的根 URL")
|
|
1189
|
+
p_map.add_argument("--instructions", default="", help="自然语言过滤指令")
|
|
1190
|
+
p_map.add_argument("--max-depth", type=int, default=1, help="最大深度 (1-5)")
|
|
1191
|
+
p_map.add_argument("--max-breadth", type=int, default=20, help="每页最大广度 (1-500)")
|
|
1192
|
+
p_map.add_argument("--limit", type=int, default=50, help="总链接限制 (1-500)")
|
|
1193
|
+
p_map.add_argument("--timeout", type=int, default=150, help="超时秒数 (10-150)")
|
|
1194
|
+
|
|
1195
|
+
# get_config_info
|
|
1196
|
+
p_config = subparsers.add_parser("get_config_info", help="显示配置并测试连接")
|
|
1197
|
+
p_config.add_argument("--no-test", action="store_true", help="跳过连接测试")
|
|
1198
|
+
|
|
1199
|
+
# toggle_builtin_tools
|
|
1200
|
+
p_toggle = subparsers.add_parser("toggle_builtin_tools", help="切换内置 WebSearch/WebFetch")
|
|
1201
|
+
p_toggle.add_argument("--action", "-a", default="status", help="操作: on/off/status")
|
|
1202
|
+
p_toggle.add_argument("--root", "-r", help="项目根路径 (默认: 通过 .git 自动检测)")
|
|
1203
|
+
|
|
1204
|
+
args = parser.parse_args()
|
|
1205
|
+
|
|
1206
|
+
# 应用覆盖
|
|
1207
|
+
if args.api_url or args.debug:
|
|
1208
|
+
config.set_overrides(args.api_url, debug=True if args.debug else None)
|
|
1209
|
+
|
|
1210
|
+
asyncio.run(_run_command(args))
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
if __name__ == "__main__":
|
|
1214
|
+
main()
|