browser-ai-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_ai_cli-0.1.0.data/data/browser_ai_examples/ai_sites.example.json +188 -0
- browser_ai_cli-0.1.0.data/data/browser_ai_examples/search_routes.example.json +111 -0
- browser_ai_cli-0.1.0.dist-info/METADATA +315 -0
- browser_ai_cli-0.1.0.dist-info/RECORD +12 -0
- browser_ai_cli-0.1.0.dist-info/WHEEL +5 -0
- browser_ai_cli-0.1.0.dist-info/entry_points.txt +2 -0
- browser_ai_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- browser_ai_cli-0.1.0.dist-info/top_level.txt +1 -0
- scripts/__init__.py +8 -0
- scripts/browser_ai.py +793 -0
- scripts/import_firefox_login.py +502 -0
- scripts/pre-commit-check.py +182 -0
scripts/browser_ai.py
ADDED
|
@@ -0,0 +1,793 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Browser-AI Toolkit — Multi-Site AI Aggregator
|
|
5
|
+
浏览器AI工具 - 多站点AI聚合器
|
|
6
|
+
|
|
7
|
+
A configuration-driven Playwright toolkit that orchestrates multiple AI
|
|
8
|
+
chatbots and search engines through a unified CLI. Designed for engineers
|
|
9
|
+
who want to fan-out a single query to several AI services, route tasks to
|
|
10
|
+
the right site by intent, and persist browser sessions locally.
|
|
11
|
+
|
|
12
|
+
一个由配置驱动的 Playwright 工具集,通过统一 CLI 编排多个 AI 对话机器人和
|
|
13
|
+
搜索引擎。适合需要把一个查询同时发给多个 AI 服务、按意图路由到合适站点、
|
|
14
|
+
并在本地持久化浏览器会话的工程师使用。
|
|
15
|
+
|
|
16
|
+
Features / 功能:
|
|
17
|
+
- Dual engines / 双引擎:
|
|
18
|
+
* Chromium + persistent profile (sites that require login state)
|
|
19
|
+
* Camoufox (anti-detect Firefox) for scraping/search-heavy sites
|
|
20
|
+
- JSON-driven site config (add new sites without editing code)
|
|
21
|
+
- Intent-based routing via search_routes.json
|
|
22
|
+
- Quality scoring + deep-dive recommendations
|
|
23
|
+
|
|
24
|
+
Usage / 用法:
|
|
25
|
+
python browser_ai.py login [site_name] # Login to AI sites (Chromium visible)
|
|
26
|
+
python browser_ai.py list # List configured sites
|
|
27
|
+
python browser_ai.py add-site # Interactive wizard to add a new site
|
|
28
|
+
python browser_ai.py search "keyword" # Smart search (route + probe + score)
|
|
29
|
+
python browser_ai.py probe "keyword" # Probe-only search
|
|
30
|
+
python browser_ai.py open <site_name> # Open a site directly
|
|
31
|
+
python browser_ai.py weixin "keyword" # WeChat article search (sogou + yuanbao)
|
|
32
|
+
|
|
33
|
+
Options / 选项:
|
|
34
|
+
--headed Force visible browser window
|
|
35
|
+
--headless Force silent mode (no window)
|
|
36
|
+
--engine chromium|camoufox Force a specific engine
|
|
37
|
+
|
|
38
|
+
SECURITY NOTE / 安全提示:
|
|
39
|
+
- The `config/profiles/` directory contains browser session data (cookies,
|
|
40
|
+
localStorage, IndexedDB) and is gitignored by default.
|
|
41
|
+
- DO NOT commit your own session data; it would expose your logins.
|
|
42
|
+
- Never run `git add .` blindly. Always review what is staged.
|
|
43
|
+
"""
|
|
44
|
+
import asyncio
|
|
45
|
+
import json
|
|
46
|
+
import re
|
|
47
|
+
import sys
|
|
48
|
+
import tempfile
|
|
49
|
+
from pathlib import Path
|
|
50
|
+
from typing import Any, Optional, Union
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from playwright.async_api import async_playwright, BrowserContext, Page, Playwright
|
|
54
|
+
except ImportError:
|
|
55
|
+
print("Error: playwright is not installed. Run: pip install playwright && playwright install chromium")
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
from camoufox.async_api import AsyncCamoufox
|
|
60
|
+
from camoufox.addons import DefaultAddons
|
|
61
|
+
CAMOUFOX_AVAILABLE = True
|
|
62
|
+
except ImportError:
|
|
63
|
+
CAMOUFOX_AVAILABLE = False
|
|
64
|
+
|
|
65
|
+
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _is_installed_via_pip() -> bool:
|
|
69
|
+
"""Return True if this script is running from a pip-installed location.
|
|
70
|
+
|
|
71
|
+
After `pip install browser-ai`, __file__ lives under site-packages (or
|
|
72
|
+
dist-packages on Debian). When running from a local checkout, __file__
|
|
73
|
+
lives under the user's working tree and there's no site-packages in
|
|
74
|
+
the path.
|
|
75
|
+
"""
|
|
76
|
+
resolved = str(Path(__file__).resolve())
|
|
77
|
+
return ("site-packages" in resolved or "dist-packages" in resolved)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _examples_source_dir() -> Path:
|
|
81
|
+
"""Locate the bundled example config files.
|
|
82
|
+
|
|
83
|
+
- Local checkout: `<repo_root>/config/`
|
|
84
|
+
- pip install: data-files unpack to `<site.getuserbase()>/browser_ai_examples/`
|
|
85
|
+
(or `<sys.prefix>/browser_ai_examples/` for system installs)
|
|
86
|
+
"""
|
|
87
|
+
if _is_installed_via_pip():
|
|
88
|
+
import site as _site
|
|
89
|
+
import sys as _sys
|
|
90
|
+
candidates = [
|
|
91
|
+
Path(_site.getuserbase()) / "browser_ai_examples",
|
|
92
|
+
Path(_sys.prefix) / "browser_ai_examples",
|
|
93
|
+
]
|
|
94
|
+
for c in candidates:
|
|
95
|
+
if c.exists():
|
|
96
|
+
return c
|
|
97
|
+
return candidates[0]
|
|
98
|
+
return SCRIPT_DIR.parent / "config"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _user_config_base() -> Path:
|
|
102
|
+
"""Where user-writable config + profiles live.
|
|
103
|
+
|
|
104
|
+
- Local checkout: `<repo_root>/` (existing behavior, sibling of scripts/)
|
|
105
|
+
- pip install: `~/.config/browser-ai/` (XDG-style, writable, persistent)
|
|
106
|
+
"""
|
|
107
|
+
if _is_installed_via_pip():
|
|
108
|
+
base = Path.home() / ".config" / "browser-ai"
|
|
109
|
+
base.mkdir(parents=True, exist_ok=True)
|
|
110
|
+
return base
|
|
111
|
+
return SCRIPT_DIR.parent
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
CONFIG_BASE = _user_config_base()
|
|
115
|
+
CONFIG_DIR = CONFIG_BASE / "config"
|
|
116
|
+
SITES_FILE = CONFIG_DIR / "ai_sites.json"
|
|
117
|
+
ROUTES_FILE = CONFIG_DIR / "search_routes.json"
|
|
118
|
+
PROFILES_DIR = CONFIG_DIR / "profiles"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def ensure_config_layout() -> None:
|
|
122
|
+
"""First-run setup for pip-installed mode.
|
|
123
|
+
|
|
124
|
+
Copies the bundled example files into the user config dir and creates
|
|
125
|
+
the profiles/ directory. Local-checkout runs are a no-op (the config
|
|
126
|
+
directory is the repo's own `config/`).
|
|
127
|
+
"""
|
|
128
|
+
if not _is_installed_via_pip():
|
|
129
|
+
return
|
|
130
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
PROFILES_DIR.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
examples = _examples_source_dir()
|
|
133
|
+
if not examples.exists():
|
|
134
|
+
return
|
|
135
|
+
import shutil
|
|
136
|
+
for example_name, target in (
|
|
137
|
+
("ai_sites.example.json", SITES_FILE),
|
|
138
|
+
("search_routes.example.json", ROUTES_FILE),
|
|
139
|
+
):
|
|
140
|
+
if target.exists():
|
|
141
|
+
continue
|
|
142
|
+
src = examples / example_name
|
|
143
|
+
if src.exists():
|
|
144
|
+
shutil.copy(src, target)
|
|
145
|
+
print(f"[browser-ai] Created {target} from bundled template")
|
|
146
|
+
|
|
147
|
+
DEFAULT_HEADLESS: bool = True
|
|
148
|
+
FORCE_ENGINE: Optional[str] = None
|
|
149
|
+
|
|
150
|
+
SiteDict = dict[str, Any]
|
|
151
|
+
CookieDict = dict[str, Any]
|
|
152
|
+
ChromiumHandle = tuple[Playwright, BrowserContext, Page]
|
|
153
|
+
CamoufoxHandle = tuple[Any, Any, Page]
|
|
154
|
+
EngineHandle = Union[ChromiumHandle, CamoufoxHandle]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def load_json(path: Path) -> dict[str, Any]:
|
|
158
|
+
if not path.exists():
|
|
159
|
+
return {}
|
|
160
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
161
|
+
return json.load(f)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def save_json(path: Path, data: dict[str, Any]) -> None:
|
|
165
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
166
|
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_sites() -> list[SiteDict]:
|
|
170
|
+
data = load_json(SITES_FILE)
|
|
171
|
+
return data.get("sites", [])
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_site(name: str) -> Optional[SiteDict]:
|
|
175
|
+
for s in get_sites():
|
|
176
|
+
if s["name"] == name:
|
|
177
|
+
return s
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_profile_path(site_name: str) -> str:
|
|
182
|
+
p = PROFILES_DIR / site_name
|
|
183
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
184
|
+
return str(p)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def get_login_lock_path(site_name: str) -> Path:
|
|
188
|
+
return Path(tempfile.gettempdir()) / f"browser_ai_{site_name}.lock"
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
async def launch_chromium(site_name: str, headless: Optional[bool] = None) -> ChromiumHandle:
|
|
192
|
+
if headless is None:
|
|
193
|
+
headless = DEFAULT_HEADLESS
|
|
194
|
+
p = await async_playwright().start()
|
|
195
|
+
profile_path = get_profile_path(site_name)
|
|
196
|
+
context = await p.chromium.launch_persistent_context(
|
|
197
|
+
user_data_dir=profile_path,
|
|
198
|
+
headless=headless,
|
|
199
|
+
args=[
|
|
200
|
+
'--no-sandbox',
|
|
201
|
+
'--disable-blink-features=AutomationControlled',
|
|
202
|
+
'--disable-infobars',
|
|
203
|
+
'--disable-dev-shm-usage',
|
|
204
|
+
'--disable-setuid-sandbox',
|
|
205
|
+
'--window-size=1280,800',
|
|
206
|
+
'--start-maximized',
|
|
207
|
+
],
|
|
208
|
+
viewport={"width": 1280, "height": 800},
|
|
209
|
+
locale='zh-CN',
|
|
210
|
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
211
|
+
bypass_csp=True,
|
|
212
|
+
java_script_enabled=True,
|
|
213
|
+
)
|
|
214
|
+
await context.add_init_script("""
|
|
215
|
+
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
216
|
+
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
|
217
|
+
Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh', 'en']});
|
|
218
|
+
window.chrome = { runtime: {} };
|
|
219
|
+
Object.defineProperty(window, 'CDP', {get: () => undefined});
|
|
220
|
+
""")
|
|
221
|
+
pages = context.pages
|
|
222
|
+
if len(pages) == 0:
|
|
223
|
+
page = await context.new_page()
|
|
224
|
+
else:
|
|
225
|
+
page = pages[0]
|
|
226
|
+
return p, context, page
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
async def launch_camoufox(headless: Optional[bool] = None) -> CamoufoxHandle:
|
|
230
|
+
if headless is None:
|
|
231
|
+
headless = DEFAULT_HEADLESS
|
|
232
|
+
if not CAMOUFOX_AVAILABLE:
|
|
233
|
+
raise RuntimeError("Camoufox is not installed. Run: pip install camoufox")
|
|
234
|
+
fox = AsyncCamoufox(
|
|
235
|
+
headless=headless,
|
|
236
|
+
exclude_addons=[DefaultAddons.UBO]
|
|
237
|
+
)
|
|
238
|
+
browser = await fox.__aenter__()
|
|
239
|
+
page = await browser.new_page()
|
|
240
|
+
return fox, browser, page
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
async def close_camoufox(fox: Any, browser: Any) -> None:
|
|
244
|
+
try:
|
|
245
|
+
await browser.close()
|
|
246
|
+
except Exception:
|
|
247
|
+
pass
|
|
248
|
+
try:
|
|
249
|
+
await fox.__aexit__(None, None, None)
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
async def close_browser(p: Playwright, context: BrowserContext) -> None:
|
|
255
|
+
try:
|
|
256
|
+
await context.close()
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
try:
|
|
260
|
+
await p.stop()
|
|
261
|
+
except Exception:
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
async def launch_browser(site_name: str, headless: Optional[bool] = None, engine: Optional[str] = None) -> tuple[str, EngineHandle]:
|
|
266
|
+
site = get_site(site_name)
|
|
267
|
+
if engine is None:
|
|
268
|
+
engine = FORCE_ENGINE
|
|
269
|
+
if engine is None and site:
|
|
270
|
+
engine = site.get("preferred_engine", "chromium")
|
|
271
|
+
if engine is None:
|
|
272
|
+
engine = "chromium"
|
|
273
|
+
|
|
274
|
+
if engine == "camoufox":
|
|
275
|
+
fox, browser, page = await launch_camoufox(headless)
|
|
276
|
+
return engine, (fox, browser, page)
|
|
277
|
+
else:
|
|
278
|
+
p, context, page = await launch_chromium(site_name, headless)
|
|
279
|
+
return engine, (p, context, page)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
async def close_browser_by_engine(engine: str, handle: EngineHandle) -> None:
|
|
283
|
+
if engine == "camoufox":
|
|
284
|
+
fox, browser, page = handle
|
|
285
|
+
await close_camoufox(fox, browser)
|
|
286
|
+
else:
|
|
287
|
+
p, context, page = handle
|
|
288
|
+
await close_browser(p, context)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
async def cmd_login(site_name: Optional[str] = None) -> None:
|
|
292
|
+
sites = [s for s in get_sites() if s.get("enabled", True)]
|
|
293
|
+
if not sites:
|
|
294
|
+
print("No sites configured. Run: python browser_ai.py add-site")
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
if site_name:
|
|
298
|
+
target = get_site(site_name)
|
|
299
|
+
if not target:
|
|
300
|
+
print(f"Site not found: {site_name}")
|
|
301
|
+
print(f"Available: {', '.join(s['name'] for s in sites)}")
|
|
302
|
+
return
|
|
303
|
+
else:
|
|
304
|
+
print("\n=== Configured AI Sites ===")
|
|
305
|
+
for i, s in enumerate(sites, 1):
|
|
306
|
+
caps = ", ".join(s.get("capabilities", []))
|
|
307
|
+
profile_path = PROFILES_DIR / s["name"]
|
|
308
|
+
has_profile = "logged-in" if profile_path.exists() and any(profile_path.iterdir()) else "not-logged-in"
|
|
309
|
+
print(f" {i}. {s['display_name']} ({s['name']}) [{caps}] {has_profile}")
|
|
310
|
+
print(f" 0. Login all sites")
|
|
311
|
+
choice = input("\nSelect a site number: ").strip()
|
|
312
|
+
if not choice:
|
|
313
|
+
return
|
|
314
|
+
try:
|
|
315
|
+
idx = int(choice)
|
|
316
|
+
except ValueError:
|
|
317
|
+
print("Invalid input")
|
|
318
|
+
return
|
|
319
|
+
if idx == 0:
|
|
320
|
+
for s in sites:
|
|
321
|
+
await login_one(s)
|
|
322
|
+
return
|
|
323
|
+
if idx < 1 or idx > len(sites):
|
|
324
|
+
print("Invalid number")
|
|
325
|
+
return
|
|
326
|
+
target = sites[idx - 1]
|
|
327
|
+
|
|
328
|
+
await login_one(target)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
async def login_one(site, timeout=120):
|
|
332
|
+
name = site["name"]
|
|
333
|
+
print(f"\n--- Login to {site['display_name']} ---")
|
|
334
|
+
print(f"Hint: {site.get('login_hint', 'Please log in manually in the opened window')}")
|
|
335
|
+
print(f"URL: {site['login_url']}")
|
|
336
|
+
print("Opening Chromium browser (visible window)...\n")
|
|
337
|
+
|
|
338
|
+
p, context, page = await launch_chromium(name, headless=False)
|
|
339
|
+
try:
|
|
340
|
+
await page.goto(site["login_url"], wait_until="domcontentloaded", timeout=30000)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
print(f"Failed to open page: {e}")
|
|
343
|
+
|
|
344
|
+
print(">>> Browser is open. Please complete login in the window. <<<")
|
|
345
|
+
print(">>> Close the browser window when you are done. <<<")
|
|
346
|
+
|
|
347
|
+
lock_path = get_login_lock_path(name)
|
|
348
|
+
lock_path.write_text("waiting", encoding="utf-8")
|
|
349
|
+
print(f">>> A lock file was created at {lock_path}.")
|
|
350
|
+
print(">>> Login is saved automatically when you delete that lock file.")
|
|
351
|
+
try:
|
|
352
|
+
while lock_path.exists():
|
|
353
|
+
await asyncio.sleep(2)
|
|
354
|
+
except Exception:
|
|
355
|
+
pass
|
|
356
|
+
|
|
357
|
+
await close_browser(p, context)
|
|
358
|
+
print(f"[OK] {site['display_name']} session saved to Chromium profile")
|
|
359
|
+
print(f" Path: {get_profile_path(name)}")
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def cmd_list() -> None:
|
|
363
|
+
sites = get_sites()
|
|
364
|
+
if not sites:
|
|
365
|
+
print("No sites configured")
|
|
366
|
+
return
|
|
367
|
+
print("\n=== AI Sites ===")
|
|
368
|
+
print(f"{'Name':<20} {'Display':<20} {'Engine':<12} {'Capabilities':<25} {'Status'}")
|
|
369
|
+
print("-" * 95)
|
|
370
|
+
for s in sites:
|
|
371
|
+
caps = ", ".join(s.get("capabilities", []))
|
|
372
|
+
engine = s.get("preferred_engine", "chromium")
|
|
373
|
+
enabled = "enabled" if s.get("enabled", True) else "disabled"
|
|
374
|
+
profile_dir = PROFILES_DIR / s["name"]
|
|
375
|
+
has_profile = "logged-in" if profile_dir.exists() and any(profile_dir.iterdir()) else "not-logged-in"
|
|
376
|
+
print(f"{s['name']:<20} {s['display_name']:<20} {engine:<12} {caps:<25} {enabled} {has_profile}")
|
|
377
|
+
print(f"\nConfig file: {SITES_FILE}")
|
|
378
|
+
print(f"Profiles directory: {PROFILES_DIR}")
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def cmd_add_site():
|
|
382
|
+
print("\n=== Add New AI Site ===")
|
|
383
|
+
print("Press Ctrl+C to cancel. Press Enter to accept defaults.\n")
|
|
384
|
+
|
|
385
|
+
name = input("Site ID (English, e.g. my_ai): ").strip()
|
|
386
|
+
if not name:
|
|
387
|
+
print("Name is required")
|
|
388
|
+
return
|
|
389
|
+
|
|
390
|
+
if get_site(name):
|
|
391
|
+
print(f"Site {name} already exists. Will overwrite.")
|
|
392
|
+
|
|
393
|
+
display_name = input("Display name (e.g. My AI): ").strip() or name
|
|
394
|
+
url = input("Site URL (e.g. https://my-ai.com/): ").strip()
|
|
395
|
+
if not url:
|
|
396
|
+
print("URL is required")
|
|
397
|
+
return
|
|
398
|
+
|
|
399
|
+
login_url = input(f"Login URL (default {url}): ").strip() or url
|
|
400
|
+
login_hint = input("Login hint (e.g. login with phone number): ").strip() or "Please log in manually in the opened page"
|
|
401
|
+
|
|
402
|
+
print("\nCapabilities (comma-separated, e.g. search,chat,summarize):")
|
|
403
|
+
print(" search=search chat=chat summarize=summarize file_upload=file_upload video_ai=video AI")
|
|
404
|
+
caps_input = input("Capabilities: ").strip()
|
|
405
|
+
capabilities = [c.strip() for c in caps_input.split(",") if c.strip()] if caps_input else ["chat"]
|
|
406
|
+
|
|
407
|
+
print("\nEngine:")
|
|
408
|
+
print(" chromium - sites that need login state (recommended for AI chat)")
|
|
409
|
+
print(" camoufox - search/scraping sites with stronger anti-detect")
|
|
410
|
+
engine = input("Preferred engine (default chromium): ").strip() or "chromium"
|
|
411
|
+
|
|
412
|
+
print("\nCSS selectors (leave blank for auto-detection):")
|
|
413
|
+
input_sel = input("Input selector (default: textarea): ").strip() or "textarea"
|
|
414
|
+
submit_sel = input("Submit selector (default: button[type='submit']): ").strip() or "button[type='submit']"
|
|
415
|
+
response_sel = input("Response selector (default: div.answer): ").strip() or "div.answer"
|
|
416
|
+
|
|
417
|
+
new_site = {
|
|
418
|
+
"name": name,
|
|
419
|
+
"display_name": display_name,
|
|
420
|
+
"url": url,
|
|
421
|
+
"login_url": login_url,
|
|
422
|
+
"login_hint": login_hint,
|
|
423
|
+
"capabilities": capabilities,
|
|
424
|
+
"preferred_engine": engine,
|
|
425
|
+
"selectors": {
|
|
426
|
+
"input": input_sel,
|
|
427
|
+
"submit": submit_sel,
|
|
428
|
+
"response": response_sel
|
|
429
|
+
},
|
|
430
|
+
"special": None,
|
|
431
|
+
"enabled": True
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
data = load_json(SITES_FILE)
|
|
435
|
+
sites = data.get("sites", [])
|
|
436
|
+
sites = [s if s["name"] != name else new_site for s in sites]
|
|
437
|
+
if not any(s["name"] == name for s in sites):
|
|
438
|
+
sites.append(new_site)
|
|
439
|
+
data["sites"] = sites
|
|
440
|
+
save_json(SITES_FILE, data)
|
|
441
|
+
|
|
442
|
+
print(f"\n[OK] Site {display_name} saved to {SITES_FILE}")
|
|
443
|
+
print(f" Run 'python browser_ai.py login {name}' to log in.")
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
async def cmd_open(site_name):
|
|
447
|
+
site = get_site(site_name)
|
|
448
|
+
if not site:
|
|
449
|
+
print(f"Site not found: {site_name}")
|
|
450
|
+
return
|
|
451
|
+
engine = site.get("preferred_engine", "chromium")
|
|
452
|
+
print(f"Opening {site['display_name']} with engine: {engine}...")
|
|
453
|
+
engine, handle = await launch_browser(site_name, headless=False, engine=engine)
|
|
454
|
+
try:
|
|
455
|
+
if engine == "camoufox":
|
|
456
|
+
fox, browser, page = handle
|
|
457
|
+
else:
|
|
458
|
+
p, context, page = handle
|
|
459
|
+
await page.goto(site["url"], wait_until="domcontentloaded", timeout=30000)
|
|
460
|
+
print(f"Opened: {await page.title()}")
|
|
461
|
+
print("Press Enter to close the browser...")
|
|
462
|
+
input()
|
|
463
|
+
except Exception as e:
|
|
464
|
+
print(f"Error: {e}")
|
|
465
|
+
finally:
|
|
466
|
+
await close_browser_by_engine(engine, handle)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
async def cmd_search(query: str, dry_run: bool = False) -> None:
|
|
470
|
+
routes = load_json(ROUTES_FILE)
|
|
471
|
+
route = match_route(query, routes)
|
|
472
|
+
print(f"\n=== Smart Search: {query} ===")
|
|
473
|
+
print(f"Task type: {route['task_type']} - {route.get('description', '')}")
|
|
474
|
+
|
|
475
|
+
probe_sources = route.get("probe_sources", [])
|
|
476
|
+
|
|
477
|
+
if dry_run:
|
|
478
|
+
print(f"\n--- DRY RUN: would probe {len(probe_sources)} sources ---")
|
|
479
|
+
for src in probe_sources:
|
|
480
|
+
site = get_site(src["site"])
|
|
481
|
+
if not site:
|
|
482
|
+
continue
|
|
483
|
+
print(f" - {site['display_name']} ({site['name']}) engine={site.get('preferred_engine', 'chromium')} weight={src['weight']}")
|
|
484
|
+
print(f" reason: {src.get('reason', '')}")
|
|
485
|
+
print(f"\nDeep-dive action: {route.get('deep_dive_action', 'Let an AI summarize')}")
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
print(f"\n--- Phase 1: probe {len(probe_sources)} sources ---")
|
|
489
|
+
probe_results = {}
|
|
490
|
+
for src in probe_sources:
|
|
491
|
+
site_name = src["site"]
|
|
492
|
+
site = get_site(site_name)
|
|
493
|
+
if not site or not site.get("enabled", True):
|
|
494
|
+
continue
|
|
495
|
+
engine = site.get("preferred_engine", "chromium")
|
|
496
|
+
print(f"\n>> Probing {site['display_name']} (engine={engine}, weight={src['weight']})...")
|
|
497
|
+
result = await probe_one(site, query)
|
|
498
|
+
probe_results[site_name] = {
|
|
499
|
+
"site": site,
|
|
500
|
+
"weight": src["weight"],
|
|
501
|
+
"result": result,
|
|
502
|
+
"reason": src.get("reason", "")
|
|
503
|
+
}
|
|
504
|
+
score = score_result(result, src["weight"])
|
|
505
|
+
print(f" Score: {score:.2f} - {src.get('reason', '')}")
|
|
506
|
+
|
|
507
|
+
print("\n--- Phase 2: evaluate results ---")
|
|
508
|
+
scored = [(name, score_result(r["result"], r["weight"]), r) for name, r in probe_results.items()]
|
|
509
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
510
|
+
for name, score, r in scored:
|
|
511
|
+
print(f" {r['site']['display_name']:<20} score={score:.2f}")
|
|
512
|
+
|
|
513
|
+
threshold = routes.get("quality_scoring", {}).get("deep_dive_threshold", 0.6)
|
|
514
|
+
top = [x for x in scored if x[1] >= threshold]
|
|
515
|
+
if not top and scored:
|
|
516
|
+
top = scored[:1]
|
|
517
|
+
|
|
518
|
+
if top:
|
|
519
|
+
print(f"\n--- Phase 3: deep dive ---")
|
|
520
|
+
print(f"Deep dive sources: {', '.join(r['site']['display_name'] for _, _, r in top)}")
|
|
521
|
+
print(f"Suggested action: {route.get('deep_dive_action', 'Let an AI summarize')}")
|
|
522
|
+
print("\n>>> AI assistant will decide the deep dive strategy based on the results above <<<")
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def match_route(query: str, routes: dict[str, Any]) -> dict[str, Any]:
|
|
526
|
+
route_list = routes.get("routes", [])
|
|
527
|
+
query_lower = query.lower()
|
|
528
|
+
for route in route_list:
|
|
529
|
+
keywords = route.get("trigger_keywords", [])
|
|
530
|
+
if any(kw.lower() in query_lower for kw in keywords):
|
|
531
|
+
return route
|
|
532
|
+
return routes.get("default_route", {"task_type": "general_search", "probe_sources": []})
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
async def probe_one(site, query):
|
|
536
|
+
name = site["name"]
|
|
537
|
+
sels = site.get("selectors", {})
|
|
538
|
+
engine = site.get("preferred_engine", "chromium")
|
|
539
|
+
|
|
540
|
+
try:
|
|
541
|
+
engine, handle = await launch_browser(name, headless=DEFAULT_HEADLESS, engine=engine)
|
|
542
|
+
if engine == "camoufox":
|
|
543
|
+
fox, browser, page = handle
|
|
544
|
+
else:
|
|
545
|
+
p, context, page = handle
|
|
546
|
+
|
|
547
|
+
try:
|
|
548
|
+
await page.goto(site["url"], wait_until="domcontentloaded", timeout=30000)
|
|
549
|
+
await page.wait_for_timeout(2000)
|
|
550
|
+
|
|
551
|
+
filled = False
|
|
552
|
+
input_sel = sels.get("input", "textarea")
|
|
553
|
+
submit_sel = sels.get("submit", "button[type='submit']")
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
elem = page.locator(input_sel).first
|
|
557
|
+
if await elem.count() > 0:
|
|
558
|
+
try:
|
|
559
|
+
await elem.fill(query, timeout=5000)
|
|
560
|
+
filled = True
|
|
561
|
+
except Exception:
|
|
562
|
+
await elem.fill(query, force=True, timeout=5000)
|
|
563
|
+
filled = True
|
|
564
|
+
await page.wait_for_timeout(500)
|
|
565
|
+
try:
|
|
566
|
+
await page.locator(submit_sel).first.click(timeout=5000)
|
|
567
|
+
except Exception:
|
|
568
|
+
await page.keyboard.press("Enter")
|
|
569
|
+
await page.wait_for_timeout(5000)
|
|
570
|
+
except Exception as e:
|
|
571
|
+
print(f" Fill failed: {e}")
|
|
572
|
+
|
|
573
|
+
if not filled:
|
|
574
|
+
try:
|
|
575
|
+
url = page.url
|
|
576
|
+
if 'baidu.com' in url:
|
|
577
|
+
from urllib.parse import quote
|
|
578
|
+
search_url = f'https://www.baidu.com/s?wd={quote(query)}'
|
|
579
|
+
await page.goto(search_url, wait_until='domcontentloaded', timeout=30000)
|
|
580
|
+
await page.wait_for_timeout(5000)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
print(f" URL search also failed: {e}")
|
|
583
|
+
|
|
584
|
+
body_text = await page.inner_text("body")
|
|
585
|
+
verify_keywords = ['安全验证', '拖动左侧滑块', '请输入验证码', 'VerifyCode', 'captcha', 'wappass.baidu.com']
|
|
586
|
+
if any(kw in body_text for kw in verify_keywords) or 'wappass.baidu.com' in page.url:
|
|
587
|
+
return {
|
|
588
|
+
"success": False,
|
|
589
|
+
"error": "captcha_or_verification_triggered",
|
|
590
|
+
"text": body_text[:500],
|
|
591
|
+
"length": len(body_text),
|
|
592
|
+
"urls": [],
|
|
593
|
+
"engine": engine
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
response_sel = sels.get("response", "body")
|
|
597
|
+
try:
|
|
598
|
+
text = await page.locator(response_sel).first.inner_text(timeout=5000)
|
|
599
|
+
except Exception:
|
|
600
|
+
text = body_text
|
|
601
|
+
|
|
602
|
+
try:
|
|
603
|
+
js_links = await page.eval_on_selector_all(
|
|
604
|
+
'a',
|
|
605
|
+
'elements => elements.map(e => e.href).filter(h => h && (h.includes("mp.weixin.qq.com") || h.includes("weixin.sogou.com/link")))'
|
|
606
|
+
)
|
|
607
|
+
weixin_urls = js_links[:10]
|
|
608
|
+
except Exception:
|
|
609
|
+
weixin_urls = []
|
|
610
|
+
|
|
611
|
+
return {
|
|
612
|
+
"success": True,
|
|
613
|
+
"text": text[:3000] if text else "",
|
|
614
|
+
"length": len(text) if text else 0,
|
|
615
|
+
"urls": weixin_urls,
|
|
616
|
+
"all_urls": weixin_urls,
|
|
617
|
+
"engine": engine
|
|
618
|
+
}
|
|
619
|
+
finally:
|
|
620
|
+
await close_browser_by_engine(engine, handle)
|
|
621
|
+
except Exception as e:
|
|
622
|
+
return {"success": False, "error": str(e), "text": "", "length": 0, "engine": engine}
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def score_result(result, weight):
|
|
626
|
+
if not result.get("success"):
|
|
627
|
+
return 0.0
|
|
628
|
+
length = result.get("length", 0)
|
|
629
|
+
length_score = min(length / 3000, 1.0)
|
|
630
|
+
return length_score * weight
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
async def cmd_probe(query: str, dry_run: bool = False) -> None:
|
|
634
|
+
routes = load_json(ROUTES_FILE)
|
|
635
|
+
route = match_route(query, routes)
|
|
636
|
+
print(f"\n=== Probe Search: {query} ===")
|
|
637
|
+
print(f"Matched route: {route['task_type']}")
|
|
638
|
+
|
|
639
|
+
for src in route.get("probe_sources", []):
|
|
640
|
+
site = get_site(src["site"])
|
|
641
|
+
if not site:
|
|
642
|
+
continue
|
|
643
|
+
engine = site.get("preferred_engine", "chromium")
|
|
644
|
+
print(f"\n>> {site['display_name']} (engine={engine})")
|
|
645
|
+
result = await probe_one(site, query)
|
|
646
|
+
print(f" success: {result.get('success')}")
|
|
647
|
+
print(f" length: {result.get('length', 0)}")
|
|
648
|
+
if result.get("text"):
|
|
649
|
+
print(f" preview: {result['text'][:300]}...")
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
async def cmd_weixin(query: str, dry_run: bool = False) -> None:
|
|
653
|
+
print(f"\n=== WeChat Article Search: {query} ===")
|
|
654
|
+
print("Strategy: sogou-weixin (Camoufox) + baidu site:mp.weixin.qq.com + yuanbao AI\n")
|
|
655
|
+
|
|
656
|
+
if dry_run:
|
|
657
|
+
print("DRY RUN: would query the following sources:")
|
|
658
|
+
for name in ("sogou_weixin", "baidu", "yuanbao"):
|
|
659
|
+
site = get_site(name)
|
|
660
|
+
if site:
|
|
661
|
+
print(f" - {site['display_name']} ({site['name']}) engine={site.get('preferred_engine', 'chromium')}")
|
|
662
|
+
return
|
|
663
|
+
|
|
664
|
+
print("--- Path 1: sogou weixin search (Camoufox) ---")
|
|
665
|
+
sogou = get_site("sogou_weixin")
|
|
666
|
+
if sogou:
|
|
667
|
+
result = await probe_one(sogou, query)
|
|
668
|
+
text = result.get("text", "")
|
|
669
|
+
if "验证码" in text or "VerifyCode" in text:
|
|
670
|
+
print("! Sogou requires captcha, falling back to path 2")
|
|
671
|
+
result["success"] = False
|
|
672
|
+
if result.get("success") and text:
|
|
673
|
+
print(f"Sogou result length: {result['length']}")
|
|
674
|
+
print(f"Preview:\n{text[:1000]}")
|
|
675
|
+
urls = result.get("urls", [])
|
|
676
|
+
if urls:
|
|
677
|
+
print(f"\nFound {len(urls)} related links (first 5):")
|
|
678
|
+
for i, url in enumerate(urls[:5], 1):
|
|
679
|
+
print(f" {i}. {url}")
|
|
680
|
+
else:
|
|
681
|
+
print(f"Sogou search failed: {result.get('error', 'unknown error')}")
|
|
682
|
+
|
|
683
|
+
print("\n--- Path 2: baidu site:mp.weixin.qq.com (Camoufox) ---")
|
|
684
|
+
baidu = get_site("baidu")
|
|
685
|
+
if baidu:
|
|
686
|
+
baidu_query = f"site:mp.weixin.qq.com {query}"
|
|
687
|
+
result = await probe_one(baidu, baidu_query)
|
|
688
|
+
if result.get("success") and result.get("text"):
|
|
689
|
+
print(f"Baidu result length: {result['length']}")
|
|
690
|
+
print(f"Preview:\n{result['text'][:1500]}")
|
|
691
|
+
urls = extract_urls(result.get("text", ""))
|
|
692
|
+
if urls:
|
|
693
|
+
print(f"\nFound {len(urls)} possible article links:")
|
|
694
|
+
for i, url in enumerate(urls[:5], 1):
|
|
695
|
+
print(f" {i}. {url}")
|
|
696
|
+
else:
|
|
697
|
+
print(f"Baidu search failed: {result.get('error', 'unknown error')}")
|
|
698
|
+
|
|
699
|
+
print("\n--- Path 3: yuanbao AI with web search (Chromium, requires login) ---")
|
|
700
|
+
yuanbao = get_site("yuanbao")
|
|
701
|
+
if yuanbao:
|
|
702
|
+
ai_query = f"请帮我搜索关于「{query}」的微信公众号文章,列出文章标题、作者和链接"
|
|
703
|
+
result = await probe_one(yuanbao, ai_query)
|
|
704
|
+
text = result.get("text", "")
|
|
705
|
+
if "登录" in text or "未登录" in text:
|
|
706
|
+
print("! Yuanbao requires login. Run: python browser_ai.py login yuanbao")
|
|
707
|
+
elif result.get("success") and text:
|
|
708
|
+
print(f"Yuanbao result length: {result['length']}")
|
|
709
|
+
print(f"Preview:\n{text[:2000]}")
|
|
710
|
+
else:
|
|
711
|
+
print(f"Yuanbao search failed: {result.get('error', 'unknown error')}")
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def extract_urls(text):
|
|
715
|
+
patterns = [
|
|
716
|
+
r'https?://mp\.weixin\.qq\.com/[^\s<>"\')]+',
|
|
717
|
+
r'https?://weixin\.sogou\.com/link\?[^\s<>"\')]+',
|
|
718
|
+
]
|
|
719
|
+
urls = []
|
|
720
|
+
for pattern in patterns:
|
|
721
|
+
urls.extend(re.findall(pattern, text))
|
|
722
|
+
if not urls:
|
|
723
|
+
general_pattern = r'https?://[^\s<>"\')]+'
|
|
724
|
+
urls = re.findall(general_pattern, text)
|
|
725
|
+
return urls
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def parse_args() -> tuple[list[str], bool]:
|
|
729
|
+
"""解析命令行参数"""
|
|
730
|
+
global DEFAULT_HEADLESS, FORCE_ENGINE
|
|
731
|
+
args = sys.argv[1:]
|
|
732
|
+
if "--headed" in args:
|
|
733
|
+
DEFAULT_HEADLESS = False
|
|
734
|
+
args.remove("--headed")
|
|
735
|
+
if "--headless" in args:
|
|
736
|
+
DEFAULT_HEADLESS = True
|
|
737
|
+
args.remove("--headless")
|
|
738
|
+
if "--engine" in args:
|
|
739
|
+
idx = args.index("--engine")
|
|
740
|
+
if idx + 1 < len(args):
|
|
741
|
+
FORCE_ENGINE = args[idx + 1]
|
|
742
|
+
args.pop(idx + 1)
|
|
743
|
+
args.pop(idx)
|
|
744
|
+
dry_run = False
|
|
745
|
+
if "--dry-run" in args:
|
|
746
|
+
dry_run = True
|
|
747
|
+
args.remove("--dry-run")
|
|
748
|
+
return args, dry_run
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def main() -> None:
|
|
752
|
+
ensure_config_layout()
|
|
753
|
+
args, dry_run = parse_args()
|
|
754
|
+
if len(args) < 1:
|
|
755
|
+
print(__doc__)
|
|
756
|
+
return
|
|
757
|
+
|
|
758
|
+
cmd = args[0]
|
|
759
|
+
cmd_args = args[1:]
|
|
760
|
+
|
|
761
|
+
if cmd == "login":
|
|
762
|
+
asyncio.run(cmd_login(cmd_args[0] if cmd_args else None))
|
|
763
|
+
elif cmd == "list":
|
|
764
|
+
cmd_list()
|
|
765
|
+
elif cmd == "add-site":
|
|
766
|
+
cmd_add_site()
|
|
767
|
+
elif cmd == "open":
|
|
768
|
+
if not cmd_args:
|
|
769
|
+
print("Usage: python browser_ai.py open <site_name>")
|
|
770
|
+
return
|
|
771
|
+
asyncio.run(cmd_open(cmd_args[0]))
|
|
772
|
+
elif cmd == "search":
|
|
773
|
+
if not cmd_args:
|
|
774
|
+
print("Usage: python browser_ai.py search \"keyword\"")
|
|
775
|
+
return
|
|
776
|
+
asyncio.run(cmd_search(" ".join(cmd_args), dry_run=dry_run))
|
|
777
|
+
elif cmd == "probe":
|
|
778
|
+
if not cmd_args:
|
|
779
|
+
print("Usage: python browser_ai.py probe \"keyword\"")
|
|
780
|
+
return
|
|
781
|
+
asyncio.run(cmd_probe(" ".join(cmd_args), dry_run=dry_run))
|
|
782
|
+
elif cmd == "weixin":
|
|
783
|
+
if not cmd_args:
|
|
784
|
+
print("Usage: python browser_ai.py weixin \"keyword\"")
|
|
785
|
+
return
|
|
786
|
+
asyncio.run(cmd_weixin(" ".join(cmd_args), dry_run=dry_run))
|
|
787
|
+
else:
|
|
788
|
+
print(f"Unknown command: {cmd}")
|
|
789
|
+
print(__doc__)
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
if __name__ == "__main__":
|
|
793
|
+
main()
|