browserwright 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browserwright/__init__.py +33 -0
- browserwright/__main__.py +6 -0
- browserwright/_executor/__init__.py +47 -0
- browserwright/_executor/__main__.py +9 -0
- browserwright/_executor/client.py +127 -0
- browserwright/_executor/process.py +652 -0
- browserwright/_executor/protocol.py +152 -0
- browserwright/api.py +66 -0
- browserwright/cdp.py +285 -0
- browserwright/cli.py +741 -0
- browserwright/daemon/__init__.py +8 -0
- browserwright/daemon/_ipc.py +444 -0
- browserwright/daemon/active_tab.py +183 -0
- browserwright/daemon/auth.py +395 -0
- browserwright/daemon/backends/__init__.py +59 -0
- browserwright/daemon/backends/base.py +120 -0
- browserwright/daemon/backends/cloud.py +222 -0
- browserwright/daemon/backends/env.py +119 -0
- browserwright/daemon/backends/extension.py +185 -0
- browserwright/daemon/backends/rdp.py +214 -0
- browserwright/daemon/cli.py +1437 -0
- browserwright/daemon/config.py +380 -0
- browserwright/daemon/doctor.py +179 -0
- browserwright/daemon/errors.py +34 -0
- browserwright/daemon/launch_chrome.py +353 -0
- browserwright/daemon/observability.py +181 -0
- browserwright/daemon/platforms.py +234 -0
- browserwright/daemon/resolver.py +72 -0
- browserwright/daemon/server/__init__.py +6 -0
- browserwright/daemon/server/daemon.py +229 -0
- browserwright/daemon/server/executor_registry.py +434 -0
- browserwright/daemon/server/extension_upstream.py +677 -0
- browserwright/daemon/server/facade.py +375 -0
- browserwright/daemon/server/facade_extension.py +969 -0
- browserwright/daemon/server/listener.py +1058 -0
- browserwright/daemon/server/proxy.py +1991 -0
- browserwright/daemon/server/relay.py +783 -0
- browserwright/daemon/server/state.py +432 -0
- browserwright/daemon/server/upstream.py +266 -0
- browserwright/daemon/userscripts.py +150 -0
- browserwright/discovery.py +213 -0
- browserwright/errors.py +177 -0
- browserwright/health.py +169 -0
- browserwright/install.py +628 -0
- browserwright/memory/__init__.py +15 -0
- browserwright/memory/_md.py +120 -0
- browserwright/memory/_yaml.py +217 -0
- browserwright/memory/global_mem.py +201 -0
- browserwright/memory/repl_mem.py +28 -0
- browserwright/memory/session_decisions.py +53 -0
- browserwright/memory/site_mem.py +381 -0
- browserwright/mode_b_client.py +590 -0
- browserwright/multitask.py +131 -0
- browserwright/output_schema.py +99 -0
- browserwright/primitives/__init__.py +67 -0
- browserwright/primitives/discovery_api.py +79 -0
- browserwright/primitives/http.py +42 -0
- browserwright/primitives/inspect.py +876 -0
- browserwright/primitives/interact.py +518 -0
- browserwright/primitives/page.py +556 -0
- browserwright/primitives/site.py +143 -0
- browserwright/release_install.py +466 -0
- browserwright/repl/__init__.py +6 -0
- browserwright/repl/_namespace.py +106 -0
- browserwright/repl/_smart_goto.py +236 -0
- browserwright/repl/inline.py +180 -0
- browserwright/repl/playwright_handle.py +449 -0
- browserwright/repl/snapshot.py +150 -0
- browserwright/session.py +229 -0
- browserwright/session_create.py +252 -0
- browserwright/session_ctx.py +24 -0
- browserwright/session_registry.py +133 -0
- browserwright/session_runtime.py +133 -0
- browserwright/site_skills_starter/github.com/SKILL.md +14 -0
- browserwright/site_skills_starter/github.com/memory.md +29 -0
- browserwright/site_skills_starter/github.com/tasks/list_issues.py +55 -0
- browserwright/site_skills_starter/google.com/SKILL.md +16 -0
- browserwright/site_skills_starter/google.com/memory.md +27 -0
- browserwright/site_skills_starter/google.com/tasks/search.py +53 -0
- browserwright/site_skills_starter/producthunt.com/SKILL.md +7 -0
- browserwright/site_skills_starter/producthunt.com/memory.md +26 -0
- browserwright/site_skills_starter/producthunt.com/tasks/today.py +64 -0
- browserwright/site_skills_starter/wikipedia.org/SKILL.md +7 -0
- browserwright/site_skills_starter/wikipedia.org/memory.md +22 -0
- browserwright/site_skills_starter/wikipedia.org/tasks/lookup.py +55 -0
- browserwright/site_skills_starter/ycombinator.com/SKILL.md +8 -0
- browserwright/site_skills_starter/ycombinator.com/memory.md +25 -0
- browserwright/site_skills_starter/ycombinator.com/tasks/front_page.py +63 -0
- browserwright/skill_doc.py +140 -0
- browserwright/skill_runtime.md +194 -0
- browserwright/subscriptions.py +213 -0
- browserwright/task_runner.py +125 -0
- browserwright/version.py +117 -0
- browserwright-0.6.2.dist-info/METADATA +12 -0
- browserwright-0.6.2.dist-info/RECORD +98 -0
- browserwright-0.6.2.dist-info/WHEEL +5 -0
- browserwright-0.6.2.dist-info/entry_points.txt +3 -0
- browserwright-0.6.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Transparent reconnect-recovery for a session's tab binding.
|
|
2
|
+
|
|
3
|
+
A session's durable extension anchor is its Chrome tab-group id. The ledger
|
|
4
|
+
record carries a ``runtime`` cache (``current_target_id``, ``group_id``,
|
|
5
|
+
``owned_tab_ids``, ``updated_at``) as a fast path — the source of truth is the
|
|
6
|
+
live tab group keyed by that numeric id, recoverable via the daemon verb
|
|
7
|
+
``BrowserwrightDaemon.recoverSession``.
|
|
8
|
+
|
|
9
|
+
These helpers let primitives re-attach to a session's tab across daemon
|
|
10
|
+
daemon restarts / extension reconnects / new ``bs run`` processes without the caller
|
|
11
|
+
doing anything: ``ensure_session_target`` runs a 3-step fallback (in-process →
|
|
12
|
+
ledger.runtime fast path → group-id recovery).
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
from collections import deque
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from . import session_registry as reg
|
|
21
|
+
from .errors import CDPError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _resolve_sid(sess) -> Optional[str]:
|
|
25
|
+
"""Best-effort current session id from the bound record."""
|
|
26
|
+
rec = getattr(sess, "session_record", None)
|
|
27
|
+
if isinstance(rec, dict) and rec.get("id"):
|
|
28
|
+
return rec["id"]
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_record(sess) -> Optional[dict]:
|
|
33
|
+
"""Best-effort current session record (fresh read from the ledger)."""
|
|
34
|
+
sid = _resolve_sid(sess)
|
|
35
|
+
if not sid:
|
|
36
|
+
return None
|
|
37
|
+
return reg.get(sid)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def persist_target(target_id: str, *, group_id: Optional[int] = None,
|
|
41
|
+
sess=None) -> None:
|
|
42
|
+
"""Cache the current tab binding in the ledger record's ``runtime`` field.
|
|
43
|
+
|
|
44
|
+
Called wherever a primitive sets ``current_target_id`` so a later process
|
|
45
|
+
can fast-path re-attach without querying the tab group."""
|
|
46
|
+
if sess is None:
|
|
47
|
+
from .session import current_session
|
|
48
|
+
sess = current_session()
|
|
49
|
+
sid = _resolve_sid(sess)
|
|
50
|
+
if not sid:
|
|
51
|
+
return
|
|
52
|
+
runtime = {
|
|
53
|
+
"current_target_id": target_id,
|
|
54
|
+
"group_id": group_id,
|
|
55
|
+
"owned_tab_ids": [],
|
|
56
|
+
"updated_at": time.time(),
|
|
57
|
+
}
|
|
58
|
+
try:
|
|
59
|
+
reg.update(sid, runtime=runtime)
|
|
60
|
+
except Exception:
|
|
61
|
+
# Caching is best-effort; never let a ledger write break a primitive.
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def register_recovered(sess, payload: dict) -> Optional[str]:
|
|
66
|
+
"""Register a daemon-minted session/target in the local CDPSession tables.
|
|
67
|
+
|
|
68
|
+
Factored out of ``open_background``'s post-open registration so recovery
|
|
69
|
+
can reuse it. Returns the targetId (or None on a malformed payload)."""
|
|
70
|
+
target_id = payload.get("targetId")
|
|
71
|
+
session_id = payload.get("sessionId")
|
|
72
|
+
if not target_id or not session_id:
|
|
73
|
+
return None
|
|
74
|
+
cdp = sess.cdp
|
|
75
|
+
cdp._sessions[target_id] = session_id
|
|
76
|
+
cdp._events.setdefault(session_id, deque(maxlen=1024))
|
|
77
|
+
sess.current_target_id = target_id
|
|
78
|
+
return target_id
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def ensure_session_target(sess) -> Optional[str]:
|
|
82
|
+
"""Transparent 3-step recovery of the session's attached tab.
|
|
83
|
+
|
|
84
|
+
1. ``sess.current_target_id`` already set → return it (in-process).
|
|
85
|
+
2. ledger ``runtime.current_target_id`` → try ``cdp.attach(tid)`` (FAST
|
|
86
|
+
PATH, no group query). The daemon auto-reattaches the debugger; a
|
|
87
|
+
stale/closed tab raises → fall through.
|
|
88
|
+
3. group id: ``BrowserwrightDaemon.recoverSession`` by persisted group id
|
|
89
|
+
→ register + persist the new binding. On CDPError / empty group return
|
|
90
|
+
None.
|
|
91
|
+
|
|
92
|
+
Returns the targetId, or None when nothing could be recovered (brand-new
|
|
93
|
+
session with no tabs yet)."""
|
|
94
|
+
if sess.current_target_id:
|
|
95
|
+
return sess.current_target_id
|
|
96
|
+
|
|
97
|
+
rec = _resolve_record(sess)
|
|
98
|
+
|
|
99
|
+
# Step 2: ledger.runtime fast path.
|
|
100
|
+
if isinstance(rec, dict):
|
|
101
|
+
runtime = rec.get("runtime") or {}
|
|
102
|
+
tid = runtime.get("current_target_id")
|
|
103
|
+
if tid:
|
|
104
|
+
try:
|
|
105
|
+
sess.cdp.attach(tid)
|
|
106
|
+
sess.current_target_id = tid
|
|
107
|
+
return tid
|
|
108
|
+
except CDPError:
|
|
109
|
+
pass # stale/closed tab — fall through to group recovery
|
|
110
|
+
|
|
111
|
+
# Step 3: durable group recovery by the persisted numeric groupId — NOT the
|
|
112
|
+
# title (names aren't unique; the session = the tab group, keyed by id). The
|
|
113
|
+
# groupId is cached in ledger.runtime.group_id on every open; without it
|
|
114
|
+
# there's nothing to recover (a brand-new session, or Chrome itself
|
|
115
|
+
# restarted and reassigned group ids — which needs no recovery).
|
|
116
|
+
runtime = (rec.get("runtime") or {}) if isinstance(rec, dict) else {}
|
|
117
|
+
gid = runtime.get("group_id")
|
|
118
|
+
sid = rec.get("id") if isinstance(rec, dict) else _resolve_sid(sess)
|
|
119
|
+
if not isinstance(gid, int) or gid < 0:
|
|
120
|
+
return None
|
|
121
|
+
try:
|
|
122
|
+
payload = sess.cdp.send(
|
|
123
|
+
"BrowserwrightDaemon.recoverSession", groupId=gid, bsSession=sid,
|
|
124
|
+
)
|
|
125
|
+
except CDPError:
|
|
126
|
+
return None
|
|
127
|
+
if not payload:
|
|
128
|
+
return None
|
|
129
|
+
target_id = register_recovered(sess, payload)
|
|
130
|
+
if target_id is None:
|
|
131
|
+
return None
|
|
132
|
+
persist_target(target_id, group_id=payload.get("groupId"), sess=sess)
|
|
133
|
+
return target_id
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# github.com
|
|
2
|
+
|
|
3
|
+
Browse public GitHub issues + repos without login.
|
|
4
|
+
|
|
5
|
+
## Conventions
|
|
6
|
+
|
|
7
|
+
- Public issue / PR listings render fully without login — no auth wall.
|
|
8
|
+
- Use `https://github.com/<owner>/<repo>/issues?q=...` and parse the
|
|
9
|
+
list view; the API would be more polite but Skill is fundamentally
|
|
10
|
+
browser-first.
|
|
11
|
+
|
|
12
|
+
## Tasks
|
|
13
|
+
|
|
14
|
+
- `list_issues` — owner/repo (+ optional state filter) → list of issues.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
---
|
|
2
|
+
site: github
|
|
3
|
+
host_patterns: ["github.com", "www.github.com"]
|
|
4
|
+
aliases: ["github", "pull request", "issue tracker", "代码仓库"]
|
|
5
|
+
last_updated: 2026-05-18
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# github.com site memory
|
|
9
|
+
|
|
10
|
+
## 顶层 URL 结构
|
|
11
|
+
|
|
12
|
+
- repo home: https://github.com/<owner>/<repo>
|
|
13
|
+
- issue list: https://github.com/<owner>/<repo>/issues?state=open
|
|
14
|
+
- PR list: https://github.com/<owner>/<repo>/pulls?state=open
|
|
15
|
+
|
|
16
|
+
## 稳定 selectors
|
|
17
|
+
|
|
18
|
+
- 列表项: `div[aria-label="Issues"] [data-testid="issue-pr-title-link"]` 或经典 `a.js-navigation-open`
|
|
19
|
+
- 标题文字: `[data-testid="issue-pr-title-link"]`
|
|
20
|
+
- 状态徽标: `[data-testid="issue-state"]`
|
|
21
|
+
- 数字编号: `a[id^=issue_]`
|
|
22
|
+
|
|
23
|
+
## Known traps
|
|
24
|
+
|
|
25
|
+
- React-rendered "new issues UI" 在某些仓库上替代了 classic — 选择器要双路。
|
|
26
|
+
- 私有仓库会重定向到 login → AuthWall。
|
|
27
|
+
- `https://github.com/issues` 在未登录态下也会跳 login。
|
|
28
|
+
|
|
29
|
+
## Notes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""List public GitHub issues for owner/repo without login.
|
|
2
|
+
|
|
3
|
+
Phase C surface: drives the injected Playwright ``page`` (reused, navigated in
|
|
4
|
+
place); reads the issue list with ``page.evaluate``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ARGS = {
|
|
8
|
+
"owner": {"type": "str", "required": True, "desc": "github owner / org name"},
|
|
9
|
+
"repo": {"type": "str", "required": True, "desc": "repo name"},
|
|
10
|
+
"state": {"type": "str", "required": False, "default": "open",
|
|
11
|
+
"desc": "open / closed / all"},
|
|
12
|
+
"limit": {"type": "int", "required": False, "default": 20},
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
OUTPUT = "list[{number: int, title: str, url: str, state: str}]"
|
|
16
|
+
TAGS = ["github", "issues", "list"]
|
|
17
|
+
REQUIRES_LOGIN = False
|
|
18
|
+
ESTIMATED_DURATION_SEC = 10
|
|
19
|
+
LAST_VERIFIED = "2026-05-25"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def selftest():
|
|
23
|
+
page.goto("https://github.com/")
|
|
24
|
+
return page.url.startswith("https://github.com")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run(args, ctx=None):
|
|
28
|
+
pg = ctx.page if ctx is not None and getattr(ctx, "page", None) else page
|
|
29
|
+
|
|
30
|
+
owner, repo = args["owner"], args["repo"]
|
|
31
|
+
state = args.get("state", "open")
|
|
32
|
+
limit = int(args.get("limit", 20))
|
|
33
|
+
url = f"https://github.com/{owner}/{repo}/issues?q=is%3Aissue+state%3A{state}"
|
|
34
|
+
pg.goto(url)
|
|
35
|
+
results = pg.evaluate(
|
|
36
|
+
"""
|
|
37
|
+
(limit) => {
|
|
38
|
+
const cards = Array.from(document.querySelectorAll(
|
|
39
|
+
'[data-testid="issue-pr-title-link"], a.js-navigation-open'
|
|
40
|
+
)).filter(a => /\\/issues\\/\\d+$/.test(a.getAttribute('href') || ''));
|
|
41
|
+
return cards.slice(0, limit).map(a => {
|
|
42
|
+
const m = a.getAttribute('href').match(/\\/issues\\/(\\d+)$/);
|
|
43
|
+
const stateBadge = a.closest('li, div')?.querySelector('[data-testid="issue-state"]');
|
|
44
|
+
return {
|
|
45
|
+
number: m ? parseInt(m[1], 10) : null,
|
|
46
|
+
title: a.innerText.trim(),
|
|
47
|
+
url: new URL(a.getAttribute('href'), location.origin).toString(),
|
|
48
|
+
state: stateBadge ? stateBadge.innerText.trim().toLowerCase() : '',
|
|
49
|
+
};
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
""",
|
|
53
|
+
limit,
|
|
54
|
+
)
|
|
55
|
+
return results or []
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# google.com
|
|
2
|
+
|
|
3
|
+
Search Google and extract the top organic results.
|
|
4
|
+
|
|
5
|
+
## Conventions
|
|
6
|
+
|
|
7
|
+
- Navigate the bound `page` in place with `page.goto(...)` — reuse the working
|
|
8
|
+
tab, don't open a new one per query.
|
|
9
|
+
- The result selector `div.g h3` is stable across the consumer SERP. The mobile
|
|
10
|
+
layout uses `div[data-hveid] h3` — `tasks/search.py` falls back if needed.
|
|
11
|
+
- Don't try to login. Google's bot detection is aggressive; skill stays
|
|
12
|
+
unauthenticated and reads public SERP only.
|
|
13
|
+
|
|
14
|
+
## Tasks
|
|
15
|
+
|
|
16
|
+
- `search` — query → `list[{title, url, snippet}]`
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
site: google
|
|
3
|
+
host_patterns: ["google.com", "www.google.com"]
|
|
4
|
+
aliases: ["谷歌", "google search", "search the web"]
|
|
5
|
+
last_updated: 2026-05-18
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# google.com site memory
|
|
9
|
+
|
|
10
|
+
## 顶层 URL 结构
|
|
11
|
+
|
|
12
|
+
- 搜索: https://www.google.com/search?q=<query>&hl=en
|
|
13
|
+
- 图片搜索: https://www.google.com/search?tbm=isch&q=<query>
|
|
14
|
+
|
|
15
|
+
## 稳定 selectors
|
|
16
|
+
|
|
17
|
+
- 结果块: `div.g`
|
|
18
|
+
- 结果标题: `div.g h3` / `div[data-hveid] h3`
|
|
19
|
+
- 结果链接: `div.g a[href^=http]`
|
|
20
|
+
- 结果摘要: `div.g div[data-content-feature="1"]`
|
|
21
|
+
|
|
22
|
+
## Known traps
|
|
23
|
+
|
|
24
|
+
- 偶尔 SERP 被 "before you continue" cookie consent 拦截:先 dismiss `button#L2AGLb` 或 `form[action*="consent"]`。
|
|
25
|
+
- 中文 query 在某些区域返回 "did you mean…" 重定向,结果数为 0;要识别并提示 user。
|
|
26
|
+
|
|
27
|
+
## Notes
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Search Google for `query` and return the top N organic results.
|
|
2
|
+
|
|
3
|
+
Phase C surface: drives the injected Playwright ``page`` (reused, navigated in
|
|
4
|
+
place) and reads results via ``page.evaluate``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ARGS = {
|
|
8
|
+
"query": {"type": "str", "required": True, "desc": "search query"},
|
|
9
|
+
"limit": {"type": "int", "required": False, "default": 10},
|
|
10
|
+
"hl": {"type": "str", "required": False, "default": "en", "desc": "interface language"},
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
OUTPUT = "list[{title: str, url: str, snippet: str}]"
|
|
14
|
+
TAGS = ["search", "general"]
|
|
15
|
+
REQUIRES_LOGIN = False
|
|
16
|
+
ESTIMATED_DURATION_SEC = 8
|
|
17
|
+
LAST_VERIFIED = "2026-05-25"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def selftest():
|
|
21
|
+
page.goto("https://www.google.com/?hl=en")
|
|
22
|
+
return "google.com" in page.url
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run(args, ctx=None):
|
|
26
|
+
from urllib.parse import quote_plus
|
|
27
|
+
|
|
28
|
+
pg = ctx.page if ctx is not None and getattr(ctx, "page", None) else page
|
|
29
|
+
|
|
30
|
+
q = args["query"]
|
|
31
|
+
limit = int(args.get("limit", 10))
|
|
32
|
+
hl = args.get("hl", "en")
|
|
33
|
+
pg.goto(f"https://www.google.com/search?q={quote_plus(q)}&hl={hl}")
|
|
34
|
+
results = pg.evaluate(
|
|
35
|
+
"""
|
|
36
|
+
(limit) => Array.from(document.querySelectorAll('div.g, div[data-hveid]'))
|
|
37
|
+
.map(div => {
|
|
38
|
+
const a = div.querySelector('a[href^="http"]');
|
|
39
|
+
const h = div.querySelector('h3');
|
|
40
|
+
const sn = div.querySelector('div[data-content-feature="1"], div[role="link"] + div');
|
|
41
|
+
if (!a || !h) return null;
|
|
42
|
+
return {
|
|
43
|
+
title: h.innerText.trim(),
|
|
44
|
+
url: a.href,
|
|
45
|
+
snippet: (sn && sn.innerText.trim()) || '',
|
|
46
|
+
};
|
|
47
|
+
})
|
|
48
|
+
.filter(Boolean)
|
|
49
|
+
.slice(0, limit)
|
|
50
|
+
""",
|
|
51
|
+
limit,
|
|
52
|
+
)
|
|
53
|
+
return results or []
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
site: producthunt
|
|
3
|
+
host_patterns: ["producthunt.com", "www.producthunt.com"]
|
|
4
|
+
aliases: ["product hunt", "ph", "产品猎手", "今日新品"]
|
|
5
|
+
last_updated: 2026-05-18
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# producthunt.com site memory
|
|
9
|
+
|
|
10
|
+
## 顶层 URL 结构
|
|
11
|
+
|
|
12
|
+
- 今日榜: https://www.producthunt.com/
|
|
13
|
+
|
|
14
|
+
## Known traps
|
|
15
|
+
|
|
16
|
+
- Cookie consent 横幅可能挡住第一屏;如果选择器返回空,先 dismiss 一次再重试。
|
|
17
|
+
- ProductHunt 偶尔 A/B 切 React tree,hard-coded selector 会变;当 selector
|
|
18
|
+
miss 时改用 anchor `a[href^="/posts/"]` 兜底。
|
|
19
|
+
|
|
20
|
+
## 稳定 selectors
|
|
21
|
+
|
|
22
|
+
- 卡片: `[data-test^="post-item"]` 或 `a[href^="/posts/"]`
|
|
23
|
+
- 名称: 卡片内 `a[href^="/posts/"]` 的 innerText
|
|
24
|
+
- vote 数: 卡片内 `[data-test="vote-button"]` 的 innerText
|
|
25
|
+
|
|
26
|
+
## Notes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Today's top products on Product Hunt.
|
|
2
|
+
|
|
3
|
+
Phase C surface: drives the injected Playwright ``page`` (reused, navigated in
|
|
4
|
+
place); scrapes the feed with ``page.evaluate``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ARGS = {
|
|
8
|
+
"limit": {"type": "int", "required": False, "default": 20},
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
OUTPUT = "list[{name: str, url: str, tagline: str, votes: int}]"
|
|
12
|
+
TAGS = ["producthunt", "feed", "launches"]
|
|
13
|
+
REQUIRES_LOGIN = False
|
|
14
|
+
ESTIMATED_DURATION_SEC = 10
|
|
15
|
+
LAST_VERIFIED = "2026-05-25"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def selftest():
|
|
19
|
+
page.goto("https://www.producthunt.com/")
|
|
20
|
+
return "producthunt" in page.url
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run(args, ctx=None):
|
|
24
|
+
pg = ctx.page if ctx is not None and getattr(ctx, "page", None) else page
|
|
25
|
+
|
|
26
|
+
limit = int(args.get("limit", 20))
|
|
27
|
+
pg.goto("https://www.producthunt.com/")
|
|
28
|
+
products = pg.evaluate(
|
|
29
|
+
"""
|
|
30
|
+
() => {
|
|
31
|
+
// Hand-rolled scrape: walk every /posts/<slug> anchor on the page
|
|
32
|
+
// and grab a sibling vote count if we can find one.
|
|
33
|
+
const seen = new Set();
|
|
34
|
+
const out = [];
|
|
35
|
+
const anchors = document.querySelectorAll('a[href^="/posts/"]');
|
|
36
|
+
anchors.forEach(a => {
|
|
37
|
+
const slug = a.getAttribute('href').replace(/^\\/posts\\//, '').split(/[?#]/)[0];
|
|
38
|
+
if (!slug || seen.has(slug)) return;
|
|
39
|
+
const card = a.closest('[data-test^="post-item"], li, article, div');
|
|
40
|
+
if (!card) return;
|
|
41
|
+
const name = a.innerText.trim();
|
|
42
|
+
if (!name) return;
|
|
43
|
+
let votes = 0;
|
|
44
|
+
const numberEl = card.querySelector('[data-test="vote-button"], button');
|
|
45
|
+
if (numberEl) {
|
|
46
|
+
const m = (numberEl.innerText || '').replace(/,/g, '').match(/\\d+/);
|
|
47
|
+
if (m) votes = parseInt(m[0], 10);
|
|
48
|
+
}
|
|
49
|
+
let tagline = '';
|
|
50
|
+
const para = card.querySelector('p, [class*="tagline"], [class*="description"]');
|
|
51
|
+
if (para && para.innerText) tagline = para.innerText.trim();
|
|
52
|
+
out.push({
|
|
53
|
+
name,
|
|
54
|
+
url: 'https://www.producthunt.com/posts/' + slug,
|
|
55
|
+
tagline,
|
|
56
|
+
votes,
|
|
57
|
+
});
|
|
58
|
+
seen.add(slug);
|
|
59
|
+
});
|
|
60
|
+
return out;
|
|
61
|
+
}
|
|
62
|
+
"""
|
|
63
|
+
)
|
|
64
|
+
return (products or [])[:limit]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
---
|
|
2
|
+
site: wikipedia
|
|
3
|
+
host_patterns: ["wikipedia.org", "en.wikipedia.org"]
|
|
4
|
+
aliases: ["wikipedia", "wiki", "维基百科"]
|
|
5
|
+
last_updated: 2026-05-18
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# wikipedia.org site memory
|
|
9
|
+
|
|
10
|
+
## 顶层 URL 结构
|
|
11
|
+
|
|
12
|
+
- 英文: https://en.wikipedia.org/wiki/<Title_With_Underscores>
|
|
13
|
+
- 中文: https://zh.wikipedia.org/wiki/<Title>
|
|
14
|
+
- 检索: https://<lang>.wikipedia.org/w/index.php?search=<query>&fulltext=1
|
|
15
|
+
|
|
16
|
+
## 稳定 selectors
|
|
17
|
+
|
|
18
|
+
- 摘要段: `div.mw-parser-output > p:not(.mw-empty-elt)`
|
|
19
|
+
- 章节标题: `h2 > span.mw-headline`
|
|
20
|
+
- 第一段: `div.mw-parser-output > p:not(.mw-empty-elt):first-of-type`
|
|
21
|
+
|
|
22
|
+
## Notes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Lookup a Wikipedia article and return its first-paragraph summary + section TOC.
|
|
2
|
+
|
|
3
|
+
Phase C surface: drives the injected Playwright ``page`` (bound to the session's
|
|
4
|
+
current tab — reused, navigated in place). No ``new_tab`` / ``js`` primitives.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ARGS = {
|
|
8
|
+
"title": {"type": "str", "required": True, "desc": "article title (free text)"},
|
|
9
|
+
"lang": {"type": "str", "required": False, "default": "en",
|
|
10
|
+
"desc": "wikipedia language subdomain (en/zh/ja/...)"},
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
OUTPUT = "{title: str, url: str, summary: str, sections: list[str]}"
|
|
14
|
+
TAGS = ["wikipedia", "lookup", "reference"]
|
|
15
|
+
REQUIRES_LOGIN = False
|
|
16
|
+
ESTIMATED_DURATION_SEC = 6
|
|
17
|
+
LAST_VERIFIED = "2026-05-25"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def selftest():
|
|
21
|
+
page.goto("https://en.wikipedia.org/wiki/Wikipedia")
|
|
22
|
+
return "Wikipedia" in page.title()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run(args, ctx=None):
|
|
26
|
+
from urllib.parse import quote
|
|
27
|
+
|
|
28
|
+
# `page` / `context` / `snapshot` are injected by the task runner (Phase C),
|
|
29
|
+
# mirroring the heredoc namespace. Prefer ctx if a caller passed one.
|
|
30
|
+
pg = ctx.page if ctx is not None and getattr(ctx, "page", None) else page
|
|
31
|
+
|
|
32
|
+
title = args["title"]
|
|
33
|
+
lang = args.get("lang", "en")
|
|
34
|
+
underscore_title = title.strip().replace(" ", "_")
|
|
35
|
+
url = f"https://{lang}.wikipedia.org/wiki/{quote(underscore_title)}"
|
|
36
|
+
pg.goto(url)
|
|
37
|
+
info = pg.evaluate(
|
|
38
|
+
"""
|
|
39
|
+
() => {
|
|
40
|
+
const summary_p = document.querySelector(
|
|
41
|
+
'div.mw-parser-output > p:not(.mw-empty-elt)'
|
|
42
|
+
);
|
|
43
|
+
const sections = Array.from(
|
|
44
|
+
document.querySelectorAll('h2 > span.mw-headline, h2 .mw-headline')
|
|
45
|
+
).map(el => el.innerText.trim());
|
|
46
|
+
return {
|
|
47
|
+
title: document.title.replace(' - Wikipedia', '').trim(),
|
|
48
|
+
url: location.href,
|
|
49
|
+
summary: summary_p ? summary_p.innerText.trim() : '',
|
|
50
|
+
sections,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
"""
|
|
54
|
+
)
|
|
55
|
+
return info or {"title": title, "url": url, "summary": "", "sections": []}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
site: ycombinator.com
|
|
3
|
+
host_patterns: ["news.ycombinator.com", "ycombinator.com"]
|
|
4
|
+
aliases: ["hacker news", "hn", "黑客新闻"]
|
|
5
|
+
last_updated: 2026-05-18
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Hacker News site memory
|
|
9
|
+
|
|
10
|
+
## 顶层 URL 结构
|
|
11
|
+
|
|
12
|
+
- 首页 / top: https://news.ycombinator.com/
|
|
13
|
+
- newest: https://news.ycombinator.com/newest
|
|
14
|
+
- best: https://news.ycombinator.com/best
|
|
15
|
+
|
|
16
|
+
## 稳定 selectors
|
|
17
|
+
|
|
18
|
+
- 表格行: `tr.athing` — id 属性是 story id
|
|
19
|
+
- 标题链接: `tr.athing span.titleline > a`
|
|
20
|
+
- 分数: 紧邻 athing 之后的 `tr` 下的 `span.score`
|
|
21
|
+
- 评论数: 同上 `tr` 下面的最后一个 `a`
|
|
22
|
+
|
|
23
|
+
## Notes
|
|
24
|
+
|
|
25
|
+
- HN 上很少改 DOM。这里的 selector 在过去 10 年基本没变。
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Pull top stories from Hacker News (top/new/best).
|
|
2
|
+
|
|
3
|
+
Phase C surface: drives the injected Playwright ``page`` (reused, navigated in
|
|
4
|
+
place); reads the story list with ``page.evaluate``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ARGS = {
|
|
8
|
+
"kind": {"type": "str", "required": False, "default": "top",
|
|
9
|
+
"desc": "top / new / best"},
|
|
10
|
+
"limit": {"type": "int", "required": False, "default": 30},
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
OUTPUT = "list[{rank: int, id: int, title: str, url: str, score: int, comments: int}]"
|
|
14
|
+
TAGS = ["hn", "news", "feed"]
|
|
15
|
+
REQUIRES_LOGIN = False
|
|
16
|
+
ESTIMATED_DURATION_SEC = 5
|
|
17
|
+
LAST_VERIFIED = "2026-05-25"
|
|
18
|
+
|
|
19
|
+
_KINDS = {
|
|
20
|
+
"top": "https://news.ycombinator.com/",
|
|
21
|
+
"new": "https://news.ycombinator.com/newest",
|
|
22
|
+
"best": "https://news.ycombinator.com/best",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def selftest():
|
|
27
|
+
page.goto("https://news.ycombinator.com/")
|
|
28
|
+
return "ycombinator.com" in page.url
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def run(args, ctx=None):
|
|
32
|
+
pg = ctx.page if ctx is not None and getattr(ctx, "page", None) else page
|
|
33
|
+
|
|
34
|
+
kind = args.get("kind", "top")
|
|
35
|
+
limit = int(args.get("limit", 30))
|
|
36
|
+
url = _KINDS.get(kind, _KINDS["top"])
|
|
37
|
+
pg.goto(url)
|
|
38
|
+
rows = pg.evaluate(
|
|
39
|
+
"""
|
|
40
|
+
() => {
|
|
41
|
+
const out = [];
|
|
42
|
+
const stories = document.querySelectorAll('tr.athing');
|
|
43
|
+
stories.forEach((row, i) => {
|
|
44
|
+
const a = row.querySelector('span.titleline > a');
|
|
45
|
+
if (!a) return;
|
|
46
|
+
const sub = row.nextElementSibling;
|
|
47
|
+
const score = sub ? sub.querySelector('span.score') : null;
|
|
48
|
+
const links = sub ? sub.querySelectorAll('a') : [];
|
|
49
|
+
const commentLink = links.length ? links[links.length - 1] : null;
|
|
50
|
+
out.push({
|
|
51
|
+
rank: i + 1,
|
|
52
|
+
id: parseInt(row.id, 10),
|
|
53
|
+
title: a.innerText.trim(),
|
|
54
|
+
url: a.href,
|
|
55
|
+
score: score ? parseInt(score.innerText, 10) : 0,
|
|
56
|
+
comments: commentLink ? parseInt(commentLink.innerText, 10) || 0 : 0,
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
return out;
|
|
60
|
+
}
|
|
61
|
+
"""
|
|
62
|
+
)
|
|
63
|
+
return (rows or [])[:limit]
|