scrapekit-local 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapekit_local-0.1.0/.gitignore +32 -0
- scrapekit_local-0.1.0/PKG-INFO +9 -0
- scrapekit_local-0.1.0/pyproject.toml +24 -0
- scrapekit_local-0.1.0/src/scrapekit/__init__.py +3 -0
- scrapekit_local-0.1.0/src/scrapekit/cache/__init__.py +3 -0
- scrapekit_local-0.1.0/src/scrapekit/cache/profile.py +172 -0
- scrapekit_local-0.1.0/src/scrapekit/cli.py +83 -0
- scrapekit_local-0.1.0/src/scrapekit/dispatcher.py +158 -0
- scrapekit_local-0.1.0/src/scrapekit/extractor/__init__.py +4 -0
- scrapekit_local-0.1.0/src/scrapekit/extractor/cdp.py +201 -0
- scrapekit_local-0.1.0/src/scrapekit/extractor/http.py +243 -0
- scrapekit_local-0.1.0/src/scrapekit/extractor/state.py +66 -0
- scrapekit_local-0.1.0/src/scrapekit/models.py +96 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.egg
|
|
9
|
+
|
|
10
|
+
# 虚拟环境
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
env/
|
|
14
|
+
|
|
15
|
+
# 开发工具
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
.ruff_cache/
|
|
19
|
+
.coverage
|
|
20
|
+
htmlcov/
|
|
21
|
+
|
|
22
|
+
# IDE
|
|
23
|
+
.vscode/
|
|
24
|
+
.idea/
|
|
25
|
+
*.swp
|
|
26
|
+
|
|
27
|
+
# ScrapeKit 本地数据
|
|
28
|
+
~/.scrapekit/
|
|
29
|
+
|
|
30
|
+
# 系统
|
|
31
|
+
.DS_Store
|
|
32
|
+
Thumbs.db
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapekit-local
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local-first CLI web scraper with automatic strategy selection
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: beautifulsoup4>=4.14
|
|
7
|
+
Requires-Dist: httpx>=0.28
|
|
8
|
+
Requires-Dist: playwright>=1.44
|
|
9
|
+
Requires-Dist: typer>=0.12
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "scrapekit-local"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Local-first CLI web scraper with automatic strategy selection"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"httpx>=0.28",
|
|
12
|
+
"beautifulsoup4>=4.14",
|
|
13
|
+
"typer>=0.12",
|
|
14
|
+
"playwright>=1.44",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
scrape = "scrapekit.cli:app"
|
|
19
|
+
|
|
20
|
+
[tool.hatch.build.targets.wheel]
|
|
21
|
+
packages = ["src/scrapekit"]
|
|
22
|
+
|
|
23
|
+
[tool.hatch.build.targets.sdist]
|
|
24
|
+
include = ["src/"]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/cache/profile.py
|
|
3
|
+
===========================
|
|
4
|
+
Profile Cache — 按 domain 缓存最优抓取策略。
|
|
5
|
+
|
|
6
|
+
作用:
|
|
7
|
+
第一次访问某站点时,ScrapeKit 会探索最优策略(Level 1 / 2 / 3)。
|
|
8
|
+
成功后,将"配方"写入本地 JSON 文件。
|
|
9
|
+
后续请求直接跳到已知的最优层级,跳过探索开销。
|
|
10
|
+
|
|
11
|
+
存储位置:~/.scrapekit/profiles/<domain>.json
|
|
12
|
+
|
|
13
|
+
配方格式:
|
|
14
|
+
{
|
|
15
|
+
"domain": "sportsbet.com.au",
|
|
16
|
+
"preferred_level": 3,
|
|
17
|
+
"state_var": "window.__PRELOADED_STATE__", // Level 3 专用
|
|
18
|
+
"discovered_at": "2026-04-16T00:00:00Z",
|
|
19
|
+
"last_success": "2026-04-16T00:08:00Z",
|
|
20
|
+
"success_count": 12,
|
|
21
|
+
"failure_count": 0
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
v0.1 范围:
|
|
25
|
+
- domain 级别匹配(保守,安全)
|
|
26
|
+
- 30 天过期
|
|
27
|
+
- 提取失败时立即失效,触发重新探索
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
from datetime import datetime, timezone, timedelta
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Optional
|
|
37
|
+
from urllib.parse import urlparse
|
|
38
|
+
|
|
39
|
+
# 缓存目录
|
|
40
|
+
CACHE_DIR = Path.home() / ".scrapekit" / "profiles"
|
|
41
|
+
|
|
42
|
+
# 配方有效期(天)
|
|
43
|
+
STALE_DAYS = 30
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# 工具函数
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def _domain(url: str) -> str:
|
|
51
|
+
"""从 URL 提取 domain(不含 www.)。"""
|
|
52
|
+
parsed = urlparse(url)
|
|
53
|
+
host = parsed.netloc or parsed.path
|
|
54
|
+
# 去掉端口
|
|
55
|
+
host = host.split(":")[0]
|
|
56
|
+
# 去掉 www.
|
|
57
|
+
if host.startswith("www."):
|
|
58
|
+
host = host[4:]
|
|
59
|
+
return host.lower()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _profile_path(domain: str) -> Path:
|
|
63
|
+
return CACHE_DIR / f"{domain}.json"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _now_iso() -> str:
|
|
67
|
+
return datetime.now(timezone.utc).isoformat()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# 公开 API
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def load(url: str) -> Optional[dict]:
|
|
75
|
+
"""
|
|
76
|
+
读取 domain 的缓存配方。
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
配方 dict,或 None(未命中 / 已过期)
|
|
80
|
+
"""
|
|
81
|
+
domain = _domain(url)
|
|
82
|
+
path = _profile_path(domain)
|
|
83
|
+
|
|
84
|
+
if not path.exists():
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
with open(path, encoding="utf-8") as f:
|
|
89
|
+
profile = json.load(f)
|
|
90
|
+
except (json.JSONDecodeError, OSError):
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
# 检查是否过期
|
|
94
|
+
discovered = profile.get("discovered_at", "")
|
|
95
|
+
if discovered:
|
|
96
|
+
try:
|
|
97
|
+
dt = datetime.fromisoformat(discovered)
|
|
98
|
+
if datetime.now(timezone.utc) - dt > timedelta(days=STALE_DAYS):
|
|
99
|
+
path.unlink(missing_ok=True)
|
|
100
|
+
return None
|
|
101
|
+
except ValueError:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
return profile
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def save(url: str, result_level: int, state_var: Optional[str] = None) -> None:
|
|
108
|
+
"""
|
|
109
|
+
保存或更新 domain 的缓存配方。
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
url: 成功抓取的 URL
|
|
113
|
+
result_level: 实际使用的层级(1 / 2 / 3)
|
|
114
|
+
state_var: Level 3 时用到的 JS 变量名
|
|
115
|
+
"""
|
|
116
|
+
domain = _domain(url)
|
|
117
|
+
path = _profile_path(domain)
|
|
118
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
|
|
120
|
+
# 读取已有记录(更新计数器)
|
|
121
|
+
existing: dict = {}
|
|
122
|
+
if path.exists():
|
|
123
|
+
try:
|
|
124
|
+
with open(path, encoding="utf-8") as f:
|
|
125
|
+
existing = json.load(f)
|
|
126
|
+
except (json.JSONDecodeError, OSError):
|
|
127
|
+
existing = {}
|
|
128
|
+
|
|
129
|
+
now = _now_iso()
|
|
130
|
+
profile = {
|
|
131
|
+
"domain": domain,
|
|
132
|
+
"preferred_level": result_level,
|
|
133
|
+
"state_var": state_var,
|
|
134
|
+
"discovered_at": existing.get("discovered_at", now),
|
|
135
|
+
"last_success": now,
|
|
136
|
+
"success_count": existing.get("success_count", 0) + 1,
|
|
137
|
+
"failure_count": existing.get("failure_count", 0),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
141
|
+
json.dump(profile, f, ensure_ascii=False, indent=2)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def invalidate(url: str) -> None:
|
|
145
|
+
"""提取失败时调用,立即删除缓存配方,触发下次重新探索。"""
|
|
146
|
+
domain = _domain(url)
|
|
147
|
+
path = _profile_path(domain)
|
|
148
|
+
if path.exists():
|
|
149
|
+
# 增加失败计数,但不删除(保留历史,只标记失效)
|
|
150
|
+
try:
|
|
151
|
+
with open(path, encoding="utf-8") as f:
|
|
152
|
+
profile = json.load(f)
|
|
153
|
+
profile["failure_count"] = profile.get("failure_count", 0) + 1
|
|
154
|
+
profile["discovered_at"] = "" # 清空,下次 load() 会返回 None
|
|
155
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
156
|
+
json.dump(profile, f, ensure_ascii=False, indent=2)
|
|
157
|
+
except (json.JSONDecodeError, OSError):
|
|
158
|
+
path.unlink(missing_ok=True)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def list_profiles() -> list[dict]:
|
|
162
|
+
"""列出所有已缓存的配方(用于 scrape profile list 命令,Phase 2)。"""
|
|
163
|
+
if not CACHE_DIR.exists():
|
|
164
|
+
return []
|
|
165
|
+
profiles = []
|
|
166
|
+
for p in sorted(CACHE_DIR.glob("*.json")):
|
|
167
|
+
try:
|
|
168
|
+
with open(p, encoding="utf-8") as f:
|
|
169
|
+
profiles.append(json.load(f))
|
|
170
|
+
except (json.JSONDecodeError, OSError):
|
|
171
|
+
continue
|
|
172
|
+
return profiles
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/cli.py
|
|
3
|
+
=================
|
|
4
|
+
CLI 入口(typer)
|
|
5
|
+
|
|
6
|
+
用法:
|
|
7
|
+
scrape <url>
|
|
8
|
+
scrape <url> --output md
|
|
9
|
+
scrape <url> --force-level 2
|
|
10
|
+
scrape <url> --verbose
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import sys
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
import typer
|
|
21
|
+
|
|
22
|
+
from scrapekit.dispatcher import dispatch
|
|
23
|
+
from scrapekit.models import ExtractionError
|
|
24
|
+
|
|
25
|
+
app = typer.Typer(
|
|
26
|
+
name="scrape",
|
|
27
|
+
help="Local-first web scraper with automatic strategy selection.",
|
|
28
|
+
add_completion=False,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class OutputFormat(str, Enum):
|
|
33
|
+
json = "json"
|
|
34
|
+
md = "md"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@app.command()
|
|
38
|
+
def scrape(
|
|
39
|
+
url: str = typer.Argument(..., help="Target URL to scrape"),
|
|
40
|
+
output: OutputFormat = typer.Option(
|
|
41
|
+
OutputFormat.json,
|
|
42
|
+
"--output", "-o",
|
|
43
|
+
help="Output format: json (default) or md (markdown)",
|
|
44
|
+
),
|
|
45
|
+
force_level: Optional[int] = typer.Option(
|
|
46
|
+
None,
|
|
47
|
+
"--force-level", "-l",
|
|
48
|
+
help="Force a specific extraction level (1=HTTP, 2=CDP)",
|
|
49
|
+
min=1,
|
|
50
|
+
max=2,
|
|
51
|
+
),
|
|
52
|
+
cdp_endpoint: str = typer.Option(
|
|
53
|
+
"http://localhost:9222",
|
|
54
|
+
"--cdp",
|
|
55
|
+
help="Chrome DevTools endpoint",
|
|
56
|
+
),
|
|
57
|
+
verbose: bool = typer.Option(
|
|
58
|
+
False,
|
|
59
|
+
"--verbose", "-v",
|
|
60
|
+
help="Print escalation steps to stderr",
|
|
61
|
+
),
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Scrape a URL and output structured data."""
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
result = asyncio.run(
|
|
67
|
+
dispatch(
|
|
68
|
+
url,
|
|
69
|
+
force_level=force_level,
|
|
70
|
+
cdp_endpoint=cdp_endpoint,
|
|
71
|
+
verbose=verbose,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
except ExtractionError as e:
|
|
75
|
+
typer.echo(f"Error: {e.reason}", err=True)
|
|
76
|
+
raise typer.Exit(code=1)
|
|
77
|
+
except KeyboardInterrupt:
|
|
78
|
+
raise typer.Exit(code=130)
|
|
79
|
+
|
|
80
|
+
if output == OutputFormat.json:
|
|
81
|
+
typer.echo(result.to_json())
|
|
82
|
+
else:
|
|
83
|
+
typer.echo(result.to_markdown())
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/dispatcher.py
|
|
3
|
+
========================
|
|
4
|
+
核心调度器:按策略阶梯自动升级。
|
|
5
|
+
|
|
6
|
+
Level 1 → Level 2 → Level 3(v0.1 只支持到 Level 2)
|
|
7
|
+
|
|
8
|
+
升级条件:
|
|
9
|
+
- Level 1 抛出 NeedsBrowserError → 升级到 Level 2
|
|
10
|
+
- Level 1 抛出 ExtractionError → 升级到 Level 2(网络失败也值得用浏览器重试)
|
|
11
|
+
- --force-level N → 跳过前面的层级
|
|
12
|
+
|
|
13
|
+
未来扩展:
|
|
14
|
+
- Level 2 → Level 3:检测到 __PRELOADED_STATE__ 等 JS state 变量后升级
|
|
15
|
+
- Profile Cache:命中缓存时直接跳到已知层级
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
from scrapekit.models import ExtractResult, NeedsBrowserError, ExtractionError
|
|
24
|
+
from scrapekit.extractor import http_extract, cdp_extract, is_chrome_available
|
|
25
|
+
from scrapekit.cache import profile as profile_cache
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# 主入口
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
async def dispatch(
|
|
33
|
+
url: str,
|
|
34
|
+
*,
|
|
35
|
+
force_level: Optional[int] = None,
|
|
36
|
+
cdp_endpoint: str = "http://localhost:9222",
|
|
37
|
+
http_timeout: float = 15.0,
|
|
38
|
+
page_timeout: float = 30.0,
|
|
39
|
+
verbose: bool = False,
|
|
40
|
+
use_cache: bool = True,
|
|
41
|
+
) -> ExtractResult:
|
|
42
|
+
"""
|
|
43
|
+
自动选择最优抓取策略并执行。
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url: 目标 URL
|
|
47
|
+
force_level: 强制使用指定层级(1/2/3),跳过自动判断
|
|
48
|
+
cdp_endpoint: Chrome 调试端点
|
|
49
|
+
http_timeout: Level 1 HTTP 超时(秒)
|
|
50
|
+
page_timeout: Level 2/3 页面加载超时(秒)
|
|
51
|
+
verbose: 打印升级过程
|
|
52
|
+
use_cache: 是否使用 Profile Cache
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
ExtractResult
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ExtractionError: 所有层级均失败
|
|
59
|
+
"""
|
|
60
|
+
def log(msg: str) -> None:
|
|
61
|
+
if verbose:
|
|
62
|
+
print(f"[scrapekit] {msg}", flush=True)
|
|
63
|
+
|
|
64
|
+
# ── 强制指定层级 ──────────────────────────────────────────────────────
|
|
65
|
+
if force_level == 1:
|
|
66
|
+
log("forced Level 1 (HTTP)")
|
|
67
|
+
return await http_extract(url, timeout=http_timeout)
|
|
68
|
+
|
|
69
|
+
if force_level in (2, 3):
|
|
70
|
+
log(f"forced Level 2/3 (CDP)")
|
|
71
|
+
return await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
72
|
+
|
|
73
|
+
# ── Profile Cache:命中时直接跳到已知层级 ─────────────────────────────
|
|
74
|
+
if use_cache:
|
|
75
|
+
cached = profile_cache.load(url)
|
|
76
|
+
if cached:
|
|
77
|
+
preferred = cached.get("preferred_level", 1)
|
|
78
|
+
log(f"cache hit → Level {preferred} for {cached.get('domain')}")
|
|
79
|
+
if preferred == 1:
|
|
80
|
+
try:
|
|
81
|
+
result = await http_extract(url, timeout=http_timeout)
|
|
82
|
+
profile_cache.save(url, result.level)
|
|
83
|
+
return result
|
|
84
|
+
except Exception:
|
|
85
|
+
log("cache: Level 1 failed, invalidating and re-exploring")
|
|
86
|
+
profile_cache.invalidate(url)
|
|
87
|
+
else:
|
|
88
|
+
try:
|
|
89
|
+
result = await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
90
|
+
profile_cache.save(url, result.level,
|
|
91
|
+
state_var=cached.get("state_var"))
|
|
92
|
+
return result
|
|
93
|
+
except Exception:
|
|
94
|
+
log("cache: Level 2 failed, invalidating and re-exploring")
|
|
95
|
+
profile_cache.invalidate(url)
|
|
96
|
+
|
|
97
|
+
# ── 自动升级探索 ──────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
# Level 1:HTTP 直取
|
|
100
|
+
log(f"Level 1 → {url}")
|
|
101
|
+
try:
|
|
102
|
+
result = await http_extract(url, timeout=http_timeout)
|
|
103
|
+
log(f"Level 1 succeeded ({result.elapsed_ms:.0f}ms)")
|
|
104
|
+
if use_cache:
|
|
105
|
+
profile_cache.save(url, result.level)
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
except NeedsBrowserError as e:
|
|
109
|
+
log(f"Level 1 → needs browser ({e.reason}), escalating to Level 2/3")
|
|
110
|
+
|
|
111
|
+
except ExtractionError as e:
|
|
112
|
+
log(f"Level 1 → error ({e.reason}), escalating to Level 2/3")
|
|
113
|
+
|
|
114
|
+
# Level 2/3:CDP + 已有 Chrome(内部自动尝试 Level 3 state 提取)
|
|
115
|
+
result = await _try_level2(url, cdp_endpoint, page_timeout, log)
|
|
116
|
+
if use_cache:
|
|
117
|
+
state_var = None
|
|
118
|
+
if result.level == 3:
|
|
119
|
+
from scrapekit.extractor.state import KNOWN_STATE_VARS
|
|
120
|
+
state_var = KNOWN_STATE_VARS[0] # v0.1 固定
|
|
121
|
+
profile_cache.save(url, result.level, state_var=state_var)
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def _try_level2(
|
|
126
|
+
url: str,
|
|
127
|
+
cdp_endpoint: str,
|
|
128
|
+
page_timeout: float,
|
|
129
|
+
log,
|
|
130
|
+
) -> ExtractResult:
|
|
131
|
+
"""尝试 Level 2,Chrome 不可用时给出清晰错误。"""
|
|
132
|
+
from scrapekit.extractor.cdp import ChromeNotRunningError
|
|
133
|
+
|
|
134
|
+
log(f"Level 2 → {url}")
|
|
135
|
+
|
|
136
|
+
# 提前探测 Chrome,给出更友好的错误
|
|
137
|
+
if not await is_chrome_available(cdp_endpoint):
|
|
138
|
+
raise ExtractionError(
|
|
139
|
+
url,
|
|
140
|
+
f"Level 1 failed and Chrome is not available at {cdp_endpoint}. "
|
|
141
|
+
"Start Chrome with: chrome --remote-debugging-port=9222",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
result = await cdp_extract(
|
|
146
|
+
url,
|
|
147
|
+
cdp_endpoint=cdp_endpoint,
|
|
148
|
+
page_timeout=page_timeout,
|
|
149
|
+
)
|
|
150
|
+
log(f"Level 2 succeeded ({result.elapsed_ms:.0f}ms)")
|
|
151
|
+
return result
|
|
152
|
+
|
|
153
|
+
except ChromeNotRunningError:
|
|
154
|
+
raise
|
|
155
|
+
except ExtractionError:
|
|
156
|
+
raise
|
|
157
|
+
except Exception as e:
|
|
158
|
+
raise ExtractionError(url, f"Level 2 unexpected error: {e}") from e
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/extractor/cdp.py
|
|
3
|
+
===========================
|
|
4
|
+
Level 2 — CDP + 已有 Chrome Session(Playwright connect_over_cdp)
|
|
5
|
+
|
|
6
|
+
核心优势:
|
|
7
|
+
复用用户已经登录的 Chrome,无需重新认证、无需存储凭据。
|
|
8
|
+
Chrome 需提前以 --remote-debugging-port=9222 启动(见 README)。
|
|
9
|
+
|
|
10
|
+
流程:
|
|
11
|
+
1. 连接到 localhost:9222
|
|
12
|
+
2. 取已有 context(继承登录态 / cookies)
|
|
13
|
+
3. 新开一个标签页,导航到目标 URL
|
|
14
|
+
4. 等待页面加载(domcontentloaded + networkidle)
|
|
15
|
+
5. 提取 HTML,复用 Level 1 的 BeautifulSoup 逻辑
|
|
16
|
+
6. 关闭标签页,不污染 Chrome 会话
|
|
17
|
+
|
|
18
|
+
错误处理:
|
|
19
|
+
- Chrome 未启动 → ChromeNotRunningError(清晰提示)
|
|
20
|
+
- 页面加载超时 → ExtractionError
|
|
21
|
+
- 其他 → ExtractionError
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import time
|
|
27
|
+
from typing import Optional
|
|
28
|
+
|
|
29
|
+
from bs4 import BeautifulSoup
|
|
30
|
+
|
|
31
|
+
from scrapekit.models import ExtractResult, ExtractionError
|
|
32
|
+
from scrapekit.extractor.http import (
|
|
33
|
+
_extract_title,
|
|
34
|
+
_extract_text,
|
|
35
|
+
_extract_links,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# CDP 端口(可通过环境变量覆盖)
|
|
39
|
+
CDP_ENDPOINT = "http://localhost:9222"
|
|
40
|
+
|
|
41
|
+
# 等待策略
|
|
42
|
+
WAIT_UNTIL = "domcontentloaded" # 第一阶段:DOM 就绪
|
|
43
|
+
NETWORK_IDLE_TIMEOUT = 5_000 # ms,等 networkidle 的最长时间(不强制)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# 专属异常
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
class ChromeNotRunningError(ExtractionError):
|
|
51
|
+
"""
|
|
52
|
+
Chrome 未以 --remote-debugging-port=9222 运行时抛出。
|
|
53
|
+
提示用户如何启动 Chrome。
|
|
54
|
+
"""
|
|
55
|
+
def __init__(self, endpoint: str = CDP_ENDPOINT):
|
|
56
|
+
super().__init__(
|
|
57
|
+
url="",
|
|
58
|
+
reason=(
|
|
59
|
+
f"Cannot connect to Chrome at {endpoint}. "
|
|
60
|
+
"Start Chrome with: "
|
|
61
|
+
"chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug"
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
# 公开 API
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
async def extract(
|
|
71
|
+
url: str,
|
|
72
|
+
*,
|
|
73
|
+
cdp_endpoint: str = CDP_ENDPOINT,
|
|
74
|
+
page_timeout: float = 30.0,
|
|
75
|
+
wait_networkidle: bool = True,
|
|
76
|
+
) -> ExtractResult:
|
|
77
|
+
"""
|
|
78
|
+
Level 2 CDP 抓取。
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
url: 目标 URL
|
|
82
|
+
cdp_endpoint: Chrome DevTools 端点,默认 http://localhost:9222
|
|
83
|
+
page_timeout: 页面导航超时(秒)
|
|
84
|
+
wait_networkidle: 是否等待 networkidle(SPA 内容渲染完毕)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
ExtractResult(level=2)
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
ChromeNotRunningError: Chrome 未启动或未开启远程调试
|
|
91
|
+
ExtractionError: 页面加载或提取失败
|
|
92
|
+
"""
|
|
93
|
+
try:
|
|
94
|
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
|
|
95
|
+
except ImportError as e:
|
|
96
|
+
raise ExtractionError(url, "playwright not installed: pip install playwright") from e
|
|
97
|
+
|
|
98
|
+
t0 = time.perf_counter()
|
|
99
|
+
|
|
100
|
+
async with async_playwright() as pw:
|
|
101
|
+
# ── 1. 连接已有 Chrome ────────────────────────────────────────────
|
|
102
|
+
try:
|
|
103
|
+
browser = await pw.chromium.connect_over_cdp(
|
|
104
|
+
cdp_endpoint,
|
|
105
|
+
timeout=5_000, # 连接超时 5s,快速失败
|
|
106
|
+
)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
err_str = str(e).lower()
|
|
109
|
+
if any(k in err_str for k in ("connection refused", "connect", "econnrefused", "failed to connect")):
|
|
110
|
+
raise ChromeNotRunningError(cdp_endpoint) from e
|
|
111
|
+
raise ExtractionError(url, f"cdp connect error: {e}") from e
|
|
112
|
+
|
|
113
|
+
# ── 2. 取已有 context(继承登录态)────────────────────────────────
|
|
114
|
+
if browser.contexts:
|
|
115
|
+
context = browser.contexts[0]
|
|
116
|
+
else:
|
|
117
|
+
# 极少数情况:Chrome 连上了但没有 context(无窗口模式)
|
|
118
|
+
context = await browser.new_context()
|
|
119
|
+
|
|
120
|
+
# ── 3. 开新标签页 ─────────────────────────────────────────────────
|
|
121
|
+
page = await context.new_page()
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
# ── 4. 导航 ───────────────────────────────────────────────────
|
|
125
|
+
try:
|
|
126
|
+
await page.goto(
|
|
127
|
+
url,
|
|
128
|
+
wait_until=WAIT_UNTIL,
|
|
129
|
+
timeout=page_timeout * 1000,
|
|
130
|
+
)
|
|
131
|
+
except PlaywrightTimeout as e:
|
|
132
|
+
raise ExtractionError(url, f"page load timeout: {e}") from e
|
|
133
|
+
except Exception as e:
|
|
134
|
+
raise ExtractionError(url, f"navigation error: {e}") from e
|
|
135
|
+
|
|
136
|
+
# ── 5. 等待 networkidle(可选,给 SPA 时间完成渲染)───────────
|
|
137
|
+
# 文档说 networkidle 不推荐用于 CI 测试(flaky),
|
|
138
|
+
# 但对爬虫来说是正确选择:等 JS 执行完毕再抓 DOM
|
|
139
|
+
if wait_networkidle:
|
|
140
|
+
try:
|
|
141
|
+
await page.wait_for_load_state(
|
|
142
|
+
"networkidle",
|
|
143
|
+
timeout=NETWORK_IDLE_TIMEOUT,
|
|
144
|
+
)
|
|
145
|
+
except PlaywrightTimeout:
|
|
146
|
+
# networkidle 超时不致命,继续提取已有内容
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
# ── 6. 尝试 Level 3 JS State 提取(在关闭标签页之前)────────────
|
|
150
|
+
from scrapekit.extractor.state import try_extract_state, state_to_text
|
|
151
|
+
state_var, state_data = await try_extract_state(page)
|
|
152
|
+
|
|
153
|
+
# ── 7. 提取 HTML(Level 2 DOM 路径)──────────────────────────────
|
|
154
|
+
html = await page.content()
|
|
155
|
+
final_url = page.url
|
|
156
|
+
|
|
157
|
+
finally:
|
|
158
|
+
# 始终关闭标签页,不污染 Chrome
|
|
159
|
+
await page.close()
|
|
160
|
+
|
|
161
|
+
elapsed = (time.perf_counter() - t0) * 1000
|
|
162
|
+
|
|
163
|
+
# ── 8. Level 3:有 JS state → 直接返回结构化 JSON ─────────────────────
|
|
164
|
+
if state_data is not None:
|
|
165
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
166
|
+
return ExtractResult(
|
|
167
|
+
url=final_url,
|
|
168
|
+
level=3,
|
|
169
|
+
status_code=200,
|
|
170
|
+
title=_extract_title(soup),
|
|
171
|
+
text=state_to_text(state_var, state_data),
|
|
172
|
+
links=_extract_links(soup, final_url),
|
|
173
|
+
elapsed_ms=round(elapsed, 1),
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# ── 9. Level 2:回退到 DOM 提取 ───────────────────────────────────────
|
|
177
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
178
|
+
return ExtractResult(
|
|
179
|
+
url=final_url,
|
|
180
|
+
level=2,
|
|
181
|
+
status_code=200,
|
|
182
|
+
title=_extract_title(soup),
|
|
183
|
+
text=_extract_text(BeautifulSoup(html, "html.parser")),
|
|
184
|
+
links=_extract_links(soup, final_url),
|
|
185
|
+
elapsed_ms=round(elapsed, 1),
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# 工具:检查 Chrome 是否可连接
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
async def is_chrome_available(cdp_endpoint: str = CDP_ENDPOINT) -> bool:
|
|
194
|
+
"""快速探测 Chrome 是否在指定端口运行。"""
|
|
195
|
+
import httpx
|
|
196
|
+
try:
|
|
197
|
+
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
198
|
+
resp = await client.get(f"{cdp_endpoint}/json/version")
|
|
199
|
+
return resp.status_code == 200
|
|
200
|
+
except Exception:
|
|
201
|
+
return False
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/extractor/http.py
|
|
3
|
+
============================
|
|
4
|
+
Level 1 — HTTP 直取(httpx + beautifulsoup4)
|
|
5
|
+
|
|
6
|
+
核心职责:
|
|
7
|
+
1. 发起 HTTP 请求
|
|
8
|
+
2. 通过 needs_browser() 判断是否需要升级
|
|
9
|
+
3. 提取 title、正文、链接
|
|
10
|
+
4. 返回 ExtractResult,或 raise NeedsBrowserError
|
|
11
|
+
|
|
12
|
+
needs_browser() 版本:V3(经过 50+ URL 验证,准确率 96%)
|
|
13
|
+
关键改进:
|
|
14
|
+
- 检查前剥离 <noscript>,避免 PyPI 类误报
|
|
15
|
+
- 小页面误判修复:tlen < 200 同时要求 ratio < 0.15
|
|
16
|
+
- 尺寸感知阈值:大页面(>50KB)用 < 0.018,小页面用 < 0.05
|
|
17
|
+
- 4xx 直接升级(Level 1 已失败)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import time
|
|
23
|
+
from urllib.parse import urljoin, urlparse
|
|
24
|
+
from typing import Optional
|
|
25
|
+
|
|
26
|
+
import httpx
|
|
27
|
+
from bs4 import BeautifulSoup
|
|
28
|
+
|
|
29
|
+
from scrapekit.models import ExtractResult, NeedsBrowserError, ExtractionError
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# 请求头(模拟真实 Chrome,减少 bot 拦截)
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
DEFAULT_HEADERS = {
|
|
37
|
+
"User-Agent": (
|
|
38
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
39
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
40
|
+
"Chrome/124.0.0.0 Safari/537.36"
|
|
41
|
+
),
|
|
42
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
43
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
44
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
45
|
+
"DNT": "1",
|
|
46
|
+
"Connection": "keep-alive",
|
|
47
|
+
"Upgrade-Insecure-Requests": "1",
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# needs_browser() — V3
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
def needs_browser(response: httpx.Response) -> tuple[bool, str]:
|
|
56
|
+
"""
|
|
57
|
+
判断 HTTP 响应是否需要浏览器渲染。
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
(needs_browser: bool, reason: str)
|
|
61
|
+
"""
|
|
62
|
+
html = response.text
|
|
63
|
+
|
|
64
|
+
# 1. 空 body → API 端点,不是 SPA
|
|
65
|
+
if len(html) == 0:
|
|
66
|
+
return False, ""
|
|
67
|
+
|
|
68
|
+
# 2. 4xx → Level 1 已失败,升级(Chrome 通常能绕过 bot 检测 / 登录墙)
|
|
69
|
+
if response.status_code in (400, 401, 403, 407):
|
|
70
|
+
return True, f"http_{response.status_code}"
|
|
71
|
+
|
|
72
|
+
# 3. 很短的 2xx → SPA shell
|
|
73
|
+
if len(html) < 500:
|
|
74
|
+
return True, "short_response"
|
|
75
|
+
|
|
76
|
+
# 4. 解析 HTML,去掉 <noscript>(避免 noscript 里的功能提示触发误判)
|
|
77
|
+
# 反例:PyPI 在 <noscript> 里写 "Enable javascript to filter wheels"
|
|
78
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
79
|
+
for tag in soup.find_all("noscript"):
|
|
80
|
+
tag.decompose()
|
|
81
|
+
cleaned_html = str(soup).lower()
|
|
82
|
+
|
|
83
|
+
# 5. 显式 JS 要求标记
|
|
84
|
+
js_markers = [
|
|
85
|
+
"enable javascript",
|
|
86
|
+
"you need javascript",
|
|
87
|
+
"javascript is required",
|
|
88
|
+
"javascript is disabled",
|
|
89
|
+
'<div id="app"></div>',
|
|
90
|
+
"<div id='app'></div>",
|
|
91
|
+
'<div id="root"></div>',
|
|
92
|
+
"<div id='root'></div>",
|
|
93
|
+
]
|
|
94
|
+
for marker in js_markers:
|
|
95
|
+
if marker in cleaned_html:
|
|
96
|
+
return True, f"js_marker:{marker[:30]}"
|
|
97
|
+
|
|
98
|
+
# 6. 文本内容分析
|
|
99
|
+
text_content = soup.get_text(strip=True)
|
|
100
|
+
text_len = len(text_content)
|
|
101
|
+
html_len = len(html) # 分母用原始 HTML 保持一致
|
|
102
|
+
text_ratio = text_len / max(html_len, 1)
|
|
103
|
+
|
|
104
|
+
# 6a. 极低比例 → 肯定是 SPA(Instagram / YouTube 类型)
|
|
105
|
+
if text_ratio < 0.005:
|
|
106
|
+
return True, f"ratio_near_zero:{text_ratio:.4f}"
|
|
107
|
+
|
|
108
|
+
# 6b. 文字量极少 + ratio 也低 → SPA shell
|
|
109
|
+
# example.com(tlen=139, ratio=0.263) 不应被触发
|
|
110
|
+
# Facebook(tlen=111, ratio=0.072) 应被触发
|
|
111
|
+
if text_len < 200 and text_ratio < 0.15:
|
|
112
|
+
return True, f"tiny_shell:tlen={text_len}"
|
|
113
|
+
|
|
114
|
+
# 6c. 尺寸感知 ratio 阈值
|
|
115
|
+
# 大页面天然 ratio 偏低(大量 HTML 标签)
|
|
116
|
+
# < 0.018:覆盖 Airtable(0.015)/Notion(0.015)/Replit(0.014) 等 SPA
|
|
117
|
+
# 不触发 GitHub(0.019)/BBC(0.031)/PyPI(0.075)
|
|
118
|
+
if html_len > 50_000:
|
|
119
|
+
if text_ratio < 0.018:
|
|
120
|
+
return True, f"large_page_low_ratio:{text_ratio:.4f}"
|
|
121
|
+
else:
|
|
122
|
+
if text_ratio < 0.05:
|
|
123
|
+
return True, f"small_page_low_ratio:{text_ratio:.4f}"
|
|
124
|
+
|
|
125
|
+
return False, ""
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# 内容提取
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def _extract_title(soup: BeautifulSoup) -> str:
|
|
133
|
+
tag = soup.find("title")
|
|
134
|
+
if tag:
|
|
135
|
+
return tag.get_text(strip=True)
|
|
136
|
+
h1 = soup.find("h1")
|
|
137
|
+
if h1:
|
|
138
|
+
return h1.get_text(strip=True)
|
|
139
|
+
return ""
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _extract_text(soup: BeautifulSoup) -> str:
|
|
143
|
+
"""
|
|
144
|
+
提取页面主要文字内容。
|
|
145
|
+
去除 script / style / noscript,保留段落文字。
|
|
146
|
+
"""
|
|
147
|
+
for tag in soup.find_all(["script", "style", "noscript", "header", "footer", "nav"]):
|
|
148
|
+
tag.decompose()
|
|
149
|
+
|
|
150
|
+
# 优先从 <main> / <article> 提取;否则用 <body>
|
|
151
|
+
container = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
|
152
|
+
lines = [line.strip() for line in container.get_text(separator="\n").splitlines()]
|
|
153
|
+
lines = [line for line in lines if line]
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
|
|
158
|
+
"""提取所有 <a href> 链接,转为绝对 URL,去重。"""
|
|
159
|
+
seen: set[str] = set()
|
|
160
|
+
links: list[str] = []
|
|
161
|
+
for a in soup.find_all("a", href=True):
|
|
162
|
+
href = a["href"].strip()
|
|
163
|
+
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
|
164
|
+
continue
|
|
165
|
+
absolute = urljoin(base_url, href)
|
|
166
|
+
parsed = urlparse(absolute)
|
|
167
|
+
if parsed.scheme not in ("http", "https"):
|
|
168
|
+
continue
|
|
169
|
+
if absolute not in seen:
|
|
170
|
+
seen.add(absolute)
|
|
171
|
+
links.append(absolute)
|
|
172
|
+
return links
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# ---------------------------------------------------------------------------
|
|
176
|
+
# 公开 API
|
|
177
|
+
# ---------------------------------------------------------------------------
|
|
178
|
+
|
|
179
|
+
async def extract(
|
|
180
|
+
url: str,
|
|
181
|
+
*,
|
|
182
|
+
timeout: float = 15.0,
|
|
183
|
+
headers: Optional[dict] = None,
|
|
184
|
+
client: Optional[httpx.AsyncClient] = None,
|
|
185
|
+
) -> ExtractResult:
|
|
186
|
+
"""
|
|
187
|
+
Level 1 HTTP 抓取。
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
url: 目标 URL
|
|
191
|
+
timeout: 请求超时秒数
|
|
192
|
+
headers: 额外请求头(会合并到 DEFAULT_HEADERS)
|
|
193
|
+
client: 可复用的 httpx.AsyncClient(不传则自动创建)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
ExtractResult
|
|
197
|
+
|
|
198
|
+
Raises:
|
|
199
|
+
NeedsBrowserError: 页面需要浏览器渲染
|
|
200
|
+
ExtractionError: 请求或解析失败
|
|
201
|
+
"""
|
|
202
|
+
merged_headers = {**DEFAULT_HEADERS, **(headers or {})}
|
|
203
|
+
|
|
204
|
+
async def _do_request(c: httpx.AsyncClient) -> ExtractResult:
|
|
205
|
+
t0 = time.perf_counter()
|
|
206
|
+
try:
|
|
207
|
+
response = await c.get(url, follow_redirects=True)
|
|
208
|
+
except httpx.TimeoutException as e:
|
|
209
|
+
raise ExtractionError(url, f"timeout: {e}") from e
|
|
210
|
+
except httpx.RequestError as e:
|
|
211
|
+
raise ExtractionError(url, f"request error: {e}") from e
|
|
212
|
+
elapsed = (time.perf_counter() - t0) * 1000
|
|
213
|
+
|
|
214
|
+
# 判断是否需要浏览器
|
|
215
|
+
browser_needed, reason = needs_browser(response)
|
|
216
|
+
if browser_needed:
|
|
217
|
+
raise NeedsBrowserError(url, reason)
|
|
218
|
+
|
|
219
|
+
# 解析内容
|
|
220
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
221
|
+
title = _extract_title(soup)
|
|
222
|
+
text = _extract_text(BeautifulSoup(response.text, "html.parser")) # 用新 soup 避免修改影响
|
|
223
|
+
links = _extract_links(soup, str(response.url))
|
|
224
|
+
|
|
225
|
+
return ExtractResult(
|
|
226
|
+
url=str(response.url),
|
|
227
|
+
level=1,
|
|
228
|
+
status_code=response.status_code,
|
|
229
|
+
title=title,
|
|
230
|
+
text=text,
|
|
231
|
+
links=links,
|
|
232
|
+
elapsed_ms=round(elapsed, 1),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if client is not None:
|
|
236
|
+
return await _do_request(client)
|
|
237
|
+
|
|
238
|
+
async with httpx.AsyncClient(
|
|
239
|
+
headers=merged_headers,
|
|
240
|
+
timeout=httpx.Timeout(timeout),
|
|
241
|
+
follow_redirects=True,
|
|
242
|
+
) as c:
|
|
243
|
+
return await _do_request(c)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/extractor/state.py
|
|
3
|
+
=============================
|
|
4
|
+
Level 3 — JS State 提取
|
|
5
|
+
|
|
6
|
+
核心思路:
|
|
7
|
+
现代 SSR 框架(Next.js / Nuxt / 自定义)会在 HTML 里注入完整数据集,
|
|
8
|
+
以 JavaScript 变量的形式存在(如 window.__PRELOADED_STATE__)。
|
|
9
|
+
这些数据在 DOM 渲染之前就已存在,可以直接用 page.evaluate() 提取,
|
|
10
|
+
比 DOM 解析更快、更完整、更稳定。
|
|
11
|
+
|
|
12
|
+
v0.1 范围:
|
|
13
|
+
只支持 window.__PRELOADED_STATE__(已在 Sportsbet.com.au 生产验证)。
|
|
14
|
+
其他变量(__NEXT_DATA__ / __NUXT__ 等)是 Phase 2 的扩展点。
|
|
15
|
+
|
|
16
|
+
与 Level 2 的关系:
|
|
17
|
+
Level 3 复用 Level 2 的 CDP 页面加载流程,
|
|
18
|
+
在页面加载完成后额外调用 page.evaluate() 尝试提取 state 变量。
|
|
19
|
+
- 成功 → 返回 level=3 结果(结构化 JSON)
|
|
20
|
+
- 失败 → 调用方回退到 Level 2 DOM 提取
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
from typing import Optional, Any
|
|
27
|
+
|
|
28
|
+
# v0.1 支持的 JS state 变量列表(按优先级排序)
|
|
29
|
+
# Phase 2 会扩展这个列表并加入 HAR 自动发现
|
|
30
|
+
KNOWN_STATE_VARS: list[str] = [
|
|
31
|
+
"window.__PRELOADED_STATE__",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def try_extract_state(
|
|
36
|
+
page, # playwright Page 对象
|
|
37
|
+
state_vars: list[str] = KNOWN_STATE_VARS,
|
|
38
|
+
) -> tuple[Optional[str], Optional[Any]]:
|
|
39
|
+
"""
|
|
40
|
+
尝试从已加载的页面中提取 JS state 变量。
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
page: 已导航到目标 URL 的 Playwright Page 对象
|
|
44
|
+
state_vars: 要尝试的变量名列表(按优先级)
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
(var_name, data) 如果找到
|
|
48
|
+
(None, None) 如果均未找到
|
|
49
|
+
"""
|
|
50
|
+
for var in state_vars:
|
|
51
|
+
try:
|
|
52
|
+
# page.evaluate 会把 JS 返回值序列化为 Python 对象
|
|
53
|
+
# 如果变量不存在,JS 返回 undefined → Python 返回 None
|
|
54
|
+
data = await page.evaluate(f"() => {var}")
|
|
55
|
+
if data is not None:
|
|
56
|
+
return var, data
|
|
57
|
+
except Exception:
|
|
58
|
+
# 变量不存在或 JS 执行出错,继续尝试下一个
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
return None, None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def state_to_text(var_name: str, data: Any) -> str:
|
|
65
|
+
"""将 JS state 对象序列化为可读文本(JSON 格式)。"""
|
|
66
|
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scrapekit/models.py
|
|
3
|
+
===================
|
|
4
|
+
共享数据类型:ExtractResult、异常类。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# 异常
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
class NeedsBrowserError(Exception):
|
|
20
|
+
"""Level 1 检测到页面需要浏览器渲染,应升级到 Level 2。"""
|
|
21
|
+
def __init__(self, url: str, reason: str = ""):
|
|
22
|
+
self.url = url
|
|
23
|
+
self.reason = reason
|
|
24
|
+
super().__init__(f"Browser required for {url}: {reason}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExtractionError(Exception):
|
|
28
|
+
"""抓取过程中发生不可恢复的错误。"""
|
|
29
|
+
def __init__(self, url: str, reason: str = ""):
|
|
30
|
+
self.url = url
|
|
31
|
+
self.reason = reason
|
|
32
|
+
super().__init__(f"Extraction failed for {url}: {reason}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# 抓取结果
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ExtractResult:
|
|
41
|
+
"""
|
|
42
|
+
单次抓取的结构化结果。
|
|
43
|
+
|
|
44
|
+
level:
|
|
45
|
+
1 = HTTP 直取(httpx)
|
|
46
|
+
2 = CDP + 已有 Chrome Session
|
|
47
|
+
3 = JS State 提取
|
|
48
|
+
"""
|
|
49
|
+
url: str
|
|
50
|
+
level: int
|
|
51
|
+
status_code: int
|
|
52
|
+
title: str
|
|
53
|
+
text: str # 主要文字内容(BeautifulSoup 提取)
|
|
54
|
+
links: list[str] # 页面内所有链接(绝对 URL)
|
|
55
|
+
elapsed_ms: float
|
|
56
|
+
extracted_at: str = field(
|
|
57
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
58
|
+
)
|
|
59
|
+
error: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
# ── 输出格式 ──────────────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict:
|
|
64
|
+
return {
|
|
65
|
+
"url": self.url,
|
|
66
|
+
"level": self.level,
|
|
67
|
+
"status_code": self.status_code,
|
|
68
|
+
"title": self.title,
|
|
69
|
+
"text": self.text,
|
|
70
|
+
"links": self.links,
|
|
71
|
+
"elapsed_ms": self.elapsed_ms,
|
|
72
|
+
"extracted_at": self.extracted_at,
|
|
73
|
+
"error": self.error,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def to_json(self, indent: int = 2) -> str:
|
|
77
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
|
|
78
|
+
|
|
79
|
+
def to_markdown(self) -> str:
|
|
80
|
+
lines = [
|
|
81
|
+
f"# {self.title or self.url}",
|
|
82
|
+
f"",
|
|
83
|
+
f"> URL: {self.url} ",
|
|
84
|
+
f"> Level: {self.level} ",
|
|
85
|
+
f"> Extracted: {self.extracted_at}",
|
|
86
|
+
f"",
|
|
87
|
+
"---",
|
|
88
|
+
f"",
|
|
89
|
+
self.text,
|
|
90
|
+
]
|
|
91
|
+
if self.links:
|
|
92
|
+
lines += ["", "---", "", "## Links", ""]
|
|
93
|
+
lines += [f"- {link}" for link in self.links[:50]]
|
|
94
|
+
if len(self.links) > 50:
|
|
95
|
+
lines.append(f"- *(+{len(self.links) - 50} more)*")
|
|
96
|
+
return "\n".join(lines)
|