@jackwener/opencli 1.7.8 → 1.7.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -14
- package/README.zh-CN.md +30 -10
- package/cli-manifest.json +646 -30
- package/clis/36kr/news.js +1 -1
- package/clis/apple-podcasts/commands.test.js +4 -4
- package/clis/apple-podcasts/episodes.js +1 -1
- package/clis/apple-podcasts/search.js +1 -1
- package/clis/apple-podcasts/top.js +1 -1
- package/clis/arxiv/paper.js +1 -1
- package/clis/arxiv/search.js +1 -1
- package/clis/band/mentions.js +3 -3
- package/clis/bbc/news.js +1 -1
- package/clis/bilibili/subtitle.js +2 -2
- package/clis/bloomberg/businessweek.js +1 -1
- package/clis/bloomberg/economics.js +1 -1
- package/clis/bloomberg/industries.js +1 -1
- package/clis/bloomberg/main.js +1 -1
- package/clis/bloomberg/markets.js +1 -1
- package/clis/bloomberg/opinions.js +1 -1
- package/clis/bloomberg/politics.js +1 -1
- package/clis/bloomberg/tech.js +1 -1
- package/clis/boss/search.js +49 -8
- package/clis/boss/search.test.js +78 -0
- package/clis/boss/send.js +3 -3
- package/clis/chatgpt/image.js +37 -8
- package/clis/chatgpt/image.test.js +92 -0
- package/clis/chatgpt/utils.js +39 -6
- package/clis/chatgpt/utils.test.js +63 -0
- package/clis/chatgpt-app/ask.js +1 -1
- package/clis/chatgpt-app/ax.js +4 -2
- package/clis/chatgpt-app/ax.test.js +12 -0
- package/clis/chatgpt-app/model.js +1 -1
- package/clis/chatgpt-app/new.js +1 -1
- package/clis/chatgpt-app/read.js +1 -1
- package/clis/chatgpt-app/send.js +1 -1
- package/clis/chatgpt-app/status.js +1 -1
- package/clis/chatwise/ask.js +2 -2
- package/clis/chatwise/model.js +2 -2
- package/clis/chatwise/send.js +2 -2
- package/clis/claude/ask.js +128 -0
- package/clis/claude/ask.test.js +338 -0
- package/clis/claude/commands.test.js +118 -0
- package/clis/claude/detail.js +29 -0
- package/clis/claude/history.js +31 -0
- package/clis/claude/new.js +21 -0
- package/clis/claude/read.js +24 -0
- package/clis/claude/send.js +41 -0
- package/clis/claude/status.js +24 -0
- package/clis/claude/utils.js +440 -0
- package/clis/claude/utils.test.js +148 -0
- package/clis/codex/ask.js +2 -2
- package/clis/codex/send.js +2 -2
- package/clis/ctrip/search.js +1 -1
- package/clis/ctrip/search.test.js +4 -4
- package/clis/cursor/ask.js +2 -2
- package/clis/cursor/composer.js +2 -2
- package/clis/cursor/send.js +2 -2
- package/clis/deepseek/ask.js +17 -4
- package/clis/deepseek/ask.test.js +46 -0
- package/clis/deepseek/utils.js +55 -16
- package/clis/deepseek/utils.test.js +124 -5
- package/clis/doubao/utils.js +53 -11
- package/clis/doubao/utils.test.js +22 -2
- package/clis/eastmoney/announcement.js +1 -1
- package/clis/eastmoney/convertible.js +1 -1
- package/clis/eastmoney/etf.js +1 -1
- package/clis/eastmoney/holders.js +1 -1
- package/clis/eastmoney/index-board.js +1 -1
- package/clis/eastmoney/kline.js +1 -1
- package/clis/eastmoney/kuaixun.js +1 -1
- package/clis/eastmoney/longhu.js +1 -1
- package/clis/eastmoney/money-flow.js +1 -1
- package/clis/eastmoney/northbound.js +1 -1
- package/clis/eastmoney/quote.js +1 -1
- package/clis/eastmoney/rank.js +1 -1
- package/clis/eastmoney/sectors.js +1 -1
- package/clis/facebook/marketplace-inbox.js +83 -0
- package/clis/facebook/marketplace-listings.js +83 -0
- package/clis/facebook/marketplace.test.js +91 -0
- package/clis/google/news.js +1 -1
- package/clis/google/suggest.js +1 -1
- package/clis/google/trends.js +1 -1
- package/clis/google-scholar/cite.js +74 -0
- package/clis/google-scholar/cite.test.js +47 -0
- package/clis/google-scholar/profile.js +92 -0
- package/clis/google-scholar/profile.test.js +49 -0
- package/clis/google-scholar/search.js +1 -1
- package/clis/google-scholar/search.test.js +15 -0
- package/clis/hf/top.js +1 -1
- package/clis/instagram/collection-create.js +57 -0
- package/clis/instagram/saved.js +21 -7
- package/clis/jd/item.js +679 -47
- package/clis/jd/item.test.js +318 -7
- package/clis/jd/item.test.ts +517 -0
- package/clis/lesswrong/comments.js +1 -1
- package/clis/lesswrong/curated.js +1 -1
- package/clis/lesswrong/frontpage.js +1 -1
- package/clis/lesswrong/new.js +1 -1
- package/clis/lesswrong/read.js +1 -1
- package/clis/lesswrong/sequences.js +1 -1
- package/clis/lesswrong/shortform.js +1 -1
- package/clis/lesswrong/tag.js +1 -1
- package/clis/lesswrong/tags.js +1 -1
- package/clis/lesswrong/top-month.js +1 -1
- package/clis/lesswrong/top-week.js +1 -1
- package/clis/lesswrong/top-year.js +1 -1
- package/clis/lesswrong/top.js +1 -1
- package/clis/lesswrong/user-posts.js +1 -1
- package/clis/lesswrong/user.js +1 -1
- package/clis/paperreview/commands.test.js +6 -6
- package/clis/paperreview/feedback.js +1 -1
- package/clis/paperreview/review.js +1 -1
- package/clis/paperreview/submit.js +1 -1
- package/clis/producthunt/posts.js +1 -1
- package/clis/producthunt/today.js +1 -1
- package/clis/sinablog/search.js +1 -1
- package/clis/sinafinance/news.js +1 -1
- package/clis/sinafinance/stock.js +1 -1
- package/clis/sinafinance/stock.test.js +2 -2
- package/clis/spotify/spotify.js +6 -6
- package/clis/substack/search.js +1 -1
- package/clis/toutiao/articles.js +5 -6
- package/clis/toutiao/articles.test.js +22 -15
- package/clis/twitter/followers.js +2 -2
- package/clis/twitter/following.js +224 -73
- package/clis/twitter/following.test.js +277 -0
- package/clis/twitter/post.js +184 -47
- package/clis/twitter/post.test.js +114 -34
- package/clis/uiverse/_shared.js +63 -4
- package/clis/uiverse/_shared.test.js +7 -0
- package/clis/uiverse/code.js +1 -0
- package/clis/uiverse/navigation.test.js +12 -0
- package/clis/uiverse/preview.js +1 -0
- package/clis/web/read.js +319 -81
- package/clis/web/read.test.js +221 -5
- package/clis/weibo/favorites.js +169 -0
- package/clis/weibo/favorites.test.js +114 -0
- package/clis/weibo/publish.js +282 -0
- package/clis/weibo/publish.test.js +183 -0
- package/clis/weread/ranking.js +1 -1
- package/clis/weread/search-regression.test.js +8 -8
- package/clis/weread/search.js +1 -1
- package/clis/wikipedia/random.js +1 -1
- package/clis/wikipedia/search.js +1 -1
- package/clis/wikipedia/summary.js +1 -1
- package/clis/wikipedia/trending.js +1 -1
- package/clis/xianyu/chat.js +3 -3
- package/clis/xianyu/item.js +2 -2
- package/clis/xianyu/item.test.js +3 -3
- package/clis/xiaohongshu/search.js +17 -2
- package/clis/xiaohongshu/search.test.js +37 -1
- package/clis/xiaoyuzhou/download.js +1 -1
- package/clis/xiaoyuzhou/download.test.js +3 -3
- package/clis/xiaoyuzhou/episode.js +1 -1
- package/clis/xiaoyuzhou/podcast-episodes.js +1 -1
- package/clis/xiaoyuzhou/podcast-episodes.test.js +2 -2
- package/clis/xiaoyuzhou/podcast.js +1 -1
- package/clis/xiaoyuzhou/transcript.js +1 -1
- package/clis/xiaoyuzhou/transcript.test.js +5 -5
- package/clis/yollomi/models.js +1 -1
- package/clis/youtube/channel.js +24 -1
- package/clis/youtube/channel.test.js +59 -0
- package/clis/zhihu/answer.js +21 -162
- package/clis/zhihu/answer.test.js +26 -53
- package/clis/zhihu/collection.js +197 -0
- package/clis/zhihu/collection.test.js +290 -0
- package/clis/zhihu/collections.js +127 -0
- package/clis/zhihu/collections.test.js +182 -0
- package/clis/zhihu/comment.js +24 -305
- package/clis/zhihu/comment.test.js +31 -35
- package/clis/zhihu/favorite.js +44 -182
- package/clis/zhihu/favorite.test.js +30 -167
- package/clis/zhihu/follow.js +25 -56
- package/clis/zhihu/follow.test.js +20 -23
- package/clis/zhihu/like.js +22 -67
- package/clis/zhihu/like.test.js +19 -42
- package/clis/zhihu/search.js +3 -2
- package/clis/zhihu/write-shared.js +8 -1
- package/clis/zhihu/write-shared.test.js +1 -0
- package/clis/zlibrary/commands.test.js +75 -0
- package/clis/zlibrary/info.js +47 -0
- package/clis/zlibrary/search.js +46 -0
- package/clis/zlibrary/utils.js +136 -0
- package/dist/src/adapter-source.d.ts +11 -0
- package/dist/src/adapter-source.js +24 -0
- package/dist/src/adapter-source.test.js +29 -0
- package/dist/src/browser/base-page.d.ts +3 -1
- package/dist/src/browser/base-page.js +76 -1
- package/dist/src/browser/base-page.test.d.ts +1 -0
- package/dist/src/browser/base-page.test.js +74 -0
- package/dist/src/browser/bridge.d.ts +1 -2
- package/dist/src/browser/bridge.js +40 -41
- package/dist/src/browser/cdp.d.ts +1 -0
- package/dist/src/browser/cdp.js +3 -3
- package/dist/src/browser/daemon-client.d.ts +38 -4
- package/dist/src/browser/daemon-client.js +24 -7
- package/dist/src/browser/daemon-client.test.js +49 -0
- package/dist/src/browser/daemon-lifecycle.d.ts +23 -0
- package/dist/src/browser/daemon-lifecycle.js +67 -0
- package/dist/src/browser/daemon-version.d.ts +4 -0
- package/dist/src/browser/daemon-version.js +12 -0
- package/dist/src/browser/errors.js +3 -0
- package/dist/src/browser/errors.test.js +3 -0
- package/dist/src/browser/network-cache.d.ts +1 -0
- package/dist/src/browser/page.d.ts +3 -1
- package/dist/src/browser/page.js +10 -2
- package/dist/src/browser/profile.d.ts +14 -0
- package/dist/src/browser/profile.js +85 -0
- package/dist/src/build-manifest.d.ts +2 -0
- package/dist/src/build-manifest.js +13 -3
- package/dist/src/build-manifest.test.js +20 -2
- package/dist/src/cli.d.ts +6 -0
- package/dist/src/cli.js +477 -35
- package/dist/src/cli.test.js +303 -2
- package/dist/src/commanderAdapter.js +17 -9
- package/dist/src/commanderAdapter.test.js +67 -2
- package/dist/src/commands/daemon.d.ts +2 -0
- package/dist/src/commands/daemon.js +42 -1
- package/dist/src/commands/daemon.test.js +103 -2
- package/dist/src/completion-shared.js +1 -2
- package/dist/src/completion.test.js +3 -2
- package/dist/src/daemon.js +125 -41
- package/dist/src/doctor.d.ts +5 -6
- package/dist/src/doctor.js +77 -19
- package/dist/src/doctor.test.js +117 -0
- package/dist/src/engine.test.js +6 -5
- package/dist/src/errors.d.ts +14 -8
- package/dist/src/errors.js +36 -30
- package/dist/src/errors.test.js +5 -5
- package/dist/src/execution.d.ts +4 -0
- package/dist/src/execution.js +173 -25
- package/dist/src/execution.test.js +171 -1
- package/dist/src/main.js +10 -0
- package/dist/src/observation/artifact.d.ts +16 -0
- package/dist/src/observation/artifact.js +260 -0
- package/dist/src/observation/artifact.test.d.ts +1 -0
- package/dist/src/observation/artifact.test.js +121 -0
- package/dist/src/observation/events.d.ts +89 -0
- package/dist/src/observation/events.js +1 -0
- package/dist/src/observation/index.d.ts +7 -0
- package/dist/src/observation/index.js +7 -0
- package/dist/src/observation/manager.d.ts +9 -0
- package/dist/src/observation/manager.js +27 -0
- package/dist/src/observation/manager.test.d.ts +1 -0
- package/dist/src/observation/manager.test.js +13 -0
- package/dist/src/observation/redaction.d.ts +11 -0
- package/dist/src/observation/redaction.js +81 -0
- package/dist/src/observation/redaction.test.d.ts +1 -0
- package/dist/src/observation/redaction.test.js +32 -0
- package/dist/src/observation/retention.d.ts +32 -0
- package/dist/src/observation/retention.js +160 -0
- package/dist/src/observation/retention.test.d.ts +1 -0
- package/dist/src/observation/retention.test.js +118 -0
- package/dist/src/observation/ring-buffer.d.ts +22 -0
- package/dist/src/observation/ring-buffer.js +45 -0
- package/dist/src/observation/ring-buffer.test.d.ts +1 -0
- package/dist/src/observation/ring-buffer.test.js +22 -0
- package/dist/src/observation/session.d.ts +25 -0
- package/dist/src/observation/session.js +50 -0
- package/dist/src/pipeline/executor.test.js +1 -0
- package/dist/src/pipeline/steps/download.test.js +1 -0
- package/dist/src/pipeline/steps/fetch.js +1 -21
- package/dist/src/pipeline/steps/fetch.test.js +6 -12
- package/dist/src/plugin-scaffold.js +1 -1
- package/dist/src/plugin-scaffold.test.js +1 -1
- package/dist/src/registry.d.ts +40 -9
- package/dist/src/registry.js +3 -1
- package/dist/src/runtime-detect.d.ts +10 -0
- package/dist/src/runtime-detect.js +19 -0
- package/dist/src/runtime-detect.test.js +12 -1
- package/dist/src/runtime.d.ts +2 -0
- package/dist/src/runtime.js +1 -0
- package/dist/src/types.d.ts +22 -0
- package/dist/src/update-check.d.ts +31 -1
- package/dist/src/update-check.js +62 -16
- package/dist/src/update-check.test.js +86 -1
- package/package.json +1 -1
- package/dist/src/diagnostic.d.ts +0 -63
- package/dist/src/diagnostic.js +0 -292
- package/dist/src/diagnostic.test.js +0 -302
- /package/dist/src/{diagnostic.test.d.ts → adapter-source.test.d.ts} +0 -0
package/clis/web/read.js
CHANGED
|
@@ -15,63 +15,204 @@
|
|
|
15
15
|
*/
|
|
16
16
|
import { cli, Strategy } from '@jackwener/opencli/registry';
|
|
17
17
|
import { downloadArticle } from '@jackwener/opencli/download/article-download';
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
18
|
+
|
|
19
|
+
const NETWORK_IDLE_QUIET_MS = 1000;
|
|
20
|
+
const NETWORK_IDLE_POLL_MS = 500;
|
|
21
|
+
|
|
22
|
+
function sleep(ms) {
|
|
23
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function boolish(value) {
|
|
27
|
+
if (value === true) return true;
|
|
28
|
+
if (typeof value === 'string') return ['1', 'true', 'yes', 'on'].includes(value.toLowerCase());
|
|
29
|
+
return false;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function normalizeFrameMode(value) {
|
|
33
|
+
const mode = String(value || 'same-origin').toLowerCase();
|
|
34
|
+
if (['same-origin', 'none'].includes(mode)) return mode;
|
|
35
|
+
return 'same-origin';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function normalizeWaitUntil(value) {
|
|
39
|
+
const waitUntil = String(value || 'domstable').toLowerCase();
|
|
40
|
+
if (['domstable', 'networkidle'].includes(waitUntil)) return waitUntil;
|
|
41
|
+
return 'domstable';
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function normalizeNetworkEntry(entry) {
|
|
45
|
+
const preview = typeof entry?.responsePreview === 'string' ? entry.responsePreview : '';
|
|
46
|
+
return {
|
|
47
|
+
method: typeof entry?.method === 'string' ? entry.method : 'GET',
|
|
48
|
+
url: typeof entry?.url === 'string' ? entry.url : '',
|
|
49
|
+
status: typeof entry?.responseStatus === 'number' ? entry.responseStatus : 0,
|
|
50
|
+
contentType: typeof entry?.responseContentType === 'string' ? entry.responseContentType : '',
|
|
51
|
+
size: typeof entry?.responseBodyFullSize === 'number' ? entry.responseBodyFullSize : preview.length,
|
|
52
|
+
bodyTruncated: entry?.responseBodyTruncated === true,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function isInterestingNetworkEntry(entry) {
|
|
57
|
+
const ct = (entry.contentType || '').toLowerCase();
|
|
58
|
+
const url = entry.url || '';
|
|
59
|
+
const method = (entry.method || 'GET').toUpperCase();
|
|
60
|
+
const staticAsset = /\.(js|css|png|jpg|jpeg|gif|svg|woff|woff2|ico|map)(\?|$)/i.test(url);
|
|
61
|
+
const noisy = /analytics|tracking|telemetry|beacon|pixel|gtag|fbevents/i.test(url);
|
|
62
|
+
const apiLikeUrl = /\/(api|ajax|graphql|rest|service|handler)(\/|[?._-]|$)|\.(ashx|aspx|asmx|php)(\?|$)/i.test(url);
|
|
63
|
+
const dataLikeContent = ct.includes('json')
|
|
64
|
+
|| ct.includes('xml')
|
|
65
|
+
|| ct.includes('text/plain')
|
|
66
|
+
|| ct.includes('javascript')
|
|
67
|
+
|| (apiLikeUrl && ct.includes('text/html'));
|
|
68
|
+
return (
|
|
69
|
+
!staticAsset
|
|
70
|
+
&& !noisy
|
|
71
|
+
&& (dataLikeContent || apiLikeUrl || method !== 'GET')
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async function drainNetworkCapture(page, sink) {
|
|
76
|
+
if (!page.readNetworkCapture) return [];
|
|
77
|
+
const raw = await page.readNetworkCapture().catch(() => []);
|
|
78
|
+
const entries = Array.isArray(raw) ? raw.map(normalizeNetworkEntry).filter(entry => entry.url) : [];
|
|
79
|
+
sink.push(...entries);
|
|
80
|
+
return entries;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async function maybeStartNetworkCapture(page) {
|
|
84
|
+
if (!page.startNetworkCapture) return false;
|
|
85
|
+
try {
|
|
86
|
+
return await page.startNetworkCapture('');
|
|
87
|
+
} catch {
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async function waitForNetworkIdle(page, maxSeconds, sink) {
|
|
93
|
+
const timeoutMs = Math.max(1, Number(maxSeconds) || 1) * 1000;
|
|
94
|
+
const deadline = Date.now() + timeoutMs;
|
|
95
|
+
let quietSince = Date.now();
|
|
96
|
+
while (Date.now() < deadline) {
|
|
97
|
+
const entries = await drainNetworkCapture(page, sink);
|
|
98
|
+
if (entries.length > 0) quietSince = Date.now();
|
|
99
|
+
if (Date.now() - quietSince >= NETWORK_IDLE_QUIET_MS) return { ok: true };
|
|
100
|
+
await sleep(NETWORK_IDLE_POLL_MS);
|
|
101
|
+
}
|
|
102
|
+
return { ok: false, timedOut: true };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function buildWaitForSelectorAcrossFramesJs(selector, timeoutMs) {
|
|
106
|
+
return `
|
|
107
|
+
(async () => {
|
|
108
|
+
const selector = ${JSON.stringify(selector)};
|
|
109
|
+
const timeoutAt = Date.now() + ${Number(timeoutMs) || 10000};
|
|
110
|
+
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
111
|
+
const sameOriginFrameDocs = () => Array.from(document.querySelectorAll('iframe')).map((frame) => {
|
|
112
|
+
try {
|
|
113
|
+
const href = new URL(frame.getAttribute('src') || frame.src || '', window.location.href).href;
|
|
114
|
+
if (new URL(href).origin !== window.location.origin) return null;
|
|
115
|
+
return { href, doc: frame.contentDocument };
|
|
116
|
+
} catch {
|
|
117
|
+
return null;
|
|
118
|
+
}
|
|
119
|
+
}).filter(Boolean);
|
|
120
|
+
const findMatch = () => {
|
|
121
|
+
try {
|
|
122
|
+
if (document.querySelector(selector)) return { ok: true, scope: 'main', url: window.location.href };
|
|
123
|
+
} catch (err) {
|
|
124
|
+
return { ok: false, invalidSelector: true, error: String(err && err.message || err) };
|
|
125
|
+
}
|
|
126
|
+
for (const frame of sameOriginFrameDocs()) {
|
|
127
|
+
try {
|
|
128
|
+
if (frame.doc?.querySelector(selector)) return { ok: true, scope: 'iframe', url: frame.href };
|
|
129
|
+
} catch {}
|
|
130
|
+
}
|
|
131
|
+
return { ok: false };
|
|
132
|
+
};
|
|
133
|
+
while (Date.now() < timeoutAt) {
|
|
134
|
+
const found = findMatch();
|
|
135
|
+
if (found.ok || found.invalidSelector) return found;
|
|
136
|
+
await sleep(100);
|
|
137
|
+
}
|
|
138
|
+
return { ok: false, timedOut: true, selector };
|
|
139
|
+
})()
|
|
140
|
+
`;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function buildRenderAwareExtractorJs(options) {
|
|
144
|
+
return `
|
|
40
145
|
(() => {
|
|
146
|
+
const frameMode = ${JSON.stringify(options.frames)};
|
|
41
147
|
const result = {
|
|
42
148
|
title: '',
|
|
43
149
|
author: '',
|
|
44
150
|
publishTime: '',
|
|
45
151
|
contentHtml: '',
|
|
46
|
-
imageUrls: []
|
|
152
|
+
imageUrls: [],
|
|
153
|
+
diagnostics: {
|
|
154
|
+
url: window.location.href,
|
|
155
|
+
frames: [],
|
|
156
|
+
emptyContainers: [],
|
|
157
|
+
includedFrameCount: 0
|
|
158
|
+
}
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
const absolutize = (value, base) => {
|
|
162
|
+
if (!value || value.startsWith('data:') || value.startsWith('javascript:') || value.startsWith('#')) return value || '';
|
|
163
|
+
try { return new URL(value, base).href; } catch { return value; }
|
|
164
|
+
};
|
|
165
|
+
const absolutizeTree = (root, base) => {
|
|
166
|
+
root.querySelectorAll?.('[href]').forEach(el => el.setAttribute('href', absolutize(el.getAttribute('href'), base)));
|
|
167
|
+
root.querySelectorAll?.('[src]').forEach(el => el.setAttribute('src', absolutize(el.getAttribute('src'), base)));
|
|
168
|
+
root.querySelectorAll?.('[poster]').forEach(el => el.setAttribute('poster', absolutize(el.getAttribute('poster'), base)));
|
|
169
|
+
root.querySelectorAll?.('[action]').forEach(el => el.setAttribute('action', absolutize(el.getAttribute('action'), base)));
|
|
170
|
+
};
|
|
171
|
+
const textLen = (node) => (node?.textContent || '').replace(/\\s+/g, ' ').trim().length;
|
|
172
|
+
const describeFrame = (frame, index) => {
|
|
173
|
+
const rawSrc = frame.getAttribute('src') || frame.src || '';
|
|
174
|
+
let href = '';
|
|
175
|
+
try { href = new URL(rawSrc, window.location.href).href; } catch { href = rawSrc; }
|
|
176
|
+
let sameOrigin = false;
|
|
177
|
+
try { sameOrigin = href ? new URL(href).origin === window.location.origin : false; } catch {}
|
|
178
|
+
let accessible = false;
|
|
179
|
+
let title = frame.getAttribute('title') || frame.getAttribute('name') || frame.id || '';
|
|
180
|
+
let length = 0;
|
|
181
|
+
try {
|
|
182
|
+
accessible = !!frame.contentDocument;
|
|
183
|
+
title = title || frame.contentDocument?.title || '';
|
|
184
|
+
length = textLen(frame.contentDocument?.body);
|
|
185
|
+
} catch {}
|
|
186
|
+
return { index, src: href, title, sameOrigin, accessible, textLength: length };
|
|
187
|
+
};
|
|
188
|
+
const collectEmptyContainers = (root, scope, baseUrl) => {
|
|
189
|
+
const likely = 'table, tbody, ul[id], ol[id], div[id], section[id], [class*="grid"], [class*="data"], [class*="list"], [id*="grid"], [id*="data"], [id*="list"]';
|
|
190
|
+
root.querySelectorAll?.(likely).forEach((el) => {
|
|
191
|
+
const id = el.getAttribute('id') || '';
|
|
192
|
+
const cls = el.getAttribute('class') || '';
|
|
193
|
+
const name = [id, cls].join(' ').toLowerCase();
|
|
194
|
+
if (!/(grid|data|list|table|content|result)/.test(name) && !['TABLE', 'TBODY', 'UL', 'OL'].includes(el.nodeName)) return;
|
|
195
|
+
if (textLen(el) > 20) return;
|
|
196
|
+
result.diagnostics.emptyContainers.push({
|
|
197
|
+
scope,
|
|
198
|
+
url: baseUrl,
|
|
199
|
+
tag: el.tagName.toLowerCase(),
|
|
200
|
+
id,
|
|
201
|
+
className: cls,
|
|
202
|
+
});
|
|
203
|
+
});
|
|
47
204
|
};
|
|
48
205
|
|
|
49
|
-
// --- Title extraction ---
|
|
50
|
-
// Priority: og:title > <title> > first <h1>
|
|
51
206
|
const ogTitle = document.querySelector('meta[property="og:title"]');
|
|
52
|
-
if (ogTitle)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
if (!result.title) {
|
|
56
|
-
result.title = document.title?.trim() || '';
|
|
57
|
-
}
|
|
58
|
-
if (!result.title) {
|
|
59
|
-
const h1 = document.querySelector('h1');
|
|
60
|
-
result.title = h1?.textContent?.trim() || 'untitled';
|
|
61
|
-
}
|
|
62
|
-
// Strip site suffix (e.g. " | Anthropic", " - Blog")
|
|
207
|
+
if (ogTitle) result.title = ogTitle.getAttribute('content')?.trim() || '';
|
|
208
|
+
if (!result.title) result.title = document.title?.trim() || '';
|
|
209
|
+
if (!result.title) result.title = document.querySelector('h1')?.textContent?.trim() || 'untitled';
|
|
63
210
|
result.title = result.title.replace(/\\s*[|\\-–—]\\s*[^|\\-–—]{1,30}$/, '').trim();
|
|
64
211
|
|
|
65
|
-
|
|
66
|
-
const authorMeta = document.querySelector(
|
|
67
|
-
'meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]'
|
|
68
|
-
);
|
|
212
|
+
const authorMeta = document.querySelector('meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]');
|
|
69
213
|
result.author = authorMeta?.getAttribute('content')?.trim() || '';
|
|
70
214
|
|
|
71
|
-
|
|
72
|
-
const timeMeta = document.querySelector(
|
|
73
|
-
'meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]'
|
|
74
|
-
);
|
|
215
|
+
const timeMeta = document.querySelector('meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]');
|
|
75
216
|
if (timeMeta) {
|
|
76
217
|
result.publishTime = timeMeta.getAttribute('content')
|
|
77
218
|
|| timeMeta.getAttribute('datetime')
|
|
@@ -79,34 +220,19 @@ const command = cli({
|
|
|
79
220
|
|| '';
|
|
80
221
|
}
|
|
81
222
|
|
|
82
|
-
// --- Content extraction ---
|
|
83
|
-
// Strategy: try semantic elements first, then fall back to largest text block
|
|
84
223
|
let contentEl = null;
|
|
85
|
-
|
|
86
|
-
// 1. <article>
|
|
87
224
|
const articles = document.querySelectorAll('article');
|
|
88
225
|
if (articles.length === 1) {
|
|
89
226
|
contentEl = articles[0];
|
|
90
227
|
} else if (articles.length > 1) {
|
|
91
|
-
// Pick the largest article by text length
|
|
92
228
|
let maxLen = 0;
|
|
93
229
|
articles.forEach(a => {
|
|
94
|
-
const len = a
|
|
230
|
+
const len = textLen(a);
|
|
95
231
|
if (len > maxLen) { maxLen = len; contentEl = a; }
|
|
96
232
|
});
|
|
97
233
|
}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if (!contentEl) {
|
|
101
|
-
contentEl = document.querySelector('[role="main"]');
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// 3. <main>
|
|
105
|
-
if (!contentEl) {
|
|
106
|
-
contentEl = document.querySelector('main');
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
// 4. Largest text-dense block fallback
|
|
234
|
+
if (!contentEl) contentEl = document.querySelector('[role="main"]');
|
|
235
|
+
if (!contentEl) contentEl = document.querySelector('main');
|
|
110
236
|
if (!contentEl) {
|
|
111
237
|
const candidates = document.querySelectorAll(
|
|
112
238
|
'div[class*="content"], div[class*="article"], div[class*="post"], ' +
|
|
@@ -115,26 +241,51 @@ const command = cli({
|
|
|
115
241
|
);
|
|
116
242
|
let maxLen = 0;
|
|
117
243
|
candidates.forEach(c => {
|
|
118
|
-
const len = c
|
|
244
|
+
const len = textLen(c);
|
|
119
245
|
if (len > maxLen) { maxLen = len; contentEl = c; }
|
|
120
246
|
});
|
|
121
247
|
}
|
|
248
|
+
if (!contentEl || textLen(contentEl) < 200) contentEl = document.body;
|
|
249
|
+
|
|
250
|
+
const clone = contentEl.cloneNode(true);
|
|
251
|
+
absolutizeTree(clone, window.location.href);
|
|
252
|
+
|
|
253
|
+
const originalFrames = Array.from(contentEl.querySelectorAll('iframe'));
|
|
254
|
+
const clonedFrames = Array.from(clone.querySelectorAll('iframe'));
|
|
255
|
+
const allFrames = Array.from(document.querySelectorAll('iframe'));
|
|
256
|
+
result.diagnostics.frames = allFrames.map(describeFrame);
|
|
122
257
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
258
|
+
if (frameMode === 'same-origin') {
|
|
259
|
+
originalFrames.forEach((frame, index) => {
|
|
260
|
+
const cloned = clonedFrames[index];
|
|
261
|
+
if (!cloned) return;
|
|
262
|
+
const desc = describeFrame(frame, index);
|
|
263
|
+
if (!desc.sameOrigin || !desc.accessible) return;
|
|
264
|
+
try {
|
|
265
|
+
const doc = frame.contentDocument;
|
|
266
|
+
if (!doc?.body) return;
|
|
267
|
+
const frameBody = doc.body.cloneNode(true);
|
|
268
|
+
absolutizeTree(frameBody, desc.src || window.location.href);
|
|
269
|
+
collectEmptyContainers(frameBody, 'iframe', desc.src);
|
|
270
|
+
const section = document.createElement('section');
|
|
271
|
+
section.setAttribute('data-opencli-iframe-source', desc.src);
|
|
272
|
+
const heading = document.createElement('h2');
|
|
273
|
+
heading.textContent = '来自 iframe: ' + (desc.src || frame.getAttribute('src') || ('#' + index));
|
|
274
|
+
section.appendChild(heading);
|
|
275
|
+
Array.from(frameBody.childNodes).forEach(node => section.appendChild(node));
|
|
276
|
+
cloned.replaceWith(section);
|
|
277
|
+
result.diagnostics.includedFrameCount += 1;
|
|
278
|
+
} catch {}
|
|
279
|
+
});
|
|
126
280
|
}
|
|
127
281
|
|
|
128
|
-
|
|
129
|
-
|
|
282
|
+
collectEmptyContainers(clone, 'main', window.location.href);
|
|
283
|
+
|
|
130
284
|
const noise = 'nav, header, footer, aside, .sidebar, .nav, .menu, .footer, ' +
|
|
131
285
|
'.header, .comments, .comment, .ad, .ads, .advertisement, .social-share, ' +
|
|
132
286
|
'.related-posts, .newsletter, .cookie-banner, script, style, noscript, iframe';
|
|
133
287
|
clone.querySelectorAll(noise).forEach(el => el.remove());
|
|
134
288
|
|
|
135
|
-
// Deduplicate: some sites (e.g. Anthropic) render each paragraph twice
|
|
136
|
-
// (a visible version + a line-broken animation version with missing spaces).
|
|
137
|
-
// Compare by stripping ALL whitespace so "Hello world" matches "Helloworld".
|
|
138
289
|
const stripWS = (s) => (s || '').replace(/\\s+/g, '');
|
|
139
290
|
const dedup = (parent) => {
|
|
140
291
|
const children = Array.from(parent.children || []);
|
|
@@ -144,9 +295,7 @@ const command = cli({
|
|
|
144
295
|
const cur = stripWS(curRaw);
|
|
145
296
|
const prev = stripWS(prevRaw);
|
|
146
297
|
if (cur.length < 20 || prev.length < 20) continue;
|
|
147
|
-
// Exact match after whitespace strip, or >90% overlap
|
|
148
298
|
if (cur === prev) {
|
|
149
|
-
// Keep the one with more proper spacing (more spaces = better formatted)
|
|
150
299
|
const curSpaces = (curRaw.match(/ /g) || []).length;
|
|
151
300
|
const prevSpaces = (prevRaw.match(/ /g) || []).length;
|
|
152
301
|
if (curSpaces >= prevSpaces) children[i - 1].remove();
|
|
@@ -163,10 +312,6 @@ const command = cli({
|
|
|
163
312
|
if (el.children && el.children.length > 2) dedup(el);
|
|
164
313
|
});
|
|
165
314
|
|
|
166
|
-
// --- Lazy-load image src rewrite ---
|
|
167
|
-
// Many sites render <img src="placeholder.gif" data-src="real.jpg">.
|
|
168
|
-
// Promote the real URL onto src so both the markdown body and the
|
|
169
|
-
// image download list reference the same URL.
|
|
170
315
|
clone.querySelectorAll('img').forEach(img => {
|
|
171
316
|
const srcset = img.getAttribute('data-srcset') || '';
|
|
172
317
|
const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || '';
|
|
@@ -174,12 +319,11 @@ const command = cli({
|
|
|
174
319
|
|| img.getAttribute('data-original')
|
|
175
320
|
|| img.getAttribute('data-lazy-src')
|
|
176
321
|
|| srcsetFirst;
|
|
177
|
-
if (real) img.setAttribute('src', real);
|
|
322
|
+
if (real) img.setAttribute('src', absolutize(real, window.location.href));
|
|
178
323
|
});
|
|
179
324
|
|
|
180
325
|
result.contentHtml = clone.innerHTML;
|
|
181
326
|
|
|
182
|
-
// --- Image extraction ---
|
|
183
327
|
const seen = new Set();
|
|
184
328
|
clone.querySelectorAll('img').forEach(img => {
|
|
185
329
|
const src = img.getAttribute('src') || '';
|
|
@@ -191,7 +335,87 @@ const command = cli({
|
|
|
191
335
|
|
|
192
336
|
return result;
|
|
193
337
|
})()
|
|
194
|
-
|
|
338
|
+
`;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
function formatDiagnostics(data, networkEntries, captureSupported) {
|
|
342
|
+
const lines = [];
|
|
343
|
+
const diag = data?.diagnostics || {};
|
|
344
|
+
lines.push('[web-read diagnose]');
|
|
345
|
+
lines.push(`url: ${diag.url || '-'}`);
|
|
346
|
+
lines.push(`frames: ${Array.isArray(diag.frames) ? diag.frames.length : 0}, included_same_origin: ${diag.includedFrameCount || 0}`);
|
|
347
|
+
for (const frame of (diag.frames || []).slice(0, 20)) {
|
|
348
|
+
lines.push(` [frame ${frame.index}] ${frame.sameOrigin ? 'same-origin' : 'cross-origin'} ${frame.accessible ? 'accessible' : 'blocked'} text=${frame.textLength || 0} ${frame.src || '-'}`);
|
|
349
|
+
}
|
|
350
|
+
if (Array.isArray(diag.emptyContainers) && diag.emptyContainers.length > 0) {
|
|
351
|
+
lines.push(`empty_containers: ${diag.emptyContainers.length}`);
|
|
352
|
+
for (const item of diag.emptyContainers.slice(0, 12)) {
|
|
353
|
+
const selector = `${item.tag}${item.id ? `#${item.id}` : ''}${item.className ? `.${String(item.className).trim().split(/\\s+/).filter(Boolean).join('.')}` : ''}`;
|
|
354
|
+
lines.push(` ${item.scope}: ${selector} (${item.url || '-'})`);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
const interesting = networkEntries.filter(isInterestingNetworkEntry);
|
|
358
|
+
lines.push(`network_capture: ${captureSupported ? 'enabled' : 'unavailable'}, entries=${networkEntries.length}, api_like=${interesting.length}`);
|
|
359
|
+
for (const entry of interesting.slice(0, 20)) {
|
|
360
|
+
lines.push(` ${entry.method} ${entry.status || '-'} ${entry.contentType || '-'} ${entry.url}`);
|
|
361
|
+
}
|
|
362
|
+
return `${lines.join('\n')}\n`;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
const command = cli({
|
|
366
|
+
site: 'web',
|
|
367
|
+
name: 'read',
|
|
368
|
+
description: 'Fetch any web page and export as Markdown',
|
|
369
|
+
strategy: Strategy.COOKIE,
|
|
370
|
+
navigateBefore: false, // we handle navigation ourselves
|
|
371
|
+
args: [
|
|
372
|
+
{ name: 'url', required: true, help: 'Any web page URL' },
|
|
373
|
+
{ name: 'output', default: './web-articles', help: 'Output directory' },
|
|
374
|
+
{ name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
|
|
375
|
+
{ name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
|
|
376
|
+
{ name: 'wait-for', valueRequired: true, help: 'CSS selector to wait for in the main document or same-origin iframes' },
|
|
377
|
+
{ name: 'wait-until', default: 'domstable', choices: ['domstable', 'networkidle'], help: 'Readiness policy after navigation: domstable or networkidle' },
|
|
378
|
+
{ name: 'frames', default: 'same-origin', choices: ['same-origin', 'none'], help: 'Iframe handling mode: same-origin or none' },
|
|
379
|
+
{ name: 'diagnose', type: 'boolean', default: false, help: 'Print render diagnostics (frames, empty containers, XHR/API-like requests) to stderr' },
|
|
380
|
+
{ name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
|
|
381
|
+
],
|
|
382
|
+
columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
|
|
383
|
+
func: async (page, kwargs, debug = false) => {
|
|
384
|
+
const url = kwargs.url;
|
|
385
|
+
const waitSeconds = kwargs.wait ?? 3;
|
|
386
|
+
const waitUntil = normalizeWaitUntil(kwargs['wait-until']);
|
|
387
|
+
const frameMode = normalizeFrameMode(kwargs.frames);
|
|
388
|
+
const shouldDiagnose = boolish(kwargs.diagnose) || debug || !!process.env.OPENCLI_VERBOSE;
|
|
389
|
+
const networkEntries = [];
|
|
390
|
+
const captureSupported = (waitUntil === 'networkidle' || shouldDiagnose)
|
|
391
|
+
? await maybeStartNetworkCapture(page)
|
|
392
|
+
: false;
|
|
393
|
+
// Navigate to the target URL
|
|
394
|
+
await page.goto(url);
|
|
395
|
+
if (kwargs['wait-for']) {
|
|
396
|
+
const waitResult = await page.evaluate(buildWaitForSelectorAcrossFramesJs(String(kwargs['wait-for']), waitSeconds * 1000));
|
|
397
|
+
if (waitResult?.invalidSelector) {
|
|
398
|
+
throw new Error(`Invalid --wait-for selector "${kwargs['wait-for']}": ${waitResult.error || 'querySelector failed'}`);
|
|
399
|
+
}
|
|
400
|
+
if (!waitResult?.ok) {
|
|
401
|
+
throw new Error(`Timed out waiting for selector "${kwargs['wait-for']}" in main document or same-origin iframes`);
|
|
402
|
+
}
|
|
403
|
+
} else if (waitUntil !== 'networkidle') {
|
|
404
|
+
await page.wait(waitSeconds);
|
|
405
|
+
}
|
|
406
|
+
if (waitUntil === 'networkidle') {
|
|
407
|
+
if (!captureSupported) {
|
|
408
|
+
throw new Error('Network capture is unavailable, so --wait-until networkidle cannot be satisfied');
|
|
409
|
+
}
|
|
410
|
+
const idle = await waitForNetworkIdle(page, waitSeconds, networkEntries);
|
|
411
|
+
if (!idle?.ok) {
|
|
412
|
+
throw new Error(`Timed out waiting for network idle after ${waitSeconds}s`);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
// Extract article content using browser-side heuristics
|
|
416
|
+
const data = await page.evaluate(buildRenderAwareExtractorJs({ frames: frameMode }));
|
|
417
|
+
if (captureSupported) await drainNetworkCapture(page, networkEntries);
|
|
418
|
+
if (shouldDiagnose) process.stderr.write(formatDiagnostics(data, networkEntries, captureSupported));
|
|
195
419
|
// Determine Referer from URL for image downloads
|
|
196
420
|
let referer = '';
|
|
197
421
|
try {
|
|
@@ -211,6 +435,12 @@ const command = cli({
|
|
|
211
435
|
downloadImages: kwargs['download-images'],
|
|
212
436
|
imageHeaders: referer ? { Referer: referer } : undefined,
|
|
213
437
|
stdout: kwargs.stdout,
|
|
438
|
+
configureTurndown: (td) => {
|
|
439
|
+
td.addRule('preserveButtons', {
|
|
440
|
+
filter: (node) => node.nodeName === 'BUTTON',
|
|
441
|
+
replacement: (content) => content,
|
|
442
|
+
});
|
|
443
|
+
},
|
|
214
444
|
});
|
|
215
445
|
// `--stdout` is a content-streaming mode. The markdown body already went
|
|
216
446
|
// to process.stdout inside downloadArticle(), so returning rows here
|
|
@@ -219,4 +449,12 @@ const command = cli({
|
|
|
219
449
|
return kwargs.stdout ? null : result;
|
|
220
450
|
},
|
|
221
451
|
});
|
|
222
|
-
export const __test__ = {
|
|
452
|
+
export const __test__ = {
|
|
453
|
+
command,
|
|
454
|
+
buildRenderAwareExtractorJs,
|
|
455
|
+
buildWaitForSelectorAcrossFramesJs,
|
|
456
|
+
formatDiagnostics,
|
|
457
|
+
isInterestingNetworkEntry,
|
|
458
|
+
normalizeFrameMode,
|
|
459
|
+
normalizeWaitUntil,
|
|
460
|
+
};
|