@jackwener/opencli 1.7.8 → 1.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/README.md +49 -14
  2. package/README.zh-CN.md +30 -10
  3. package/cli-manifest.json +646 -30
  4. package/clis/36kr/news.js +1 -1
  5. package/clis/apple-podcasts/commands.test.js +4 -4
  6. package/clis/apple-podcasts/episodes.js +1 -1
  7. package/clis/apple-podcasts/search.js +1 -1
  8. package/clis/apple-podcasts/top.js +1 -1
  9. package/clis/arxiv/paper.js +1 -1
  10. package/clis/arxiv/search.js +1 -1
  11. package/clis/band/mentions.js +3 -3
  12. package/clis/bbc/news.js +1 -1
  13. package/clis/bilibili/subtitle.js +2 -2
  14. package/clis/bloomberg/businessweek.js +1 -1
  15. package/clis/bloomberg/economics.js +1 -1
  16. package/clis/bloomberg/industries.js +1 -1
  17. package/clis/bloomberg/main.js +1 -1
  18. package/clis/bloomberg/markets.js +1 -1
  19. package/clis/bloomberg/opinions.js +1 -1
  20. package/clis/bloomberg/politics.js +1 -1
  21. package/clis/bloomberg/tech.js +1 -1
  22. package/clis/boss/search.js +49 -8
  23. package/clis/boss/search.test.js +78 -0
  24. package/clis/boss/send.js +3 -3
  25. package/clis/chatgpt/image.js +37 -8
  26. package/clis/chatgpt/image.test.js +92 -0
  27. package/clis/chatgpt/utils.js +39 -6
  28. package/clis/chatgpt/utils.test.js +63 -0
  29. package/clis/chatgpt-app/ask.js +1 -1
  30. package/clis/chatgpt-app/ax.js +4 -2
  31. package/clis/chatgpt-app/ax.test.js +12 -0
  32. package/clis/chatgpt-app/model.js +1 -1
  33. package/clis/chatgpt-app/new.js +1 -1
  34. package/clis/chatgpt-app/read.js +1 -1
  35. package/clis/chatgpt-app/send.js +1 -1
  36. package/clis/chatgpt-app/status.js +1 -1
  37. package/clis/chatwise/ask.js +2 -2
  38. package/clis/chatwise/model.js +2 -2
  39. package/clis/chatwise/send.js +2 -2
  40. package/clis/claude/ask.js +128 -0
  41. package/clis/claude/ask.test.js +338 -0
  42. package/clis/claude/commands.test.js +118 -0
  43. package/clis/claude/detail.js +29 -0
  44. package/clis/claude/history.js +31 -0
  45. package/clis/claude/new.js +21 -0
  46. package/clis/claude/read.js +24 -0
  47. package/clis/claude/send.js +41 -0
  48. package/clis/claude/status.js +24 -0
  49. package/clis/claude/utils.js +440 -0
  50. package/clis/claude/utils.test.js +148 -0
  51. package/clis/codex/ask.js +2 -2
  52. package/clis/codex/send.js +2 -2
  53. package/clis/ctrip/search.js +1 -1
  54. package/clis/ctrip/search.test.js +4 -4
  55. package/clis/cursor/ask.js +2 -2
  56. package/clis/cursor/composer.js +2 -2
  57. package/clis/cursor/send.js +2 -2
  58. package/clis/deepseek/ask.js +17 -4
  59. package/clis/deepseek/ask.test.js +46 -0
  60. package/clis/deepseek/utils.js +55 -16
  61. package/clis/deepseek/utils.test.js +124 -5
  62. package/clis/doubao/utils.js +53 -11
  63. package/clis/doubao/utils.test.js +22 -2
  64. package/clis/eastmoney/announcement.js +1 -1
  65. package/clis/eastmoney/convertible.js +1 -1
  66. package/clis/eastmoney/etf.js +1 -1
  67. package/clis/eastmoney/holders.js +1 -1
  68. package/clis/eastmoney/index-board.js +1 -1
  69. package/clis/eastmoney/kline.js +1 -1
  70. package/clis/eastmoney/kuaixun.js +1 -1
  71. package/clis/eastmoney/longhu.js +1 -1
  72. package/clis/eastmoney/money-flow.js +1 -1
  73. package/clis/eastmoney/northbound.js +1 -1
  74. package/clis/eastmoney/quote.js +1 -1
  75. package/clis/eastmoney/rank.js +1 -1
  76. package/clis/eastmoney/sectors.js +1 -1
  77. package/clis/facebook/marketplace-inbox.js +83 -0
  78. package/clis/facebook/marketplace-listings.js +83 -0
  79. package/clis/facebook/marketplace.test.js +91 -0
  80. package/clis/google/news.js +1 -1
  81. package/clis/google/suggest.js +1 -1
  82. package/clis/google/trends.js +1 -1
  83. package/clis/google-scholar/cite.js +74 -0
  84. package/clis/google-scholar/cite.test.js +47 -0
  85. package/clis/google-scholar/profile.js +92 -0
  86. package/clis/google-scholar/profile.test.js +49 -0
  87. package/clis/google-scholar/search.js +1 -1
  88. package/clis/google-scholar/search.test.js +15 -0
  89. package/clis/hf/top.js +1 -1
  90. package/clis/instagram/collection-create.js +57 -0
  91. package/clis/instagram/saved.js +21 -7
  92. package/clis/jd/item.js +679 -47
  93. package/clis/jd/item.test.js +318 -7
  94. package/clis/jd/item.test.ts +517 -0
  95. package/clis/lesswrong/comments.js +1 -1
  96. package/clis/lesswrong/curated.js +1 -1
  97. package/clis/lesswrong/frontpage.js +1 -1
  98. package/clis/lesswrong/new.js +1 -1
  99. package/clis/lesswrong/read.js +1 -1
  100. package/clis/lesswrong/sequences.js +1 -1
  101. package/clis/lesswrong/shortform.js +1 -1
  102. package/clis/lesswrong/tag.js +1 -1
  103. package/clis/lesswrong/tags.js +1 -1
  104. package/clis/lesswrong/top-month.js +1 -1
  105. package/clis/lesswrong/top-week.js +1 -1
  106. package/clis/lesswrong/top-year.js +1 -1
  107. package/clis/lesswrong/top.js +1 -1
  108. package/clis/lesswrong/user-posts.js +1 -1
  109. package/clis/lesswrong/user.js +1 -1
  110. package/clis/paperreview/commands.test.js +6 -6
  111. package/clis/paperreview/feedback.js +1 -1
  112. package/clis/paperreview/review.js +1 -1
  113. package/clis/paperreview/submit.js +1 -1
  114. package/clis/producthunt/posts.js +1 -1
  115. package/clis/producthunt/today.js +1 -1
  116. package/clis/sinablog/search.js +1 -1
  117. package/clis/sinafinance/news.js +1 -1
  118. package/clis/sinafinance/stock.js +1 -1
  119. package/clis/sinafinance/stock.test.js +2 -2
  120. package/clis/spotify/spotify.js +6 -6
  121. package/clis/substack/search.js +1 -1
  122. package/clis/toutiao/articles.js +5 -6
  123. package/clis/toutiao/articles.test.js +22 -15
  124. package/clis/twitter/followers.js +2 -2
  125. package/clis/twitter/following.js +224 -73
  126. package/clis/twitter/following.test.js +277 -0
  127. package/clis/twitter/post.js +184 -47
  128. package/clis/twitter/post.test.js +114 -34
  129. package/clis/uiverse/_shared.js +63 -4
  130. package/clis/uiverse/_shared.test.js +7 -0
  131. package/clis/uiverse/code.js +1 -0
  132. package/clis/uiverse/navigation.test.js +12 -0
  133. package/clis/uiverse/preview.js +1 -0
  134. package/clis/web/read.js +319 -81
  135. package/clis/web/read.test.js +221 -5
  136. package/clis/weibo/favorites.js +169 -0
  137. package/clis/weibo/favorites.test.js +114 -0
  138. package/clis/weibo/publish.js +282 -0
  139. package/clis/weibo/publish.test.js +183 -0
  140. package/clis/weread/ranking.js +1 -1
  141. package/clis/weread/search-regression.test.js +8 -8
  142. package/clis/weread/search.js +1 -1
  143. package/clis/wikipedia/random.js +1 -1
  144. package/clis/wikipedia/search.js +1 -1
  145. package/clis/wikipedia/summary.js +1 -1
  146. package/clis/wikipedia/trending.js +1 -1
  147. package/clis/xianyu/chat.js +3 -3
  148. package/clis/xianyu/item.js +2 -2
  149. package/clis/xianyu/item.test.js +3 -3
  150. package/clis/xiaohongshu/search.js +17 -2
  151. package/clis/xiaohongshu/search.test.js +37 -1
  152. package/clis/xiaoyuzhou/download.js +1 -1
  153. package/clis/xiaoyuzhou/download.test.js +3 -3
  154. package/clis/xiaoyuzhou/episode.js +1 -1
  155. package/clis/xiaoyuzhou/podcast-episodes.js +1 -1
  156. package/clis/xiaoyuzhou/podcast-episodes.test.js +2 -2
  157. package/clis/xiaoyuzhou/podcast.js +1 -1
  158. package/clis/xiaoyuzhou/transcript.js +1 -1
  159. package/clis/xiaoyuzhou/transcript.test.js +5 -5
  160. package/clis/yollomi/models.js +1 -1
  161. package/clis/youtube/channel.js +24 -1
  162. package/clis/youtube/channel.test.js +59 -0
  163. package/clis/zhihu/answer.js +21 -162
  164. package/clis/zhihu/answer.test.js +26 -53
  165. package/clis/zhihu/collection.js +197 -0
  166. package/clis/zhihu/collection.test.js +290 -0
  167. package/clis/zhihu/collections.js +127 -0
  168. package/clis/zhihu/collections.test.js +182 -0
  169. package/clis/zhihu/comment.js +24 -305
  170. package/clis/zhihu/comment.test.js +31 -35
  171. package/clis/zhihu/favorite.js +44 -182
  172. package/clis/zhihu/favorite.test.js +30 -167
  173. package/clis/zhihu/follow.js +25 -56
  174. package/clis/zhihu/follow.test.js +20 -23
  175. package/clis/zhihu/like.js +22 -67
  176. package/clis/zhihu/like.test.js +19 -42
  177. package/clis/zhihu/search.js +3 -2
  178. package/clis/zhihu/write-shared.js +8 -1
  179. package/clis/zhihu/write-shared.test.js +1 -0
  180. package/clis/zlibrary/commands.test.js +75 -0
  181. package/clis/zlibrary/info.js +47 -0
  182. package/clis/zlibrary/search.js +46 -0
  183. package/clis/zlibrary/utils.js +136 -0
  184. package/dist/src/adapter-source.d.ts +11 -0
  185. package/dist/src/adapter-source.js +24 -0
  186. package/dist/src/adapter-source.test.js +29 -0
  187. package/dist/src/browser/base-page.d.ts +3 -1
  188. package/dist/src/browser/base-page.js +76 -1
  189. package/dist/src/browser/base-page.test.d.ts +1 -0
  190. package/dist/src/browser/base-page.test.js +74 -0
  191. package/dist/src/browser/bridge.d.ts +1 -2
  192. package/dist/src/browser/bridge.js +40 -41
  193. package/dist/src/browser/cdp.d.ts +1 -0
  194. package/dist/src/browser/cdp.js +3 -3
  195. package/dist/src/browser/daemon-client.d.ts +38 -4
  196. package/dist/src/browser/daemon-client.js +24 -7
  197. package/dist/src/browser/daemon-client.test.js +49 -0
  198. package/dist/src/browser/daemon-lifecycle.d.ts +23 -0
  199. package/dist/src/browser/daemon-lifecycle.js +67 -0
  200. package/dist/src/browser/daemon-version.d.ts +4 -0
  201. package/dist/src/browser/daemon-version.js +12 -0
  202. package/dist/src/browser/errors.js +3 -0
  203. package/dist/src/browser/errors.test.js +3 -0
  204. package/dist/src/browser/network-cache.d.ts +1 -0
  205. package/dist/src/browser/page.d.ts +3 -1
  206. package/dist/src/browser/page.js +10 -2
  207. package/dist/src/browser/profile.d.ts +14 -0
  208. package/dist/src/browser/profile.js +85 -0
  209. package/dist/src/build-manifest.d.ts +2 -0
  210. package/dist/src/build-manifest.js +13 -3
  211. package/dist/src/build-manifest.test.js +20 -2
  212. package/dist/src/cli.d.ts +6 -0
  213. package/dist/src/cli.js +477 -35
  214. package/dist/src/cli.test.js +303 -2
  215. package/dist/src/commanderAdapter.js +17 -9
  216. package/dist/src/commanderAdapter.test.js +67 -2
  217. package/dist/src/commands/daemon.d.ts +2 -0
  218. package/dist/src/commands/daemon.js +42 -1
  219. package/dist/src/commands/daemon.test.js +103 -2
  220. package/dist/src/completion-shared.js +1 -2
  221. package/dist/src/completion.test.js +3 -2
  222. package/dist/src/daemon.js +125 -41
  223. package/dist/src/doctor.d.ts +5 -6
  224. package/dist/src/doctor.js +77 -19
  225. package/dist/src/doctor.test.js +117 -0
  226. package/dist/src/engine.test.js +6 -5
  227. package/dist/src/errors.d.ts +14 -8
  228. package/dist/src/errors.js +36 -30
  229. package/dist/src/errors.test.js +5 -5
  230. package/dist/src/execution.d.ts +4 -0
  231. package/dist/src/execution.js +173 -25
  232. package/dist/src/execution.test.js +171 -1
  233. package/dist/src/main.js +10 -0
  234. package/dist/src/observation/artifact.d.ts +16 -0
  235. package/dist/src/observation/artifact.js +260 -0
  236. package/dist/src/observation/artifact.test.d.ts +1 -0
  237. package/dist/src/observation/artifact.test.js +121 -0
  238. package/dist/src/observation/events.d.ts +89 -0
  239. package/dist/src/observation/events.js +1 -0
  240. package/dist/src/observation/index.d.ts +7 -0
  241. package/dist/src/observation/index.js +7 -0
  242. package/dist/src/observation/manager.d.ts +9 -0
  243. package/dist/src/observation/manager.js +27 -0
  244. package/dist/src/observation/manager.test.d.ts +1 -0
  245. package/dist/src/observation/manager.test.js +13 -0
  246. package/dist/src/observation/redaction.d.ts +11 -0
  247. package/dist/src/observation/redaction.js +81 -0
  248. package/dist/src/observation/redaction.test.d.ts +1 -0
  249. package/dist/src/observation/redaction.test.js +32 -0
  250. package/dist/src/observation/retention.d.ts +32 -0
  251. package/dist/src/observation/retention.js +160 -0
  252. package/dist/src/observation/retention.test.d.ts +1 -0
  253. package/dist/src/observation/retention.test.js +118 -0
  254. package/dist/src/observation/ring-buffer.d.ts +22 -0
  255. package/dist/src/observation/ring-buffer.js +45 -0
  256. package/dist/src/observation/ring-buffer.test.d.ts +1 -0
  257. package/dist/src/observation/ring-buffer.test.js +22 -0
  258. package/dist/src/observation/session.d.ts +25 -0
  259. package/dist/src/observation/session.js +50 -0
  260. package/dist/src/pipeline/executor.test.js +1 -0
  261. package/dist/src/pipeline/steps/download.test.js +1 -0
  262. package/dist/src/pipeline/steps/fetch.js +1 -21
  263. package/dist/src/pipeline/steps/fetch.test.js +6 -12
  264. package/dist/src/plugin-scaffold.js +1 -1
  265. package/dist/src/plugin-scaffold.test.js +1 -1
  266. package/dist/src/registry.d.ts +40 -9
  267. package/dist/src/registry.js +3 -1
  268. package/dist/src/runtime-detect.d.ts +10 -0
  269. package/dist/src/runtime-detect.js +19 -0
  270. package/dist/src/runtime-detect.test.js +12 -1
  271. package/dist/src/runtime.d.ts +2 -0
  272. package/dist/src/runtime.js +1 -0
  273. package/dist/src/types.d.ts +22 -0
  274. package/dist/src/update-check.d.ts +31 -1
  275. package/dist/src/update-check.js +62 -16
  276. package/dist/src/update-check.test.js +86 -1
  277. package/package.json +1 -1
  278. package/dist/src/diagnostic.d.ts +0 -63
  279. package/dist/src/diagnostic.js +0 -292
  280. package/dist/src/diagnostic.test.js +0 -302
  281. /package/dist/src/{diagnostic.test.d.ts → adapter-source.test.d.ts} +0 -0
package/clis/web/read.js CHANGED
@@ -15,63 +15,204 @@
15
15
  */
16
16
  import { cli, Strategy } from '@jackwener/opencli/registry';
17
17
  import { downloadArticle } from '@jackwener/opencli/download/article-download';
18
- const command = cli({
19
- site: 'web',
20
- name: 'read',
21
- description: 'Fetch any web page and export as Markdown',
22
- strategy: Strategy.COOKIE,
23
- navigateBefore: false, // we handle navigation ourselves
24
- args: [
25
- { name: 'url', required: true, help: 'Any web page URL' },
26
- { name: 'output', default: './web-articles', help: 'Output directory' },
27
- { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
28
- { name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
29
- { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
30
- ],
31
- columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
32
- func: async (page, kwargs) => {
33
- const url = kwargs.url;
34
- const waitSeconds = kwargs.wait ?? 3;
35
- // Navigate to the target URL
36
- await page.goto(url);
37
- await page.wait(waitSeconds);
38
- // Extract article content using browser-side heuristics
39
- const data = await page.evaluate(`
18
+
19
+ const NETWORK_IDLE_QUIET_MS = 1000;
20
+ const NETWORK_IDLE_POLL_MS = 500;
21
+
22
+ function sleep(ms) {
23
+ return new Promise(resolve => setTimeout(resolve, ms));
24
+ }
25
+
26
+ function boolish(value) {
27
+ if (value === true) return true;
28
+ if (typeof value === 'string') return ['1', 'true', 'yes', 'on'].includes(value.toLowerCase());
29
+ return false;
30
+ }
31
+
32
+ function normalizeFrameMode(value) {
33
+ const mode = String(value || 'same-origin').toLowerCase();
34
+ if (['same-origin', 'none'].includes(mode)) return mode;
35
+ return 'same-origin';
36
+ }
37
+
38
+ function normalizeWaitUntil(value) {
39
+ const waitUntil = String(value || 'domstable').toLowerCase();
40
+ if (['domstable', 'networkidle'].includes(waitUntil)) return waitUntil;
41
+ return 'domstable';
42
+ }
43
+
44
+ function normalizeNetworkEntry(entry) {
45
+ const preview = typeof entry?.responsePreview === 'string' ? entry.responsePreview : '';
46
+ return {
47
+ method: typeof entry?.method === 'string' ? entry.method : 'GET',
48
+ url: typeof entry?.url === 'string' ? entry.url : '',
49
+ status: typeof entry?.responseStatus === 'number' ? entry.responseStatus : 0,
50
+ contentType: typeof entry?.responseContentType === 'string' ? entry.responseContentType : '',
51
+ size: typeof entry?.responseBodyFullSize === 'number' ? entry.responseBodyFullSize : preview.length,
52
+ bodyTruncated: entry?.responseBodyTruncated === true,
53
+ };
54
+ }
55
+
56
+ function isInterestingNetworkEntry(entry) {
57
+ const ct = (entry.contentType || '').toLowerCase();
58
+ const url = entry.url || '';
59
+ const method = (entry.method || 'GET').toUpperCase();
60
+ const staticAsset = /\.(js|css|png|jpg|jpeg|gif|svg|woff|woff2|ico|map)(\?|$)/i.test(url);
61
+ const noisy = /analytics|tracking|telemetry|beacon|pixel|gtag|fbevents/i.test(url);
62
+ const apiLikeUrl = /\/(api|ajax|graphql|rest|service|handler)(\/|[?._-]|$)|\.(ashx|aspx|asmx|php)(\?|$)/i.test(url);
63
+ const dataLikeContent = ct.includes('json')
64
+ || ct.includes('xml')
65
+ || ct.includes('text/plain')
66
+ || ct.includes('javascript')
67
+ || (apiLikeUrl && ct.includes('text/html'));
68
+ return (
69
+ !staticAsset
70
+ && !noisy
71
+ && (dataLikeContent || apiLikeUrl || method !== 'GET')
72
+ );
73
+ }
74
+
75
+ async function drainNetworkCapture(page, sink) {
76
+ if (!page.readNetworkCapture) return [];
77
+ const raw = await page.readNetworkCapture().catch(() => []);
78
+ const entries = Array.isArray(raw) ? raw.map(normalizeNetworkEntry).filter(entry => entry.url) : [];
79
+ sink.push(...entries);
80
+ return entries;
81
+ }
82
+
83
+ async function maybeStartNetworkCapture(page) {
84
+ if (!page.startNetworkCapture) return false;
85
+ try {
86
+ return await page.startNetworkCapture('');
87
+ } catch {
88
+ return false;
89
+ }
90
+ }
91
+
92
+ async function waitForNetworkIdle(page, maxSeconds, sink) {
93
+ const timeoutMs = Math.max(1, Number(maxSeconds) || 1) * 1000;
94
+ const deadline = Date.now() + timeoutMs;
95
+ let quietSince = Date.now();
96
+ while (Date.now() < deadline) {
97
+ const entries = await drainNetworkCapture(page, sink);
98
+ if (entries.length > 0) quietSince = Date.now();
99
+ if (Date.now() - quietSince >= NETWORK_IDLE_QUIET_MS) return { ok: true };
100
+ await sleep(NETWORK_IDLE_POLL_MS);
101
+ }
102
+ return { ok: false, timedOut: true };
103
+ }
104
+
105
+ function buildWaitForSelectorAcrossFramesJs(selector, timeoutMs) {
106
+ return `
107
+ (async () => {
108
+ const selector = ${JSON.stringify(selector)};
109
+ const timeoutAt = Date.now() + ${Number(timeoutMs) || 10000};
110
+ const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
111
+ const sameOriginFrameDocs = () => Array.from(document.querySelectorAll('iframe')).map((frame) => {
112
+ try {
113
+ const href = new URL(frame.getAttribute('src') || frame.src || '', window.location.href).href;
114
+ if (new URL(href).origin !== window.location.origin) return null;
115
+ return { href, doc: frame.contentDocument };
116
+ } catch {
117
+ return null;
118
+ }
119
+ }).filter(Boolean);
120
+ const findMatch = () => {
121
+ try {
122
+ if (document.querySelector(selector)) return { ok: true, scope: 'main', url: window.location.href };
123
+ } catch (err) {
124
+ return { ok: false, invalidSelector: true, error: String(err && err.message || err) };
125
+ }
126
+ for (const frame of sameOriginFrameDocs()) {
127
+ try {
128
+ if (frame.doc?.querySelector(selector)) return { ok: true, scope: 'iframe', url: frame.href };
129
+ } catch {}
130
+ }
131
+ return { ok: false };
132
+ };
133
+ while (Date.now() < timeoutAt) {
134
+ const found = findMatch();
135
+ if (found.ok || found.invalidSelector) return found;
136
+ await sleep(100);
137
+ }
138
+ return { ok: false, timedOut: true, selector };
139
+ })()
140
+ `;
141
+ }
142
+
143
+ function buildRenderAwareExtractorJs(options) {
144
+ return `
40
145
  (() => {
146
+ const frameMode = ${JSON.stringify(options.frames)};
41
147
  const result = {
42
148
  title: '',
43
149
  author: '',
44
150
  publishTime: '',
45
151
  contentHtml: '',
46
- imageUrls: []
152
+ imageUrls: [],
153
+ diagnostics: {
154
+ url: window.location.href,
155
+ frames: [],
156
+ emptyContainers: [],
157
+ includedFrameCount: 0
158
+ }
159
+ };
160
+
161
+ const absolutize = (value, base) => {
162
+ if (!value || value.startsWith('data:') || value.startsWith('javascript:') || value.startsWith('#')) return value || '';
163
+ try { return new URL(value, base).href; } catch { return value; }
164
+ };
165
+ const absolutizeTree = (root, base) => {
166
+ root.querySelectorAll?.('[href]').forEach(el => el.setAttribute('href', absolutize(el.getAttribute('href'), base)));
167
+ root.querySelectorAll?.('[src]').forEach(el => el.setAttribute('src', absolutize(el.getAttribute('src'), base)));
168
+ root.querySelectorAll?.('[poster]').forEach(el => el.setAttribute('poster', absolutize(el.getAttribute('poster'), base)));
169
+ root.querySelectorAll?.('[action]').forEach(el => el.setAttribute('action', absolutize(el.getAttribute('action'), base)));
170
+ };
171
+ const textLen = (node) => (node?.textContent || '').replace(/\\s+/g, ' ').trim().length;
172
+ const describeFrame = (frame, index) => {
173
+ const rawSrc = frame.getAttribute('src') || frame.src || '';
174
+ let href = '';
175
+ try { href = new URL(rawSrc, window.location.href).href; } catch { href = rawSrc; }
176
+ let sameOrigin = false;
177
+ try { sameOrigin = href ? new URL(href).origin === window.location.origin : false; } catch {}
178
+ let accessible = false;
179
+ let title = frame.getAttribute('title') || frame.getAttribute('name') || frame.id || '';
180
+ let length = 0;
181
+ try {
182
+ accessible = !!frame.contentDocument;
183
+ title = title || frame.contentDocument?.title || '';
184
+ length = textLen(frame.contentDocument?.body);
185
+ } catch {}
186
+ return { index, src: href, title, sameOrigin, accessible, textLength: length };
187
+ };
188
+ const collectEmptyContainers = (root, scope, baseUrl) => {
189
+ const likely = 'table, tbody, ul[id], ol[id], div[id], section[id], [class*="grid"], [class*="data"], [class*="list"], [id*="grid"], [id*="data"], [id*="list"]';
190
+ root.querySelectorAll?.(likely).forEach((el) => {
191
+ const id = el.getAttribute('id') || '';
192
+ const cls = el.getAttribute('class') || '';
193
+ const name = [id, cls].join(' ').toLowerCase();
194
+ if (!/(grid|data|list|table|content|result)/.test(name) && !['TABLE', 'TBODY', 'UL', 'OL'].includes(el.nodeName)) return;
195
+ if (textLen(el) > 20) return;
196
+ result.diagnostics.emptyContainers.push({
197
+ scope,
198
+ url: baseUrl,
199
+ tag: el.tagName.toLowerCase(),
200
+ id,
201
+ className: cls,
202
+ });
203
+ });
47
204
  };
48
205
 
49
- // --- Title extraction ---
50
- // Priority: og:title > <title> > first <h1>
51
206
  const ogTitle = document.querySelector('meta[property="og:title"]');
52
- if (ogTitle) {
53
- result.title = ogTitle.getAttribute('content')?.trim() || '';
54
- }
55
- if (!result.title) {
56
- result.title = document.title?.trim() || '';
57
- }
58
- if (!result.title) {
59
- const h1 = document.querySelector('h1');
60
- result.title = h1?.textContent?.trim() || 'untitled';
61
- }
62
- // Strip site suffix (e.g. " | Anthropic", " - Blog")
207
+ if (ogTitle) result.title = ogTitle.getAttribute('content')?.trim() || '';
208
+ if (!result.title) result.title = document.title?.trim() || '';
209
+ if (!result.title) result.title = document.querySelector('h1')?.textContent?.trim() || 'untitled';
63
210
  result.title = result.title.replace(/\\s*[|\\-–—]\\s*[^|\\-–—]{1,30}$/, '').trim();
64
211
 
65
- // --- Author extraction ---
66
- const authorMeta = document.querySelector(
67
- 'meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]'
68
- );
212
+ const authorMeta = document.querySelector('meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]');
69
213
  result.author = authorMeta?.getAttribute('content')?.trim() || '';
70
214
 
71
- // --- Publish time extraction ---
72
- const timeMeta = document.querySelector(
73
- 'meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]'
74
- );
215
+ const timeMeta = document.querySelector('meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]');
75
216
  if (timeMeta) {
76
217
  result.publishTime = timeMeta.getAttribute('content')
77
218
  || timeMeta.getAttribute('datetime')
@@ -79,34 +220,19 @@ const command = cli({
79
220
  || '';
80
221
  }
81
222
 
82
- // --- Content extraction ---
83
- // Strategy: try semantic elements first, then fall back to largest text block
84
223
  let contentEl = null;
85
-
86
- // 1. <article>
87
224
  const articles = document.querySelectorAll('article');
88
225
  if (articles.length === 1) {
89
226
  contentEl = articles[0];
90
227
  } else if (articles.length > 1) {
91
- // Pick the largest article by text length
92
228
  let maxLen = 0;
93
229
  articles.forEach(a => {
94
- const len = a.textContent?.length || 0;
230
+ const len = textLen(a);
95
231
  if (len > maxLen) { maxLen = len; contentEl = a; }
96
232
  });
97
233
  }
98
-
99
- // 2. [role="main"]
100
- if (!contentEl) {
101
- contentEl = document.querySelector('[role="main"]');
102
- }
103
-
104
- // 3. <main>
105
- if (!contentEl) {
106
- contentEl = document.querySelector('main');
107
- }
108
-
109
- // 4. Largest text-dense block fallback
234
+ if (!contentEl) contentEl = document.querySelector('[role="main"]');
235
+ if (!contentEl) contentEl = document.querySelector('main');
110
236
  if (!contentEl) {
111
237
  const candidates = document.querySelectorAll(
112
238
  'div[class*="content"], div[class*="article"], div[class*="post"], ' +
@@ -115,26 +241,51 @@ const command = cli({
115
241
  );
116
242
  let maxLen = 0;
117
243
  candidates.forEach(c => {
118
- const len = c.textContent?.length || 0;
244
+ const len = textLen(c);
119
245
  if (len > maxLen) { maxLen = len; contentEl = c; }
120
246
  });
121
247
  }
248
+ if (!contentEl || textLen(contentEl) < 200) contentEl = document.body;
249
+
250
+ const clone = contentEl.cloneNode(true);
251
+ absolutizeTree(clone, window.location.href);
252
+
253
+ const originalFrames = Array.from(contentEl.querySelectorAll('iframe'));
254
+ const clonedFrames = Array.from(clone.querySelectorAll('iframe'));
255
+ const allFrames = Array.from(document.querySelectorAll('iframe'));
256
+ result.diagnostics.frames = allFrames.map(describeFrame);
122
257
 
123
- // 5. Last resort: document.body
124
- if (!contentEl || (contentEl.textContent?.length || 0) < 200) {
125
- contentEl = document.body;
258
+ if (frameMode === 'same-origin') {
259
+ originalFrames.forEach((frame, index) => {
260
+ const cloned = clonedFrames[index];
261
+ if (!cloned) return;
262
+ const desc = describeFrame(frame, index);
263
+ if (!desc.sameOrigin || !desc.accessible) return;
264
+ try {
265
+ const doc = frame.contentDocument;
266
+ if (!doc?.body) return;
267
+ const frameBody = doc.body.cloneNode(true);
268
+ absolutizeTree(frameBody, desc.src || window.location.href);
269
+ collectEmptyContainers(frameBody, 'iframe', desc.src);
270
+ const section = document.createElement('section');
271
+ section.setAttribute('data-opencli-iframe-source', desc.src);
272
+ const heading = document.createElement('h2');
273
+ heading.textContent = '来自 iframe: ' + (desc.src || frame.getAttribute('src') || ('#' + index));
274
+ section.appendChild(heading);
275
+ Array.from(frameBody.childNodes).forEach(node => section.appendChild(node));
276
+ cloned.replaceWith(section);
277
+ result.diagnostics.includedFrameCount += 1;
278
+ } catch {}
279
+ });
126
280
  }
127
281
 
128
- // Clean up noise elements before extraction
129
- const clone = contentEl.cloneNode(true);
282
+ collectEmptyContainers(clone, 'main', window.location.href);
283
+
130
284
  const noise = 'nav, header, footer, aside, .sidebar, .nav, .menu, .footer, ' +
131
285
  '.header, .comments, .comment, .ad, .ads, .advertisement, .social-share, ' +
132
286
  '.related-posts, .newsletter, .cookie-banner, script, style, noscript, iframe';
133
287
  clone.querySelectorAll(noise).forEach(el => el.remove());
134
288
 
135
- // Deduplicate: some sites (e.g. Anthropic) render each paragraph twice
136
- // (a visible version + a line-broken animation version with missing spaces).
137
- // Compare by stripping ALL whitespace so "Hello world" matches "Helloworld".
138
289
  const stripWS = (s) => (s || '').replace(/\\s+/g, '');
139
290
  const dedup = (parent) => {
140
291
  const children = Array.from(parent.children || []);
@@ -144,9 +295,7 @@ const command = cli({
144
295
  const cur = stripWS(curRaw);
145
296
  const prev = stripWS(prevRaw);
146
297
  if (cur.length < 20 || prev.length < 20) continue;
147
- // Exact match after whitespace strip, or >90% overlap
148
298
  if (cur === prev) {
149
- // Keep the one with more proper spacing (more spaces = better formatted)
150
299
  const curSpaces = (curRaw.match(/ /g) || []).length;
151
300
  const prevSpaces = (prevRaw.match(/ /g) || []).length;
152
301
  if (curSpaces >= prevSpaces) children[i - 1].remove();
@@ -163,10 +312,6 @@ const command = cli({
163
312
  if (el.children && el.children.length > 2) dedup(el);
164
313
  });
165
314
 
166
- // --- Lazy-load image src rewrite ---
167
- // Many sites render <img src="placeholder.gif" data-src="real.jpg">.
168
- // Promote the real URL onto src so both the markdown body and the
169
- // image download list reference the same URL.
170
315
  clone.querySelectorAll('img').forEach(img => {
171
316
  const srcset = img.getAttribute('data-srcset') || '';
172
317
  const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || '';
@@ -174,12 +319,11 @@ const command = cli({
174
319
  || img.getAttribute('data-original')
175
320
  || img.getAttribute('data-lazy-src')
176
321
  || srcsetFirst;
177
- if (real) img.setAttribute('src', real);
322
+ if (real) img.setAttribute('src', absolutize(real, window.location.href));
178
323
  });
179
324
 
180
325
  result.contentHtml = clone.innerHTML;
181
326
 
182
- // --- Image extraction ---
183
327
  const seen = new Set();
184
328
  clone.querySelectorAll('img').forEach(img => {
185
329
  const src = img.getAttribute('src') || '';
@@ -191,7 +335,87 @@ const command = cli({
191
335
 
192
336
  return result;
193
337
  })()
194
- `);
338
+ `;
339
+ }
340
+
341
+ function formatDiagnostics(data, networkEntries, captureSupported) {
342
+ const lines = [];
343
+ const diag = data?.diagnostics || {};
344
+ lines.push('[web-read diagnose]');
345
+ lines.push(`url: ${diag.url || '-'}`);
346
+ lines.push(`frames: ${Array.isArray(diag.frames) ? diag.frames.length : 0}, included_same_origin: ${diag.includedFrameCount || 0}`);
347
+ for (const frame of (diag.frames || []).slice(0, 20)) {
348
+ lines.push(` [frame ${frame.index}] ${frame.sameOrigin ? 'same-origin' : 'cross-origin'} ${frame.accessible ? 'accessible' : 'blocked'} text=${frame.textLength || 0} ${frame.src || '-'}`);
349
+ }
350
+ if (Array.isArray(diag.emptyContainers) && diag.emptyContainers.length > 0) {
351
+ lines.push(`empty_containers: ${diag.emptyContainers.length}`);
352
+ for (const item of diag.emptyContainers.slice(0, 12)) {
353
+ const selector = `${item.tag}${item.id ? `#${item.id}` : ''}${item.className ? `.${String(item.className).trim().split(/\\s+/).filter(Boolean).join('.')}` : ''}`;
354
+ lines.push(` ${item.scope}: ${selector} (${item.url || '-'})`);
355
+ }
356
+ }
357
+ const interesting = networkEntries.filter(isInterestingNetworkEntry);
358
+ lines.push(`network_capture: ${captureSupported ? 'enabled' : 'unavailable'}, entries=${networkEntries.length}, api_like=${interesting.length}`);
359
+ for (const entry of interesting.slice(0, 20)) {
360
+ lines.push(` ${entry.method} ${entry.status || '-'} ${entry.contentType || '-'} ${entry.url}`);
361
+ }
362
+ return `${lines.join('\n')}\n`;
363
+ }
364
+
365
+ const command = cli({
366
+ site: 'web',
367
+ name: 'read',
368
+ description: 'Fetch any web page and export as Markdown',
369
+ strategy: Strategy.COOKIE,
370
+ navigateBefore: false, // we handle navigation ourselves
371
+ args: [
372
+ { name: 'url', required: true, help: 'Any web page URL' },
373
+ { name: 'output', default: './web-articles', help: 'Output directory' },
374
+ { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
375
+ { name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
376
+ { name: 'wait-for', valueRequired: true, help: 'CSS selector to wait for in the main document or same-origin iframes' },
377
+ { name: 'wait-until', default: 'domstable', choices: ['domstable', 'networkidle'], help: 'Readiness policy after navigation: domstable or networkidle' },
378
+ { name: 'frames', default: 'same-origin', choices: ['same-origin', 'none'], help: 'Iframe handling mode: same-origin or none' },
379
+ { name: 'diagnose', type: 'boolean', default: false, help: 'Print render diagnostics (frames, empty containers, XHR/API-like requests) to stderr' },
380
+ { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
381
+ ],
382
+ columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
383
+ func: async (page, kwargs, debug = false) => {
384
+ const url = kwargs.url;
385
+ const waitSeconds = kwargs.wait ?? 3;
386
+ const waitUntil = normalizeWaitUntil(kwargs['wait-until']);
387
+ const frameMode = normalizeFrameMode(kwargs.frames);
388
+ const shouldDiagnose = boolish(kwargs.diagnose) || debug || !!process.env.OPENCLI_VERBOSE;
389
+ const networkEntries = [];
390
+ const captureSupported = (waitUntil === 'networkidle' || shouldDiagnose)
391
+ ? await maybeStartNetworkCapture(page)
392
+ : false;
393
+ // Navigate to the target URL
394
+ await page.goto(url);
395
+ if (kwargs['wait-for']) {
396
+ const waitResult = await page.evaluate(buildWaitForSelectorAcrossFramesJs(String(kwargs['wait-for']), waitSeconds * 1000));
397
+ if (waitResult?.invalidSelector) {
398
+ throw new Error(`Invalid --wait-for selector "${kwargs['wait-for']}": ${waitResult.error || 'querySelector failed'}`);
399
+ }
400
+ if (!waitResult?.ok) {
401
+ throw new Error(`Timed out waiting for selector "${kwargs['wait-for']}" in main document or same-origin iframes`);
402
+ }
403
+ } else if (waitUntil !== 'networkidle') {
404
+ await page.wait(waitSeconds);
405
+ }
406
+ if (waitUntil === 'networkidle') {
407
+ if (!captureSupported) {
408
+ throw new Error('Network capture is unavailable, so --wait-until networkidle cannot be satisfied');
409
+ }
410
+ const idle = await waitForNetworkIdle(page, waitSeconds, networkEntries);
411
+ if (!idle?.ok) {
412
+ throw new Error(`Timed out waiting for network idle after ${waitSeconds}s`);
413
+ }
414
+ }
415
+ // Extract article content using browser-side heuristics
416
+ const data = await page.evaluate(buildRenderAwareExtractorJs({ frames: frameMode }));
417
+ if (captureSupported) await drainNetworkCapture(page, networkEntries);
418
+ if (shouldDiagnose) process.stderr.write(formatDiagnostics(data, networkEntries, captureSupported));
195
419
  // Determine Referer from URL for image downloads
196
420
  let referer = '';
197
421
  try {
@@ -211,6 +435,12 @@ const command = cli({
211
435
  downloadImages: kwargs['download-images'],
212
436
  imageHeaders: referer ? { Referer: referer } : undefined,
213
437
  stdout: kwargs.stdout,
438
+ configureTurndown: (td) => {
439
+ td.addRule('preserveButtons', {
440
+ filter: (node) => node.nodeName === 'BUTTON',
441
+ replacement: (content) => content,
442
+ });
443
+ },
214
444
  });
215
445
  // `--stdout` is a content-streaming mode. The markdown body already went
216
446
  // to process.stdout inside downloadArticle(), so returning rows here
@@ -219,4 +449,12 @@ const command = cli({
219
449
  return kwargs.stdout ? null : result;
220
450
  },
221
451
  });
222
- export const __test__ = { command };
452
+ export const __test__ = {
453
+ command,
454
+ buildRenderAwareExtractorJs,
455
+ buildWaitForSelectorAcrossFramesJs,
456
+ formatDiagnostics,
457
+ isInterestingNetworkEntry,
458
+ normalizeFrameMode,
459
+ normalizeWaitUntil,
460
+ };