tt-help-cli-ycl 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +1 -1
  2. package/src/auto-core.mjs +174 -0
  3. package/src/cli/auto.js +94 -0
  4. package/src/cli/explore.js +117 -0
  5. package/src/cli/progress.js +111 -0
  6. package/src/cli/scrape.js +47 -0
  7. package/src/cli/utils.js +18 -0
  8. package/src/cli/videos.js +41 -0
  9. package/src/cli/watch.js +28 -0
  10. package/src/data-store.mjs +213 -0
  11. package/src/{explore-core.cjs → explore-core.mjs} +148 -157
  12. package/src/{get-user-videos-core.cjs → get-user-videos-core.mjs} +6 -23
  13. package/src/lib/args.js +19 -38
  14. package/src/lib/auto-browser.mjs +5 -12
  15. package/src/lib/browser/anti-detect.js +23 -0
  16. package/src/lib/browser/cdp.js +142 -0
  17. package/src/lib/browser/launch.js +43 -0
  18. package/src/lib/browser/page.js +62 -0
  19. package/src/lib/constants.js +13 -95
  20. package/src/lib/delay.js +54 -0
  21. package/src/lib/explore.js +16 -123
  22. package/src/lib/fetcher.js +3 -18
  23. package/src/lib/get-user-videos-browser.mjs +1 -6
  24. package/src/lib/io.js +8 -30
  25. package/src/lib/parser.js +1 -1
  26. package/src/lib/retry.js +44 -0
  27. package/src/lib/scrape-browser.mjs +1 -6
  28. package/src/lib/scrape.js +5 -4
  29. package/src/lib/url.js +52 -0
  30. package/src/main.mjs +59 -822
  31. package/src/scraper/{core.cjs → core.mjs} +25 -57
  32. package/src/scraper/modules/{comment-extractor.cjs → comment-extractor.mjs} +23 -15
  33. package/src/scraper/modules/follow-extractor.mjs +121 -0
  34. package/src/scraper/modules/{guess-extractor.cjs → guess-extractor.mjs} +3 -5
  35. package/src/scraper/modules/page-error-detector.mjs +68 -0
  36. package/src/scraper/modules/page-helpers.mjs +44 -0
  37. package/src/scraper/modules/scroll-collector.mjs +189 -0
  38. package/src/watch/public/index.html +139 -64
  39. package/src/watch/server.mjs +234 -153
  40. package/src/auto-core.cjs +0 -367
  41. package/src/data-store.cjs +0 -69
  42. package/src/get-user-videos.cjs +0 -59
  43. package/src/scraper/index.cjs +0 -97
  44. package/src/scraper/modules/follow-extractor.cjs +0 -112
  45. package/src/scraper/modules/page-helpers.cjs +0 -422
  46. package/src/scraper/modules/scroll-collector.cjs +0 -173
  47. package/src/scraper/modules/video-scanner.cjs +0 -43
@@ -1,4 +1,4 @@
1
- const {
1
+ import {
2
2
  closeCommentPanel,
3
3
  delay,
4
4
  ensureBrowserReady,
@@ -6,101 +6,78 @@ const {
6
6
  setDelayConfig,
7
7
  getDelayConfig,
8
8
  retryWithBackoff,
9
- } = require("./modules/page-helpers.cjs");
10
- const { extractCommentAuthors } = require("./modules/comment-extractor.cjs");
11
- const { extractGuessVideos } = require("./modules/guess-extractor.cjs");
9
+ } from './modules/page-helpers.mjs';
10
+ import { extractCommentAuthors } from './modules/comment-extractor.mjs';
11
+ import { extractGuessVideos } from './modules/guess-extractor.mjs';
12
12
 
13
13
  async function scrapeSingleVideo(page, maxComments, maxGuess, log, location = 'ES') {
14
14
  const config = getDelayConfig();
15
15
 
16
- await page
17
- .waitForSelector('[class*="VideoMeta"]', { timeout: 10000 })
18
- .catch(() => {});
16
+ await page.waitForSelector('[class*="VideoMeta"]', { timeout: 10000 }).catch(() => {});
19
17
  await delay(Math.round(config.commentMax * 0.3), config.commentMax);
20
18
 
21
19
  const userData = await page.evaluate(() => {
22
20
  const result = {};
23
-
24
- const m = window.location.href.match(/\/@([^\/]+)\/video/);
21
+ const m = window.location.href.match(/\/@([^/]+)\/video/);
25
22
  if (m) result.uniqueId = m[1];
26
-
27
23
  const authorEls = document.querySelectorAll('[class*="Author"]');
28
24
  for (const el of authorEls) {
29
- const text = (el.textContent || "").trim();
30
- if (text && !text.includes("TikTok") && !text.includes("Share")) {
25
+ const text = (el.textContent || '').trim();
26
+ if (text && !text.includes('TikTok') && !text.includes('Share')) {
31
27
  result.nickname = text;
32
28
  break;
33
29
  }
34
30
  }
35
-
36
31
  const html = document.documentElement.outerHTML;
37
32
  const locMatch = html.match(/"locationCreated":"([^"]*)/);
38
33
  if (locMatch) result.locationCreated = locMatch[1];
39
-
40
34
  return result;
41
35
  });
42
36
 
43
- const videoAuthor = userData.uniqueId ? "@" + userData.uniqueId : null;
44
- if (!videoAuthor) {
45
- throw new Error("无法获取视频作者");
46
- }
37
+ const videoAuthor = userData.uniqueId ? '@' + userData.uniqueId : null;
38
+ if (!videoAuthor) throw new Error('无法获取视频作者');
47
39
 
48
40
  let guessVideos = [];
49
41
  let commentUsers = [];
50
42
 
51
43
  if (userData.locationCreated === location) {
52
44
  guessVideos = await extractGuessVideos(page, maxGuess);
53
-
54
45
  commentUsers = await extractCommentAuthors(page, maxComments);
55
46
  await closeCommentPanel(page);
56
47
  await delay(Math.round(config.commentMax * 0.3), config.commentMax);
57
48
  }
58
49
 
59
- const uniqueUsers = [...new Set(commentUsers)];
60
-
61
50
  return {
62
51
  videoAuthor,
63
52
  uniqueId: userData.uniqueId,
64
53
  nickname: userData.nickname,
65
54
  locationCreated: userData.locationCreated,
66
- commentUsers: uniqueUsers,
55
+ commentUsers: [...new Set(commentUsers)],
67
56
  guessVideos,
68
57
  };
69
58
  }
70
59
 
71
60
  async function runScrape(options) {
72
61
  const {
73
- videoUrl,
74
- maxVideos = 20,
75
- maxComments = 999,
76
- maxGuess = 10,
77
- preset = null,
78
- switchMax = null,
79
- commentMax = null,
62
+ videoUrl, maxVideos = 20, maxComments = 999, maxGuess = 10,
63
+ preset = null, switchMax = null, commentMax = null,
80
64
  log = console.error,
81
- browser: externalBrowser = null,
82
- page: externalPage = null,
65
+ browser: externalBrowser = null, page: externalPage = null,
83
66
  } = options;
84
67
 
85
68
  if (preset) {
86
69
  setDelayConfig(preset);
87
70
  } else if (switchMax || commentMax) {
88
- setDelayConfig({
89
- switchMax: switchMax || 5000,
90
- commentMax: commentMax || 3000,
91
- });
71
+ setDelayConfig({ switchMax: switchMax || 5000, commentMax: commentMax || 3000 });
92
72
  }
93
73
 
94
74
  const config = getDelayConfig();
95
-
96
75
  let browser, page;
97
76
  const isExternal = !!(externalBrowser && externalPage);
98
77
 
99
78
  if (!isExternal) {
100
79
  log(`视频地址: ${videoUrl}`);
101
- log(
102
- `视频数: ${maxVideos}, 评论数: ${maxComments}, 猜你喜欢: ${maxGuess}, 切换延迟: ${config.switchMax}ms, 评论延迟: ${config.commentMax}ms`,
103
- );
80
+ log(`视频数: ${maxVideos}, 评论数: ${maxComments}, 猜你喜欢: ${maxGuess}, 切换延迟: ${config.switchMax}ms, 评论延迟: ${config.commentMax}ms`);
104
81
  }
105
82
 
106
83
  if (isExternal) {
@@ -116,7 +93,7 @@ async function runScrape(options) {
116
93
  }
117
94
  }
118
95
 
119
- await retryWithBackoff(() => page.goto(videoUrl, { waitUntil: "load", timeout: 30000 }), { log });
96
+ await retryWithBackoff(() => page.goto(videoUrl, { waitUntil: 'load', timeout: 30000 }), { log });
120
97
  await delay(Math.round(config.switchMax * 0.5), config.switchMax);
121
98
  await closeCommentPanel(page);
122
99
  await delay(Math.round(config.commentMax * 0.5), config.commentMax);
@@ -138,9 +115,7 @@ async function runScrape(options) {
138
115
  log(`[${i + 1}/${maxVideos}] 跳过: ${e.message}`);
139
116
  if (i < maxVideos - 1) {
140
117
  await page.evaluate(() => {
141
- const container = document.querySelector(
142
- '[class*="ColumnListContainer"]',
143
- );
118
+ const container = document.querySelector('[class*="ColumnListContainer"]');
144
119
  if (container) container.scrollTop += 700;
145
120
  else window.scrollBy(0, 700);
146
121
  });
@@ -148,37 +123,30 @@ async function runScrape(options) {
148
123
  }
149
124
  continue;
150
125
  }
126
+
151
127
  allResults.push(result);
152
128
  videoAuthors.add(result.videoAuthor);
153
- result.commentUsers.forEach((u) => commentUsers.add(u));
129
+ result.commentUsers.forEach(u => commentUsers.add(u));
154
130
  allCommentAuthorsList.push(...result.commentUsers);
155
131
  if (result.guessVideos) {
156
132
  allGuessVideos.push(...result.guessVideos);
157
- result.guessVideos.forEach((v) => {
158
- if (v.author) allGuessAuthors.add(v.author);
159
- });
133
+ result.guessVideos.forEach(v => { if (v.author) allGuessAuthors.add(v.author); });
160
134
  }
161
135
 
162
136
  if ((i + 1) % 5 === 0 || i === 0) {
163
- log(
164
- `[${i + 1}/${maxVideos}] ${result.videoAuthor} | 昵称: ${result.nickname || "-"} | 评论用户: ${result.commentUsers.length} | 猜你喜欢: ${result.guessVideos ? result.guessVideos.length : 0}`,
165
- );
137
+ log(`[${i + 1}/${maxVideos}] ${result.videoAuthor} | 昵称: ${result.nickname || '-'} | 评论用户: ${result.commentUsers.length} | 猜你喜欢: ${result.guessVideos ? result.guessVideos.length : 0}`);
166
138
  }
167
139
 
168
140
  if (i < maxVideos - 1) {
169
141
  await page.evaluate(() => {
170
- const container = document.querySelector(
171
- '[class*="ColumnListContainer"]',
172
- );
142
+ const container = document.querySelector('[class*="ColumnListContainer"]');
173
143
  if (container) container.scrollTop += 700;
174
144
  });
175
145
  await delay(2000, config.switchMax);
176
146
  }
177
147
  }
178
148
 
179
- log(
180
- `\n结果: 视频作者 ${videoAuthors.size} | 评论用户 ${commentUsers.size} | 总评论 ${allCommentAuthorsList.length} | 猜你喜欢作者 ${allGuessAuthors.size} | 总猜中视频 ${allGuessVideos.length}`,
181
- );
149
+ log(`\n结果: 视频作者 ${videoAuthors.size} | 评论用户 ${commentUsers.size} | 总评论 ${allCommentAuthorsList.length} | 猜你喜欢作者 ${allGuessAuthors.size} | 总猜中视频 ${allGuessVideos.length}`);
182
150
 
183
151
  const videoDetails = {};
184
152
  for (const r of allResults) {
@@ -211,4 +179,4 @@ async function runScrape(options) {
211
179
  return { output, browser, isExternal };
212
180
  }
213
181
 
214
- module.exports = { scrapeSingleVideo, runScrape };
182
+ export { scrapeSingleVideo, runScrape };
@@ -1,13 +1,22 @@
1
- const { delay, getDelayConfig, closeCommentPanel } = require('./page-helpers.cjs');
2
- const { scrollAndCollect } = require('./scroll-collector.cjs');
1
+ import { delay, getDelayConfig, closeCommentPanel } from "./page-helpers.mjs";
2
+ import { scrollAndCollect } from "./scroll-collector.mjs";
3
3
 
4
4
  async function openCommentPanel(page) {
5
5
  const tabs = page.locator('[class*="tabbar-item"]');
6
- const commentTab = tabs.filter({ hasText: '评论' }).first();
6
+ const commentTab = tabs.filter({ hasText: "评论" }).first();
7
7
  await commentTab.click();
8
- const config = getDelayConfig();
9
- await delay(Math.round(config.commentMax * 0.5), config.commentMax);
10
- await page.waitForSelector('[class*="CommentListContainer"]', { timeout: 5000 }).catch(() => {});
8
+ await page
9
+ .waitForSelector('[class*="CommentListContainer"]', { timeout: 5000 })
10
+ .catch(() => {});
11
+ await page
12
+ .waitForFunction(
13
+ () => {
14
+ const list = document.querySelector('[class*="CommentListContainer"]');
15
+ return list && list.children.length > 0;
16
+ },
17
+ { timeout: 10000 },
18
+ )
19
+ .catch(() => {});
11
20
  }
12
21
 
13
22
  async function extractCommentAuthors(page, maxComments = 10) {
@@ -20,14 +29,15 @@ async function extractCommentAuthors(page, maxComments = 10) {
20
29
  collectFn: (container) => {
21
30
  const list = document.querySelector('[class*="CommentListContainer"]');
22
31
  if (!list) return { items: [] };
23
-
24
32
  const authors = [];
25
- Array.from(list.children).forEach(wrapper => {
26
- const link = wrapper.querySelector('[class*="UsernameContentWrapper"] a');
33
+ Array.from(list.children).forEach((wrapper) => {
34
+ const link = wrapper.querySelector(
35
+ '[class*="UsernameContentWrapper"] a',
36
+ );
27
37
  if (link) {
28
- const href = link.href || link.getAttribute('href');
38
+ const href = link.href || link.getAttribute("href");
29
39
  const m = href && href.match(/@([^/]+)/);
30
- if (m) authors.push('@' + m[1]);
40
+ if (m) authors.push("@" + m[1]);
31
41
  }
32
42
  });
33
43
  return { items: authors };
@@ -35,7 +45,7 @@ async function extractCommentAuthors(page, maxComments = 10) {
35
45
  uniqueKey: (a) => a,
36
46
  maxItems: maxComments,
37
47
  delayRange: [Math.round(config.commentMax * 0.3), config.commentMax],
38
- staleThreshold: 3,
48
+ staleThreshold: 2,
39
49
  });
40
50
 
41
51
  await closeCommentPanel(page);
@@ -44,6 +54,4 @@ async function extractCommentAuthors(page, maxComments = 10) {
44
54
  return allAuthors.slice(0, maxComments);
45
55
  }
46
56
 
47
- module.exports = {
48
- extractCommentAuthors,
49
- };
57
+ export { extractCommentAuthors };
@@ -0,0 +1,121 @@
1
+ import { delay, getDelayConfig } from "./page-helpers.mjs";
2
+ import { scrollAndCollect } from "./scroll-collector.mjs";
3
+
4
+ const FILTER_WORDS = ["主页", "已关注", "粉丝", "推荐"];
5
+
6
+ async function waitForListContent(page, minChildren = 1, timeout = 15000) {
7
+ await page
8
+ .waitForFunction(
9
+ (min) => {
10
+ const container = document.querySelector(
11
+ "[class*=DivUserListContainer]",
12
+ );
13
+ return container && container.children.length >= min;
14
+ },
15
+ minChildren,
16
+ { timeout },
17
+ )
18
+ .catch(() => {});
19
+ }
20
+
21
+ async function openFollowModal(page) {
22
+ const el = await page.$("[data-e2e=following]");
23
+ if (!el) {
24
+ throw new Error(
25
+ "未找到 [data-e2e=following] 元素,请确认当前页面为用户主页",
26
+ );
27
+ }
28
+ await el.evaluate((el) => el.parentElement.click());
29
+ await page
30
+ .waitForSelector("[class*=DivUserListContainer]", { timeout: 5000 })
31
+ .catch(() => {
32
+ throw new Error("关注弹窗未出现 DivUserListContainer");
33
+ });
34
+ await waitForListContent(page, 1, 3000);
35
+ }
36
+
37
+ async function switchToFollowersTab(page) {
38
+ await page.evaluate(() => {
39
+ const tabs = document.querySelectorAll("[class*=DivTabItem]");
40
+ for (const tab of tabs) {
41
+ if (tab.textContent?.includes("粉丝")) {
42
+ tab.click();
43
+ return;
44
+ }
45
+ }
46
+ throw new Error("未找到粉丝 Tab");
47
+ });
48
+ await waitForListContent(page, 1, 3000);
49
+ }
50
+
51
+ async function closeFollowModal(page) {
52
+ await page.evaluate(() => {
53
+ const closeBtn = document.querySelector("[data-e2e=follow-popup-close]");
54
+ if (closeBtn) closeBtn.click();
55
+ });
56
+ await page.waitForTimeout(500);
57
+ }
58
+
59
+ function createUserCollectFn() {
60
+ return (container) => {
61
+ const FILTER_WORDS = ["主页", "已关注", "粉丝", "推荐"];
62
+ const modal = document.querySelector("[class*=eyhy6180]");
63
+ const root = modal || document;
64
+ const users = [];
65
+ const seen = new Set();
66
+ const links = root.querySelectorAll('a[href*="/@"]');
67
+ for (const link of links) {
68
+ const match = link.href.match(/@([^/?]+)/);
69
+ if (!match) continue;
70
+ const handle = "@" + decodeURIComponent(match[1]);
71
+ const text = (link.textContent || "").trim();
72
+ if (text.length <= 2) continue;
73
+ if (FILTER_WORDS.includes(text)) continue;
74
+ if (seen.has(handle)) continue;
75
+ seen.add(handle);
76
+ users.push({ handle, displayName: text });
77
+ }
78
+ return { items: users };
79
+ };
80
+ }
81
+
82
+ async function extractUsersFromModal(page, maxUsers) {
83
+ const config = getDelayConfig();
84
+ const minDelay = Math.max(300, Math.round(config.commentMax * 0.3));
85
+ const maxDelay = Math.max(800, config.commentMax);
86
+
87
+ const allUsers = await scrollAndCollect(page, {
88
+ container: "[class*=DivUserListContainer]",
89
+ findScrollable: false,
90
+ collectFn: createUserCollectFn(),
91
+ uniqueKey: (u) => u.handle,
92
+ maxItems: maxUsers,
93
+ delayRange: [minDelay, maxDelay],
94
+ staleThreshold: 2,
95
+ });
96
+
97
+ return allUsers.slice(0, maxUsers);
98
+ }
99
+
100
+ async function extractFollowAndFollowers(page, options = {}) {
101
+ const { maxFollowing = 999, maxFollowers = 999, log = () => {} } = options;
102
+
103
+ await openFollowModal(page);
104
+
105
+ const following = await extractUsersFromModal(page, maxFollowing);
106
+ log(` 已关注: ${following.length}`);
107
+
108
+ await switchToFollowersTab(page);
109
+
110
+ const followers = await extractUsersFromModal(page, maxFollowers);
111
+ log(` 粉丝: ${followers.length}`);
112
+
113
+ await closeFollowModal(page);
114
+
115
+ return {
116
+ following: following.map((u) => [u.handle, u.displayName]),
117
+ followers: followers.map((u) => [u.handle, u.displayName]),
118
+ };
119
+ }
120
+
121
+ export { extractFollowAndFollowers };
@@ -1,5 +1,5 @@
1
- const { delay, getDelayConfig, closeCommentPanel } = require('./page-helpers.cjs');
2
- const { scrollAndCollect } = require('./scroll-collector.cjs');
1
+ import { delay, getDelayConfig, closeCommentPanel } from './page-helpers.mjs';
2
+ import { scrollAndCollect } from './scroll-collector.mjs';
3
3
 
4
4
  async function openGuessTab(page) {
5
5
  const tabs = page.locator('[class*="tabbar-item"]');
@@ -48,6 +48,4 @@ async function extractGuessVideos(page, maxVideos = 10) {
48
48
  return allVideos.slice(0, maxVideos);
49
49
  }
50
50
 
51
- module.exports = {
52
- extractGuessVideos,
53
- };
51
+ export { extractGuessVideos };
@@ -0,0 +1,68 @@
1
+ const PATTERNS = {
2
+ login_required: [
3
+ "登录 TikTok",
4
+ "登录后查看",
5
+ "查看需登录",
6
+ "Log in to TikTok",
7
+ "Login to TikTok",
8
+ "观众管理功能",
9
+ "Viewer management",
10
+ "私密账号",
11
+ "私密状态",
12
+ ],
13
+ captcha: [
14
+ "captcha",
15
+ "verify",
16
+ "验证码",
17
+ "点击下一步",
18
+ "Press and hold",
19
+ "slide to verify",
20
+ "滑动验证",
21
+ "人机验证",
22
+ "安全验证",
23
+ ],
24
+ rate_limited: [
25
+ "访问过于频繁",
26
+ "操作过于频繁",
27
+ "too many requests",
28
+ "rate limit",
29
+ "稍后再试",
30
+ "try again later",
31
+ "请稍后再来",
32
+ ],
33
+ region_blocked: [
34
+ "地区限制",
35
+ "not available in your",
36
+ "此内容不可用",
37
+ "content not available",
38
+ "currently unavailable",
39
+ "抱歉,此内容",
40
+ "此页面不可用",
41
+ ],
42
+ not_found: [
43
+ "页面不存在",
44
+ "page not found",
45
+ "找不到",
46
+ "Couldn't find this",
47
+ "nothing here",
48
+ "此页面不存在",
49
+ "没有内容",
50
+ ],
51
+ };
52
+
53
+ export async function detectPageError(page) {
54
+ return page.evaluate((patterns) => {
55
+ const bodyText = document.body.innerText;
56
+ const lower = bodyText.toLowerCase();
57
+
58
+ for (const [type, phrases] of Object.entries(patterns)) {
59
+ for (const phrase of phrases) {
60
+ if (lower.includes(phrase.toLowerCase())) {
61
+ return type;
62
+ }
63
+ }
64
+ }
65
+
66
+ return null;
67
+ }, PATTERNS);
68
+ }
@@ -0,0 +1,44 @@
1
+ import {
2
+ delay,
3
+ getDelayConfig,
4
+ setDelayConfig,
5
+ listDelayPresets,
6
+ DELAY_PRESETS,
7
+ } from '../../lib/delay.js';
8
+ import { ensureBrowserReady } from '../../lib/browser/cdp.js';
9
+ import {
10
+ ensureTikTokPage,
11
+ closeCommentPanel,
12
+ findTikTokPage,
13
+ getOrCreatePage,
14
+ } from '../../lib/browser/page.js';
15
+ import { retryWithBackoff, isRetryableError } from '../../lib/retry.js';
16
+ import {
17
+ extractUserSection,
18
+ parseUserSection,
19
+ extractLocationCreated,
20
+ USER_SECTION_SIZE,
21
+ } from '../../lib/parser.js';
22
+ import { detectPageError } from './page-error-detector.mjs';
23
+
24
+ export {
25
+ delay,
26
+ setDelayConfig,
27
+ getDelayConfig,
28
+ listDelayPresets,
29
+ DELAY_PRESETS,
30
+ ensureBrowserReady,
31
+ ensureTikTokPage,
32
+ closeCommentPanel,
33
+ findTikTokPage,
34
+ getOrCreatePage,
35
+ retryWithBackoff,
36
+ isRetryableError,
37
+ extractUserSection,
38
+ parseUserSection,
39
+ extractLocationCreated,
40
+ USER_SECTION_SIZE,
41
+ detectPageError,
42
+ };
43
+
44
+ export const CDP_PORT = 9222;
@@ -0,0 +1,189 @@
1
+ import { delay } from "../../lib/delay.js";
2
+ import { detectPageError } from "./page-error-detector.mjs";
3
+
4
+ async function doCollect(
5
+ page,
6
+ { container, findScrollable, fnStr, extraArgs },
7
+ ) {
8
+ return page.evaluate(
9
+ ({ fn: fnStr, containerSelector, findScrollableFlag, args }) => {
10
+ let el;
11
+ if (!containerSelector) {
12
+ el = window;
13
+ } else {
14
+ el = document.querySelector(containerSelector);
15
+ if (!el) {
16
+ el = window;
17
+ } else if (findScrollableFlag) {
18
+ let current = el;
19
+ let found = false;
20
+ while (current && current !== document.body) {
21
+ if (current.scrollHeight > current.clientHeight + 10) {
22
+ el = current;
23
+ found = true;
24
+ break;
25
+ }
26
+ current = current.parentElement;
27
+ }
28
+ if (!found) el = document.body;
29
+ }
30
+ }
31
+ const fn = eval("(" + fnStr + ")");
32
+ return fn(el, args);
33
+ },
34
+ {
35
+ fn: fnStr,
36
+ containerSelector: container,
37
+ findScrollableFlag: findScrollable,
38
+ args: extraArgs,
39
+ },
40
+ );
41
+ }
42
+
43
+ const LOADING_SELECTORS = [
44
+ '[class*="loading"]',
45
+ '[class*="Loading"]',
46
+ '[class*="spinner"]',
47
+ '[class*="Spinner"]',
48
+ '[class*="skeleton"]',
49
+ '[class*="Skeleton"]',
50
+ '[aria-busy="true"]',
51
+ ];
52
+
53
+ async function waitForLoading(page) {
54
+ const maxWait = 5000;
55
+ const startTime = Date.now();
56
+ while (Date.now() - startTime < maxWait) {
57
+ const isLoading = await page.evaluate((sels) => {
58
+ if (document.readyState !== "complete") return true;
59
+ for (const sel of sels) {
60
+ const el = document.querySelector(sel);
61
+ if (el && el.offsetParent !== null) return true;
62
+ }
63
+ return false;
64
+ }, LOADING_SELECTORS);
65
+ if (!isLoading) return;
66
+ await delay(300, 600);
67
+ }
68
+ }
69
+
70
+ export async function scrollAndCollect(page, options) {
71
+ const {
72
+ container,
73
+ findScrollable = false,
74
+ collectFn,
75
+ extraArgs,
76
+ delayRange = [800, 1500],
77
+ maxItems,
78
+ maxRounds = 200,
79
+ staleThreshold = 3,
80
+ uniqueKey,
81
+ onRound,
82
+ } = options;
83
+
84
+ if (!collectFn) throw new Error("collectFn is required");
85
+
86
+ const fnStr =
87
+ typeof collectFn === "function" ? collectFn.toString() : collectFn;
88
+ const allItems = [];
89
+ const seenKeys = uniqueKey ? new Set() : null;
90
+ let staleCount = 0;
91
+
92
+ const processItems = (result) => {
93
+ const raw = result.items || [];
94
+ const newItems = uniqueKey
95
+ ? raw.filter((item) => {
96
+ const key = uniqueKey(item);
97
+ if (seenKeys.has(key)) return false;
98
+ seenKeys.add(key);
99
+ return true;
100
+ })
101
+ : raw;
102
+ allItems.push(...newItems);
103
+ return newItems;
104
+ };
105
+
106
+ const isDone = (newItems) => {
107
+ if (maxItems !== undefined && allItems.length >= maxItems) return true;
108
+ if (newItems.length === 0) {
109
+ staleCount++;
110
+ if (staleCount >= staleThreshold) return true;
111
+ } else {
112
+ staleCount = 0;
113
+ }
114
+ return false;
115
+ };
116
+
117
+ const collectCtx = { container, findScrollable, fnStr, extraArgs };
118
+
119
+ const pageError = await detectPageError(page);
120
+
121
+ if (pageError) return [];
122
+
123
+ await waitForLoading(page);
124
+ let result = await doCollect(page, collectCtx);
125
+ let newItems = processItems(result);
126
+ if (onRound) onRound(0, newItems, allItems);
127
+ if (isDone(newItems)) return allItems;
128
+
129
+ for (let round = 1; round < maxRounds; round++) {
130
+ await threePhaseScroll(page, { container, findScrollable });
131
+ await delay(delayRange[0], delayRange[1]);
132
+ await waitForLoading(page);
133
+
134
+ result = await doCollect(page, collectCtx);
135
+ newItems = processItems(result);
136
+
137
+ if (onRound) onRound(round, newItems, allItems);
138
+
139
+ if (isDone(newItems)) break;
140
+ }
141
+
142
+ return allItems;
143
+ }
144
+
145
+ async function threePhaseScroll(page, { container, findScrollable }) {
146
+ await page.evaluate(
147
+ async (opts) => {
148
+ let el;
149
+ if (!opts.container) {
150
+ el = window;
151
+ } else {
152
+ el = document.querySelector(opts.container);
153
+ if (!el) {
154
+ el = window;
155
+ } else if (opts.findScrollable) {
156
+ let current = el;
157
+ let found = false;
158
+ while (current && current !== document.body) {
159
+ if (current.scrollHeight > current.clientHeight + 10) {
160
+ el = current;
161
+ found = true;
162
+ break;
163
+ }
164
+ current = current.parentElement;
165
+ }
166
+ if (!found) el = document.body;
167
+ }
168
+ }
169
+
170
+ const randDelay = (min, max) =>
171
+ new Promise((r) => setTimeout(r, min + Math.random() * (max - min)));
172
+
173
+ if (el === window) {
174
+ window.scrollBy(0, window.innerHeight);
175
+ await randDelay(400, 800);
176
+ window.scrollBy(0, -200);
177
+ await randDelay(200, 400);
178
+ window.scrollBy(0, window.innerHeight);
179
+ } else {
180
+ el.scrollTop = el.scrollHeight;
181
+ await randDelay(400, 800);
182
+ el.scrollTop -= 100 + Math.random() * 100;
183
+ await randDelay(200, 400);
184
+ el.scrollTop = el.scrollHeight;
185
+ }
186
+ },
187
+ { container, findScrollable },
188
+ );
189
+ }