@blocklet/crawler 2.1.231 → 2.1.233

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,7 +32,9 @@ const getPageContent = async ({
32
32
  let page = await (0, _utils.initPage)();
33
33
  let pageContent = null;
34
34
  try {
35
- const response = await page.goto(url);
35
+ const response = await page.goto(url, {
36
+ timeout: 20 * 1e3
37
+ });
36
38
  const statusCode = response.status();
37
39
  if (![200, 304].includes(statusCode)) {
38
40
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -231,18 +233,14 @@ const crawlBlocklet = async () => {
231
233
  }
232
234
  });
233
235
  _utils.logger.info(...crawlerLogText("success"));
234
- await _utils.puppeteer.trimCache();
235
- _utils.logger.info("Puppeteer trim cache success");
236
- if (global.gc) {
237
- global.gc();
238
- }
236
+ await (0, _utils.closeBrowser)();
239
237
  };
240
238
  exports.crawlBlocklet = crawlBlocklet;
241
239
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
242
240
  let cronCrawlBlockletJob = null;
243
241
  const initCronCrawlBlocklet = ({
244
- time = "0 0 */12 * * *",
245
- // every 12 hours
242
+ time = "0 0 */24 * * *",
243
+ // every 24 hours
246
244
  options
247
245
  } = {}) => {
248
246
  if (!cronCrawlBlockletJob) {
@@ -97,7 +97,11 @@ exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
97
97
  const closeBrowser = async () => {
98
98
  try {
99
99
  if (browser) {
100
- await browser.close();
100
+ const pages = await browser.pages().catch(() => []);
101
+ await Promise.all(pages.map(page => page.close().catch(() => {})));
102
+ await browser.close().catch(err => {
103
+ logger.warn("Browser close failed with error:", err);
104
+ });
101
105
  browser = null;
102
106
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
103
107
  }
@@ -107,7 +111,9 @@ const closeBrowser = async () => {
107
111
  if (global.gc) {
108
112
  global.gc();
109
113
  }
110
- } catch (error) {}
114
+ } catch (error) {
115
+ logger.error("Failed to close browser:", error);
116
+ }
111
117
  };
112
118
  exports.closeBrowser = closeBrowser;
113
119
  const getBrowser = async () => {
@@ -134,16 +140,17 @@ const getBrowser = async () => {
134
140
  try {
135
141
  browser = await _puppeteer.default.launch({
136
142
  headless: true,
137
- // stable headless
138
- // headless: false, // debug
139
- // dumpio: true,
140
143
  args: [
141
144
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
142
145
  "--no-first-run",
143
146
  // '--no-startup-window',
144
147
  "--hide-scrollbars", "--no-sandbox", "--no-zygote",
145
148
  // '--single-process',
146
- "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas"]
149
+ "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
150
+ // 限制V8内存
151
+ "--disable-background-networking", "--disable-default-apps", "--disable-web-security",
152
+ // 允许跨域请求
153
+ "--disable-software-rasterizer", "--disable-crash-reporter"]
147
154
  });
148
155
  logger.info("Launch browser success");
149
156
  const browserWSEndpoint = await browser.wsEndpoint();
@@ -158,11 +165,15 @@ const getBrowser = async () => {
158
165
  let count = 0;
159
166
  checkBrowserTimer = setInterval(async () => {
160
167
  if (browser) {
161
- const pages = await browser.pages();
168
+ const pages = await browser.pages().catch(() => []);
162
169
  if (pages.length === 1 && pages[0].url() === "about:blank") {
163
170
  count++;
171
+ logger.debug(`Browser inactive count: ${count}/3`);
172
+ } else {
173
+ count = 0;
164
174
  }
165
175
  if (count >= 3) {
176
+ logger.info("Browser inactive for 3 minutes, closing...");
166
177
  await closeBrowser();
167
178
  }
168
179
  }
@@ -398,8 +409,9 @@ const cachePool = exports.cachePool = (0, _genericPool.createPool)({
398
409
  }
399
410
  }
400
411
  }, {
401
- max: 10,
402
- min: 1
412
+ max: 2,
413
+ // 2 clients
414
+ min: 0
403
415
  // evictionRunIntervalMillis: 0,
404
416
  });
405
417
  const withCache = async cb => {
@@ -8,7 +8,6 @@ import {
8
8
  isAcceptCrawler,
9
9
  env,
10
10
  components,
11
- puppeteer,
12
11
  getComponentInfo,
13
12
  sleep,
14
13
  closeBrowser,
@@ -31,7 +30,9 @@ export const getPageContent = async ({ url, formatPageContent }) => {
31
30
  let page = await initPage();
32
31
  let pageContent = null;
33
32
  try {
34
- const response = await page.goto(url);
33
+ const response = await page.goto(url, {
34
+ timeout: 20 * 1e3
35
+ });
35
36
  const statusCode = response.status();
36
37
  if (![200, 304].includes(statusCode)) {
37
38
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -212,17 +213,13 @@ export const crawlBlocklet = async () => {
212
213
  }
213
214
  });
214
215
  logger.info(...crawlerLogText("success"));
215
- await puppeteer.trimCache();
216
- logger.info("Puppeteer trim cache success");
217
- if (global.gc) {
218
- global.gc();
219
- }
216
+ await closeBrowser();
220
217
  };
221
218
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
222
219
  let cronCrawlBlockletJob = null;
223
220
  export const initCronCrawlBlocklet = ({
224
- time = "0 0 */12 * * *",
225
- // every 12 hours
221
+ time = "0 0 */24 * * *",
222
+ // every 24 hours
226
223
  options
227
224
  } = {}) => {
228
225
  if (!cronCrawlBlockletJob) {
@@ -41,7 +41,12 @@ export const clearCheckBrowserTimer = () => {
41
41
  export const closeBrowser = async () => {
42
42
  try {
43
43
  if (browser) {
44
- await browser.close();
44
+ const pages = await browser.pages().catch(() => []);
45
+ await Promise.all(pages.map((page) => page.close().catch(() => {
46
+ })));
47
+ await browser.close().catch((err) => {
48
+ logger.warn("Browser close failed with error:", err);
49
+ });
45
50
  browser = null;
46
51
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
47
52
  }
@@ -52,6 +57,7 @@ export const closeBrowser = async () => {
52
57
  global.gc();
53
58
  }
54
59
  } catch (error) {
60
+ logger.error("Failed to close browser:", error);
55
61
  }
56
62
  };
57
63
  export const getBrowser = async () => {
@@ -78,9 +84,6 @@ export const getBrowser = async () => {
78
84
  try {
79
85
  browser = await puppeteer.launch({
80
86
  headless: true,
81
- // stable headless
82
- // headless: false, // debug
83
- // dumpio: true,
84
87
  args: [
85
88
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
86
89
  "--no-first-run",
@@ -93,7 +96,16 @@ export const getBrowser = async () => {
93
96
  "--disable-gpu",
94
97
  "--disable-dev-shm-usage",
95
98
  "--disable-site-isolation-trials",
96
- "--disable-accelerated-2d-canvas"
99
+ "--disable-accelerated-2d-canvas",
100
+ "--disable-extensions",
101
+ "--js-flags=--max_old_space_size=512",
102
+ // 限制V8内存
103
+ "--disable-background-networking",
104
+ "--disable-default-apps",
105
+ "--disable-web-security",
106
+ // 允许跨域请求
107
+ "--disable-software-rasterizer",
108
+ "--disable-crash-reporter"
97
109
  ]
98
110
  });
99
111
  logger.info("Launch browser success");
@@ -109,11 +121,15 @@ export const getBrowser = async () => {
109
121
  let count = 0;
110
122
  checkBrowserTimer = setInterval(async () => {
111
123
  if (browser) {
112
- const pages = await browser.pages();
124
+ const pages = await browser.pages().catch(() => []);
113
125
  if (pages.length === 1 && pages[0].url() === "about:blank") {
114
126
  count++;
127
+ logger.debug(`Browser inactive count: ${count}/3`);
128
+ } else {
129
+ count = 0;
115
130
  }
116
131
  if (count >= 3) {
132
+ logger.info("Browser inactive for 3 minutes, closing...");
117
133
  await closeBrowser();
118
134
  }
119
135
  }
@@ -423,8 +439,9 @@ export const cachePool = createPool(
423
439
  }
424
440
  },
425
441
  {
426
- max: 10,
427
- min: 1
442
+ max: 2,
443
+ // 2 clients
444
+ min: 0
428
445
  // evictionRunIntervalMillis: 0,
429
446
  }
430
447
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.1.231",
3
+ "version": "2.1.233",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"