@blocklet/crawler 2.1.232 → 2.1.234

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,7 +32,9 @@ const getPageContent = async ({
32
32
  let page = await (0, _utils.initPage)();
33
33
  let pageContent = null;
34
34
  try {
35
- const response = await page.goto(url);
35
+ const response = await page.goto(url, {
36
+ timeout: 20 * 1e3
37
+ });
36
38
  const statusCode = response.status();
37
39
  if (![200, 304].includes(statusCode)) {
38
40
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -231,18 +233,14 @@ const crawlBlocklet = async () => {
231
233
  }
232
234
  });
233
235
  _utils.logger.info(...crawlerLogText("success"));
234
- await _utils.puppeteer.trimCache();
235
- _utils.logger.info("Puppeteer trim cache success");
236
- if (global.gc) {
237
- global.gc();
238
- }
236
+ await (0, _utils.closeBrowser)();
239
237
  };
240
238
  exports.crawlBlocklet = crawlBlocklet;
241
239
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
242
240
  let cronCrawlBlockletJob = null;
243
241
  const initCronCrawlBlocklet = ({
244
- time = "0 0 */12 * * *",
245
- // every 12 hours
242
+ time = "0 0 */24 * * *",
243
+ // every 24 hours
246
244
  options
247
245
  } = {}) => {
248
246
  if (!cronCrawlBlockletJob) {
@@ -63,6 +63,7 @@ Object.keys(_config).forEach(function (key) {
63
63
  });
64
64
  var _util = require("util");
65
65
  var _child_process = require("child_process");
66
+ var _component = require("@blocklet/sdk/lib/component");
66
67
  function _getRequireWildcardCache(e) { if ("function" != typeof WeakMap) return null; var r = new WeakMap(), t = new WeakMap(); return (_getRequireWildcardCache = function (e) { return e ? t : r; })(e); }
67
68
  function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u && {}.hasOwnProperty.call(e, u)) { var i = a ? Object.getOwnPropertyDescriptor(e, u) : null; i && (i.get || i.set) ? Object.defineProperty(n, u, i) : n[u] = e[u]; } return n.default = e, t && t.set(e, n), n; }
68
69
  function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
@@ -97,7 +98,11 @@ exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
97
98
  const closeBrowser = async () => {
98
99
  try {
99
100
  if (browser) {
100
- await browser.close();
101
+ const pages = await browser.pages().catch(() => []);
102
+ await Promise.all(pages.map(page => page.close().catch(() => {})));
103
+ await browser.close().catch(err => {
104
+ logger.warn("Browser close failed with error:", err);
105
+ });
101
106
  browser = null;
102
107
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
103
108
  }
@@ -107,7 +112,9 @@ const closeBrowser = async () => {
107
112
  if (global.gc) {
108
113
  global.gc();
109
114
  }
110
- } catch (error) {}
115
+ } catch (error) {
116
+ logger.error("Failed to close browser:", error);
117
+ }
111
118
  };
112
119
  exports.closeBrowser = closeBrowser;
113
120
  const getBrowser = async () => {
@@ -134,16 +141,17 @@ const getBrowser = async () => {
134
141
  try {
135
142
  browser = await _puppeteer.default.launch({
136
143
  headless: true,
137
- // stable headless
138
- // headless: false, // debug
139
- // dumpio: true,
140
144
  args: [
141
145
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
142
146
  "--no-first-run",
143
147
  // '--no-startup-window',
144
148
  "--hide-scrollbars", "--no-sandbox", "--no-zygote",
145
149
  // '--single-process',
146
- "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas"]
150
+ "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
151
+ // 限制V8内存
152
+ "--disable-background-networking", "--disable-default-apps", "--disable-web-security",
153
+ // 允许跨域请求
154
+ "--disable-software-rasterizer", "--disable-crash-reporter"]
147
155
  });
148
156
  logger.info("Launch browser success");
149
157
  const browserWSEndpoint = await browser.wsEndpoint();
@@ -158,11 +166,15 @@ const getBrowser = async () => {
158
166
  let count = 0;
159
167
  checkBrowserTimer = setInterval(async () => {
160
168
  if (browser) {
161
- const pages = await browser.pages();
169
+ const pages = await browser.pages().catch(() => []);
162
170
  if (pages.length === 1 && pages[0].url() === "about:blank") {
163
171
  count++;
172
+ logger.debug(`Browser inactive count: ${count}/3`);
173
+ } else {
174
+ count = 0;
164
175
  }
165
176
  if (count >= 3) {
177
+ logger.info("Browser inactive for 3 minutes, closing...");
166
178
  await closeBrowser();
167
179
  }
168
180
  }
@@ -338,6 +350,19 @@ async function detectBestRedisUrl() {
338
350
  const possibleUrls = [
339
351
  // environment variable priority
340
352
  process.env.REDIS_URL,
353
+ // by web component endpoint
354
+ (() => {
355
+ try {
356
+ const endpoint = (0, _component.getComponentWebEndpoint)(_config.default.env.componentDid);
357
+ if (endpoint) {
358
+ const url = new URL(endpoint);
359
+ return `redis://${url.hostname}:6379`;
360
+ }
361
+ } catch (err) {
362
+ logger.warn(`Failed to get component endpoint: ${err.message}`);
363
+ }
364
+ return null;
365
+ })(),
341
366
  // default gateway
342
367
  defaultGateway ? `redis://${defaultGateway}:6379` : null,
343
368
  // common Docker gateway
@@ -354,7 +379,7 @@ async function detectBestRedisUrl() {
354
379
  });
355
380
  testClient.on("error", () => {});
356
381
  await Promise.race([testClient.connect(), new Promise((_, reject) => setTimeout(() => reject(new Error("Connection timeout")), 2e3))]);
357
- const pingResult = await testClient.ping();
382
+ await testClient.ping();
358
383
  await testClient.disconnect();
359
384
  logger.info(`\u2705 Found available Redis connection: ${url}`);
360
385
  cachedRedisUrl = url;
@@ -398,8 +423,9 @@ const cachePool = exports.cachePool = (0, _genericPool.createPool)({
398
423
  }
399
424
  }
400
425
  }, {
401
- max: 10,
402
- min: 1
426
+ max: 2,
427
+ // 2 clients
428
+ min: 0
403
429
  // evictionRunIntervalMillis: 0,
404
430
  });
405
431
  const withCache = async cb => {
@@ -8,7 +8,6 @@ import {
8
8
  isAcceptCrawler,
9
9
  env,
10
10
  components,
11
- puppeteer,
12
11
  getComponentInfo,
13
12
  sleep,
14
13
  closeBrowser,
@@ -31,7 +30,9 @@ export const getPageContent = async ({ url, formatPageContent }) => {
31
30
  let page = await initPage();
32
31
  let pageContent = null;
33
32
  try {
34
- const response = await page.goto(url);
33
+ const response = await page.goto(url, {
34
+ timeout: 20 * 1e3
35
+ });
35
36
  const statusCode = response.status();
36
37
  if (![200, 304].includes(statusCode)) {
37
38
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -212,17 +213,13 @@ export const crawlBlocklet = async () => {
212
213
  }
213
214
  });
214
215
  logger.info(...crawlerLogText("success"));
215
- await puppeteer.trimCache();
216
- logger.info("Puppeteer trim cache success");
217
- if (global.gc) {
218
- global.gc();
219
- }
216
+ await closeBrowser();
220
217
  };
221
218
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
222
219
  let cronCrawlBlockletJob = null;
223
220
  export const initCronCrawlBlocklet = ({
224
- time = "0 0 */12 * * *",
225
- // every 12 hours
221
+ time = "0 0 */24 * * *",
222
+ // every 24 hours
226
223
  options
227
224
  } = {}) => {
228
225
  if (!cronCrawlBlockletJob) {
@@ -13,6 +13,7 @@ import uniq from "lodash/uniq";
13
13
  import config from "@blocklet/sdk/lib/config";
14
14
  import { promisify } from "util";
15
15
  import { exec } from "child_process";
16
+ import { getComponentWebEndpoint } from "@blocklet/sdk/lib/component";
16
17
  export * from "@blocklet/sdk/lib/config";
17
18
  const { logger } = config;
18
19
  const execAsync = promisify(exec);
@@ -41,7 +42,12 @@ export const clearCheckBrowserTimer = () => {
41
42
  export const closeBrowser = async () => {
42
43
  try {
43
44
  if (browser) {
44
- await browser.close();
45
+ const pages = await browser.pages().catch(() => []);
46
+ await Promise.all(pages.map((page) => page.close().catch(() => {
47
+ })));
48
+ await browser.close().catch((err) => {
49
+ logger.warn("Browser close failed with error:", err);
50
+ });
45
51
  browser = null;
46
52
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
47
53
  }
@@ -52,6 +58,7 @@ export const closeBrowser = async () => {
52
58
  global.gc();
53
59
  }
54
60
  } catch (error) {
61
+ logger.error("Failed to close browser:", error);
55
62
  }
56
63
  };
57
64
  export const getBrowser = async () => {
@@ -78,9 +85,6 @@ export const getBrowser = async () => {
78
85
  try {
79
86
  browser = await puppeteer.launch({
80
87
  headless: true,
81
- // stable headless
82
- // headless: false, // debug
83
- // dumpio: true,
84
88
  args: [
85
89
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
86
90
  "--no-first-run",
@@ -93,7 +97,16 @@ export const getBrowser = async () => {
93
97
  "--disable-gpu",
94
98
  "--disable-dev-shm-usage",
95
99
  "--disable-site-isolation-trials",
96
- "--disable-accelerated-2d-canvas"
100
+ "--disable-accelerated-2d-canvas",
101
+ "--disable-extensions",
102
+ "--js-flags=--max_old_space_size=512",
103
+ // 限制V8内存
104
+ "--disable-background-networking",
105
+ "--disable-default-apps",
106
+ "--disable-web-security",
107
+ // 允许跨域请求
108
+ "--disable-software-rasterizer",
109
+ "--disable-crash-reporter"
97
110
  ]
98
111
  });
99
112
  logger.info("Launch browser success");
@@ -109,11 +122,15 @@ export const getBrowser = async () => {
109
122
  let count = 0;
110
123
  checkBrowserTimer = setInterval(async () => {
111
124
  if (browser) {
112
- const pages = await browser.pages();
125
+ const pages = await browser.pages().catch(() => []);
113
126
  if (pages.length === 1 && pages[0].url() === "about:blank") {
114
127
  count++;
128
+ logger.debug(`Browser inactive count: ${count}/3`);
129
+ } else {
130
+ count = 0;
115
131
  }
116
132
  if (count >= 3) {
133
+ logger.info("Browser inactive for 3 minutes, closing...");
117
134
  await closeBrowser();
118
135
  }
119
136
  }
@@ -358,6 +375,19 @@ async function detectBestRedisUrl() {
358
375
  const possibleUrls = [
359
376
  // environment variable priority
360
377
  process.env.REDIS_URL,
378
+ // by web component endpoint
379
+ (() => {
380
+ try {
381
+ const endpoint = getComponentWebEndpoint(config.env.componentDid);
382
+ if (endpoint) {
383
+ const url = new URL(endpoint);
384
+ return `redis://${url.hostname}:6379`;
385
+ }
386
+ } catch (err) {
387
+ logger.warn(`Failed to get component endpoint: ${err.message}`);
388
+ }
389
+ return null;
390
+ })(),
361
391
  // default gateway
362
392
  defaultGateway ? `redis://${defaultGateway}:6379` : null,
363
393
  // common Docker gateway
@@ -377,7 +407,7 @@ async function detectBestRedisUrl() {
377
407
  testClient.connect(),
378
408
  new Promise((_, reject) => setTimeout(() => reject(new Error("Connection timeout")), 2e3))
379
409
  ]);
380
- const pingResult = await testClient.ping();
410
+ await testClient.ping();
381
411
  await testClient.disconnect();
382
412
  logger.info(`\u2705 Found available Redis connection: ${url}`);
383
413
  cachedRedisUrl = url;
@@ -423,8 +453,9 @@ export const cachePool = createPool(
423
453
  }
424
454
  },
425
455
  {
426
- max: 10,
427
- min: 1
456
+ max: 2,
457
+ // 2 clients
458
+ min: 0
428
459
  // evictionRunIntervalMillis: 0,
429
460
  }
430
461
  );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.1.232",
3
+ "version": "2.1.234",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"