@blocklet/crawler 2.1.219 → 2.1.221

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -203,7 +203,9 @@ const initPage = async ({
203
203
  return page;
204
204
  };
205
205
  exports.initPage = initPage;
206
- const botUserAgents = [/bot/i, /spider/i, /facebookexternalhit/i, /simplepie/i, /yahooseeker/i, /embedly/i, /quora link preview/i, /outbrain/i, /vkshare/i, /monit/i, /Pingability/i, /Monitoring/i, /WinHttpRequest/i, /Apache-HttpClient/i, /getprismatic.com/i, /python-requests/i, /Twurly/i, /yandex/i, /browserproxy/i, /crawler/i, /Qwantify/i, /Yahoo/i, /pinterest/i, /Tumblr/i, /Tumblr Agent/i, /WhatsApp/i, /Google-Structured-Data-Testing-Tool/i, /Google-InspectionTool/i, /Googlebot/i, /GPTBot/i, /Applebot/i];
206
+ const botUserAgents = [/bot/i, /spider/i, /facebookexternalhit/i, /simplepie/i, /yahooseeker/i, /embedly/i, /quora link preview/i, /outbrain/i, /vkshare/i, /monit/i, /Pingability/i, /Monitoring/i, /WinHttpRequest/i, /Apache-HttpClient/i, /getprismatic.com/i, /python-requests/i, /Twurly/i, /yandex/i, /browserproxy/i, /crawler/i, /Qwantify/i, /Yahoo/i, /pinterest/i, /Tumblr/i, /Tumblr Agent/i, /WhatsApp/i, /Google-Structured-Data-Testing-Tool/i, /Google-InspectionTool/i, /Googlebot/i, /GPTBot/i, /Applebot/i,
207
+ // AI bots
208
+ /Anthropic-ai/i, /Claude-Web/i, /anthropic-ai-scraper/i, /Google-Extended/i, /GoogleOther/i, /CCBot\/\d/i, /Bytespider/i, /BingBot/i, /Baiduspider/i, /Sogou/i, /Perplexity/i, /Cohere-ai/i, /xlts-bot/i, /THAAS/i, /YisouSpider/i, /AlibabaGroup/i, /adaptive-edge-crawler/i];
207
209
  const isSpider = ua => botUserAgents.some(spider => {
208
210
  return spider.test(ua);
209
211
  });
@@ -386,16 +388,35 @@ const useCache = exports.useCache = {
386
388
  };
387
389
  const checkBrowserDownloaded = async () => {
388
390
  try {
391
+ const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || "/usr/bin/chromium";
392
+ if (_fsExtra.default.existsSync(executablePath)) {
393
+ try {
394
+ const puppeteer2 = await Promise.resolve().then(() => require("puppeteer"));
395
+ const browser2 = await puppeteer2.launch({
396
+ executablePath,
397
+ args: ["--no-sandbox", "--disable-setuid-sandbox"],
398
+ headless: true
399
+ });
400
+ await browser2.close();
401
+ logger.info(`System Chromium found and tested successfully: ${executablePath}`);
402
+ return;
403
+ } catch (err) {
404
+ logger.warn(`System Chromium exists but test failed, will try to download`);
405
+ }
406
+ }
389
407
  const {
390
408
  downloadBrowser
391
409
  } = await (async () => {
392
410
  try {
393
411
  return await Promise.resolve().then(() => require("@blocklet/puppeteer/internal/node/install.js"));
394
- } catch {
412
+ } catch (err) {
395
413
  logger.warn("Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.");
396
414
  }
397
415
  })();
398
- await downloadBrowser();
416
+ if (downloadBrowser) {
417
+ await downloadBrowser();
418
+ logger.info("Browser download completed successfully");
419
+ }
399
420
  } catch (error) {
400
421
  logger.warn("Browser download failed", error);
401
422
  }
@@ -179,7 +179,25 @@ const botUserAgents = [
179
179
  /Google-InspectionTool/i,
180
180
  /Googlebot/i,
181
181
  /GPTBot/i,
182
- /Applebot/i
182
+ /Applebot/i,
183
+ // AI bots
184
+ /Anthropic-ai/i,
185
+ /Claude-Web/i,
186
+ /anthropic-ai-scraper/i,
187
+ /Google-Extended/i,
188
+ /GoogleOther/i,
189
+ /CCBot\/\d/i,
190
+ /Bytespider/i,
191
+ /BingBot/i,
192
+ /Baiduspider/i,
193
+ /Sogou/i,
194
+ /Perplexity/i,
195
+ /Cohere-ai/i,
196
+ /xlts-bot/i,
197
+ /THAAS/i,
198
+ /YisouSpider/i,
199
+ /AlibabaGroup/i,
200
+ /adaptive-edge-crawler/i
183
201
  ];
184
202
  const isSpider = (ua) => botUserAgents.some((spider) => {
185
203
  return spider.test(ua);
@@ -386,16 +404,35 @@ export const useCache = {
386
404
  };
387
405
  export const checkBrowserDownloaded = async () => {
388
406
  try {
407
+ const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || "/usr/bin/chromium";
408
+ if (fs.existsSync(executablePath)) {
409
+ try {
410
+ const puppeteer2 = await import("puppeteer");
411
+ const browser2 = await puppeteer2.launch({
412
+ executablePath,
413
+ args: ["--no-sandbox", "--disable-setuid-sandbox"],
414
+ headless: true
415
+ });
416
+ await browser2.close();
417
+ logger.info(`System Chromium found and tested successfully: ${executablePath}`);
418
+ return;
419
+ } catch (err) {
420
+ logger.warn(`System Chromium exists but test failed, will try to download`);
421
+ }
422
+ }
389
423
  const { downloadBrowser } = await (async () => {
390
424
  try {
391
425
  return await import("@blocklet/puppeteer/internal/node/install.js");
392
- } catch {
426
+ } catch (err) {
393
427
  logger.warn(
394
428
  "Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer."
395
429
  );
396
430
  }
397
431
  })();
398
- await downloadBrowser();
432
+ if (downloadBrowser) {
433
+ await downloadBrowser();
434
+ logger.info("Browser download completed successfully");
435
+ }
399
436
  } catch (error) {
400
437
  logger.warn("Browser download failed", error);
401
438
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.1.219",
3
+ "version": "2.1.221",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"