@blocklet/crawler 2.1.219 → 2.1.221
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -203,7 +203,9 @@ const initPage = async ({
|
|
|
203
203
|
return page;
|
|
204
204
|
};
|
|
205
205
|
exports.initPage = initPage;
|
|
206
|
-
const botUserAgents = [/bot/i, /spider/i, /facebookexternalhit/i, /simplepie/i, /yahooseeker/i, /embedly/i, /quora link preview/i, /outbrain/i, /vkshare/i, /monit/i, /Pingability/i, /Monitoring/i, /WinHttpRequest/i, /Apache-HttpClient/i, /getprismatic.com/i, /python-requests/i, /Twurly/i, /yandex/i, /browserproxy/i, /crawler/i, /Qwantify/i, /Yahoo/i, /pinterest/i, /Tumblr/i, /Tumblr Agent/i, /WhatsApp/i, /Google-Structured-Data-Testing-Tool/i, /Google-InspectionTool/i, /Googlebot/i, /GPTBot/i, /Applebot/i
|
|
206
|
+
const botUserAgents = [/bot/i, /spider/i, /facebookexternalhit/i, /simplepie/i, /yahooseeker/i, /embedly/i, /quora link preview/i, /outbrain/i, /vkshare/i, /monit/i, /Pingability/i, /Monitoring/i, /WinHttpRequest/i, /Apache-HttpClient/i, /getprismatic.com/i, /python-requests/i, /Twurly/i, /yandex/i, /browserproxy/i, /crawler/i, /Qwantify/i, /Yahoo/i, /pinterest/i, /Tumblr/i, /Tumblr Agent/i, /WhatsApp/i, /Google-Structured-Data-Testing-Tool/i, /Google-InspectionTool/i, /Googlebot/i, /GPTBot/i, /Applebot/i,
|
|
207
|
+
// AI bots
|
|
208
|
+
/Anthropic-ai/i, /Claude-Web/i, /anthropic-ai-scraper/i, /Google-Extended/i, /GoogleOther/i, /CCBot\/\d/i, /Bytespider/i, /BingBot/i, /Baiduspider/i, /Sogou/i, /Perplexity/i, /Cohere-ai/i, /xlts-bot/i, /THAAS/i, /YisouSpider/i, /AlibabaGroup/i, /adaptive-edge-crawler/i];
|
|
207
209
|
const isSpider = ua => botUserAgents.some(spider => {
|
|
208
210
|
return spider.test(ua);
|
|
209
211
|
});
|
|
@@ -386,16 +388,35 @@ const useCache = exports.useCache = {
|
|
|
386
388
|
};
|
|
387
389
|
const checkBrowserDownloaded = async () => {
|
|
388
390
|
try {
|
|
391
|
+
const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || "/usr/bin/chromium";
|
|
392
|
+
if (_fsExtra.default.existsSync(executablePath)) {
|
|
393
|
+
try {
|
|
394
|
+
const puppeteer2 = await Promise.resolve().then(() => require("puppeteer"));
|
|
395
|
+
const browser2 = await puppeteer2.launch({
|
|
396
|
+
executablePath,
|
|
397
|
+
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
398
|
+
headless: true
|
|
399
|
+
});
|
|
400
|
+
await browser2.close();
|
|
401
|
+
logger.info(`System Chromium found and tested successfully: ${executablePath}`);
|
|
402
|
+
return;
|
|
403
|
+
} catch (err) {
|
|
404
|
+
logger.warn(`System Chromium exists but test failed, will try to download`);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
389
407
|
const {
|
|
390
408
|
downloadBrowser
|
|
391
409
|
} = await (async () => {
|
|
392
410
|
try {
|
|
393
411
|
return await Promise.resolve().then(() => require("@blocklet/puppeteer/internal/node/install.js"));
|
|
394
|
-
} catch {
|
|
412
|
+
} catch (err) {
|
|
395
413
|
logger.warn("Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.");
|
|
396
414
|
}
|
|
397
415
|
})();
|
|
398
|
-
|
|
416
|
+
if (downloadBrowser) {
|
|
417
|
+
await downloadBrowser();
|
|
418
|
+
logger.info("Browser download completed successfully");
|
|
419
|
+
}
|
|
399
420
|
} catch (error) {
|
|
400
421
|
logger.warn("Browser download failed", error);
|
|
401
422
|
}
|
|
@@ -179,7 +179,25 @@ const botUserAgents = [
|
|
|
179
179
|
/Google-InspectionTool/i,
|
|
180
180
|
/Googlebot/i,
|
|
181
181
|
/GPTBot/i,
|
|
182
|
-
/Applebot/i
|
|
182
|
+
/Applebot/i,
|
|
183
|
+
// AI bots
|
|
184
|
+
/Anthropic-ai/i,
|
|
185
|
+
/Claude-Web/i,
|
|
186
|
+
/anthropic-ai-scraper/i,
|
|
187
|
+
/Google-Extended/i,
|
|
188
|
+
/GoogleOther/i,
|
|
189
|
+
/CCBot\/\d/i,
|
|
190
|
+
/Bytespider/i,
|
|
191
|
+
/BingBot/i,
|
|
192
|
+
/Baiduspider/i,
|
|
193
|
+
/Sogou/i,
|
|
194
|
+
/Perplexity/i,
|
|
195
|
+
/Cohere-ai/i,
|
|
196
|
+
/xlts-bot/i,
|
|
197
|
+
/THAAS/i,
|
|
198
|
+
/YisouSpider/i,
|
|
199
|
+
/AlibabaGroup/i,
|
|
200
|
+
/adaptive-edge-crawler/i
|
|
183
201
|
];
|
|
184
202
|
const isSpider = (ua) => botUserAgents.some((spider) => {
|
|
185
203
|
return spider.test(ua);
|
|
@@ -386,16 +404,35 @@ export const useCache = {
|
|
|
386
404
|
};
|
|
387
405
|
export const checkBrowserDownloaded = async () => {
|
|
388
406
|
try {
|
|
407
|
+
const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || "/usr/bin/chromium";
|
|
408
|
+
if (fs.existsSync(executablePath)) {
|
|
409
|
+
try {
|
|
410
|
+
const puppeteer2 = await import("puppeteer");
|
|
411
|
+
const browser2 = await puppeteer2.launch({
|
|
412
|
+
executablePath,
|
|
413
|
+
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
414
|
+
headless: true
|
|
415
|
+
});
|
|
416
|
+
await browser2.close();
|
|
417
|
+
logger.info(`System Chromium found and tested successfully: ${executablePath}`);
|
|
418
|
+
return;
|
|
419
|
+
} catch (err) {
|
|
420
|
+
logger.warn(`System Chromium exists but test failed, will try to download`);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
389
423
|
const { downloadBrowser } = await (async () => {
|
|
390
424
|
try {
|
|
391
425
|
return await import("@blocklet/puppeteer/internal/node/install.js");
|
|
392
|
-
} catch {
|
|
426
|
+
} catch (err) {
|
|
393
427
|
logger.warn(
|
|
394
428
|
"Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer."
|
|
395
429
|
);
|
|
396
430
|
}
|
|
397
431
|
})();
|
|
398
|
-
|
|
432
|
+
if (downloadBrowser) {
|
|
433
|
+
await downloadBrowser();
|
|
434
|
+
logger.info("Browser download completed successfully");
|
|
435
|
+
}
|
|
399
436
|
} catch (error) {
|
|
400
437
|
logger.warn("Browser download failed", error);
|
|
401
438
|
}
|