@arcblock/crawler 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.js +7 -1
- package/lib/cjs/utils.js +30 -18
- package/lib/esm/crawler.js +7 -1
- package/lib/esm/utils.js +30 -18
- package/package.json +1 -1
package/lib/cjs/crawler.js
CHANGED
|
@@ -242,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
|
|
|
242
242
|
// get html
|
|
243
243
|
try {
|
|
244
244
|
const data = yield page.evaluate(() => {
|
|
245
|
-
var _a;
|
|
245
|
+
var _a, _b;
|
|
246
246
|
// add meta tag to record crawler
|
|
247
247
|
const meta = document.createElement('meta');
|
|
248
248
|
meta.name = 'arcblock-crawler';
|
|
@@ -251,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
|
|
|
251
251
|
// get title and meta description
|
|
252
252
|
const title = document.title || '';
|
|
253
253
|
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
254
|
+
// remove document all <noscript> tags
|
|
255
|
+
(_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
|
|
256
|
+
if (el === null || el === void 0 ? void 0 : el.remove) {
|
|
257
|
+
el.remove();
|
|
258
|
+
}
|
|
259
|
+
});
|
|
254
260
|
return {
|
|
255
261
|
html: document.documentElement.outerHTML,
|
|
256
262
|
title,
|
package/lib/cjs/utils.js
CHANGED
|
@@ -70,24 +70,36 @@ const botUserAgents = [
|
|
|
70
70
|
/Googlebot/i,
|
|
71
71
|
/GPTBot/i,
|
|
72
72
|
/Applebot/i,
|
|
73
|
-
// AI bots
|
|
74
|
-
/
|
|
75
|
-
/
|
|
76
|
-
/
|
|
77
|
-
/
|
|
78
|
-
/
|
|
79
|
-
/
|
|
80
|
-
/
|
|
81
|
-
/
|
|
82
|
-
/
|
|
83
|
-
/
|
|
84
|
-
/
|
|
85
|
-
/
|
|
86
|
-
/
|
|
87
|
-
/
|
|
88
|
-
/
|
|
89
|
-
/
|
|
90
|
-
/
|
|
73
|
+
// AI bots - condensed patterns
|
|
74
|
+
/-AI\b/i, // Matches any string ending with "-AI"
|
|
75
|
+
/-Bot\b/i, // Matches any string ending with "-Bot"
|
|
76
|
+
/-Agent\b/i, // Matches any string ending with "-Agent"
|
|
77
|
+
/-User\b/i, // Matches any string ending with "-User"
|
|
78
|
+
/\bAI\b/i, // Matches standalone "AI" word
|
|
79
|
+
/\bGPT/i, // GPT variants
|
|
80
|
+
/\bClaude/i, // Claude variants
|
|
81
|
+
/\bBard\b/i, // Google Bard
|
|
82
|
+
/\bGemini\b/i, // Google Gemini
|
|
83
|
+
/\bLlama\b/i, // Meta Llama
|
|
84
|
+
/\bChatGPT/i, // ChatGPT variants
|
|
85
|
+
/\bOpenAI/i, // OpenAI
|
|
86
|
+
/\bAnthropic/i, // Anthropic
|
|
87
|
+
/\bPerplexity/i, // Perplexity
|
|
88
|
+
/\bCohere/i, // Cohere
|
|
89
|
+
/\bHuggingFace/i, // Hugging Face
|
|
90
|
+
/\bStability/i, // Stability AI
|
|
91
|
+
/\bMidjourney/i, // Midjourney
|
|
92
|
+
/\bDALL-E/i, // DALL-E
|
|
93
|
+
/\bMeta-External/i, // Meta external agents
|
|
94
|
+
/\bGoogle-/i, // Google agents
|
|
95
|
+
/\bLLM/i, // LLM
|
|
96
|
+
/\bBytespider/i, // ByteDance spider
|
|
97
|
+
/\bBaiduspider/i, // Baidu spider
|
|
98
|
+
/\bYandexBot/i, // Yandex bot
|
|
99
|
+
/\bDuckDuckBot/i, // DuckDuckGo bot
|
|
100
|
+
/\bLinkedInBot/i, // LinkedIn bot
|
|
101
|
+
/\bTwitterbot/i, // Twitter bot
|
|
102
|
+
/\bCCBot/i, // Common Crawl bot
|
|
91
103
|
];
|
|
92
104
|
/**
|
|
93
105
|
* A default set of file extensions for static assets that do not need to be proxied.
|
package/lib/esm/crawler.js
CHANGED
|
@@ -231,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
|
|
|
231
231
|
// get html
|
|
232
232
|
try {
|
|
233
233
|
const data = yield page.evaluate(() => {
|
|
234
|
-
var _a;
|
|
234
|
+
var _a, _b;
|
|
235
235
|
// add meta tag to record crawler
|
|
236
236
|
const meta = document.createElement('meta');
|
|
237
237
|
meta.name = 'arcblock-crawler';
|
|
@@ -240,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
|
|
|
240
240
|
// get title and meta description
|
|
241
241
|
const title = document.title || '';
|
|
242
242
|
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
243
|
+
// remove document all <noscript> tags
|
|
244
|
+
(_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
|
|
245
|
+
if (el === null || el === void 0 ? void 0 : el.remove) {
|
|
246
|
+
el.remove();
|
|
247
|
+
}
|
|
248
|
+
});
|
|
243
249
|
return {
|
|
244
250
|
html: document.documentElement.outerHTML,
|
|
245
251
|
title,
|
package/lib/esm/utils.js
CHANGED
|
@@ -59,24 +59,36 @@ const botUserAgents = [
|
|
|
59
59
|
/Googlebot/i,
|
|
60
60
|
/GPTBot/i,
|
|
61
61
|
/Applebot/i,
|
|
62
|
-
// AI bots
|
|
63
|
-
/
|
|
64
|
-
/
|
|
65
|
-
/
|
|
66
|
-
/
|
|
67
|
-
/
|
|
68
|
-
/
|
|
69
|
-
/
|
|
70
|
-
/
|
|
71
|
-
/
|
|
72
|
-
/
|
|
73
|
-
/
|
|
74
|
-
/
|
|
75
|
-
/
|
|
76
|
-
/
|
|
77
|
-
/
|
|
78
|
-
/
|
|
79
|
-
/
|
|
62
|
+
// AI bots - condensed patterns
|
|
63
|
+
/-AI\b/i, // Matches any string ending with "-AI"
|
|
64
|
+
/-Bot\b/i, // Matches any string ending with "-Bot"
|
|
65
|
+
/-Agent\b/i, // Matches any string ending with "-Agent"
|
|
66
|
+
/-User\b/i, // Matches any string ending with "-User"
|
|
67
|
+
/\bAI\b/i, // Matches standalone "AI" word
|
|
68
|
+
/\bGPT/i, // GPT variants
|
|
69
|
+
/\bClaude/i, // Claude variants
|
|
70
|
+
/\bBard\b/i, // Google Bard
|
|
71
|
+
/\bGemini\b/i, // Google Gemini
|
|
72
|
+
/\bLlama\b/i, // Meta Llama
|
|
73
|
+
/\bChatGPT/i, // ChatGPT variants
|
|
74
|
+
/\bOpenAI/i, // OpenAI
|
|
75
|
+
/\bAnthropic/i, // Anthropic
|
|
76
|
+
/\bPerplexity/i, // Perplexity
|
|
77
|
+
/\bCohere/i, // Cohere
|
|
78
|
+
/\bHuggingFace/i, // Hugging Face
|
|
79
|
+
/\bStability/i, // Stability AI
|
|
80
|
+
/\bMidjourney/i, // Midjourney
|
|
81
|
+
/\bDALL-E/i, // DALL-E
|
|
82
|
+
/\bMeta-External/i, // Meta external agents
|
|
83
|
+
/\bGoogle-/i, // Google agents
|
|
84
|
+
/\bLLM/i, // LLM
|
|
85
|
+
/\bBytespider/i, // ByteDance spider
|
|
86
|
+
/\bBaiduspider/i, // Baidu spider
|
|
87
|
+
/\bYandexBot/i, // Yandex bot
|
|
88
|
+
/\bDuckDuckBot/i, // DuckDuckGo bot
|
|
89
|
+
/\bLinkedInBot/i, // LinkedIn bot
|
|
90
|
+
/\bTwitterbot/i, // Twitter bot
|
|
91
|
+
/\bCCBot/i, // Common Crawl bot
|
|
80
92
|
];
|
|
81
93
|
/**
|
|
82
94
|
* A default set of file extensions for static assets that do not need to be proxied.
|