@arcblock/crawler 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -242,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
242
242
  // get html
243
243
  try {
244
244
  const data = yield page.evaluate(() => {
245
- var _a;
245
+ var _a, _b;
246
246
  // add meta tag to record crawler
247
247
  const meta = document.createElement('meta');
248
248
  meta.name = 'arcblock-crawler';
@@ -251,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
251
251
  // get title and meta description
252
252
  const title = document.title || '';
253
253
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
254
+ // remove document all <noscript> tags
255
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
256
+ if (el === null || el === void 0 ? void 0 : el.remove) {
257
+ el.remove();
258
+ }
259
+ });
254
260
  return {
255
261
  html: document.documentElement.outerHTML,
256
262
  title,
package/lib/cjs/utils.js CHANGED
@@ -70,24 +70,36 @@ const botUserAgents = [
70
70
  /Googlebot/i,
71
71
  /GPTBot/i,
72
72
  /Applebot/i,
73
- // AI bots
74
- /Anthropic-ai/i,
75
- /Claude-Web/i,
76
- /anthropic-ai-scraper/i,
77
- /Google-Extended/i,
78
- /GoogleOther/i,
79
- /CCBot\/\d/i,
80
- /Bytespider/i,
81
- /BingBot/i,
82
- /Baiduspider/i,
83
- /Sogou/i,
84
- /Perplexity/i,
85
- /Cohere-ai/i,
86
- /xlts-bot/i,
87
- /THAAS/i,
88
- /YisouSpider/i,
89
- /AlibabaGroup/i,
90
- /adaptive-edge-crawler/i,
73
+ // AI bots - condensed patterns
74
+ /-AI\b/i, // Matches any string ending with "-AI"
75
+ /-Bot\b/i, // Matches any string ending with "-Bot"
76
+ /-Agent\b/i, // Matches any string ending with "-Agent"
77
+ /-User\b/i, // Matches any string ending with "-User"
78
+ /\bAI\b/i, // Matches standalone "AI" word
79
+ /\bGPT/i, // GPT variants
80
+ /\bClaude/i, // Claude variants
81
+ /\bBard\b/i, // Google Bard
82
+ /\bGemini\b/i, // Google Gemini
83
+ /\bLlama\b/i, // Meta Llama
84
+ /\bChatGPT/i, // ChatGPT variants
85
+ /\bOpenAI/i, // OpenAI
86
+ /\bAnthropic/i, // Anthropic
87
+ /\bPerplexity/i, // Perplexity
88
+ /\bCohere/i, // Cohere
89
+ /\bHuggingFace/i, // Hugging Face
90
+ /\bStability/i, // Stability AI
91
+ /\bMidjourney/i, // Midjourney
92
+ /\bDALL-E/i, // DALL-E
93
+ /\bMeta-External/i, // Meta external agents
94
+ /\bGoogle-/i, // Google agents
95
+ /\bLLM/i, // LLM
96
+ /\bBytespider/i, // ByteDance spider
97
+ /\bBaiduspider/i, // Baidu spider
98
+ /\bYandexBot/i, // Yandex bot
99
+ /\bDuckDuckBot/i, // DuckDuckGo bot
100
+ /\bLinkedInBot/i, // LinkedIn bot
101
+ /\bTwitterbot/i, // Twitter bot
102
+ /\bCCBot/i, // Common Crawl bot
91
103
  ];
92
104
  /**
93
105
  * A default set of file extensions for static assets that do not need to be proxied.
@@ -231,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
231
231
  // get html
232
232
  try {
233
233
  const data = yield page.evaluate(() => {
234
- var _a;
234
+ var _a, _b;
235
235
  // add meta tag to record crawler
236
236
  const meta = document.createElement('meta');
237
237
  meta.name = 'arcblock-crawler';
@@ -240,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
240
240
  // get title and meta description
241
241
  const title = document.title || '';
242
242
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
243
+ // remove document all <noscript> tags
244
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
245
+ if (el === null || el === void 0 ? void 0 : el.remove) {
246
+ el.remove();
247
+ }
248
+ });
243
249
  return {
244
250
  html: document.documentElement.outerHTML,
245
251
  title,
package/lib/esm/utils.js CHANGED
@@ -59,24 +59,36 @@ const botUserAgents = [
59
59
  /Googlebot/i,
60
60
  /GPTBot/i,
61
61
  /Applebot/i,
62
- // AI bots
63
- /Anthropic-ai/i,
64
- /Claude-Web/i,
65
- /anthropic-ai-scraper/i,
66
- /Google-Extended/i,
67
- /GoogleOther/i,
68
- /CCBot\/\d/i,
69
- /Bytespider/i,
70
- /BingBot/i,
71
- /Baiduspider/i,
72
- /Sogou/i,
73
- /Perplexity/i,
74
- /Cohere-ai/i,
75
- /xlts-bot/i,
76
- /THAAS/i,
77
- /YisouSpider/i,
78
- /AlibabaGroup/i,
79
- /adaptive-edge-crawler/i,
62
+ // AI bots - condensed patterns
63
+ /-AI\b/i, // Matches any string ending with "-AI"
64
+ /-Bot\b/i, // Matches any string ending with "-Bot"
65
+ /-Agent\b/i, // Matches any string ending with "-Agent"
66
+ /-User\b/i, // Matches any string ending with "-User"
67
+ /\bAI\b/i, // Matches standalone "AI" word
68
+ /\bGPT/i, // GPT variants
69
+ /\bClaude/i, // Claude variants
70
+ /\bBard\b/i, // Google Bard
71
+ /\bGemini\b/i, // Google Gemini
72
+ /\bLlama\b/i, // Meta Llama
73
+ /\bChatGPT/i, // ChatGPT variants
74
+ /\bOpenAI/i, // OpenAI
75
+ /\bAnthropic/i, // Anthropic
76
+ /\bPerplexity/i, // Perplexity
77
+ /\bCohere/i, // Cohere
78
+ /\bHuggingFace/i, // Hugging Face
79
+ /\bStability/i, // Stability AI
80
+ /\bMidjourney/i, // Midjourney
81
+ /\bDALL-E/i, // DALL-E
82
+ /\bMeta-External/i, // Meta external agents
83
+ /\bGoogle-/i, // Google agents
84
+ /\bLLM/i, // LLM
85
+ /\bBytespider/i, // ByteDance spider
86
+ /\bBaiduspider/i, // Baidu spider
87
+ /\bYandexBot/i, // Yandex bot
88
+ /\bDuckDuckBot/i, // DuckDuckGo bot
89
+ /\bLinkedInBot/i, // LinkedIn bot
90
+ /\bTwitterbot/i, // Twitter bot
91
+ /\bCCBot/i, // Common Crawl bot
80
92
  ];
81
93
  /**
82
94
  * A default set of file extensions for static assets that do not need to be proxied.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.3.2",
3
+ "version": "1.3.3",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",