@arcblock/crawler 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -242,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
242
242
  // get html
243
243
  try {
244
244
  const data = yield page.evaluate(() => {
245
- var _a;
245
+ var _a, _b;
246
246
  // add meta tag to record crawler
247
247
  const meta = document.createElement('meta');
248
248
  meta.name = 'arcblock-crawler';
@@ -251,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
251
251
  // get title and meta description
252
252
  const title = document.title || '';
253
253
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
254
+ // remove document all <noscript> tags
255
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
256
+ if (el === null || el === void 0 ? void 0 : el.remove) {
257
+ el.remove();
258
+ }
259
+ });
254
260
  return {
255
261
  html: document.documentElement.outerHTML,
256
262
  title,
@@ -44,8 +44,18 @@ function formatSnapshot(snapshot, columns) {
44
44
  }
45
45
  // format html path to string
46
46
  if (data.html) {
47
- const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
48
- data.html = html.toString();
47
+ try {
48
+ const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
49
+ data.html = html.toString();
50
+ }
51
+ catch (err) {
52
+ config_1.logger.error('Failed to read html', {
53
+ err,
54
+ dataDir: config_1.config.dataDir,
55
+ snapshot,
56
+ });
57
+ data.html = '';
58
+ }
49
59
  }
50
60
  // remove sensitive options that should not be returned
51
61
  if (data.options) {
@@ -104,10 +114,15 @@ function deleteSnapshots(where_1) {
104
114
  });
105
115
  const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
106
116
  try {
107
- yield Promise.all([
108
- snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
109
- snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
110
- ]);
117
+ try {
118
+ yield Promise.all([
119
+ snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
120
+ snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
121
+ ]);
122
+ }
123
+ catch (err) {
124
+ config_1.logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config_1.config.dataDir });
125
+ }
111
126
  yield snapshot.destroy({ transaction: txn });
112
127
  return snapshot.jobId;
113
128
  }
package/lib/cjs/utils.js CHANGED
@@ -70,24 +70,36 @@ const botUserAgents = [
70
70
  /Googlebot/i,
71
71
  /GPTBot/i,
72
72
  /Applebot/i,
73
- // AI bots
74
- /Anthropic-ai/i,
75
- /Claude-Web/i,
76
- /anthropic-ai-scraper/i,
77
- /Google-Extended/i,
78
- /GoogleOther/i,
79
- /CCBot\/\d/i,
80
- /Bytespider/i,
81
- /BingBot/i,
82
- /Baiduspider/i,
83
- /Sogou/i,
84
- /Perplexity/i,
85
- /Cohere-ai/i,
86
- /xlts-bot/i,
87
- /THAAS/i,
88
- /YisouSpider/i,
89
- /AlibabaGroup/i,
90
- /adaptive-edge-crawler/i,
73
+ // AI bots - condensed patterns
74
+ /-AI\b/i, // Matches any string ending with "-AI"
75
+ /-Bot\b/i, // Matches any string ending with "-Bot"
76
+ /-Agent\b/i, // Matches any string ending with "-Agent"
77
+ /-User\b/i, // Matches any string ending with "-User"
78
+ /\bAI\b/i, // Matches standalone "AI" word
79
+ /\bGPT/i, // GPT variants
80
+ /\bClaude/i, // Claude variants
81
+ /\bBard\b/i, // Google Bard
82
+ /\bGemini\b/i, // Google Gemini
83
+ /\bLlama\b/i, // Meta Llama
84
+ /\bChatGPT/i, // ChatGPT variants
85
+ /\bOpenAI/i, // OpenAI
86
+ /\bAnthropic/i, // Anthropic
87
+ /\bPerplexity/i, // Perplexity
88
+ /\bCohere/i, // Cohere
89
+ /\bHuggingFace/i, // Hugging Face
90
+ /\bStability/i, // Stability AI
91
+ /\bMidjourney/i, // Midjourney
92
+ /\bDALL-E/i, // DALL-E
93
+ /\bMeta-External/i, // Meta external agents
94
+ /\bGoogle-/i, // Google agents
95
+ /\bLLM/i, // LLM
96
+ /\bBytespider/i, // ByteDance spider
97
+ /\bBaiduspider/i, // Baidu spider
98
+ /\bYandexBot/i, // Yandex bot
99
+ /\bDuckDuckBot/i, // DuckDuckGo bot
100
+ /\bLinkedInBot/i, // LinkedIn bot
101
+ /\bTwitterbot/i, // Twitter bot
102
+ /\bCCBot/i, // Common Crawl bot
91
103
  ];
92
104
  /**
93
105
  * A default set of file extensions for static assets that do not need to be proxied.
@@ -231,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
231
231
  // get html
232
232
  try {
233
233
  const data = yield page.evaluate(() => {
234
- var _a;
234
+ var _a, _b;
235
235
  // add meta tag to record crawler
236
236
  const meta = document.createElement('meta');
237
237
  meta.name = 'arcblock-crawler';
@@ -240,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
240
240
  // get title and meta description
241
241
  const title = document.title || '';
242
242
  const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
243
+ // remove document all <noscript> tags
244
+ (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
245
+ if (el === null || el === void 0 ? void 0 : el.remove) {
246
+ el.remove();
247
+ }
248
+ });
243
249
  return {
244
250
  html: document.documentElement.outerHTML,
245
251
  title,
@@ -34,8 +34,18 @@ export function formatSnapshot(snapshot, columns) {
34
34
  }
35
35
  // format html path to string
36
36
  if (data.html) {
37
- const html = yield fs.readFile(path.join(config.dataDir, data.html));
38
- data.html = html.toString();
37
+ try {
38
+ const html = yield fs.readFile(path.join(config.dataDir, data.html));
39
+ data.html = html.toString();
40
+ }
41
+ catch (err) {
42
+ logger.error('Failed to read html', {
43
+ err,
44
+ dataDir: config.dataDir,
45
+ snapshot,
46
+ });
47
+ data.html = '';
48
+ }
39
49
  }
40
50
  // remove sensitive options that should not be returned
41
51
  if (data.options) {
@@ -94,10 +104,15 @@ export function deleteSnapshots(where_1) {
94
104
  });
95
105
  const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
96
106
  try {
97
- yield Promise.all([
98
- snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
99
- snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
100
- ]);
107
+ try {
108
+ yield Promise.all([
109
+ snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
110
+ snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
111
+ ]);
112
+ }
113
+ catch (err) {
114
+ logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config.dataDir });
115
+ }
101
116
  yield snapshot.destroy({ transaction: txn });
102
117
  return snapshot.jobId;
103
118
  }
package/lib/esm/utils.js CHANGED
@@ -59,24 +59,36 @@ const botUserAgents = [
59
59
  /Googlebot/i,
60
60
  /GPTBot/i,
61
61
  /Applebot/i,
62
- // AI bots
63
- /Anthropic-ai/i,
64
- /Claude-Web/i,
65
- /anthropic-ai-scraper/i,
66
- /Google-Extended/i,
67
- /GoogleOther/i,
68
- /CCBot\/\d/i,
69
- /Bytespider/i,
70
- /BingBot/i,
71
- /Baiduspider/i,
72
- /Sogou/i,
73
- /Perplexity/i,
74
- /Cohere-ai/i,
75
- /xlts-bot/i,
76
- /THAAS/i,
77
- /YisouSpider/i,
78
- /AlibabaGroup/i,
79
- /adaptive-edge-crawler/i,
62
+ // AI bots - condensed patterns
63
+ /-AI\b/i, // Matches any string ending with "-AI"
64
+ /-Bot\b/i, // Matches any string ending with "-Bot"
65
+ /-Agent\b/i, // Matches any string ending with "-Agent"
66
+ /-User\b/i, // Matches any string ending with "-User"
67
+ /\bAI\b/i, // Matches standalone "AI" word
68
+ /\bGPT/i, // GPT variants
69
+ /\bClaude/i, // Claude variants
70
+ /\bBard\b/i, // Google Bard
71
+ /\bGemini\b/i, // Google Gemini
72
+ /\bLlama\b/i, // Meta Llama
73
+ /\bChatGPT/i, // ChatGPT variants
74
+ /\bOpenAI/i, // OpenAI
75
+ /\bAnthropic/i, // Anthropic
76
+ /\bPerplexity/i, // Perplexity
77
+ /\bCohere/i, // Cohere
78
+ /\bHuggingFace/i, // Hugging Face
79
+ /\bStability/i, // Stability AI
80
+ /\bMidjourney/i, // Midjourney
81
+ /\bDALL-E/i, // DALL-E
82
+ /\bMeta-External/i, // Meta external agents
83
+ /\bGoogle-/i, // Google agents
84
+ /\bLLM/i, // LLM
85
+ /\bBytespider/i, // ByteDance spider
86
+ /\bBaiduspider/i, // Baidu spider
87
+ /\bYandexBot/i, // Yandex bot
88
+ /\bDuckDuckBot/i, // DuckDuckGo bot
89
+ /\bLinkedInBot/i, // LinkedIn bot
90
+ /\bTwitterbot/i, // Twitter bot
91
+ /\bCCBot/i, // Common Crawl bot
80
92
  ];
81
93
  /**
82
94
  * A default set of file extensions for static assets that do not need to be proxied.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.3.2",
3
+ "version": "1.3.4",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",