npm - @arcblock/crawler - Versions diffs - 1.3.2 → 1.3.4 - Mend

@arcblock/crawler 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/lib/cjs/crawler.js +7 -1
package/lib/cjs/services/snapshot.js +21 -6
package/lib/cjs/utils.js +30 -18
package/lib/esm/crawler.js +7 -1
package/lib/esm/services/snapshot.js +21 -6
package/lib/esm/utils.js +30 -18
package/package.json +1 -1

package/lib/cjs/crawler.js CHANGED Viewed

@@ -242,7 +242,7 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
         // get html
         try {
             const data = yield page.evaluate(() => {
-                var _a;
+                var _a, _b;
                 // add meta tag to record crawler
                 const meta = document.createElement('meta');
                 meta.name = 'arcblock-crawler';
@@ -251,6 +251,12 @@ const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], voi
                 // get title and meta description
                 const title = document.title || '';
                 const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
+                // remove document all <noscript> tags
+                (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
+                    if (el === null || el === void 0 ? void 0 : el.remove) {
+                        el.remove();
+                    }
+                });
                 return {
                     html: document.documentElement.outerHTML,
                     title,

package/lib/cjs/services/snapshot.js CHANGED Viewed

@@ -44,8 +44,18 @@ function formatSnapshot(snapshot, columns) {
         }
         // format html path to string
         if (data.html) {
-            const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
-            data.html = html.toString();
+            try {
+                const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
+                data.html = html.toString();
+            }
+            catch (err) {
+                config_1.logger.error('Failed to read html', {
+                    err,
+                    dataDir: config_1.config.dataDir,
+                    snapshot,
+                });
+                data.html = '';
+            }
         }
         // remove sensitive options that should not be returned
         if (data.options) {
@@ -104,10 +114,15 @@ function deleteSnapshots(where_1) {
         });
         const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
             try {
-                yield Promise.all([
-                    snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
-                    snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
-                ]);
+                try {
+                    yield Promise.all([
+                        snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
+                        snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
+                    ]);
+                }
+                catch (err) {
+                    config_1.logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config_1.config.dataDir });
+                }
                 yield snapshot.destroy({ transaction: txn });
                 return snapshot.jobId;
             }

package/lib/cjs/utils.js CHANGED Viewed

@@ -70,24 +70,36 @@ const botUserAgents = [
     /Googlebot/i,
     /GPTBot/i,
     /Applebot/i,
-    // AI bots
-    /Anthropic-ai/i,
-    /Claude-Web/i,
-    /anthropic-ai-scraper/i,
-    /Google-Extended/i,
-    /GoogleOther/i,
-    /CCBot\/\d/i,
-    /Bytespider/i,
-    /BingBot/i,
-    /Baiduspider/i,
-    /Sogou/i,
-    /Perplexity/i,
-    /Cohere-ai/i,
-    /xlts-bot/i,
-    /THAAS/i,
-    /YisouSpider/i,
-    /AlibabaGroup/i,
-    /adaptive-edge-crawler/i,
+    // AI bots - condensed patterns
+    /-AI\b/i, // Matches any string ending with "-AI"
+    /-Bot\b/i, // Matches any string ending with "-Bot"
+    /-Agent\b/i, // Matches any string ending with "-Agent"
+    /-User\b/i, // Matches any string ending with "-User"
+    /\bAI\b/i, // Matches standalone "AI" word
+    /\bGPT/i, // GPT variants
+    /\bClaude/i, // Claude variants
+    /\bBard\b/i, // Google Bard
+    /\bGemini\b/i, // Google Gemini
+    /\bLlama\b/i, // Meta Llama
+    /\bChatGPT/i, // ChatGPT variants
+    /\bOpenAI/i, // OpenAI
+    /\bAnthropic/i, // Anthropic
+    /\bPerplexity/i, // Perplexity
+    /\bCohere/i, // Cohere
+    /\bHuggingFace/i, // Hugging Face
+    /\bStability/i, // Stability AI
+    /\bMidjourney/i, // Midjourney
+    /\bDALL-E/i, // DALL-E
+    /\bMeta-External/i, // Meta external agents
+    /\bGoogle-/i, // Google agents
+    /\bLLM/i, // LLM
+    /\bBytespider/i, // ByteDance spider
+    /\bBaiduspider/i, // Baidu spider
+    /\bYandexBot/i, // Yandex bot
+    /\bDuckDuckBot/i, // DuckDuckGo bot
+    /\bLinkedInBot/i, // LinkedIn bot
+    /\bTwitterbot/i, // Twitter bot
+    /\bCCBot/i, // Common Crawl bot
 ];
 /**
  * A default set of file extensions for static assets that do not need to be proxied.

package/lib/esm/crawler.js CHANGED Viewed

@@ -231,7 +231,7 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
         // get html
         try {
             const data = yield page.evaluate(() => {
-                var _a;
+                var _a, _b;
                 // add meta tag to record crawler
                 const meta = document.createElement('meta');
                 meta.name = 'arcblock-crawler';
@@ -240,6 +240,12 @@ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_
                 // get title and meta description
                 const title = document.title || '';
                 const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
+                // remove document all <noscript> tags
+                (_b = document.querySelectorAll('noscript')) === null || _b === void 0 ? void 0 : _b.forEach((el) => {
+                    if (el === null || el === void 0 ? void 0 : el.remove) {
+                        el.remove();
+                    }
+                });
                 return {
                     html: document.documentElement.outerHTML,
                     title,

package/lib/esm/services/snapshot.js CHANGED Viewed

@@ -34,8 +34,18 @@ export function formatSnapshot(snapshot, columns) {
         }
         // format html path to string
         if (data.html) {
-            const html = yield fs.readFile(path.join(config.dataDir, data.html));
-            data.html = html.toString();
+            try {
+                const html = yield fs.readFile(path.join(config.dataDir, data.html));
+                data.html = html.toString();
+            }
+            catch (err) {
+                logger.error('Failed to read html', {
+                    err,
+                    dataDir: config.dataDir,
+                    snapshot,
+                });
+                data.html = '';
+            }
         }
         // remove sensitive options that should not be returned
         if (data.options) {
@@ -94,10 +104,15 @@ export function deleteSnapshots(where_1) {
         });
         const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
             try {
-                yield Promise.all([
-                    snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
-                    snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
-                ]);
+                try {
+                    yield Promise.all([
+                        snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
+                        snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
+                    ]);
+                }
+                catch (err) {
+                    logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config.dataDir });
+                }
                 yield snapshot.destroy({ transaction: txn });
                 return snapshot.jobId;
             }

package/lib/esm/utils.js CHANGED Viewed

@@ -59,24 +59,36 @@ const botUserAgents = [
     /Googlebot/i,
     /GPTBot/i,
     /Applebot/i,
-    // AI bots
-    /Anthropic-ai/i,
-    /Claude-Web/i,
-    /anthropic-ai-scraper/i,
-    /Google-Extended/i,
-    /GoogleOther/i,
-    /CCBot\/\d/i,
-    /Bytespider/i,
-    /BingBot/i,
-    /Baiduspider/i,
-    /Sogou/i,
-    /Perplexity/i,
-    /Cohere-ai/i,
-    /xlts-bot/i,
-    /THAAS/i,
-    /YisouSpider/i,
-    /AlibabaGroup/i,
-    /adaptive-edge-crawler/i,
+    // AI bots - condensed patterns
+    /-AI\b/i, // Matches any string ending with "-AI"
+    /-Bot\b/i, // Matches any string ending with "-Bot"
+    /-Agent\b/i, // Matches any string ending with "-Agent"
+    /-User\b/i, // Matches any string ending with "-User"
+    /\bAI\b/i, // Matches standalone "AI" word
+    /\bGPT/i, // GPT variants
+    /\bClaude/i, // Claude variants
+    /\bBard\b/i, // Google Bard
+    /\bGemini\b/i, // Google Gemini
+    /\bLlama\b/i, // Meta Llama
+    /\bChatGPT/i, // ChatGPT variants
+    /\bOpenAI/i, // OpenAI
+    /\bAnthropic/i, // Anthropic
+    /\bPerplexity/i, // Perplexity
+    /\bCohere/i, // Cohere
+    /\bHuggingFace/i, // Hugging Face
+    /\bStability/i, // Stability AI
+    /\bMidjourney/i, // Midjourney
+    /\bDALL-E/i, // DALL-E
+    /\bMeta-External/i, // Meta external agents
+    /\bGoogle-/i, // Google agents
+    /\bLLM/i, // LLM
+    /\bBytespider/i, // ByteDance spider
+    /\bBaiduspider/i, // Baidu spider
+    /\bYandexBot/i, // Yandex bot
+    /\bDuckDuckBot/i, // DuckDuckGo bot
+    /\bLinkedInBot/i, // LinkedIn bot
+    /\bTwitterbot/i, // Twitter bot
+    /\bCCBot/i, // Common Crawl bot
 ];
 /**
  * A default set of file extensions for static assets that do not need to be proxied.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arcblock/crawler",
-  "version": "1.3.2",
+  "version": "1.3.4",
   "main": "lib/cjs/index.js",
   "module": "lib/esm/index.js",
   "types": "lib/cjs/index.d.ts",