npm - @blocklet/crawler - Versions diffs - 2.1.245 → 2.1.247 - Mend

@blocklet/crawler 2.1.245 → 2.1.247

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/lib/cjs/middlewares/crawler.d.ts +2 -1
package/lib/cjs/middlewares/crawler.js +35 -12
package/lib/cjs/middlewares/utils.d.ts +1 -1
package/lib/cjs/middlewares/utils.js +27 -10
package/lib/es/middlewares/crawler.d.ts +2 -1
package/lib/es/middlewares/crawler.js +42 -12
package/lib/es/middlewares/utils.d.ts +1 -1
package/lib/es/middlewares/utils.js +29 -7
package/package.json +1 -1

package/lib/cjs/middlewares/crawler.d.ts CHANGED Viewed

@@ -3,10 +3,11 @@ import { PageOptions } from './types';
 export { getRelativePath };
 export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
 export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
-export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
+export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
     url: string;
     content: string;
     lastmod?: string;
+    nextDate?: string;
 }) => Promise<any>;
 export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
     urls: string[] | string;

package/lib/cjs/middlewares/crawler.js CHANGED Viewed

@@ -71,18 +71,41 @@ const getUrlInfoFromCache = async url => {
   return cache;
 };
 exports.getUrlInfoFromCache = getUrlInfoFromCache;
+function getNextCrawlDate(lastmod) {
+  const now = /* @__PURE__ */new Date();
+  const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
+  const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
+  const CRAWL_INTERVALS = /* @__PURE__ */new Map([[[-1, 0], 1],
+  // 无 lastmod
+  [[0, 3], 1],
+  // 3 天内活跃
+  [[3, 7], 3],
+  // 7 天内活跃
+  [[7, 30], 14],
+  // 30 天内活跃
+  [[30, 90], 30],
+  // 90 天内活跃
+  [[90, Infinity], 365]
+  // 长期不活跃
+  ]);
+  const interval = Array.from(CRAWL_INTERVALS.entries()).find(([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1)?.[1] || 90;
+  return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
+}
 const setUrlInfoToCache = async ({
   url,
   content,
-  lastmod
+  lastmod,
+  nextDate
 }) => {
   if (!content || !url) {
     return;
   }
+  const lastmodValue = lastmod || ( /* @__PURE__ */new Date()).toISOString();
   return await _utils.useCache.set((0, _utils.getRelativePath)(url), {
     content,
-    lastmod: lastmod || ( /* @__PURE__ */new Date()).toISOString(),
-    updatedAt: ( /* @__PURE__ */new Date()).toISOString()
+    lastmod: lastmodValue,
+    updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
+    nextDate: nextDate || getNextCrawlDate(lastmodValue)
   });
 };
 exports.setUrlInfoToCache = setUrlInfoToCache;
@@ -90,7 +113,7 @@ const crawlUrl = async ({
   urls,
   lastmodMap,
   formatPageContent,
-  autoCloseBrowserCount = 30
+  autoCloseBrowserCount = 50
 }) => {
   if (typeof urls === "string") {
     urls = [urls];
@@ -104,7 +127,7 @@ const crawlUrl = async ({
       try {
         if (index % autoCloseBrowserCount === 0) {
           await (0, _utils.closeBrowser)({
-            trimCache: false
+            trimCache: index % (autoCloseBrowserCount * 5) === 0
           });
         }
         const canCrawl = await (0, _utils.isAcceptCrawler)(url);
@@ -114,10 +137,12 @@ const crawlUrl = async ({
             formatPageContent
           });
           if (pageContent) {
+            const lastmodValue = lastmodMap?.get(url) || ( /* @__PURE__ */new Date()).toISOString();
             await setUrlInfoToCache({
               url,
               content: pageContent,
-              lastmod: lastmodMap?.get(url)
+              lastmod: lastmodValue,
+              nextDate: getNextCrawlDate(lastmodValue)
             });
             _utils.logger.info(`Crawler[${index}] ${url} success`);
           } else if (retryCount < 3) {
@@ -245,10 +270,6 @@ const crawlBlocklet = async () => {
             } catch (error) {}
           }
         };
-        removeElements("script");
-        removeElements("style");
-        removeElements("link");
-        removeElements("noscript");
         removeElements('[id="uploader-container"]');
         removeElements('[class^="uppy-"]');
         removeElements('[id="point-up-component"]');
@@ -262,7 +283,9 @@ const crawlBlocklet = async () => {
     }
   });
   _utils.logger.info(...crawlerLogText("success"));
-  await (0, _utils.closeBrowser)();
+  await (0, _utils.closeBrowser)({
+    trimCache: true
+  });
 };
 exports.crawlBlocklet = crawlBlocklet;
 const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
@@ -280,7 +303,7 @@ const initCronCrawlBlocklet = ({
         time,
         fn: (0, _debounce.default)(crawlBlocklet),
         options: {
-          runOnInit: true,
+          runOnInit: false,
           ...options
         }
       }],

package/lib/cjs/middlewares/utils.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
 export declare const CRAWLER_FLAG = "x-crawler";
 export declare const isSelfCrawler: (req: any) => boolean;
 export declare const initPage: ({ abortResourceTypes, }?: {
-    abortResourceTypes?: never[] | undefined;
+    abortResourceTypes?: string[] | undefined;
 }) => Promise<any>;
 export declare const getDefaultRobotsUrl: (req: any) => string;
 export declare const getDefaultSitemapUrl: (req: any) => string;

package/lib/cjs/middlewares/utils.js CHANGED Viewed

@@ -111,6 +111,16 @@ const closeBrowser = async ({
     clearCheckBrowserTimer();
     if (trimCache) {
       await _puppeteer.default.trimCache();
+      const {
+        cacheDirectory,
+        temporaryDirectory
+      } = getPuppeteerrc();
+      if (cacheDirectory) {
+        _fsExtra.default.emptyDirSync(cacheDirectory);
+      }
+      if (temporaryDirectory) {
+        _fsExtra.default.emptyDirSync(temporaryDirectory);
+      }
     }
     logger.info("Close browser success");
     if (global.gc) {
@@ -147,15 +157,11 @@ const getBrowser = async () => {
           headless: true,
           args: [
           // docs: https://peter.sh/experiments/chromium-command-line-switches/
-          "--no-first-run",
-          // '--no-startup-window',
-          "--hide-scrollbars", "--no-sandbox", "--no-zygote",
-          // '--single-process',
-          "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
+          "--no-first-run", "--hide-scrollbars", "--no-sandbox", "--no-zygote", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
           // 限制V8内存
-          "--disable-background-networking", "--disable-default-apps", "--disable-web-security",
-          // 允许跨域请求
-          "--disable-software-rasterizer", "--disable-crash-reporter"]
+          "--disable-background-networking", "--disable-default-apps",
+          // '--disable-web-security', // 允许跨域请求
+          "--disable-software-rasterizer", "--disable-crash-reporter", "--disable-service-workers", "--no-startup-window", "--single-process", "--disable-gpu", "--disable-notifications", "--disable-infobars"]
         });
         logger.info("Launch browser success");
         const browserWSEndpoint = await browser.wsEndpoint();
@@ -201,8 +207,19 @@ const isSelfCrawler = req => {
 };
 exports.isSelfCrawler = isSelfCrawler;
 const initPage = async ({
-  abortResourceTypes = []
-  // ['image', 'stylesheet', 'font']
+  abortResourceTypes = ["image",
+  // 图片
+  "media",
+  // 媒体文件
+  "font",
+  // 字体
+  "websocket",
+  // websocket 连接
+  "manifest",
+  // manifest 文件
+  "other"
+  // 其他资源
+  ]
 } = {}) => {
   const browser2 = await getBrowser();
   const page = await browser2.newPage();

package/lib/es/middlewares/crawler.d.ts CHANGED Viewed

@@ -3,10 +3,11 @@ import { PageOptions } from './types';
 export { getRelativePath };
 export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
 export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
-export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
+export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
     url: string;
     content: string;
     lastmod?: string;
+    nextDate?: string;
 }) => Promise<any>;
 export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
     urls: string[] | string;

package/lib/es/middlewares/crawler.js CHANGED Viewed

@@ -58,25 +58,51 @@ export const getUrlInfoFromCache = async (url) => {
   const cache = await useCache.get(getRelativePath(url));
   return cache;
 };
+function getNextCrawlDate(lastmod) {
+  const now = /* @__PURE__ */ new Date();
+  const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
+  const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
+  const CRAWL_INTERVALS = /* @__PURE__ */ new Map([
+    [[-1, 0], 1],
+    // 无 lastmod
+    [[0, 3], 1],
+    // 3 天内活跃
+    [[3, 7], 3],
+    // 7 天内活跃
+    [[7, 30], 14],
+    // 30 天内活跃
+    [[30, 90], 30],
+    // 90 天内活跃
+    [[90, Infinity], 365]
+    // 长期不活跃
+  ]);
+  const interval = Array.from(CRAWL_INTERVALS.entries()).find(
+    ([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1
+  )?.[1] || 90;
+  return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
+}
 export const setUrlInfoToCache = async ({
   url,
   content,
-  lastmod
+  lastmod,
+  nextDate
 }) => {
   if (!content || !url) {
     return;
   }
+  const lastmodValue = lastmod || (/* @__PURE__ */ new Date()).toISOString();
   return await useCache.set(getRelativePath(url), {
     content,
-    lastmod: lastmod || (/* @__PURE__ */ new Date()).toISOString(),
-    updatedAt: (/* @__PURE__ */ new Date()).toISOString()
+    lastmod: lastmodValue,
+    updatedAt: (/* @__PURE__ */ new Date()).toISOString(),
+    nextDate: nextDate || getNextCrawlDate(lastmodValue)
   });
 };
 export const crawlUrl = async ({
   urls,
   lastmodMap,
   formatPageContent,
-  autoCloseBrowserCount = 30
+  autoCloseBrowserCount = 50
 }) => {
   if (typeof urls === "string") {
     urls = [urls];
@@ -86,7 +112,7 @@ export const crawlUrl = async ({
       try {
         if (index % autoCloseBrowserCount === 0) {
           await closeBrowser({
-            trimCache: false
+            trimCache: index % (autoCloseBrowserCount * 5) === 0
           });
         }
         const canCrawl = await isAcceptCrawler(url);
@@ -96,7 +122,13 @@ export const crawlUrl = async ({
             formatPageContent
           });
           if (pageContent) {
-            await setUrlInfoToCache({ url, content: pageContent, lastmod: lastmodMap?.get(url) });
+            const lastmodValue = lastmodMap?.get(url) || (/* @__PURE__ */ new Date()).toISOString();
+            await setUrlInfoToCache({
+              url,
+              content: pageContent,
+              lastmod: lastmodValue,
+              nextDate: getNextCrawlDate(lastmodValue)
+            });
             logger.info(`Crawler[${index}] ${url} success`);
           } else if (retryCount < 3) {
             retryCount++;
@@ -213,10 +245,6 @@ export const crawlBlocklet = async () => {
             }
           }
         };
-        removeElements("script");
-        removeElements("style");
-        removeElements("link");
-        removeElements("noscript");
         removeElements('[id="uploader-container"]');
         removeElements('[class^="uppy-"]');
         removeElements('[id="point-up-component"]');
@@ -230,7 +258,9 @@ export const crawlBlocklet = async () => {
     }
   });
   logger.info(...crawlerLogText("success"));
-  await closeBrowser();
+  await closeBrowser({
+    trimCache: true
+  });
 };
 const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
 let cronCrawlBlockletJob = null;
@@ -247,7 +277,7 @@ export const initCronCrawlBlocklet = ({
           name: CRON_CRAWL_BLOCKLET_KEY,
           time,
           fn: debounce(crawlBlocklet),
-          options: { runOnInit: true, ...options }
+          options: { runOnInit: false, ...options }
         }
       ],
       onError: (err) => {

package/lib/es/middlewares/utils.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
 export declare const CRAWLER_FLAG = "x-crawler";
 export declare const isSelfCrawler: (req: any) => boolean;
 export declare const initPage: ({ abortResourceTypes, }?: {
-    abortResourceTypes?: never[] | undefined;
+    abortResourceTypes?: string[] | undefined;
 }) => Promise<any>;
 export declare const getDefaultRobotsUrl: (req: any) => string;
 export declare const getDefaultSitemapUrl: (req: any) => string;

package/lib/es/middlewares/utils.js CHANGED Viewed

@@ -54,6 +54,13 @@ export const closeBrowser = async ({ trimCache = true } = {}) => {
     clearCheckBrowserTimer();
     if (trimCache) {
       await puppeteer.trimCache();
+      const { cacheDirectory, temporaryDirectory } = getPuppeteerrc();
+      if (cacheDirectory) {
+        fs.emptyDirSync(cacheDirectory);
+      }
+      if (temporaryDirectory) {
+        fs.emptyDirSync(temporaryDirectory);
+      }
     }
     logger.info("Close browser success");
     if (global.gc) {
@@ -90,11 +97,9 @@ export const getBrowser = async () => {
           args: [
             // docs: https://peter.sh/experiments/chromium-command-line-switches/
             "--no-first-run",
-            // '--no-startup-window',
             "--hide-scrollbars",
             "--no-sandbox",
             "--no-zygote",
-            // '--single-process',
             "--disable-setuid-sandbox",
             "--disable-gpu",
             "--disable-dev-shm-usage",
@@ -105,10 +110,15 @@ export const getBrowser = async () => {
             // 限制V8内存
             "--disable-background-networking",
             "--disable-default-apps",
-            "--disable-web-security",
-            // 允许跨域请求
+            // '--disable-web-security', // 允许跨域请求
             "--disable-software-rasterizer",
-            "--disable-crash-reporter"
+            "--disable-crash-reporter",
+            "--disable-service-workers",
+            "--no-startup-window",
+            "--single-process",
+            "--disable-gpu",
+            "--disable-notifications",
+            "--disable-infobars"
           ]
         });
         logger.info("Launch browser success");
@@ -153,8 +163,20 @@ export const isSelfCrawler = (req) => {
   return req.get(CRAWLER_FLAG) === "true" || `${ua}`.toLowerCase().indexOf("headless") !== -1;
 };
 export const initPage = async ({
-  abortResourceTypes = []
-  // ['image', 'stylesheet', 'font']
+  abortResourceTypes = [
+    "image",
+    // 图片
+    "media",
+    // 媒体文件
+    "font",
+    // 字体
+    "websocket",
+    // websocket 连接
+    "manifest",
+    // manifest 文件
+    "other"
+    // 其他资源
+  ]
 } = {}) => {
   const browser2 = await getBrowser();
   const page = await browser2.newPage();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blocklet/crawler",
-  "version": "2.1.245",
+  "version": "2.1.247",
   "description": "blocklet crawler lib",
   "publishConfig": {
     "access": "public"