@blocklet/crawler 2.1.245 → 2.1.247

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,11 @@ import { PageOptions } from './types';
3
3
  export { getRelativePath };
4
4
  export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
5
5
  export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
6
- export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
6
+ export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
7
7
  url: string;
8
8
  content: string;
9
9
  lastmod?: string;
10
+ nextDate?: string;
10
11
  }) => Promise<any>;
11
12
  export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
12
13
  urls: string[] | string;
@@ -71,18 +71,41 @@ const getUrlInfoFromCache = async url => {
71
71
  return cache;
72
72
  };
73
73
  exports.getUrlInfoFromCache = getUrlInfoFromCache;
74
+ function getNextCrawlDate(lastmod) {
75
+ const now = /* @__PURE__ */new Date();
76
+ const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
77
+ const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
78
+ const CRAWL_INTERVALS = /* @__PURE__ */new Map([[[-1, 0], 1],
79
+ // 无 lastmod
80
+ [[0, 3], 1],
81
+ // 3 天内活跃
82
+ [[3, 7], 3],
83
+ // 7 天内活跃
84
+ [[7, 30], 14],
85
+ // 30 天内活跃
86
+ [[30, 90], 30],
87
+ // 90 天内活跃
88
+ [[90, Infinity], 365]
89
+ // 长期不活跃
90
+ ]);
91
+ const interval = Array.from(CRAWL_INTERVALS.entries()).find(([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1)?.[1] || 90;
92
+ return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
93
+ }
74
94
  const setUrlInfoToCache = async ({
75
95
  url,
76
96
  content,
77
- lastmod
97
+ lastmod,
98
+ nextDate
78
99
  }) => {
79
100
  if (!content || !url) {
80
101
  return;
81
102
  }
103
+ const lastmodValue = lastmod || ( /* @__PURE__ */new Date()).toISOString();
82
104
  return await _utils.useCache.set((0, _utils.getRelativePath)(url), {
83
105
  content,
84
- lastmod: lastmod || ( /* @__PURE__ */new Date()).toISOString(),
85
- updatedAt: ( /* @__PURE__ */new Date()).toISOString()
106
+ lastmod: lastmodValue,
107
+ updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
108
+ nextDate: nextDate || getNextCrawlDate(lastmodValue)
86
109
  });
87
110
  };
88
111
  exports.setUrlInfoToCache = setUrlInfoToCache;
@@ -90,7 +113,7 @@ const crawlUrl = async ({
90
113
  urls,
91
114
  lastmodMap,
92
115
  formatPageContent,
93
- autoCloseBrowserCount = 30
116
+ autoCloseBrowserCount = 50
94
117
  }) => {
95
118
  if (typeof urls === "string") {
96
119
  urls = [urls];
@@ -104,7 +127,7 @@ const crawlUrl = async ({
104
127
  try {
105
128
  if (index % autoCloseBrowserCount === 0) {
106
129
  await (0, _utils.closeBrowser)({
107
- trimCache: false
130
+ trimCache: index % (autoCloseBrowserCount * 5) === 0
108
131
  });
109
132
  }
110
133
  const canCrawl = await (0, _utils.isAcceptCrawler)(url);
@@ -114,10 +137,12 @@ const crawlUrl = async ({
114
137
  formatPageContent
115
138
  });
116
139
  if (pageContent) {
140
+ const lastmodValue = lastmodMap?.get(url) || ( /* @__PURE__ */new Date()).toISOString();
117
141
  await setUrlInfoToCache({
118
142
  url,
119
143
  content: pageContent,
120
- lastmod: lastmodMap?.get(url)
144
+ lastmod: lastmodValue,
145
+ nextDate: getNextCrawlDate(lastmodValue)
121
146
  });
122
147
  _utils.logger.info(`Crawler[${index}] ${url} success`);
123
148
  } else if (retryCount < 3) {
@@ -245,10 +270,6 @@ const crawlBlocklet = async () => {
245
270
  } catch (error) {}
246
271
  }
247
272
  };
248
- removeElements("script");
249
- removeElements("style");
250
- removeElements("link");
251
- removeElements("noscript");
252
273
  removeElements('[id="uploader-container"]');
253
274
  removeElements('[class^="uppy-"]');
254
275
  removeElements('[id="point-up-component"]');
@@ -262,7 +283,9 @@ const crawlBlocklet = async () => {
262
283
  }
263
284
  });
264
285
  _utils.logger.info(...crawlerLogText("success"));
265
- await (0, _utils.closeBrowser)();
286
+ await (0, _utils.closeBrowser)({
287
+ trimCache: true
288
+ });
266
289
  };
267
290
  exports.crawlBlocklet = crawlBlocklet;
268
291
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
@@ -280,7 +303,7 @@ const initCronCrawlBlocklet = ({
280
303
  time,
281
304
  fn: (0, _debounce.default)(crawlBlocklet),
282
305
  options: {
283
- runOnInit: true,
306
+ runOnInit: false,
284
307
  ...options
285
308
  }
286
309
  }],
@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
10
10
  export declare const CRAWLER_FLAG = "x-crawler";
11
11
  export declare const isSelfCrawler: (req: any) => boolean;
12
12
  export declare const initPage: ({ abortResourceTypes, }?: {
13
- abortResourceTypes?: never[] | undefined;
13
+ abortResourceTypes?: string[] | undefined;
14
14
  }) => Promise<any>;
15
15
  export declare const getDefaultRobotsUrl: (req: any) => string;
16
16
  export declare const getDefaultSitemapUrl: (req: any) => string;
@@ -111,6 +111,16 @@ const closeBrowser = async ({
111
111
  clearCheckBrowserTimer();
112
112
  if (trimCache) {
113
113
  await _puppeteer.default.trimCache();
114
+ const {
115
+ cacheDirectory,
116
+ temporaryDirectory
117
+ } = getPuppeteerrc();
118
+ if (cacheDirectory) {
119
+ _fsExtra.default.emptyDirSync(cacheDirectory);
120
+ }
121
+ if (temporaryDirectory) {
122
+ _fsExtra.default.emptyDirSync(temporaryDirectory);
123
+ }
114
124
  }
115
125
  logger.info("Close browser success");
116
126
  if (global.gc) {
@@ -147,15 +157,11 @@ const getBrowser = async () => {
147
157
  headless: true,
148
158
  args: [
149
159
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
150
- "--no-first-run",
151
- // '--no-startup-window',
152
- "--hide-scrollbars", "--no-sandbox", "--no-zygote",
153
- // '--single-process',
154
- "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
160
+ "--no-first-run", "--hide-scrollbars", "--no-sandbox", "--no-zygote", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
155
161
  // 限制V8内存
156
- "--disable-background-networking", "--disable-default-apps", "--disable-web-security",
157
- // 允许跨域请求
158
- "--disable-software-rasterizer", "--disable-crash-reporter"]
162
+ "--disable-background-networking", "--disable-default-apps",
163
+ // '--disable-web-security', // 允许跨域请求
164
+ "--disable-software-rasterizer", "--disable-crash-reporter", "--disable-service-workers", "--no-startup-window", "--single-process", "--disable-gpu", "--disable-notifications", "--disable-infobars"]
159
165
  });
160
166
  logger.info("Launch browser success");
161
167
  const browserWSEndpoint = await browser.wsEndpoint();
@@ -201,8 +207,19 @@ const isSelfCrawler = req => {
201
207
  };
202
208
  exports.isSelfCrawler = isSelfCrawler;
203
209
  const initPage = async ({
204
- abortResourceTypes = []
205
- // ['image', 'stylesheet', 'font']
210
+ abortResourceTypes = ["image",
211
+ // 图片
212
+ "media",
213
+ // 媒体文件
214
+ "font",
215
+ // 字体
216
+ "websocket",
217
+ // websocket 连接
218
+ "manifest",
219
+ // manifest 文件
220
+ "other"
221
+ // 其他资源
222
+ ]
206
223
  } = {}) => {
207
224
  const browser2 = await getBrowser();
208
225
  const page = await browser2.newPage();
@@ -3,10 +3,11 @@ import { PageOptions } from './types';
3
3
  export { getRelativePath };
4
4
  export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
5
5
  export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
6
- export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
6
+ export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
7
7
  url: string;
8
8
  content: string;
9
9
  lastmod?: string;
10
+ nextDate?: string;
10
11
  }) => Promise<any>;
11
12
  export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
12
13
  urls: string[] | string;
@@ -58,25 +58,51 @@ export const getUrlInfoFromCache = async (url) => {
58
58
  const cache = await useCache.get(getRelativePath(url));
59
59
  return cache;
60
60
  };
61
+ function getNextCrawlDate(lastmod) {
62
+ const now = /* @__PURE__ */ new Date();
63
+ const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
64
+ const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
65
+ const CRAWL_INTERVALS = /* @__PURE__ */ new Map([
66
+ [[-1, 0], 1],
67
+ // 无 lastmod
68
+ [[0, 3], 1],
69
+ // 3 天内活跃
70
+ [[3, 7], 3],
71
+ // 7 天内活跃
72
+ [[7, 30], 14],
73
+ // 30 天内活跃
74
+ [[30, 90], 30],
75
+ // 90 天内活跃
76
+ [[90, Infinity], 365]
77
+ // 长期不活跃
78
+ ]);
79
+ const interval = Array.from(CRAWL_INTERVALS.entries()).find(
80
+ ([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1
81
+ )?.[1] || 90;
82
+ return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
83
+ }
61
84
  export const setUrlInfoToCache = async ({
62
85
  url,
63
86
  content,
64
- lastmod
87
+ lastmod,
88
+ nextDate
65
89
  }) => {
66
90
  if (!content || !url) {
67
91
  return;
68
92
  }
93
+ const lastmodValue = lastmod || (/* @__PURE__ */ new Date()).toISOString();
69
94
  return await useCache.set(getRelativePath(url), {
70
95
  content,
71
- lastmod: lastmod || (/* @__PURE__ */ new Date()).toISOString(),
72
- updatedAt: (/* @__PURE__ */ new Date()).toISOString()
96
+ lastmod: lastmodValue,
97
+ updatedAt: (/* @__PURE__ */ new Date()).toISOString(),
98
+ nextDate: nextDate || getNextCrawlDate(lastmodValue)
73
99
  });
74
100
  };
75
101
  export const crawlUrl = async ({
76
102
  urls,
77
103
  lastmodMap,
78
104
  formatPageContent,
79
- autoCloseBrowserCount = 30
105
+ autoCloseBrowserCount = 50
80
106
  }) => {
81
107
  if (typeof urls === "string") {
82
108
  urls = [urls];
@@ -86,7 +112,7 @@ export const crawlUrl = async ({
86
112
  try {
87
113
  if (index % autoCloseBrowserCount === 0) {
88
114
  await closeBrowser({
89
- trimCache: false
115
+ trimCache: index % (autoCloseBrowserCount * 5) === 0
90
116
  });
91
117
  }
92
118
  const canCrawl = await isAcceptCrawler(url);
@@ -96,7 +122,13 @@ export const crawlUrl = async ({
96
122
  formatPageContent
97
123
  });
98
124
  if (pageContent) {
99
- await setUrlInfoToCache({ url, content: pageContent, lastmod: lastmodMap?.get(url) });
125
+ const lastmodValue = lastmodMap?.get(url) || (/* @__PURE__ */ new Date()).toISOString();
126
+ await setUrlInfoToCache({
127
+ url,
128
+ content: pageContent,
129
+ lastmod: lastmodValue,
130
+ nextDate: getNextCrawlDate(lastmodValue)
131
+ });
100
132
  logger.info(`Crawler[${index}] ${url} success`);
101
133
  } else if (retryCount < 3) {
102
134
  retryCount++;
@@ -213,10 +245,6 @@ export const crawlBlocklet = async () => {
213
245
  }
214
246
  }
215
247
  };
216
- removeElements("script");
217
- removeElements("style");
218
- removeElements("link");
219
- removeElements("noscript");
220
248
  removeElements('[id="uploader-container"]');
221
249
  removeElements('[class^="uppy-"]');
222
250
  removeElements('[id="point-up-component"]');
@@ -230,7 +258,9 @@ export const crawlBlocklet = async () => {
230
258
  }
231
259
  });
232
260
  logger.info(...crawlerLogText("success"));
233
- await closeBrowser();
261
+ await closeBrowser({
262
+ trimCache: true
263
+ });
234
264
  };
235
265
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
236
266
  let cronCrawlBlockletJob = null;
@@ -247,7 +277,7 @@ export const initCronCrawlBlocklet = ({
247
277
  name: CRON_CRAWL_BLOCKLET_KEY,
248
278
  time,
249
279
  fn: debounce(crawlBlocklet),
250
- options: { runOnInit: true, ...options }
280
+ options: { runOnInit: false, ...options }
251
281
  }
252
282
  ],
253
283
  onError: (err) => {
@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
10
10
  export declare const CRAWLER_FLAG = "x-crawler";
11
11
  export declare const isSelfCrawler: (req: any) => boolean;
12
12
  export declare const initPage: ({ abortResourceTypes, }?: {
13
- abortResourceTypes?: never[] | undefined;
13
+ abortResourceTypes?: string[] | undefined;
14
14
  }) => Promise<any>;
15
15
  export declare const getDefaultRobotsUrl: (req: any) => string;
16
16
  export declare const getDefaultSitemapUrl: (req: any) => string;
@@ -54,6 +54,13 @@ export const closeBrowser = async ({ trimCache = true } = {}) => {
54
54
  clearCheckBrowserTimer();
55
55
  if (trimCache) {
56
56
  await puppeteer.trimCache();
57
+ const { cacheDirectory, temporaryDirectory } = getPuppeteerrc();
58
+ if (cacheDirectory) {
59
+ fs.emptyDirSync(cacheDirectory);
60
+ }
61
+ if (temporaryDirectory) {
62
+ fs.emptyDirSync(temporaryDirectory);
63
+ }
57
64
  }
58
65
  logger.info("Close browser success");
59
66
  if (global.gc) {
@@ -90,11 +97,9 @@ export const getBrowser = async () => {
90
97
  args: [
91
98
  // docs: https://peter.sh/experiments/chromium-command-line-switches/
92
99
  "--no-first-run",
93
- // '--no-startup-window',
94
100
  "--hide-scrollbars",
95
101
  "--no-sandbox",
96
102
  "--no-zygote",
97
- // '--single-process',
98
103
  "--disable-setuid-sandbox",
99
104
  "--disable-gpu",
100
105
  "--disable-dev-shm-usage",
@@ -105,10 +110,15 @@ export const getBrowser = async () => {
105
110
  // 限制V8内存
106
111
  "--disable-background-networking",
107
112
  "--disable-default-apps",
108
- "--disable-web-security",
109
- // 允许跨域请求
113
+ // '--disable-web-security', // 允许跨域请求
110
114
  "--disable-software-rasterizer",
111
- "--disable-crash-reporter"
115
+ "--disable-crash-reporter",
116
+ "--disable-service-workers",
117
+ "--no-startup-window",
118
+ "--single-process",
119
+ "--disable-gpu",
120
+ "--disable-notifications",
121
+ "--disable-infobars"
112
122
  ]
113
123
  });
114
124
  logger.info("Launch browser success");
@@ -153,8 +163,20 @@ export const isSelfCrawler = (req) => {
153
163
  return req.get(CRAWLER_FLAG) === "true" || `${ua}`.toLowerCase().indexOf("headless") !== -1;
154
164
  };
155
165
  export const initPage = async ({
156
- abortResourceTypes = []
157
- // ['image', 'stylesheet', 'font']
166
+ abortResourceTypes = [
167
+ "image",
168
+ // 图片
169
+ "media",
170
+ // 媒体文件
171
+ "font",
172
+ // 字体
173
+ "websocket",
174
+ // websocket 连接
175
+ "manifest",
176
+ // manifest 文件
177
+ "other"
178
+ // 其他资源
179
+ ]
158
180
  } = {}) => {
159
181
  const browser2 = await getBrowser();
160
182
  const page = await browser2.newPage();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.1.245",
3
+ "version": "2.1.247",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"