@blocklet/crawler 2.1.246 → 2.1.248
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/middlewares/crawler.d.ts +2 -1
- package/lib/cjs/middlewares/crawler.js +35 -12
- package/lib/cjs/middlewares/utils.d.ts +1 -1
- package/lib/cjs/middlewares/utils.js +27 -10
- package/lib/es/middlewares/crawler.d.ts +2 -1
- package/lib/es/middlewares/crawler.js +42 -12
- package/lib/es/middlewares/utils.d.ts +1 -1
- package/lib/es/middlewares/utils.js +29 -7
- package/package.json +1 -1
|
@@ -3,10 +3,11 @@ import { PageOptions } from './types';
|
|
|
3
3
|
export { getRelativePath };
|
|
4
4
|
export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
|
|
5
5
|
export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
|
|
6
|
-
export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
|
|
6
|
+
export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
|
|
7
7
|
url: string;
|
|
8
8
|
content: string;
|
|
9
9
|
lastmod?: string;
|
|
10
|
+
nextDate?: string;
|
|
10
11
|
}) => Promise<any>;
|
|
11
12
|
export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
|
|
12
13
|
urls: string[] | string;
|
|
@@ -71,18 +71,41 @@ const getUrlInfoFromCache = async url => {
|
|
|
71
71
|
return cache;
|
|
72
72
|
};
|
|
73
73
|
exports.getUrlInfoFromCache = getUrlInfoFromCache;
|
|
74
|
+
function getNextCrawlDate(lastmod) {
|
|
75
|
+
const now = /* @__PURE__ */new Date();
|
|
76
|
+
const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
|
|
77
|
+
const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
|
|
78
|
+
const CRAWL_INTERVALS = /* @__PURE__ */new Map([[[-1, 0], 1],
|
|
79
|
+
// 无 lastmod
|
|
80
|
+
[[0, 3], 1],
|
|
81
|
+
// 3 天内活跃
|
|
82
|
+
[[3, 7], 3],
|
|
83
|
+
// 7 天内活跃
|
|
84
|
+
[[7, 30], 14],
|
|
85
|
+
// 30 天内活跃
|
|
86
|
+
[[30, 90], 30],
|
|
87
|
+
// 90 天内活跃
|
|
88
|
+
[[90, Infinity], 365]
|
|
89
|
+
// 长期不活跃
|
|
90
|
+
]);
|
|
91
|
+
const interval = Array.from(CRAWL_INTERVALS.entries()).find(([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1)?.[1] || 90;
|
|
92
|
+
return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
|
|
93
|
+
}
|
|
74
94
|
const setUrlInfoToCache = async ({
|
|
75
95
|
url,
|
|
76
96
|
content,
|
|
77
|
-
lastmod
|
|
97
|
+
lastmod,
|
|
98
|
+
nextDate
|
|
78
99
|
}) => {
|
|
79
100
|
if (!content || !url) {
|
|
80
101
|
return;
|
|
81
102
|
}
|
|
103
|
+
const lastmodValue = lastmod || ( /* @__PURE__ */new Date()).toISOString();
|
|
82
104
|
return await _utils.useCache.set((0, _utils.getRelativePath)(url), {
|
|
83
105
|
content,
|
|
84
|
-
lastmod:
|
|
85
|
-
updatedAt: ( /* @__PURE__ */new Date()).toISOString()
|
|
106
|
+
lastmod: lastmodValue,
|
|
107
|
+
updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
|
|
108
|
+
nextDate: nextDate || getNextCrawlDate(lastmodValue)
|
|
86
109
|
});
|
|
87
110
|
};
|
|
88
111
|
exports.setUrlInfoToCache = setUrlInfoToCache;
|
|
@@ -90,7 +113,7 @@ const crawlUrl = async ({
|
|
|
90
113
|
urls,
|
|
91
114
|
lastmodMap,
|
|
92
115
|
formatPageContent,
|
|
93
|
-
autoCloseBrowserCount =
|
|
116
|
+
autoCloseBrowserCount = 50
|
|
94
117
|
}) => {
|
|
95
118
|
if (typeof urls === "string") {
|
|
96
119
|
urls = [urls];
|
|
@@ -104,7 +127,7 @@ const crawlUrl = async ({
|
|
|
104
127
|
try {
|
|
105
128
|
if (index % autoCloseBrowserCount === 0) {
|
|
106
129
|
await (0, _utils.closeBrowser)({
|
|
107
|
-
trimCache:
|
|
130
|
+
trimCache: index % (autoCloseBrowserCount * 5) === 0
|
|
108
131
|
});
|
|
109
132
|
}
|
|
110
133
|
const canCrawl = await (0, _utils.isAcceptCrawler)(url);
|
|
@@ -114,10 +137,12 @@ const crawlUrl = async ({
|
|
|
114
137
|
formatPageContent
|
|
115
138
|
});
|
|
116
139
|
if (pageContent) {
|
|
140
|
+
const lastmodValue = lastmodMap?.get(url) || ( /* @__PURE__ */new Date()).toISOString();
|
|
117
141
|
await setUrlInfoToCache({
|
|
118
142
|
url,
|
|
119
143
|
content: pageContent,
|
|
120
|
-
lastmod:
|
|
144
|
+
lastmod: lastmodValue,
|
|
145
|
+
nextDate: getNextCrawlDate(lastmodValue)
|
|
121
146
|
});
|
|
122
147
|
_utils.logger.info(`Crawler[${index}] ${url} success`);
|
|
123
148
|
} else if (retryCount < 3) {
|
|
@@ -245,10 +270,6 @@ const crawlBlocklet = async () => {
|
|
|
245
270
|
} catch (error) {}
|
|
246
271
|
}
|
|
247
272
|
};
|
|
248
|
-
removeElements("script");
|
|
249
|
-
removeElements("style");
|
|
250
|
-
removeElements("link");
|
|
251
|
-
removeElements("noscript");
|
|
252
273
|
removeElements('[id="uploader-container"]');
|
|
253
274
|
removeElements('[class^="uppy-"]');
|
|
254
275
|
removeElements('[id="point-up-component"]');
|
|
@@ -262,7 +283,9 @@ const crawlBlocklet = async () => {
|
|
|
262
283
|
}
|
|
263
284
|
});
|
|
264
285
|
_utils.logger.info(...crawlerLogText("success"));
|
|
265
|
-
await (0, _utils.closeBrowser)(
|
|
286
|
+
await (0, _utils.closeBrowser)({
|
|
287
|
+
trimCache: true
|
|
288
|
+
});
|
|
266
289
|
};
|
|
267
290
|
exports.crawlBlocklet = crawlBlocklet;
|
|
268
291
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
@@ -280,7 +303,7 @@ const initCronCrawlBlocklet = ({
|
|
|
280
303
|
time,
|
|
281
304
|
fn: (0, _debounce.default)(crawlBlocklet),
|
|
282
305
|
options: {
|
|
283
|
-
runOnInit:
|
|
306
|
+
runOnInit: false,
|
|
284
307
|
...options
|
|
285
308
|
}
|
|
286
309
|
}],
|
|
@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
|
|
|
10
10
|
export declare const CRAWLER_FLAG = "x-crawler";
|
|
11
11
|
export declare const isSelfCrawler: (req: any) => boolean;
|
|
12
12
|
export declare const initPage: ({ abortResourceTypes, }?: {
|
|
13
|
-
abortResourceTypes?:
|
|
13
|
+
abortResourceTypes?: string[] | undefined;
|
|
14
14
|
}) => Promise<any>;
|
|
15
15
|
export declare const getDefaultRobotsUrl: (req: any) => string;
|
|
16
16
|
export declare const getDefaultSitemapUrl: (req: any) => string;
|
|
@@ -111,6 +111,16 @@ const closeBrowser = async ({
|
|
|
111
111
|
clearCheckBrowserTimer();
|
|
112
112
|
if (trimCache) {
|
|
113
113
|
await _puppeteer.default.trimCache();
|
|
114
|
+
const {
|
|
115
|
+
cacheDirectory,
|
|
116
|
+
temporaryDirectory
|
|
117
|
+
} = getPuppeteerrc();
|
|
118
|
+
if (cacheDirectory) {
|
|
119
|
+
_fsExtra.default.emptyDirSync(cacheDirectory);
|
|
120
|
+
}
|
|
121
|
+
if (temporaryDirectory) {
|
|
122
|
+
_fsExtra.default.emptyDirSync(temporaryDirectory);
|
|
123
|
+
}
|
|
114
124
|
}
|
|
115
125
|
logger.info("Close browser success");
|
|
116
126
|
if (global.gc) {
|
|
@@ -147,15 +157,11 @@ const getBrowser = async () => {
|
|
|
147
157
|
headless: true,
|
|
148
158
|
args: [
|
|
149
159
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
150
|
-
"--no-first-run",
|
|
151
|
-
// '--no-startup-window',
|
|
152
|
-
"--hide-scrollbars", "--no-sandbox", "--no-zygote",
|
|
153
|
-
// '--single-process',
|
|
154
|
-
"--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
|
|
160
|
+
"--no-first-run", "--hide-scrollbars", "--no-sandbox", "--no-zygote", "--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
|
|
155
161
|
// 限制V8内存
|
|
156
|
-
"--disable-background-networking", "--disable-default-apps",
|
|
157
|
-
// 允许跨域请求
|
|
158
|
-
"--disable-software-rasterizer", "--disable-crash-reporter"]
|
|
162
|
+
"--disable-background-networking", "--disable-default-apps",
|
|
163
|
+
// '--disable-web-security', // 允许跨域请求
|
|
164
|
+
"--disable-software-rasterizer", "--disable-crash-reporter", "--disable-service-workers", "--no-startup-window", "--single-process", "--disable-gpu", "--disable-notifications", "--disable-infobars"]
|
|
159
165
|
});
|
|
160
166
|
logger.info("Launch browser success");
|
|
161
167
|
const browserWSEndpoint = await browser.wsEndpoint();
|
|
@@ -201,8 +207,19 @@ const isSelfCrawler = req => {
|
|
|
201
207
|
};
|
|
202
208
|
exports.isSelfCrawler = isSelfCrawler;
|
|
203
209
|
const initPage = async ({
|
|
204
|
-
abortResourceTypes = [
|
|
205
|
-
//
|
|
210
|
+
abortResourceTypes = ["image",
|
|
211
|
+
// 图片
|
|
212
|
+
"media",
|
|
213
|
+
// 媒体文件
|
|
214
|
+
"font",
|
|
215
|
+
// 字体
|
|
216
|
+
"websocket",
|
|
217
|
+
// websocket 连接
|
|
218
|
+
"manifest",
|
|
219
|
+
// manifest 文件
|
|
220
|
+
"other"
|
|
221
|
+
// 其他资源
|
|
222
|
+
]
|
|
206
223
|
} = {}) => {
|
|
207
224
|
const browser2 = await getBrowser();
|
|
208
225
|
const page = await browser2.newPage();
|
|
@@ -3,10 +3,11 @@ import { PageOptions } from './types';
|
|
|
3
3
|
export { getRelativePath };
|
|
4
4
|
export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
|
|
5
5
|
export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
|
|
6
|
-
export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
|
|
6
|
+
export declare const setUrlInfoToCache: ({ url, content, lastmod, nextDate, }: {
|
|
7
7
|
url: string;
|
|
8
8
|
content: string;
|
|
9
9
|
lastmod?: string;
|
|
10
|
+
nextDate?: string;
|
|
10
11
|
}) => Promise<any>;
|
|
11
12
|
export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
|
|
12
13
|
urls: string[] | string;
|
|
@@ -58,25 +58,51 @@ export const getUrlInfoFromCache = async (url) => {
|
|
|
58
58
|
const cache = await useCache.get(getRelativePath(url));
|
|
59
59
|
return cache;
|
|
60
60
|
};
|
|
61
|
+
function getNextCrawlDate(lastmod) {
|
|
62
|
+
const now = /* @__PURE__ */ new Date();
|
|
63
|
+
const lastModTime = lastmod ? new Date(lastmod).getTime() || 0 : 0;
|
|
64
|
+
const daysDiff = Math.max(0, (now.getTime() - lastModTime) / (24 * 60 * 60 * 1e3));
|
|
65
|
+
const CRAWL_INTERVALS = /* @__PURE__ */ new Map([
|
|
66
|
+
[[-1, 0], 1],
|
|
67
|
+
// 无 lastmod
|
|
68
|
+
[[0, 3], 1],
|
|
69
|
+
// 3 天内活跃
|
|
70
|
+
[[3, 7], 3],
|
|
71
|
+
// 7 天内活跃
|
|
72
|
+
[[7, 30], 14],
|
|
73
|
+
// 30 天内活跃
|
|
74
|
+
[[30, 90], 30],
|
|
75
|
+
// 90 天内活跃
|
|
76
|
+
[[90, Infinity], 365]
|
|
77
|
+
// 长期不活跃
|
|
78
|
+
]);
|
|
79
|
+
const interval = Array.from(CRAWL_INTERVALS.entries()).find(
|
|
80
|
+
([[min, max]]) => lastmod ? daysDiff > min && daysDiff <= max : min === -1
|
|
81
|
+
)?.[1] || 90;
|
|
82
|
+
return new Date(now.getTime() + interval * 24 * 60 * 60 * 1e3).toISOString();
|
|
83
|
+
}
|
|
61
84
|
export const setUrlInfoToCache = async ({
|
|
62
85
|
url,
|
|
63
86
|
content,
|
|
64
|
-
lastmod
|
|
87
|
+
lastmod,
|
|
88
|
+
nextDate
|
|
65
89
|
}) => {
|
|
66
90
|
if (!content || !url) {
|
|
67
91
|
return;
|
|
68
92
|
}
|
|
93
|
+
const lastmodValue = lastmod || (/* @__PURE__ */ new Date()).toISOString();
|
|
69
94
|
return await useCache.set(getRelativePath(url), {
|
|
70
95
|
content,
|
|
71
|
-
lastmod:
|
|
72
|
-
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
96
|
+
lastmod: lastmodValue,
|
|
97
|
+
updatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
98
|
+
nextDate: nextDate || getNextCrawlDate(lastmodValue)
|
|
73
99
|
});
|
|
74
100
|
};
|
|
75
101
|
export const crawlUrl = async ({
|
|
76
102
|
urls,
|
|
77
103
|
lastmodMap,
|
|
78
104
|
formatPageContent,
|
|
79
|
-
autoCloseBrowserCount =
|
|
105
|
+
autoCloseBrowserCount = 50
|
|
80
106
|
}) => {
|
|
81
107
|
if (typeof urls === "string") {
|
|
82
108
|
urls = [urls];
|
|
@@ -86,7 +112,7 @@ export const crawlUrl = async ({
|
|
|
86
112
|
try {
|
|
87
113
|
if (index % autoCloseBrowserCount === 0) {
|
|
88
114
|
await closeBrowser({
|
|
89
|
-
trimCache:
|
|
115
|
+
trimCache: index % (autoCloseBrowserCount * 5) === 0
|
|
90
116
|
});
|
|
91
117
|
}
|
|
92
118
|
const canCrawl = await isAcceptCrawler(url);
|
|
@@ -96,7 +122,13 @@ export const crawlUrl = async ({
|
|
|
96
122
|
formatPageContent
|
|
97
123
|
});
|
|
98
124
|
if (pageContent) {
|
|
99
|
-
|
|
125
|
+
const lastmodValue = lastmodMap?.get(url) || (/* @__PURE__ */ new Date()).toISOString();
|
|
126
|
+
await setUrlInfoToCache({
|
|
127
|
+
url,
|
|
128
|
+
content: pageContent,
|
|
129
|
+
lastmod: lastmodValue,
|
|
130
|
+
nextDate: getNextCrawlDate(lastmodValue)
|
|
131
|
+
});
|
|
100
132
|
logger.info(`Crawler[${index}] ${url} success`);
|
|
101
133
|
} else if (retryCount < 3) {
|
|
102
134
|
retryCount++;
|
|
@@ -213,10 +245,6 @@ export const crawlBlocklet = async () => {
|
|
|
213
245
|
}
|
|
214
246
|
}
|
|
215
247
|
};
|
|
216
|
-
removeElements("script");
|
|
217
|
-
removeElements("style");
|
|
218
|
-
removeElements("link");
|
|
219
|
-
removeElements("noscript");
|
|
220
248
|
removeElements('[id="uploader-container"]');
|
|
221
249
|
removeElements('[class^="uppy-"]');
|
|
222
250
|
removeElements('[id="point-up-component"]');
|
|
@@ -230,7 +258,9 @@ export const crawlBlocklet = async () => {
|
|
|
230
258
|
}
|
|
231
259
|
});
|
|
232
260
|
logger.info(...crawlerLogText("success"));
|
|
233
|
-
await closeBrowser(
|
|
261
|
+
await closeBrowser({
|
|
262
|
+
trimCache: true
|
|
263
|
+
});
|
|
234
264
|
};
|
|
235
265
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
236
266
|
let cronCrawlBlockletJob = null;
|
|
@@ -247,7 +277,7 @@ export const initCronCrawlBlocklet = ({
|
|
|
247
277
|
name: CRON_CRAWL_BLOCKLET_KEY,
|
|
248
278
|
time,
|
|
249
279
|
fn: debounce(crawlBlocklet),
|
|
250
|
-
options: { runOnInit:
|
|
280
|
+
options: { runOnInit: false, ...options }
|
|
251
281
|
}
|
|
252
282
|
],
|
|
253
283
|
onError: (err) => {
|
|
@@ -10,7 +10,7 @@ export declare const getBrowser: () => Promise<any>;
|
|
|
10
10
|
export declare const CRAWLER_FLAG = "x-crawler";
|
|
11
11
|
export declare const isSelfCrawler: (req: any) => boolean;
|
|
12
12
|
export declare const initPage: ({ abortResourceTypes, }?: {
|
|
13
|
-
abortResourceTypes?:
|
|
13
|
+
abortResourceTypes?: string[] | undefined;
|
|
14
14
|
}) => Promise<any>;
|
|
15
15
|
export declare const getDefaultRobotsUrl: (req: any) => string;
|
|
16
16
|
export declare const getDefaultSitemapUrl: (req: any) => string;
|
|
@@ -54,6 +54,13 @@ export const closeBrowser = async ({ trimCache = true } = {}) => {
|
|
|
54
54
|
clearCheckBrowserTimer();
|
|
55
55
|
if (trimCache) {
|
|
56
56
|
await puppeteer.trimCache();
|
|
57
|
+
const { cacheDirectory, temporaryDirectory } = getPuppeteerrc();
|
|
58
|
+
if (cacheDirectory) {
|
|
59
|
+
fs.emptyDirSync(cacheDirectory);
|
|
60
|
+
}
|
|
61
|
+
if (temporaryDirectory) {
|
|
62
|
+
fs.emptyDirSync(temporaryDirectory);
|
|
63
|
+
}
|
|
57
64
|
}
|
|
58
65
|
logger.info("Close browser success");
|
|
59
66
|
if (global.gc) {
|
|
@@ -90,11 +97,9 @@ export const getBrowser = async () => {
|
|
|
90
97
|
args: [
|
|
91
98
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
92
99
|
"--no-first-run",
|
|
93
|
-
// '--no-startup-window',
|
|
94
100
|
"--hide-scrollbars",
|
|
95
101
|
"--no-sandbox",
|
|
96
102
|
"--no-zygote",
|
|
97
|
-
// '--single-process',
|
|
98
103
|
"--disable-setuid-sandbox",
|
|
99
104
|
"--disable-gpu",
|
|
100
105
|
"--disable-dev-shm-usage",
|
|
@@ -105,10 +110,15 @@ export const getBrowser = async () => {
|
|
|
105
110
|
// 限制V8内存
|
|
106
111
|
"--disable-background-networking",
|
|
107
112
|
"--disable-default-apps",
|
|
108
|
-
|
|
109
|
-
// 允许跨域请求
|
|
113
|
+
// '--disable-web-security', // 允许跨域请求
|
|
110
114
|
"--disable-software-rasterizer",
|
|
111
|
-
"--disable-crash-reporter"
|
|
115
|
+
"--disable-crash-reporter",
|
|
116
|
+
"--disable-service-workers",
|
|
117
|
+
"--no-startup-window",
|
|
118
|
+
"--single-process",
|
|
119
|
+
"--disable-gpu",
|
|
120
|
+
"--disable-notifications",
|
|
121
|
+
"--disable-infobars"
|
|
112
122
|
]
|
|
113
123
|
});
|
|
114
124
|
logger.info("Launch browser success");
|
|
@@ -153,8 +163,20 @@ export const isSelfCrawler = (req) => {
|
|
|
153
163
|
return req.get(CRAWLER_FLAG) === "true" || `${ua}`.toLowerCase().indexOf("headless") !== -1;
|
|
154
164
|
};
|
|
155
165
|
export const initPage = async ({
|
|
156
|
-
abortResourceTypes = [
|
|
157
|
-
|
|
166
|
+
abortResourceTypes = [
|
|
167
|
+
"image",
|
|
168
|
+
// 图片
|
|
169
|
+
"media",
|
|
170
|
+
// 媒体文件
|
|
171
|
+
"font",
|
|
172
|
+
// 字体
|
|
173
|
+
"websocket",
|
|
174
|
+
// websocket 连接
|
|
175
|
+
"manifest",
|
|
176
|
+
// manifest 文件
|
|
177
|
+
"other"
|
|
178
|
+
// 其他资源
|
|
179
|
+
]
|
|
158
180
|
} = {}) => {
|
|
159
181
|
const browser2 = await getBrowser();
|
|
160
182
|
const page = await browser2.newPage();
|