@blocklet/crawler 2.1.232 → 2.1.233
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -32,7 +32,9 @@ const getPageContent = async ({
|
|
|
32
32
|
let page = await (0, _utils.initPage)();
|
|
33
33
|
let pageContent = null;
|
|
34
34
|
try {
|
|
35
|
-
const response = await page.goto(url
|
|
35
|
+
const response = await page.goto(url, {
|
|
36
|
+
timeout: 20 * 1e3
|
|
37
|
+
});
|
|
36
38
|
const statusCode = response.status();
|
|
37
39
|
if (![200, 304].includes(statusCode)) {
|
|
38
40
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -231,18 +233,14 @@ const crawlBlocklet = async () => {
|
|
|
231
233
|
}
|
|
232
234
|
});
|
|
233
235
|
_utils.logger.info(...crawlerLogText("success"));
|
|
234
|
-
await _utils.
|
|
235
|
-
_utils.logger.info("Puppeteer trim cache success");
|
|
236
|
-
if (global.gc) {
|
|
237
|
-
global.gc();
|
|
238
|
-
}
|
|
236
|
+
await (0, _utils.closeBrowser)();
|
|
239
237
|
};
|
|
240
238
|
exports.crawlBlocklet = crawlBlocklet;
|
|
241
239
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
242
240
|
let cronCrawlBlockletJob = null;
|
|
243
241
|
const initCronCrawlBlocklet = ({
|
|
244
|
-
time = "0 0 */
|
|
245
|
-
// every
|
|
242
|
+
time = "0 0 */24 * * *",
|
|
243
|
+
// every 24 hours
|
|
246
244
|
options
|
|
247
245
|
} = {}) => {
|
|
248
246
|
if (!cronCrawlBlockletJob) {
|
|
@@ -97,7 +97,11 @@ exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
|
|
|
97
97
|
const closeBrowser = async () => {
|
|
98
98
|
try {
|
|
99
99
|
if (browser) {
|
|
100
|
-
await browser.
|
|
100
|
+
const pages = await browser.pages().catch(() => []);
|
|
101
|
+
await Promise.all(pages.map(page => page.close().catch(() => {})));
|
|
102
|
+
await browser.close().catch(err => {
|
|
103
|
+
logger.warn("Browser close failed with error:", err);
|
|
104
|
+
});
|
|
101
105
|
browser = null;
|
|
102
106
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
103
107
|
}
|
|
@@ -107,7 +111,9 @@ const closeBrowser = async () => {
|
|
|
107
111
|
if (global.gc) {
|
|
108
112
|
global.gc();
|
|
109
113
|
}
|
|
110
|
-
} catch (error) {
|
|
114
|
+
} catch (error) {
|
|
115
|
+
logger.error("Failed to close browser:", error);
|
|
116
|
+
}
|
|
111
117
|
};
|
|
112
118
|
exports.closeBrowser = closeBrowser;
|
|
113
119
|
const getBrowser = async () => {
|
|
@@ -134,16 +140,17 @@ const getBrowser = async () => {
|
|
|
134
140
|
try {
|
|
135
141
|
browser = await _puppeteer.default.launch({
|
|
136
142
|
headless: true,
|
|
137
|
-
// stable headless
|
|
138
|
-
// headless: false, // debug
|
|
139
|
-
// dumpio: true,
|
|
140
143
|
args: [
|
|
141
144
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
142
145
|
"--no-first-run",
|
|
143
146
|
// '--no-startup-window',
|
|
144
147
|
"--hide-scrollbars", "--no-sandbox", "--no-zygote",
|
|
145
148
|
// '--single-process',
|
|
146
|
-
"--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas"
|
|
149
|
+
"--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
|
|
150
|
+
// 限制V8内存
|
|
151
|
+
"--disable-background-networking", "--disable-default-apps", "--disable-web-security",
|
|
152
|
+
// 允许跨域请求
|
|
153
|
+
"--disable-software-rasterizer", "--disable-crash-reporter"]
|
|
147
154
|
});
|
|
148
155
|
logger.info("Launch browser success");
|
|
149
156
|
const browserWSEndpoint = await browser.wsEndpoint();
|
|
@@ -158,11 +165,15 @@ const getBrowser = async () => {
|
|
|
158
165
|
let count = 0;
|
|
159
166
|
checkBrowserTimer = setInterval(async () => {
|
|
160
167
|
if (browser) {
|
|
161
|
-
const pages = await browser.pages();
|
|
168
|
+
const pages = await browser.pages().catch(() => []);
|
|
162
169
|
if (pages.length === 1 && pages[0].url() === "about:blank") {
|
|
163
170
|
count++;
|
|
171
|
+
logger.debug(`Browser inactive count: ${count}/3`);
|
|
172
|
+
} else {
|
|
173
|
+
count = 0;
|
|
164
174
|
}
|
|
165
175
|
if (count >= 3) {
|
|
176
|
+
logger.info("Browser inactive for 3 minutes, closing...");
|
|
166
177
|
await closeBrowser();
|
|
167
178
|
}
|
|
168
179
|
}
|
|
@@ -398,8 +409,9 @@ const cachePool = exports.cachePool = (0, _genericPool.createPool)({
|
|
|
398
409
|
}
|
|
399
410
|
}
|
|
400
411
|
}, {
|
|
401
|
-
max:
|
|
402
|
-
|
|
412
|
+
max: 2,
|
|
413
|
+
// 2 clients
|
|
414
|
+
min: 0
|
|
403
415
|
// evictionRunIntervalMillis: 0,
|
|
404
416
|
});
|
|
405
417
|
const withCache = async cb => {
|
|
@@ -8,7 +8,6 @@ import {
|
|
|
8
8
|
isAcceptCrawler,
|
|
9
9
|
env,
|
|
10
10
|
components,
|
|
11
|
-
puppeteer,
|
|
12
11
|
getComponentInfo,
|
|
13
12
|
sleep,
|
|
14
13
|
closeBrowser,
|
|
@@ -31,7 +30,9 @@ export const getPageContent = async ({ url, formatPageContent }) => {
|
|
|
31
30
|
let page = await initPage();
|
|
32
31
|
let pageContent = null;
|
|
33
32
|
try {
|
|
34
|
-
const response = await page.goto(url
|
|
33
|
+
const response = await page.goto(url, {
|
|
34
|
+
timeout: 20 * 1e3
|
|
35
|
+
});
|
|
35
36
|
const statusCode = response.status();
|
|
36
37
|
if (![200, 304].includes(statusCode)) {
|
|
37
38
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -212,17 +213,13 @@ export const crawlBlocklet = async () => {
|
|
|
212
213
|
}
|
|
213
214
|
});
|
|
214
215
|
logger.info(...crawlerLogText("success"));
|
|
215
|
-
await
|
|
216
|
-
logger.info("Puppeteer trim cache success");
|
|
217
|
-
if (global.gc) {
|
|
218
|
-
global.gc();
|
|
219
|
-
}
|
|
216
|
+
await closeBrowser();
|
|
220
217
|
};
|
|
221
218
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
222
219
|
let cronCrawlBlockletJob = null;
|
|
223
220
|
export const initCronCrawlBlocklet = ({
|
|
224
|
-
time = "0 0 */
|
|
225
|
-
// every
|
|
221
|
+
time = "0 0 */24 * * *",
|
|
222
|
+
// every 24 hours
|
|
226
223
|
options
|
|
227
224
|
} = {}) => {
|
|
228
225
|
if (!cronCrawlBlockletJob) {
|
|
@@ -41,7 +41,12 @@ export const clearCheckBrowserTimer = () => {
|
|
|
41
41
|
export const closeBrowser = async () => {
|
|
42
42
|
try {
|
|
43
43
|
if (browser) {
|
|
44
|
-
await browser.
|
|
44
|
+
const pages = await browser.pages().catch(() => []);
|
|
45
|
+
await Promise.all(pages.map((page) => page.close().catch(() => {
|
|
46
|
+
})));
|
|
47
|
+
await browser.close().catch((err) => {
|
|
48
|
+
logger.warn("Browser close failed with error:", err);
|
|
49
|
+
});
|
|
45
50
|
browser = null;
|
|
46
51
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
47
52
|
}
|
|
@@ -52,6 +57,7 @@ export const closeBrowser = async () => {
|
|
|
52
57
|
global.gc();
|
|
53
58
|
}
|
|
54
59
|
} catch (error) {
|
|
60
|
+
logger.error("Failed to close browser:", error);
|
|
55
61
|
}
|
|
56
62
|
};
|
|
57
63
|
export const getBrowser = async () => {
|
|
@@ -78,9 +84,6 @@ export const getBrowser = async () => {
|
|
|
78
84
|
try {
|
|
79
85
|
browser = await puppeteer.launch({
|
|
80
86
|
headless: true,
|
|
81
|
-
// stable headless
|
|
82
|
-
// headless: false, // debug
|
|
83
|
-
// dumpio: true,
|
|
84
87
|
args: [
|
|
85
88
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
86
89
|
"--no-first-run",
|
|
@@ -93,7 +96,16 @@ export const getBrowser = async () => {
|
|
|
93
96
|
"--disable-gpu",
|
|
94
97
|
"--disable-dev-shm-usage",
|
|
95
98
|
"--disable-site-isolation-trials",
|
|
96
|
-
"--disable-accelerated-2d-canvas"
|
|
99
|
+
"--disable-accelerated-2d-canvas",
|
|
100
|
+
"--disable-extensions",
|
|
101
|
+
"--js-flags=--max_old_space_size=512",
|
|
102
|
+
// 限制V8内存
|
|
103
|
+
"--disable-background-networking",
|
|
104
|
+
"--disable-default-apps",
|
|
105
|
+
"--disable-web-security",
|
|
106
|
+
// 允许跨域请求
|
|
107
|
+
"--disable-software-rasterizer",
|
|
108
|
+
"--disable-crash-reporter"
|
|
97
109
|
]
|
|
98
110
|
});
|
|
99
111
|
logger.info("Launch browser success");
|
|
@@ -109,11 +121,15 @@ export const getBrowser = async () => {
|
|
|
109
121
|
let count = 0;
|
|
110
122
|
checkBrowserTimer = setInterval(async () => {
|
|
111
123
|
if (browser) {
|
|
112
|
-
const pages = await browser.pages();
|
|
124
|
+
const pages = await browser.pages().catch(() => []);
|
|
113
125
|
if (pages.length === 1 && pages[0].url() === "about:blank") {
|
|
114
126
|
count++;
|
|
127
|
+
logger.debug(`Browser inactive count: ${count}/3`);
|
|
128
|
+
} else {
|
|
129
|
+
count = 0;
|
|
115
130
|
}
|
|
116
131
|
if (count >= 3) {
|
|
132
|
+
logger.info("Browser inactive for 3 minutes, closing...");
|
|
117
133
|
await closeBrowser();
|
|
118
134
|
}
|
|
119
135
|
}
|
|
@@ -423,8 +439,9 @@ export const cachePool = createPool(
|
|
|
423
439
|
}
|
|
424
440
|
},
|
|
425
441
|
{
|
|
426
|
-
max:
|
|
427
|
-
|
|
442
|
+
max: 2,
|
|
443
|
+
// 2 clients
|
|
444
|
+
min: 0
|
|
428
445
|
// evictionRunIntervalMillis: 0,
|
|
429
446
|
}
|
|
430
447
|
);
|