@blocklet/crawler 2.1.232 → 2.1.234
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -32,7 +32,9 @@ const getPageContent = async ({
|
|
|
32
32
|
let page = await (0, _utils.initPage)();
|
|
33
33
|
let pageContent = null;
|
|
34
34
|
try {
|
|
35
|
-
const response = await page.goto(url
|
|
35
|
+
const response = await page.goto(url, {
|
|
36
|
+
timeout: 20 * 1e3
|
|
37
|
+
});
|
|
36
38
|
const statusCode = response.status();
|
|
37
39
|
if (![200, 304].includes(statusCode)) {
|
|
38
40
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -231,18 +233,14 @@ const crawlBlocklet = async () => {
|
|
|
231
233
|
}
|
|
232
234
|
});
|
|
233
235
|
_utils.logger.info(...crawlerLogText("success"));
|
|
234
|
-
await _utils.
|
|
235
|
-
_utils.logger.info("Puppeteer trim cache success");
|
|
236
|
-
if (global.gc) {
|
|
237
|
-
global.gc();
|
|
238
|
-
}
|
|
236
|
+
await (0, _utils.closeBrowser)();
|
|
239
237
|
};
|
|
240
238
|
exports.crawlBlocklet = crawlBlocklet;
|
|
241
239
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
242
240
|
let cronCrawlBlockletJob = null;
|
|
243
241
|
const initCronCrawlBlocklet = ({
|
|
244
|
-
time = "0 0 */
|
|
245
|
-
// every
|
|
242
|
+
time = "0 0 */24 * * *",
|
|
243
|
+
// every 24 hours
|
|
246
244
|
options
|
|
247
245
|
} = {}) => {
|
|
248
246
|
if (!cronCrawlBlockletJob) {
|
|
@@ -63,6 +63,7 @@ Object.keys(_config).forEach(function (key) {
|
|
|
63
63
|
});
|
|
64
64
|
var _util = require("util");
|
|
65
65
|
var _child_process = require("child_process");
|
|
66
|
+
var _component = require("@blocklet/sdk/lib/component");
|
|
66
67
|
function _getRequireWildcardCache(e) { if ("function" != typeof WeakMap) return null; var r = new WeakMap(), t = new WeakMap(); return (_getRequireWildcardCache = function (e) { return e ? t : r; })(e); }
|
|
67
68
|
function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u && {}.hasOwnProperty.call(e, u)) { var i = a ? Object.getOwnPropertyDescriptor(e, u) : null; i && (i.get || i.set) ? Object.defineProperty(n, u, i) : n[u] = e[u]; } return n.default = e, t && t.set(e, n), n; }
|
|
68
69
|
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -97,7 +98,11 @@ exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
|
|
|
97
98
|
const closeBrowser = async () => {
|
|
98
99
|
try {
|
|
99
100
|
if (browser) {
|
|
100
|
-
await browser.
|
|
101
|
+
const pages = await browser.pages().catch(() => []);
|
|
102
|
+
await Promise.all(pages.map(page => page.close().catch(() => {})));
|
|
103
|
+
await browser.close().catch(err => {
|
|
104
|
+
logger.warn("Browser close failed with error:", err);
|
|
105
|
+
});
|
|
101
106
|
browser = null;
|
|
102
107
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
103
108
|
}
|
|
@@ -107,7 +112,9 @@ const closeBrowser = async () => {
|
|
|
107
112
|
if (global.gc) {
|
|
108
113
|
global.gc();
|
|
109
114
|
}
|
|
110
|
-
} catch (error) {
|
|
115
|
+
} catch (error) {
|
|
116
|
+
logger.error("Failed to close browser:", error);
|
|
117
|
+
}
|
|
111
118
|
};
|
|
112
119
|
exports.closeBrowser = closeBrowser;
|
|
113
120
|
const getBrowser = async () => {
|
|
@@ -134,16 +141,17 @@ const getBrowser = async () => {
|
|
|
134
141
|
try {
|
|
135
142
|
browser = await _puppeteer.default.launch({
|
|
136
143
|
headless: true,
|
|
137
|
-
// stable headless
|
|
138
|
-
// headless: false, // debug
|
|
139
|
-
// dumpio: true,
|
|
140
144
|
args: [
|
|
141
145
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
142
146
|
"--no-first-run",
|
|
143
147
|
// '--no-startup-window',
|
|
144
148
|
"--hide-scrollbars", "--no-sandbox", "--no-zygote",
|
|
145
149
|
// '--single-process',
|
|
146
|
-
"--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas"
|
|
150
|
+
"--disable-setuid-sandbox", "--disable-gpu", "--disable-dev-shm-usage", "--disable-site-isolation-trials", "--disable-accelerated-2d-canvas", "--disable-extensions", "--js-flags=--max_old_space_size=512",
|
|
151
|
+
// 限制V8内存
|
|
152
|
+
"--disable-background-networking", "--disable-default-apps", "--disable-web-security",
|
|
153
|
+
// 允许跨域请求
|
|
154
|
+
"--disable-software-rasterizer", "--disable-crash-reporter"]
|
|
147
155
|
});
|
|
148
156
|
logger.info("Launch browser success");
|
|
149
157
|
const browserWSEndpoint = await browser.wsEndpoint();
|
|
@@ -158,11 +166,15 @@ const getBrowser = async () => {
|
|
|
158
166
|
let count = 0;
|
|
159
167
|
checkBrowserTimer = setInterval(async () => {
|
|
160
168
|
if (browser) {
|
|
161
|
-
const pages = await browser.pages();
|
|
169
|
+
const pages = await browser.pages().catch(() => []);
|
|
162
170
|
if (pages.length === 1 && pages[0].url() === "about:blank") {
|
|
163
171
|
count++;
|
|
172
|
+
logger.debug(`Browser inactive count: ${count}/3`);
|
|
173
|
+
} else {
|
|
174
|
+
count = 0;
|
|
164
175
|
}
|
|
165
176
|
if (count >= 3) {
|
|
177
|
+
logger.info("Browser inactive for 3 minutes, closing...");
|
|
166
178
|
await closeBrowser();
|
|
167
179
|
}
|
|
168
180
|
}
|
|
@@ -338,6 +350,19 @@ async function detectBestRedisUrl() {
|
|
|
338
350
|
const possibleUrls = [
|
|
339
351
|
// environment variable priority
|
|
340
352
|
process.env.REDIS_URL,
|
|
353
|
+
// by web component endpoint
|
|
354
|
+
(() => {
|
|
355
|
+
try {
|
|
356
|
+
const endpoint = (0, _component.getComponentWebEndpoint)(_config.default.env.componentDid);
|
|
357
|
+
if (endpoint) {
|
|
358
|
+
const url = new URL(endpoint);
|
|
359
|
+
return `redis://${url.hostname}:6379`;
|
|
360
|
+
}
|
|
361
|
+
} catch (err) {
|
|
362
|
+
logger.warn(`Failed to get component endpoint: ${err.message}`);
|
|
363
|
+
}
|
|
364
|
+
return null;
|
|
365
|
+
})(),
|
|
341
366
|
// default gateway
|
|
342
367
|
defaultGateway ? `redis://${defaultGateway}:6379` : null,
|
|
343
368
|
// common Docker gateway
|
|
@@ -354,7 +379,7 @@ async function detectBestRedisUrl() {
|
|
|
354
379
|
});
|
|
355
380
|
testClient.on("error", () => {});
|
|
356
381
|
await Promise.race([testClient.connect(), new Promise((_, reject) => setTimeout(() => reject(new Error("Connection timeout")), 2e3))]);
|
|
357
|
-
|
|
382
|
+
await testClient.ping();
|
|
358
383
|
await testClient.disconnect();
|
|
359
384
|
logger.info(`\u2705 Found available Redis connection: ${url}`);
|
|
360
385
|
cachedRedisUrl = url;
|
|
@@ -398,8 +423,9 @@ const cachePool = exports.cachePool = (0, _genericPool.createPool)({
|
|
|
398
423
|
}
|
|
399
424
|
}
|
|
400
425
|
}, {
|
|
401
|
-
max:
|
|
402
|
-
|
|
426
|
+
max: 2,
|
|
427
|
+
// 2 clients
|
|
428
|
+
min: 0
|
|
403
429
|
// evictionRunIntervalMillis: 0,
|
|
404
430
|
});
|
|
405
431
|
const withCache = async cb => {
|
|
@@ -8,7 +8,6 @@ import {
|
|
|
8
8
|
isAcceptCrawler,
|
|
9
9
|
env,
|
|
10
10
|
components,
|
|
11
|
-
puppeteer,
|
|
12
11
|
getComponentInfo,
|
|
13
12
|
sleep,
|
|
14
13
|
closeBrowser,
|
|
@@ -31,7 +30,9 @@ export const getPageContent = async ({ url, formatPageContent }) => {
|
|
|
31
30
|
let page = await initPage();
|
|
32
31
|
let pageContent = null;
|
|
33
32
|
try {
|
|
34
|
-
const response = await page.goto(url
|
|
33
|
+
const response = await page.goto(url, {
|
|
34
|
+
timeout: 20 * 1e3
|
|
35
|
+
});
|
|
35
36
|
const statusCode = response.status();
|
|
36
37
|
if (![200, 304].includes(statusCode)) {
|
|
37
38
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -212,17 +213,13 @@ export const crawlBlocklet = async () => {
|
|
|
212
213
|
}
|
|
213
214
|
});
|
|
214
215
|
logger.info(...crawlerLogText("success"));
|
|
215
|
-
await
|
|
216
|
-
logger.info("Puppeteer trim cache success");
|
|
217
|
-
if (global.gc) {
|
|
218
|
-
global.gc();
|
|
219
|
-
}
|
|
216
|
+
await closeBrowser();
|
|
220
217
|
};
|
|
221
218
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
222
219
|
let cronCrawlBlockletJob = null;
|
|
223
220
|
export const initCronCrawlBlocklet = ({
|
|
224
|
-
time = "0 0 */
|
|
225
|
-
// every
|
|
221
|
+
time = "0 0 */24 * * *",
|
|
222
|
+
// every 24 hours
|
|
226
223
|
options
|
|
227
224
|
} = {}) => {
|
|
228
225
|
if (!cronCrawlBlockletJob) {
|
|
@@ -13,6 +13,7 @@ import uniq from "lodash/uniq";
|
|
|
13
13
|
import config from "@blocklet/sdk/lib/config";
|
|
14
14
|
import { promisify } from "util";
|
|
15
15
|
import { exec } from "child_process";
|
|
16
|
+
import { getComponentWebEndpoint } from "@blocklet/sdk/lib/component";
|
|
16
17
|
export * from "@blocklet/sdk/lib/config";
|
|
17
18
|
const { logger } = config;
|
|
18
19
|
const execAsync = promisify(exec);
|
|
@@ -41,7 +42,12 @@ export const clearCheckBrowserTimer = () => {
|
|
|
41
42
|
export const closeBrowser = async () => {
|
|
42
43
|
try {
|
|
43
44
|
if (browser) {
|
|
44
|
-
await browser.
|
|
45
|
+
const pages = await browser.pages().catch(() => []);
|
|
46
|
+
await Promise.all(pages.map((page) => page.close().catch(() => {
|
|
47
|
+
})));
|
|
48
|
+
await browser.close().catch((err) => {
|
|
49
|
+
logger.warn("Browser close failed with error:", err);
|
|
50
|
+
});
|
|
45
51
|
browser = null;
|
|
46
52
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
47
53
|
}
|
|
@@ -52,6 +58,7 @@ export const closeBrowser = async () => {
|
|
|
52
58
|
global.gc();
|
|
53
59
|
}
|
|
54
60
|
} catch (error) {
|
|
61
|
+
logger.error("Failed to close browser:", error);
|
|
55
62
|
}
|
|
56
63
|
};
|
|
57
64
|
export const getBrowser = async () => {
|
|
@@ -78,9 +85,6 @@ export const getBrowser = async () => {
|
|
|
78
85
|
try {
|
|
79
86
|
browser = await puppeteer.launch({
|
|
80
87
|
headless: true,
|
|
81
|
-
// stable headless
|
|
82
|
-
// headless: false, // debug
|
|
83
|
-
// dumpio: true,
|
|
84
88
|
args: [
|
|
85
89
|
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
86
90
|
"--no-first-run",
|
|
@@ -93,7 +97,16 @@ export const getBrowser = async () => {
|
|
|
93
97
|
"--disable-gpu",
|
|
94
98
|
"--disable-dev-shm-usage",
|
|
95
99
|
"--disable-site-isolation-trials",
|
|
96
|
-
"--disable-accelerated-2d-canvas"
|
|
100
|
+
"--disable-accelerated-2d-canvas",
|
|
101
|
+
"--disable-extensions",
|
|
102
|
+
"--js-flags=--max_old_space_size=512",
|
|
103
|
+
// 限制V8内存
|
|
104
|
+
"--disable-background-networking",
|
|
105
|
+
"--disable-default-apps",
|
|
106
|
+
"--disable-web-security",
|
|
107
|
+
// 允许跨域请求
|
|
108
|
+
"--disable-software-rasterizer",
|
|
109
|
+
"--disable-crash-reporter"
|
|
97
110
|
]
|
|
98
111
|
});
|
|
99
112
|
logger.info("Launch browser success");
|
|
@@ -109,11 +122,15 @@ export const getBrowser = async () => {
|
|
|
109
122
|
let count = 0;
|
|
110
123
|
checkBrowserTimer = setInterval(async () => {
|
|
111
124
|
if (browser) {
|
|
112
|
-
const pages = await browser.pages();
|
|
125
|
+
const pages = await browser.pages().catch(() => []);
|
|
113
126
|
if (pages.length === 1 && pages[0].url() === "about:blank") {
|
|
114
127
|
count++;
|
|
128
|
+
logger.debug(`Browser inactive count: ${count}/3`);
|
|
129
|
+
} else {
|
|
130
|
+
count = 0;
|
|
115
131
|
}
|
|
116
132
|
if (count >= 3) {
|
|
133
|
+
logger.info("Browser inactive for 3 minutes, closing...");
|
|
117
134
|
await closeBrowser();
|
|
118
135
|
}
|
|
119
136
|
}
|
|
@@ -358,6 +375,19 @@ async function detectBestRedisUrl() {
|
|
|
358
375
|
const possibleUrls = [
|
|
359
376
|
// environment variable priority
|
|
360
377
|
process.env.REDIS_URL,
|
|
378
|
+
// by web component endpoint
|
|
379
|
+
(() => {
|
|
380
|
+
try {
|
|
381
|
+
const endpoint = getComponentWebEndpoint(config.env.componentDid);
|
|
382
|
+
if (endpoint) {
|
|
383
|
+
const url = new URL(endpoint);
|
|
384
|
+
return `redis://${url.hostname}:6379`;
|
|
385
|
+
}
|
|
386
|
+
} catch (err) {
|
|
387
|
+
logger.warn(`Failed to get component endpoint: ${err.message}`);
|
|
388
|
+
}
|
|
389
|
+
return null;
|
|
390
|
+
})(),
|
|
361
391
|
// default gateway
|
|
362
392
|
defaultGateway ? `redis://${defaultGateway}:6379` : null,
|
|
363
393
|
// common Docker gateway
|
|
@@ -377,7 +407,7 @@ async function detectBestRedisUrl() {
|
|
|
377
407
|
testClient.connect(),
|
|
378
408
|
new Promise((_, reject) => setTimeout(() => reject(new Error("Connection timeout")), 2e3))
|
|
379
409
|
]);
|
|
380
|
-
|
|
410
|
+
await testClient.ping();
|
|
381
411
|
await testClient.disconnect();
|
|
382
412
|
logger.info(`\u2705 Found available Redis connection: ${url}`);
|
|
383
413
|
cachedRedisUrl = url;
|
|
@@ -423,8 +453,9 @@ export const cachePool = createPool(
|
|
|
423
453
|
}
|
|
424
454
|
},
|
|
425
455
|
{
|
|
426
|
-
max:
|
|
427
|
-
|
|
456
|
+
max: 2,
|
|
457
|
+
// 2 clients
|
|
458
|
+
min: 0
|
|
428
459
|
// evictionRunIntervalMillis: 0,
|
|
429
460
|
}
|
|
430
461
|
);
|