@arcblock/crawler 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocklet.d.ts +6 -0
- package/dist/blocklet.js +199 -0
- package/dist/cache.d.ts +10 -0
- package/dist/cache.js +119 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +17 -0
- package/dist/crawler.d.ts +28 -0
- package/dist/crawler.js +314 -0
- package/dist/db/index.d.ts +1 -0
- package/dist/db/index.js +41 -0
- package/dist/db/job.d.ts +33 -0
- package/dist/db/job.js +54 -0
- package/dist/db/snapshot.d.ts +31 -0
- package/dist/db/snapshot.js +52 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +45 -0
- package/dist/middleware.d.ts +4 -0
- package/dist/middleware.js +44 -0
- package/dist/puppeteer.d.ts +16 -0
- package/dist/puppeteer.js +318 -0
- package/dist/utils.d.ts +15 -0
- package/dist/utils.js +239 -0
- package/esm/blocklet.d.ts +6 -0
- package/esm/blocklet.js +190 -0
- package/esm/cache.d.ts +10 -0
- package/esm/cache.js +114 -0
- package/esm/config.d.ts +10 -0
- package/esm/config.js +11 -0
- package/esm/crawler.d.ts +28 -0
- package/esm/crawler.js +301 -0
- package/esm/db/index.d.ts +1 -0
- package/esm/db/index.js +35 -0
- package/esm/db/job.d.ts +33 -0
- package/esm/db/job.js +50 -0
- package/esm/db/snapshot.d.ts +31 -0
- package/esm/db/snapshot.js +48 -0
- package/esm/index.d.ts +6 -0
- package/esm/index.js +26 -0
- package/esm/middleware.d.ts +4 -0
- package/esm/middleware.js +41 -0
- package/esm/puppeteer.d.ts +16 -0
- package/esm/puppeteer.js +272 -0
- package/esm/utils.d.ts +15 -0
- package/esm/utils.js +220 -0
- package/package.json +10 -3
- package/src/blocklet.ts +0 -223
- package/src/cache.ts +0 -117
- package/src/config.ts +0 -13
- package/src/crawler.ts +0 -364
- package/src/db/index.ts +0 -27
- package/src/db/job.ts +0 -93
- package/src/db/snapshot.ts +0 -89
- package/src/index.ts +0 -19
- package/src/middleware.ts +0 -46
- package/src/puppeteer.ts +0 -296
- package/src/utils.ts +0 -240
- package/third.d.ts +0 -1
- package/tsconfig.json +0 -9
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
47
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
+
exports.closeBrowser = exports.getBrowser = exports.puppeteer = void 0;
|
|
49
|
+
exports.ensurePuppeteerrc = ensurePuppeteerrc;
|
|
50
|
+
exports.ensureBrowser = ensureBrowser;
|
|
51
|
+
exports.connectBrowser = connectBrowser;
|
|
52
|
+
exports.launchBrowser = launchBrowser;
|
|
53
|
+
exports.initPage = initPage;
|
|
54
|
+
// import fs from 'fs-extra';
|
|
55
|
+
// import path from 'path';
|
|
56
|
+
const puppeteer_1 = __importDefault(require("@blocklet/puppeteer"));
|
|
57
|
+
exports.puppeteer = puppeteer_1.default;
|
|
58
|
+
const config_1 = require("@blocklet/sdk/lib/config");
|
|
59
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
60
|
+
const path_1 = __importDefault(require("path"));
|
|
61
|
+
const timers_1 = require("timers");
|
|
62
|
+
const cache_1 = require("./cache");
|
|
63
|
+
const config_2 = require("./config");
|
|
64
|
+
const utils_1 = require("./utils");
|
|
65
|
+
// let puppeteerConfig: {
|
|
66
|
+
// cacheDirectory: string;
|
|
67
|
+
// temporaryDirectory: string;
|
|
68
|
+
// };
|
|
69
|
+
const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${config_1.env.appId || 'unknown'}`;
|
|
70
|
+
const BrowserStatus = {
|
|
71
|
+
Launching: 'Launching',
|
|
72
|
+
Ready: 'Ready',
|
|
73
|
+
};
|
|
74
|
+
let browser;
|
|
75
|
+
let browserActivatedTimer;
|
|
76
|
+
function ensurePuppeteerrc() {
|
|
77
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
78
|
+
const cacheDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'cache');
|
|
79
|
+
const temporaryDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'tmp');
|
|
80
|
+
const puppeteerrcPath = path_1.default.join(config_2.config.appDir, '.puppeteerrc.js');
|
|
81
|
+
// ensure directory exists
|
|
82
|
+
yield Promise.all([fs_extra_1.default.ensureDir(cacheDirectory), fs_extra_1.default.ensureDir(temporaryDirectory), fs_extra_1.default.ensureFile(puppeteerrcPath)]);
|
|
83
|
+
const puppeteerConfig = {
|
|
84
|
+
cacheDirectory,
|
|
85
|
+
temporaryDirectory,
|
|
86
|
+
};
|
|
87
|
+
const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
|
|
88
|
+
yield fs_extra_1.default.writeFile(puppeteerrcPath, fileContent);
|
|
89
|
+
config_2.logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
|
|
90
|
+
return puppeteerConfig;
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
function ensureBrowser() {
|
|
94
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
95
|
+
const puppeteerConfig = yield ensurePuppeteerrc();
|
|
96
|
+
const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
|
|
97
|
+
config_2.logger.info('executablePath', executablePath);
|
|
98
|
+
if (!fs_extra_1.default.existsSync(executablePath)) {
|
|
99
|
+
config_2.logger.info('start download browser', puppeteerConfig);
|
|
100
|
+
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
101
|
+
try {
|
|
102
|
+
// @ts-ignore
|
|
103
|
+
// eslint-disable-next-line import/extensions
|
|
104
|
+
return yield Promise.resolve().then(() => __importStar(require('@blocklet/puppeteer/internal/node/install.js')));
|
|
105
|
+
}
|
|
106
|
+
catch (err) {
|
|
107
|
+
config_2.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
|
|
108
|
+
}
|
|
109
|
+
}))();
|
|
110
|
+
if (downloadBrowser) {
|
|
111
|
+
yield downloadBrowser();
|
|
112
|
+
config_2.logger.info('Browser download completed successfully');
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// try to launch browser
|
|
116
|
+
if (config_2.config.testOnInitialize) {
|
|
117
|
+
const browser = yield launchBrowser();
|
|
118
|
+
if (!browser) {
|
|
119
|
+
throw new Error('Failed to launch browser');
|
|
120
|
+
}
|
|
121
|
+
yield (0, exports.closeBrowser)();
|
|
122
|
+
}
|
|
123
|
+
config_2.logger.info('Puppeteer is ready');
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
function connectBrowser() {
|
|
127
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
128
|
+
const browserWSEndpoint = yield cache_1.useCache.get(BROWSER_WS_ENDPOINT_KEY);
|
|
129
|
+
if (!browserWSEndpoint) {
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
// retry if browser is launching
|
|
133
|
+
if (browserWSEndpoint.status === BrowserStatus.Launching) {
|
|
134
|
+
yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000));
|
|
135
|
+
return connectBrowser();
|
|
136
|
+
}
|
|
137
|
+
try {
|
|
138
|
+
browser = yield puppeteer_1.default.connect({
|
|
139
|
+
browserWSEndpoint: browserWSEndpoint.endpoint,
|
|
140
|
+
});
|
|
141
|
+
config_2.logger.info('Connect browser success');
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
config_2.logger.warn('Connect browser failed, clear endpoint', err);
|
|
145
|
+
yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
return browser;
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
function launchBrowser() {
|
|
152
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
153
|
+
yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
|
|
154
|
+
endpoint: null,
|
|
155
|
+
status: BrowserStatus.Launching,
|
|
156
|
+
});
|
|
157
|
+
try {
|
|
158
|
+
// @ts-ignore
|
|
159
|
+
browser = yield puppeteer_1.default.launch({
|
|
160
|
+
headless: true,
|
|
161
|
+
args: [
|
|
162
|
+
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
163
|
+
'--no-first-run',
|
|
164
|
+
'--hide-scrollbars',
|
|
165
|
+
'--no-sandbox',
|
|
166
|
+
'--no-zygote',
|
|
167
|
+
'--disable-setuid-sandbox',
|
|
168
|
+
'--disable-gpu',
|
|
169
|
+
'--disable-dev-shm-usage',
|
|
170
|
+
'--disable-site-isolation-trials',
|
|
171
|
+
'--disable-accelerated-2d-canvas',
|
|
172
|
+
'--disable-extensions',
|
|
173
|
+
'--js-flags=--max_old_space_size=512', // 限制V8内存
|
|
174
|
+
'--disable-background-networking',
|
|
175
|
+
'--disable-default-apps',
|
|
176
|
+
// '--disable-web-security', // 允许跨域请求
|
|
177
|
+
'--disable-software-rasterizer',
|
|
178
|
+
'--disable-crash-reporter',
|
|
179
|
+
'--disable-service-workers',
|
|
180
|
+
'--disable-notifications',
|
|
181
|
+
'--disable-infobars',
|
|
182
|
+
'--font-render-hinting=none',
|
|
183
|
+
],
|
|
184
|
+
});
|
|
185
|
+
config_2.logger.info('Launch browser success');
|
|
186
|
+
}
|
|
187
|
+
catch (error) {
|
|
188
|
+
config_2.logger.error('launch browser failed: ', error);
|
|
189
|
+
// cleanup browser endpoint
|
|
190
|
+
yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
191
|
+
throw error;
|
|
192
|
+
}
|
|
193
|
+
// save browserWSEndpoint to cache
|
|
194
|
+
const endpoint = yield browser.wsEndpoint();
|
|
195
|
+
yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
|
|
196
|
+
endpoint,
|
|
197
|
+
status: BrowserStatus.Ready,
|
|
198
|
+
});
|
|
199
|
+
return browser;
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
function checkBrowserActivated() {
|
|
203
|
+
clearBrowserActivatedTimer();
|
|
204
|
+
let count = 0;
|
|
205
|
+
browserActivatedTimer = (0, timers_1.setInterval)(() => __awaiter(this, void 0, void 0, function* () {
|
|
206
|
+
var _a;
|
|
207
|
+
if (browser) {
|
|
208
|
+
const pages = yield browser.pages().catch(() => []);
|
|
209
|
+
if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
|
|
210
|
+
count++;
|
|
211
|
+
config_2.logger.debug(`Browser inactive count: ${count}/3`);
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
count = 0; // 重置计数器!
|
|
215
|
+
}
|
|
216
|
+
if (count >= 3) {
|
|
217
|
+
config_2.logger.info('Browser inactive for 3 minutes, closing...');
|
|
218
|
+
yield (0, exports.closeBrowser)({
|
|
219
|
+
trimCache: true,
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}), 1000 * 60);
|
|
224
|
+
}
|
|
225
|
+
function clearBrowserActivatedTimer() {
|
|
226
|
+
if (browserActivatedTimer) {
|
|
227
|
+
(0, timers_1.clearInterval)(browserActivatedTimer);
|
|
228
|
+
browserActivatedTimer = null;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
232
|
+
if (browser)
|
|
233
|
+
return browser;
|
|
234
|
+
// sleep random time (0 ~ 5s),to avoid concurrent blocklet
|
|
235
|
+
yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000 * 5));
|
|
236
|
+
// try to connect browser
|
|
237
|
+
const connectedBrowser = yield connectBrowser();
|
|
238
|
+
if (connectedBrowser) {
|
|
239
|
+
config_2.logger.debug('getBrowser.connectedBrowser');
|
|
240
|
+
browser = connectedBrowser;
|
|
241
|
+
return browser;
|
|
242
|
+
}
|
|
243
|
+
// try to launch browser
|
|
244
|
+
const launchedBrowser = yield launchBrowser();
|
|
245
|
+
if (launchedBrowser) {
|
|
246
|
+
config_2.logger.debug('getBrowser.launchedBrowser');
|
|
247
|
+
browser = launchedBrowser;
|
|
248
|
+
checkBrowserActivated();
|
|
249
|
+
return browser;
|
|
250
|
+
}
|
|
251
|
+
throw new Error('No browser to use, should install redis or browser');
|
|
252
|
+
});
|
|
253
|
+
exports.getBrowser = getBrowser;
|
|
254
|
+
const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
|
|
255
|
+
if (!browser)
|
|
256
|
+
return;
|
|
257
|
+
// close all pages
|
|
258
|
+
try {
|
|
259
|
+
const pages = yield browser.pages();
|
|
260
|
+
yield Promise.all(pages.map((page) => page.close()));
|
|
261
|
+
}
|
|
262
|
+
catch (err) {
|
|
263
|
+
config_2.logger.error('Failed to close all pages:', err);
|
|
264
|
+
}
|
|
265
|
+
// close browser
|
|
266
|
+
try {
|
|
267
|
+
yield browser.close();
|
|
268
|
+
}
|
|
269
|
+
catch (err) {
|
|
270
|
+
config_2.logger.error('Failed to close browser:', err);
|
|
271
|
+
}
|
|
272
|
+
// clear cache
|
|
273
|
+
try {
|
|
274
|
+
if (trimCache) {
|
|
275
|
+
yield puppeteer_1.default.trimCache();
|
|
276
|
+
config_2.logger.info('Trim cache success');
|
|
277
|
+
}
|
|
278
|
+
// try to clear temporary directory
|
|
279
|
+
// if (puppeteerConfig) {
|
|
280
|
+
// await fs.emptyDir(puppeteerConfig.temporaryDirectory);
|
|
281
|
+
// }
|
|
282
|
+
if (global.gc) {
|
|
283
|
+
global.gc();
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
catch (err) {
|
|
287
|
+
config_2.logger.error('Failed to clear browser cache:', err);
|
|
288
|
+
}
|
|
289
|
+
browser = null;
|
|
290
|
+
clearBrowserActivatedTimer();
|
|
291
|
+
yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
292
|
+
config_2.logger.info('Close browser success');
|
|
293
|
+
});
|
|
294
|
+
exports.closeBrowser = closeBrowser;
|
|
295
|
+
function initPage() {
|
|
296
|
+
return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
|
|
297
|
+
const browser = yield (0, exports.getBrowser)();
|
|
298
|
+
const page = yield browser.newPage();
|
|
299
|
+
yield page.setViewport({ width: 1440, height: 900 });
|
|
300
|
+
// page setting
|
|
301
|
+
// add custom headers
|
|
302
|
+
yield page.setExtraHTTPHeaders({
|
|
303
|
+
[utils_1.CRAWLER_FLAG]: 'true',
|
|
304
|
+
});
|
|
305
|
+
// abort resource types
|
|
306
|
+
if (abortResourceTypes.length > 0) {
|
|
307
|
+
yield page.setRequestInterception(true);
|
|
308
|
+
page.on('request', (req) => {
|
|
309
|
+
// @ts-ignore
|
|
310
|
+
if (abortResourceTypes.includes(req.resourceType())) {
|
|
311
|
+
return req.abort();
|
|
312
|
+
}
|
|
313
|
+
return req.continue();
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
return page;
|
|
317
|
+
});
|
|
318
|
+
}
|
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export declare const api: import("axios").AxiosInstance;
|
|
2
|
+
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
3
|
+
export declare const CRAWLER_FLAG = "x-crawler";
|
|
4
|
+
export declare const isSelfCrawler: (req: any) => boolean;
|
|
5
|
+
export declare const getDefaultRobotsUrl: (url: string) => string;
|
|
6
|
+
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
7
|
+
export declare const getDefaultSitemapUrl: (url: string) => string;
|
|
8
|
+
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
9
|
+
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
10
|
+
export declare const isBotUserAgent: (req: any) => boolean;
|
|
11
|
+
export declare const getComponentInfo: () => {};
|
|
12
|
+
export declare const getFullUrl: (req: any) => string;
|
|
13
|
+
export declare const getRelativePath: (url: string) => string;
|
|
14
|
+
export declare const formatUrl: (url: string) => string;
|
|
15
|
+
export declare function md5(content: string | Uint8Array): string;
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
|
|
16
|
+
exports.getRobots = getRobots;
|
|
17
|
+
exports.md5 = md5;
|
|
18
|
+
const config_1 = require("@blocklet/sdk/lib/config");
|
|
19
|
+
const axios_1 = __importDefault(require("axios"));
|
|
20
|
+
const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
|
|
21
|
+
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
22
|
+
const node_crypto_1 = require("node:crypto");
|
|
23
|
+
const robots_parser_1 = __importDefault(require("robots-parser"));
|
|
24
|
+
const sitemap_1 = require("sitemap");
|
|
25
|
+
const stream_1 = require("stream");
|
|
26
|
+
const ufo_1 = require("ufo");
|
|
27
|
+
exports.api = axios_1.default.create({
|
|
28
|
+
timeout: 1000 * 10,
|
|
29
|
+
headers: {
|
|
30
|
+
'Content-Type': 'application/json',
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
const sleep = (ms) => {
|
|
34
|
+
return new Promise((resolve) => {
|
|
35
|
+
setTimeout(resolve, ms);
|
|
36
|
+
});
|
|
37
|
+
};
|
|
38
|
+
exports.sleep = sleep;
|
|
39
|
+
exports.CRAWLER_FLAG = 'x-crawler';
|
|
40
|
+
const isSelfCrawler = (req) => {
|
|
41
|
+
const ua = req.get('user-agent') || '';
|
|
42
|
+
return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
|
|
43
|
+
};
|
|
44
|
+
exports.isSelfCrawler = isSelfCrawler;
|
|
45
|
+
/**
|
|
46
|
+
* A default set of user agent patterns for bots/crawlers that do not perform
|
|
47
|
+
* well with pages that require JavaScript.
|
|
48
|
+
*/
|
|
49
|
+
const botUserAgents = [
|
|
50
|
+
/bot/i,
|
|
51
|
+
/spider/i,
|
|
52
|
+
/facebookexternalhit/i,
|
|
53
|
+
/simplepie/i,
|
|
54
|
+
/yahooseeker/i,
|
|
55
|
+
/embedly/i,
|
|
56
|
+
/quora link preview/i,
|
|
57
|
+
/outbrain/i,
|
|
58
|
+
/vkshare/i,
|
|
59
|
+
/monit/i,
|
|
60
|
+
/Pingability/i,
|
|
61
|
+
/Monitoring/i,
|
|
62
|
+
/WinHttpRequest/i,
|
|
63
|
+
/Apache-HttpClient/i,
|
|
64
|
+
/getprismatic.com/i,
|
|
65
|
+
/python-requests/i,
|
|
66
|
+
/Twurly/i,
|
|
67
|
+
/yandex/i,
|
|
68
|
+
/browserproxy/i,
|
|
69
|
+
/crawler/i,
|
|
70
|
+
/Qwantify/i,
|
|
71
|
+
/Yahoo/i,
|
|
72
|
+
/pinterest/i,
|
|
73
|
+
/Tumblr/i,
|
|
74
|
+
/Tumblr Agent/i,
|
|
75
|
+
/WhatsApp/i,
|
|
76
|
+
/Google-Structured-Data-Testing-Tool/i,
|
|
77
|
+
/Google-InspectionTool/i,
|
|
78
|
+
/Googlebot/i,
|
|
79
|
+
/GPTBot/i,
|
|
80
|
+
/Applebot/i,
|
|
81
|
+
// AI bots
|
|
82
|
+
/Anthropic-ai/i,
|
|
83
|
+
/Claude-Web/i,
|
|
84
|
+
/anthropic-ai-scraper/i,
|
|
85
|
+
/Google-Extended/i,
|
|
86
|
+
/GoogleOther/i,
|
|
87
|
+
/CCBot\/\d/i,
|
|
88
|
+
/Bytespider/i,
|
|
89
|
+
/BingBot/i,
|
|
90
|
+
/Baiduspider/i,
|
|
91
|
+
/Sogou/i,
|
|
92
|
+
/Perplexity/i,
|
|
93
|
+
/Cohere-ai/i,
|
|
94
|
+
/xlts-bot/i,
|
|
95
|
+
/THAAS/i,
|
|
96
|
+
/YisouSpider/i,
|
|
97
|
+
/AlibabaGroup/i,
|
|
98
|
+
/adaptive-edge-crawler/i,
|
|
99
|
+
];
|
|
100
|
+
const isSpider = (ua) => botUserAgents.some((spider) => {
|
|
101
|
+
return spider.test(ua);
|
|
102
|
+
});
|
|
103
|
+
/**
|
|
104
|
+
* A default set of file extensions for static assets that do not need to be
|
|
105
|
+
* proxied.
|
|
106
|
+
*/
|
|
107
|
+
const staticFileExtensions = [
|
|
108
|
+
'ai',
|
|
109
|
+
'avi',
|
|
110
|
+
'css',
|
|
111
|
+
'dat',
|
|
112
|
+
'dmg',
|
|
113
|
+
'doc',
|
|
114
|
+
'doc',
|
|
115
|
+
'exe',
|
|
116
|
+
'flv',
|
|
117
|
+
'gif',
|
|
118
|
+
'ico',
|
|
119
|
+
'iso',
|
|
120
|
+
'jpeg',
|
|
121
|
+
'jpg',
|
|
122
|
+
'js',
|
|
123
|
+
'less',
|
|
124
|
+
'm4a',
|
|
125
|
+
'm4v',
|
|
126
|
+
'mov',
|
|
127
|
+
'mp3',
|
|
128
|
+
'mp4',
|
|
129
|
+
'mpeg',
|
|
130
|
+
'mpg',
|
|
131
|
+
'pdf',
|
|
132
|
+
'png',
|
|
133
|
+
'ppt',
|
|
134
|
+
'psd',
|
|
135
|
+
'rar',
|
|
136
|
+
'rss',
|
|
137
|
+
'svg',
|
|
138
|
+
'swf',
|
|
139
|
+
'tif',
|
|
140
|
+
'torrent',
|
|
141
|
+
'ttf',
|
|
142
|
+
'txt',
|
|
143
|
+
'wav',
|
|
144
|
+
'wmv',
|
|
145
|
+
'woff',
|
|
146
|
+
'xls',
|
|
147
|
+
'xml',
|
|
148
|
+
'zip',
|
|
149
|
+
];
|
|
150
|
+
const getDefaultRobotsUrl = (url) => {
|
|
151
|
+
const { origin } = new URL(url);
|
|
152
|
+
return (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
|
|
153
|
+
};
|
|
154
|
+
exports.getDefaultRobotsUrl = getDefaultRobotsUrl;
|
|
155
|
+
function getRobots(url) {
|
|
156
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
157
|
+
const { origin } = new URL(url);
|
|
158
|
+
const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
|
|
159
|
+
const { data } = yield exports.api.get(robotsUrl).catch(() => ({
|
|
160
|
+
data: '',
|
|
161
|
+
}));
|
|
162
|
+
return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
const getDefaultSitemapUrl = (url) => {
|
|
166
|
+
const { origin } = new URL(url);
|
|
167
|
+
return (0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1');
|
|
168
|
+
};
|
|
169
|
+
exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
|
|
170
|
+
const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
171
|
+
const robots = yield getRobots(url);
|
|
172
|
+
const isAllowed = robots ? yield robots.isAllowed(url) : true;
|
|
173
|
+
return isAllowed;
|
|
174
|
+
});
|
|
175
|
+
exports.isAcceptCrawler = isAcceptCrawler;
|
|
176
|
+
const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
177
|
+
let sitemapUrlList = [(0, exports.getDefaultSitemapUrl)(url)];
|
|
178
|
+
const robots = yield getRobots(url);
|
|
179
|
+
if (robots) {
|
|
180
|
+
const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
|
|
181
|
+
if (robotsTxtSitemapUrlList.length > 0) {
|
|
182
|
+
sitemapUrlList = robotsTxtSitemapUrlList;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// loop site map url list
|
|
186
|
+
const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
|
|
187
|
+
const newUrl = new URL(sitemapUrl);
|
|
188
|
+
newUrl.searchParams.set('nocache', '1');
|
|
189
|
+
sitemapUrl = newUrl.toString();
|
|
190
|
+
const { data: sitemapTxt } = yield exports.api.get(sitemapUrl).catch(() => ({
|
|
191
|
+
data: '',
|
|
192
|
+
}));
|
|
193
|
+
if (sitemapTxt) {
|
|
194
|
+
const stream = stream_1.Readable.from([sitemapTxt]);
|
|
195
|
+
const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
|
|
196
|
+
return sitemapJson;
|
|
197
|
+
}
|
|
198
|
+
return [];
|
|
199
|
+
})));
|
|
200
|
+
return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
|
|
201
|
+
});
|
|
202
|
+
exports.getSitemapList = getSitemapList;
|
|
203
|
+
const isBotUserAgent = (req) => {
|
|
204
|
+
const ua = req.get('user-agent');
|
|
205
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
206
|
+
if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
return true;
|
|
210
|
+
};
|
|
211
|
+
exports.isBotUserAgent = isBotUserAgent;
|
|
212
|
+
const getComponentInfo = () => {
|
|
213
|
+
return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
|
|
214
|
+
};
|
|
215
|
+
exports.getComponentInfo = getComponentInfo;
|
|
216
|
+
const getFullUrl = (req) => {
|
|
217
|
+
const blockletPathname = req.headers['x-path-prefix']
|
|
218
|
+
? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
|
|
219
|
+
: req.originalUrl;
|
|
220
|
+
return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
|
|
221
|
+
};
|
|
222
|
+
exports.getFullUrl = getFullUrl;
|
|
223
|
+
const getRelativePath = (url) => {
|
|
224
|
+
try {
|
|
225
|
+
return new URL(url).pathname;
|
|
226
|
+
}
|
|
227
|
+
catch (error) {
|
|
228
|
+
// ignore error
|
|
229
|
+
}
|
|
230
|
+
return url;
|
|
231
|
+
};
|
|
232
|
+
exports.getRelativePath = getRelativePath;
|
|
233
|
+
const formatUrl = (url) => {
|
|
234
|
+
return url.replace(/\/$/, '').trim();
|
|
235
|
+
};
|
|
236
|
+
exports.formatUrl = formatUrl;
|
|
237
|
+
function md5(content) {
|
|
238
|
+
return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
|
|
239
|
+
}
|