@arcblock/crawler 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/blocklet.d.ts +6 -0
  2. package/dist/blocklet.js +199 -0
  3. package/dist/cache.d.ts +10 -0
  4. package/dist/cache.js +119 -0
  5. package/dist/config.d.ts +10 -0
  6. package/dist/config.js +17 -0
  7. package/dist/crawler.d.ts +28 -0
  8. package/dist/crawler.js +314 -0
  9. package/dist/db/index.d.ts +1 -0
  10. package/dist/db/index.js +41 -0
  11. package/dist/db/job.d.ts +33 -0
  12. package/dist/db/job.js +54 -0
  13. package/dist/db/snapshot.d.ts +31 -0
  14. package/dist/db/snapshot.js +52 -0
  15. package/dist/index.d.ts +6 -0
  16. package/dist/index.js +45 -0
  17. package/dist/middleware.d.ts +4 -0
  18. package/dist/middleware.js +44 -0
  19. package/dist/puppeteer.d.ts +16 -0
  20. package/dist/puppeteer.js +318 -0
  21. package/dist/utils.d.ts +15 -0
  22. package/dist/utils.js +239 -0
  23. package/esm/blocklet.d.ts +6 -0
  24. package/esm/blocklet.js +190 -0
  25. package/esm/cache.d.ts +10 -0
  26. package/esm/cache.js +114 -0
  27. package/esm/config.d.ts +10 -0
  28. package/esm/config.js +11 -0
  29. package/esm/crawler.d.ts +28 -0
  30. package/esm/crawler.js +301 -0
  31. package/esm/db/index.d.ts +1 -0
  32. package/esm/db/index.js +35 -0
  33. package/esm/db/job.d.ts +33 -0
  34. package/esm/db/job.js +50 -0
  35. package/esm/db/snapshot.d.ts +31 -0
  36. package/esm/db/snapshot.js +48 -0
  37. package/esm/index.d.ts +6 -0
  38. package/esm/index.js +26 -0
  39. package/esm/middleware.d.ts +4 -0
  40. package/esm/middleware.js +41 -0
  41. package/esm/puppeteer.d.ts +16 -0
  42. package/esm/puppeteer.js +272 -0
  43. package/esm/utils.d.ts +15 -0
  44. package/esm/utils.js +220 -0
  45. package/package.json +10 -3
  46. package/src/blocklet.ts +0 -223
  47. package/src/cache.ts +0 -117
  48. package/src/config.ts +0 -13
  49. package/src/crawler.ts +0 -364
  50. package/src/db/index.ts +0 -27
  51. package/src/db/job.ts +0 -93
  52. package/src/db/snapshot.ts +0 -89
  53. package/src/index.ts +0 -19
  54. package/src/middleware.ts +0 -46
  55. package/src/puppeteer.ts +0 -296
  56. package/src/utils.ts +0 -240
  57. package/third.d.ts +0 -1
  58. package/tsconfig.json +0 -9
@@ -0,0 +1,318 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
47
+ Object.defineProperty(exports, "__esModule", { value: true });
48
+ exports.closeBrowser = exports.getBrowser = exports.puppeteer = void 0;
49
+ exports.ensurePuppeteerrc = ensurePuppeteerrc;
50
+ exports.ensureBrowser = ensureBrowser;
51
+ exports.connectBrowser = connectBrowser;
52
+ exports.launchBrowser = launchBrowser;
53
+ exports.initPage = initPage;
54
+ // import fs from 'fs-extra';
55
+ // import path from 'path';
56
+ const puppeteer_1 = __importDefault(require("@blocklet/puppeteer"));
57
+ exports.puppeteer = puppeteer_1.default;
58
+ const config_1 = require("@blocklet/sdk/lib/config");
59
+ const fs_extra_1 = __importDefault(require("fs-extra"));
60
+ const path_1 = __importDefault(require("path"));
61
+ const timers_1 = require("timers");
62
+ const cache_1 = require("./cache");
63
+ const config_2 = require("./config");
64
+ const utils_1 = require("./utils");
65
+ // let puppeteerConfig: {
66
+ // cacheDirectory: string;
67
+ // temporaryDirectory: string;
68
+ // };
69
+ const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${config_1.env.appId || 'unknown'}`;
70
+ const BrowserStatus = {
71
+ Launching: 'Launching',
72
+ Ready: 'Ready',
73
+ };
74
+ let browser;
75
+ let browserActivatedTimer;
76
+ function ensurePuppeteerrc() {
77
+ return __awaiter(this, void 0, void 0, function* () {
78
+ const cacheDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'cache');
79
+ const temporaryDirectory = path_1.default.join(config_2.config.cacheDir, 'puppeteer', 'tmp');
80
+ const puppeteerrcPath = path_1.default.join(config_2.config.appDir, '.puppeteerrc.js');
81
+ // ensure directory exists
82
+ yield Promise.all([fs_extra_1.default.ensureDir(cacheDirectory), fs_extra_1.default.ensureDir(temporaryDirectory), fs_extra_1.default.ensureFile(puppeteerrcPath)]);
83
+ const puppeteerConfig = {
84
+ cacheDirectory,
85
+ temporaryDirectory,
86
+ };
87
+ const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
88
+ yield fs_extra_1.default.writeFile(puppeteerrcPath, fileContent);
89
+ config_2.logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
90
+ return puppeteerConfig;
91
+ });
92
+ }
93
+ function ensureBrowser() {
94
+ return __awaiter(this, void 0, void 0, function* () {
95
+ const puppeteerConfig = yield ensurePuppeteerrc();
96
+ const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
97
+ config_2.logger.info('executablePath', executablePath);
98
+ if (!fs_extra_1.default.existsSync(executablePath)) {
99
+ config_2.logger.info('start download browser', puppeteerConfig);
100
+ const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
101
+ try {
102
+ // @ts-ignore
103
+ // eslint-disable-next-line import/extensions
104
+ return yield Promise.resolve().then(() => __importStar(require('@blocklet/puppeteer/internal/node/install.js')));
105
+ }
106
+ catch (err) {
107
+ config_2.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
108
+ }
109
+ }))();
110
+ if (downloadBrowser) {
111
+ yield downloadBrowser();
112
+ config_2.logger.info('Browser download completed successfully');
113
+ }
114
+ }
115
+ // try to launch browser
116
+ if (config_2.config.testOnInitialize) {
117
+ const browser = yield launchBrowser();
118
+ if (!browser) {
119
+ throw new Error('Failed to launch browser');
120
+ }
121
+ yield (0, exports.closeBrowser)();
122
+ }
123
+ config_2.logger.info('Puppeteer is ready');
124
+ });
125
+ }
126
+ function connectBrowser() {
127
+ return __awaiter(this, void 0, void 0, function* () {
128
+ const browserWSEndpoint = yield cache_1.useCache.get(BROWSER_WS_ENDPOINT_KEY);
129
+ if (!browserWSEndpoint) {
130
+ return null;
131
+ }
132
+ // retry if browser is launching
133
+ if (browserWSEndpoint.status === BrowserStatus.Launching) {
134
+ yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000));
135
+ return connectBrowser();
136
+ }
137
+ try {
138
+ browser = yield puppeteer_1.default.connect({
139
+ browserWSEndpoint: browserWSEndpoint.endpoint,
140
+ });
141
+ config_2.logger.info('Connect browser success');
142
+ }
143
+ catch (err) {
144
+ config_2.logger.warn('Connect browser failed, clear endpoint', err);
145
+ yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
146
+ return null;
147
+ }
148
+ return browser;
149
+ });
150
+ }
151
+ function launchBrowser() {
152
+ return __awaiter(this, void 0, void 0, function* () {
153
+ yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
154
+ endpoint: null,
155
+ status: BrowserStatus.Launching,
156
+ });
157
+ try {
158
+ // @ts-ignore
159
+ browser = yield puppeteer_1.default.launch({
160
+ headless: true,
161
+ args: [
162
+ // docs: https://peter.sh/experiments/chromium-command-line-switches/
163
+ '--no-first-run',
164
+ '--hide-scrollbars',
165
+ '--no-sandbox',
166
+ '--no-zygote',
167
+ '--disable-setuid-sandbox',
168
+ '--disable-gpu',
169
+ '--disable-dev-shm-usage',
170
+ '--disable-site-isolation-trials',
171
+ '--disable-accelerated-2d-canvas',
172
+ '--disable-extensions',
173
+ '--js-flags=--max_old_space_size=512', // 限制V8内存
174
+ '--disable-background-networking',
175
+ '--disable-default-apps',
176
+ // '--disable-web-security', // 允许跨域请求
177
+ '--disable-software-rasterizer',
178
+ '--disable-crash-reporter',
179
+ '--disable-service-workers',
180
+ '--disable-notifications',
181
+ '--disable-infobars',
182
+ '--font-render-hinting=none',
183
+ ],
184
+ });
185
+ config_2.logger.info('Launch browser success');
186
+ }
187
+ catch (error) {
188
+ config_2.logger.error('launch browser failed: ', error);
189
+ // cleanup browser endpoint
190
+ yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
191
+ throw error;
192
+ }
193
+ // save browserWSEndpoint to cache
194
+ const endpoint = yield browser.wsEndpoint();
195
+ yield cache_1.useCache.set(BROWSER_WS_ENDPOINT_KEY, {
196
+ endpoint,
197
+ status: BrowserStatus.Ready,
198
+ });
199
+ return browser;
200
+ });
201
+ }
202
+ function checkBrowserActivated() {
203
+ clearBrowserActivatedTimer();
204
+ let count = 0;
205
+ browserActivatedTimer = (0, timers_1.setInterval)(() => __awaiter(this, void 0, void 0, function* () {
206
+ var _a;
207
+ if (browser) {
208
+ const pages = yield browser.pages().catch(() => []);
209
+ if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
210
+ count++;
211
+ config_2.logger.debug(`Browser inactive count: ${count}/3`);
212
+ }
213
+ else {
214
+ count = 0; // 重置计数器!
215
+ }
216
+ if (count >= 3) {
217
+ config_2.logger.info('Browser inactive for 3 minutes, closing...');
218
+ yield (0, exports.closeBrowser)({
219
+ trimCache: true,
220
+ });
221
+ }
222
+ }
223
+ }), 1000 * 60);
224
+ }
225
+ function clearBrowserActivatedTimer() {
226
+ if (browserActivatedTimer) {
227
+ (0, timers_1.clearInterval)(browserActivatedTimer);
228
+ browserActivatedTimer = null;
229
+ }
230
+ }
231
+ const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
232
+ if (browser)
233
+ return browser;
234
+ // sleep random time (0 ~ 5s),to avoid concurrent blocklet
235
+ yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000 * 5));
236
+ // try to connect browser
237
+ const connectedBrowser = yield connectBrowser();
238
+ if (connectedBrowser) {
239
+ config_2.logger.debug('getBrowser.connectedBrowser');
240
+ browser = connectedBrowser;
241
+ return browser;
242
+ }
243
+ // try to launch browser
244
+ const launchedBrowser = yield launchBrowser();
245
+ if (launchedBrowser) {
246
+ config_2.logger.debug('getBrowser.launchedBrowser');
247
+ browser = launchedBrowser;
248
+ checkBrowserActivated();
249
+ return browser;
250
+ }
251
+ throw new Error('No browser to use, should install redis or browser');
252
+ });
253
+ exports.getBrowser = getBrowser;
254
+ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
255
+ if (!browser)
256
+ return;
257
+ // close all pages
258
+ try {
259
+ const pages = yield browser.pages();
260
+ yield Promise.all(pages.map((page) => page.close()));
261
+ }
262
+ catch (err) {
263
+ config_2.logger.error('Failed to close all pages:', err);
264
+ }
265
+ // close browser
266
+ try {
267
+ yield browser.close();
268
+ }
269
+ catch (err) {
270
+ config_2.logger.error('Failed to close browser:', err);
271
+ }
272
+ // clear cache
273
+ try {
274
+ if (trimCache) {
275
+ yield puppeteer_1.default.trimCache();
276
+ config_2.logger.info('Trim cache success');
277
+ }
278
+ // try to clear temporary directory
279
+ // if (puppeteerConfig) {
280
+ // await fs.emptyDir(puppeteerConfig.temporaryDirectory);
281
+ // }
282
+ if (global.gc) {
283
+ global.gc();
284
+ }
285
+ }
286
+ catch (err) {
287
+ config_2.logger.error('Failed to clear browser cache:', err);
288
+ }
289
+ browser = null;
290
+ clearBrowserActivatedTimer();
291
+ yield cache_1.useCache.remove(BROWSER_WS_ENDPOINT_KEY);
292
+ config_2.logger.info('Close browser success');
293
+ });
294
+ exports.closeBrowser = closeBrowser;
295
+ function initPage() {
296
+ return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
297
+ const browser = yield (0, exports.getBrowser)();
298
+ const page = yield browser.newPage();
299
+ yield page.setViewport({ width: 1440, height: 900 });
300
+ // page setting
301
+ // add custom headers
302
+ yield page.setExtraHTTPHeaders({
303
+ [utils_1.CRAWLER_FLAG]: 'true',
304
+ });
305
+ // abort resource types
306
+ if (abortResourceTypes.length > 0) {
307
+ yield page.setRequestInterception(true);
308
+ page.on('request', (req) => {
309
+ // @ts-ignore
310
+ if (abortResourceTypes.includes(req.resourceType())) {
311
+ return req.abort();
312
+ }
313
+ return req.continue();
314
+ });
315
+ }
316
+ return page;
317
+ });
318
+ }
@@ -0,0 +1,15 @@
1
+ export declare const api: import("axios").AxiosInstance;
2
+ export declare const sleep: (ms: number) => Promise<unknown>;
3
+ export declare const CRAWLER_FLAG = "x-crawler";
4
+ export declare const isSelfCrawler: (req: any) => boolean;
5
+ export declare const getDefaultRobotsUrl: (url: string) => string;
6
+ export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
7
+ export declare const getDefaultSitemapUrl: (url: string) => string;
8
+ export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
9
+ export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
10
+ export declare const isBotUserAgent: (req: any) => boolean;
11
+ export declare const getComponentInfo: () => {};
12
+ export declare const getFullUrl: (req: any) => string;
13
+ export declare const getRelativePath: (url: string) => string;
14
+ export declare const formatUrl: (url: string) => string;
15
+ export declare function md5(content: string | Uint8Array): string;
package/dist/utils.js ADDED
@@ -0,0 +1,239 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
16
+ exports.getRobots = getRobots;
17
+ exports.md5 = md5;
18
+ const config_1 = require("@blocklet/sdk/lib/config");
19
+ const axios_1 = __importDefault(require("axios"));
20
+ const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
21
+ const uniq_1 = __importDefault(require("lodash/uniq"));
22
+ const node_crypto_1 = require("node:crypto");
23
+ const robots_parser_1 = __importDefault(require("robots-parser"));
24
+ const sitemap_1 = require("sitemap");
25
+ const stream_1 = require("stream");
26
+ const ufo_1 = require("ufo");
27
+ exports.api = axios_1.default.create({
28
+ timeout: 1000 * 10,
29
+ headers: {
30
+ 'Content-Type': 'application/json',
31
+ },
32
+ });
33
+ const sleep = (ms) => {
34
+ return new Promise((resolve) => {
35
+ setTimeout(resolve, ms);
36
+ });
37
+ };
38
+ exports.sleep = sleep;
39
+ exports.CRAWLER_FLAG = 'x-crawler';
40
+ const isSelfCrawler = (req) => {
41
+ const ua = req.get('user-agent') || '';
42
+ return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
43
+ };
44
+ exports.isSelfCrawler = isSelfCrawler;
45
+ /**
46
+ * A default set of user agent patterns for bots/crawlers that do not perform
47
+ * well with pages that require JavaScript.
48
+ */
49
+ const botUserAgents = [
50
+ /bot/i,
51
+ /spider/i,
52
+ /facebookexternalhit/i,
53
+ /simplepie/i,
54
+ /yahooseeker/i,
55
+ /embedly/i,
56
+ /quora link preview/i,
57
+ /outbrain/i,
58
+ /vkshare/i,
59
+ /monit/i,
60
+ /Pingability/i,
61
+ /Monitoring/i,
62
+ /WinHttpRequest/i,
63
+ /Apache-HttpClient/i,
64
+ /getprismatic.com/i,
65
+ /python-requests/i,
66
+ /Twurly/i,
67
+ /yandex/i,
68
+ /browserproxy/i,
69
+ /crawler/i,
70
+ /Qwantify/i,
71
+ /Yahoo/i,
72
+ /pinterest/i,
73
+ /Tumblr/i,
74
+ /Tumblr Agent/i,
75
+ /WhatsApp/i,
76
+ /Google-Structured-Data-Testing-Tool/i,
77
+ /Google-InspectionTool/i,
78
+ /Googlebot/i,
79
+ /GPTBot/i,
80
+ /Applebot/i,
81
+ // AI bots
82
+ /Anthropic-ai/i,
83
+ /Claude-Web/i,
84
+ /anthropic-ai-scraper/i,
85
+ /Google-Extended/i,
86
+ /GoogleOther/i,
87
+ /CCBot\/\d/i,
88
+ /Bytespider/i,
89
+ /BingBot/i,
90
+ /Baiduspider/i,
91
+ /Sogou/i,
92
+ /Perplexity/i,
93
+ /Cohere-ai/i,
94
+ /xlts-bot/i,
95
+ /THAAS/i,
96
+ /YisouSpider/i,
97
+ /AlibabaGroup/i,
98
+ /adaptive-edge-crawler/i,
99
+ ];
100
+ const isSpider = (ua) => botUserAgents.some((spider) => {
101
+ return spider.test(ua);
102
+ });
103
+ /**
104
+ * A default set of file extensions for static assets that do not need to be
105
+ * proxied.
106
+ */
107
+ const staticFileExtensions = [
108
+ 'ai',
109
+ 'avi',
110
+ 'css',
111
+ 'dat',
112
+ 'dmg',
113
+ 'doc',
114
+ 'doc',
115
+ 'exe',
116
+ 'flv',
117
+ 'gif',
118
+ 'ico',
119
+ 'iso',
120
+ 'jpeg',
121
+ 'jpg',
122
+ 'js',
123
+ 'less',
124
+ 'm4a',
125
+ 'm4v',
126
+ 'mov',
127
+ 'mp3',
128
+ 'mp4',
129
+ 'mpeg',
130
+ 'mpg',
131
+ 'pdf',
132
+ 'png',
133
+ 'ppt',
134
+ 'psd',
135
+ 'rar',
136
+ 'rss',
137
+ 'svg',
138
+ 'swf',
139
+ 'tif',
140
+ 'torrent',
141
+ 'ttf',
142
+ 'txt',
143
+ 'wav',
144
+ 'wmv',
145
+ 'woff',
146
+ 'xls',
147
+ 'xml',
148
+ 'zip',
149
+ ];
150
+ const getDefaultRobotsUrl = (url) => {
151
+ const { origin } = new URL(url);
152
+ return (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
153
+ };
154
+ exports.getDefaultRobotsUrl = getDefaultRobotsUrl;
155
+ function getRobots(url) {
156
+ return __awaiter(this, void 0, void 0, function* () {
157
+ const { origin } = new URL(url);
158
+ const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
159
+ const { data } = yield exports.api.get(robotsUrl).catch(() => ({
160
+ data: '',
161
+ }));
162
+ return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
163
+ });
164
+ }
165
+ const getDefaultSitemapUrl = (url) => {
166
+ const { origin } = new URL(url);
167
+ return (0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1');
168
+ };
169
+ exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
170
+ const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
171
+ const robots = yield getRobots(url);
172
+ const isAllowed = robots ? yield robots.isAllowed(url) : true;
173
+ return isAllowed;
174
+ });
175
+ exports.isAcceptCrawler = isAcceptCrawler;
176
+ const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
177
+ let sitemapUrlList = [(0, exports.getDefaultSitemapUrl)(url)];
178
+ const robots = yield getRobots(url);
179
+ if (robots) {
180
+ const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
181
+ if (robotsTxtSitemapUrlList.length > 0) {
182
+ sitemapUrlList = robotsTxtSitemapUrlList;
183
+ }
184
+ }
185
+ // loop site map url list
186
+ const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
187
+ const newUrl = new URL(sitemapUrl);
188
+ newUrl.searchParams.set('nocache', '1');
189
+ sitemapUrl = newUrl.toString();
190
+ const { data: sitemapTxt } = yield exports.api.get(sitemapUrl).catch(() => ({
191
+ data: '',
192
+ }));
193
+ if (sitemapTxt) {
194
+ const stream = stream_1.Readable.from([sitemapTxt]);
195
+ const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
196
+ return sitemapJson;
197
+ }
198
+ return [];
199
+ })));
200
+ return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
201
+ });
202
+ exports.getSitemapList = getSitemapList;
203
+ const isBotUserAgent = (req) => {
204
+ const ua = req.get('user-agent');
205
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
206
+ if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
207
+ return false;
208
+ }
209
+ return true;
210
+ };
211
+ exports.isBotUserAgent = isBotUserAgent;
212
+ const getComponentInfo = () => {
213
+ return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
214
+ };
215
+ exports.getComponentInfo = getComponentInfo;
216
+ const getFullUrl = (req) => {
217
+ const blockletPathname = req.headers['x-path-prefix']
218
+ ? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
219
+ : req.originalUrl;
220
+ return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
221
+ };
222
+ exports.getFullUrl = getFullUrl;
223
+ const getRelativePath = (url) => {
224
+ try {
225
+ return new URL(url).pathname;
226
+ }
227
+ catch (error) {
228
+ // ignore error
229
+ }
230
+ return url;
231
+ };
232
+ exports.getRelativePath = getRelativePath;
233
+ const formatUrl = (url) => {
234
+ return url.replace(/\/$/, '').trim();
235
+ };
236
+ exports.formatUrl = formatUrl;
237
+ function md5(content) {
238
+ return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
239
+ }
@@ -0,0 +1,6 @@
1
+ export declare const crawlBlocklet: () => Promise<void>;
2
+ export declare const initCronCrawlBlocklet: ({ time, options, }?: {
3
+ time: string;
4
+ options: any;
5
+ }) => any;
6
+ export declare const cancelCronCrawlBlocklet: () => void;