@arcblock/crawler 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/blocklet.d.ts +6 -0
  2. package/dist/blocklet.js +199 -0
  3. package/dist/cache.d.ts +10 -0
  4. package/dist/cache.js +119 -0
  5. package/dist/config.d.ts +10 -0
  6. package/dist/config.js +17 -0
  7. package/dist/crawler.d.ts +28 -0
  8. package/dist/crawler.js +314 -0
  9. package/dist/db/index.d.ts +1 -0
  10. package/dist/db/index.js +41 -0
  11. package/dist/db/job.d.ts +33 -0
  12. package/dist/db/job.js +54 -0
  13. package/dist/db/snapshot.d.ts +31 -0
  14. package/dist/db/snapshot.js +52 -0
  15. package/dist/index.d.ts +6 -0
  16. package/dist/index.js +45 -0
  17. package/dist/middleware.d.ts +4 -0
  18. package/dist/middleware.js +44 -0
  19. package/dist/puppeteer.d.ts +16 -0
  20. package/dist/puppeteer.js +318 -0
  21. package/dist/utils.d.ts +15 -0
  22. package/dist/utils.js +239 -0
  23. package/esm/blocklet.d.ts +6 -0
  24. package/esm/blocklet.js +190 -0
  25. package/esm/cache.d.ts +10 -0
  26. package/esm/cache.js +114 -0
  27. package/esm/config.d.ts +10 -0
  28. package/esm/config.js +11 -0
  29. package/esm/crawler.d.ts +28 -0
  30. package/esm/crawler.js +301 -0
  31. package/esm/db/index.d.ts +1 -0
  32. package/esm/db/index.js +35 -0
  33. package/esm/db/job.d.ts +33 -0
  34. package/esm/db/job.js +50 -0
  35. package/esm/db/snapshot.d.ts +31 -0
  36. package/esm/db/snapshot.js +48 -0
  37. package/esm/index.d.ts +6 -0
  38. package/esm/index.js +26 -0
  39. package/esm/middleware.d.ts +4 -0
  40. package/esm/middleware.js +41 -0
  41. package/esm/puppeteer.d.ts +16 -0
  42. package/esm/puppeteer.js +272 -0
  43. package/esm/utils.d.ts +15 -0
  44. package/esm/utils.js +220 -0
  45. package/package.json +11 -3
  46. package/src/blocklet.ts +0 -223
  47. package/src/cache.ts +0 -117
  48. package/src/config.ts +0 -13
  49. package/src/crawler.ts +0 -364
  50. package/src/db/index.ts +0 -27
  51. package/src/db/job.ts +0 -93
  52. package/src/db/snapshot.ts +0 -89
  53. package/src/index.ts +0 -19
  54. package/src/middleware.ts +0 -46
  55. package/src/puppeteer.ts +0 -296
  56. package/src/utils.ts +0 -240
  57. package/third.d.ts +0 -1
  58. package/tsconfig.json +0 -9
@@ -0,0 +1,35 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { Sequelize } from '@sequelize/core';
11
+ import { SqliteDialect } from '@sequelize/sqlite3';
12
+ import path from 'path';
13
+ import { config, logger } from '../config';
14
+ import { initJobModel } from './job';
15
+ import { initSnapshotModel } from './snapshot';
16
+ export function ensureDatabase() {
17
+ return __awaiter(this, void 0, void 0, function* () {
18
+ const sequelize = new Sequelize({
19
+ dialect: SqliteDialect,
20
+ storage: path.join(config.dataDir, 'snap-kit.db'),
21
+ logging: (msg) => logger.debug(msg),
22
+ });
23
+ yield initSnapshotModel(sequelize);
24
+ yield initJobModel(sequelize);
25
+ try {
26
+ yield sequelize.authenticate();
27
+ yield sequelize.sync();
28
+ logger.info('Successfully connected to database');
29
+ }
30
+ catch (error) {
31
+ logger.error('Failed to connect to database:', error);
32
+ throw error;
33
+ }
34
+ });
35
+ }
@@ -0,0 +1,33 @@
1
+ import { Model, Sequelize } from '@sequelize/core';
2
+ export interface JobState {
3
+ id?: string;
4
+ jobId: string;
5
+ url: string;
6
+ includeScreenshot?: boolean;
7
+ includeHtml?: boolean;
8
+ width?: number;
9
+ height?: number;
10
+ quality?: number;
11
+ timeout?: number;
12
+ fullPage?: boolean;
13
+ }
14
+ export interface JobModel {
15
+ id: string;
16
+ queue: string;
17
+ job: JobState;
18
+ retryCount: number;
19
+ willRunAt: number;
20
+ delay: number;
21
+ cancelled: boolean;
22
+ }
23
+ declare class Job extends Model<JobModel> implements JobModel {
24
+ id: JobModel['id'];
25
+ queue: JobModel['queue'];
26
+ job: JobModel['job'];
27
+ retryCount: JobModel['retryCount'];
28
+ willRunAt: JobModel['willRunAt'];
29
+ delay: JobModel['delay'];
30
+ cancelled: JobModel['cancelled'];
31
+ }
32
+ export { Job };
33
+ export declare function initJobModel(sequelize: Sequelize): typeof Job;
package/esm/db/job.js ADDED
@@ -0,0 +1,50 @@
1
+ import { DataTypes, Model } from '@sequelize/core';
2
+ class Job extends Model {
3
+ }
4
+ export { Job };
5
+ export function initJobModel(sequelize) {
6
+ Job.init({
7
+ id: {
8
+ type: DataTypes.STRING(40),
9
+ primaryKey: true,
10
+ },
11
+ queue: {
12
+ type: DataTypes.STRING(32),
13
+ allowNull: false,
14
+ },
15
+ job: {
16
+ type: DataTypes.JSON,
17
+ allowNull: false,
18
+ },
19
+ retryCount: {
20
+ type: DataTypes.INTEGER,
21
+ },
22
+ delay: {
23
+ type: DataTypes.INTEGER,
24
+ },
25
+ willRunAt: {
26
+ type: DataTypes.INTEGER,
27
+ },
28
+ cancelled: {
29
+ type: DataTypes.BOOLEAN,
30
+ defaultValue: false,
31
+ },
32
+ createdAt: {
33
+ type: DataTypes.DATE,
34
+ defaultValue: DataTypes.NOW,
35
+ index: true,
36
+ },
37
+ updatedAt: {
38
+ type: DataTypes.DATE,
39
+ defaultValue: DataTypes.NOW,
40
+ index: true,
41
+ },
42
+ }, {
43
+ sequelize,
44
+ indexes: [{ fields: ['queue'] }],
45
+ modelName: 'job',
46
+ tableName: 'jobs',
47
+ timestamps: true,
48
+ });
49
+ return Job;
50
+ }
@@ -0,0 +1,31 @@
1
+ import { Model, Sequelize } from '@sequelize/core';
2
+ interface SnapshotModel {
3
+ jobId: string;
4
+ url: string;
5
+ status: 'success' | 'failed' | 'pending';
6
+ html?: string | null;
7
+ screenshot?: string | null;
8
+ error?: string;
9
+ lastModified?: string;
10
+ options?: {
11
+ width?: number;
12
+ height?: number;
13
+ includeScreenshot?: boolean;
14
+ includeHtml?: boolean;
15
+ quality?: number;
16
+ fullPage?: boolean;
17
+ };
18
+ }
19
+ declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
20
+ jobId: SnapshotModel['jobId'];
21
+ url: SnapshotModel['url'];
22
+ status: SnapshotModel['status'];
23
+ html?: SnapshotModel['html'];
24
+ screenshot?: SnapshotModel['screenshot'];
25
+ error?: SnapshotModel['error'];
26
+ lastModified?: SnapshotModel['lastModified'];
27
+ options: SnapshotModel['options'];
28
+ }
29
+ export { Snapshot };
30
+ export type { SnapshotModel };
31
+ export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
@@ -0,0 +1,48 @@
1
+ import { DataTypes, Model } from '@sequelize/core';
2
+ class Snapshot extends Model {
3
+ }
4
+ export { Snapshot };
5
+ export function initSnapshotModel(sequelize) {
6
+ Snapshot.init({
7
+ jobId: {
8
+ type: DataTypes.STRING,
9
+ primaryKey: true,
10
+ allowNull: false,
11
+ },
12
+ url: {
13
+ type: DataTypes.STRING,
14
+ allowNull: false,
15
+ index: true,
16
+ },
17
+ status: {
18
+ type: DataTypes.ENUM('success', 'failed'),
19
+ allowNull: false,
20
+ },
21
+ html: {
22
+ type: DataTypes.TEXT,
23
+ allowNull: true,
24
+ },
25
+ screenshot: {
26
+ type: DataTypes.STRING,
27
+ allowNull: true,
28
+ },
29
+ error: {
30
+ type: DataTypes.STRING,
31
+ allowNull: true,
32
+ },
33
+ lastModified: {
34
+ type: DataTypes.STRING,
35
+ allowNull: true,
36
+ },
37
+ options: {
38
+ type: DataTypes.JSON,
39
+ allowNull: true,
40
+ },
41
+ }, {
42
+ sequelize,
43
+ modelName: 'snapshot',
44
+ tableName: 'snap',
45
+ timestamps: true,
46
+ });
47
+ return Snapshot;
48
+ }
package/esm/index.d.ts ADDED
@@ -0,0 +1,6 @@
1
+ import { config } from './config';
2
+ export * from './blocklet';
3
+ export * from './crawler';
4
+ export * from './middleware';
5
+ export { Snapshot } from './db/snapshot';
6
+ export declare function initCrawler(_config: Partial<typeof config>): Promise<void>;
package/esm/index.js ADDED
@@ -0,0 +1,26 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { config, logger } from './config';
11
+ import { createCrawlQueue } from './crawler';
12
+ import { ensureDatabase } from './db';
13
+ import { ensureBrowser } from './puppeteer';
14
+ export * from './blocklet';
15
+ export * from './crawler';
16
+ export * from './middleware';
17
+ export { Snapshot } from './db/snapshot';
18
+ export function initCrawler(_config) {
19
+ return __awaiter(this, void 0, void 0, function* () {
20
+ Object.assign(config, _config);
21
+ logger.debug('init crawler', config);
22
+ yield ensureDatabase();
23
+ yield createCrawlQueue();
24
+ yield ensureBrowser();
25
+ });
26
+ }
@@ -0,0 +1,4 @@
1
+ export declare function initSEOMiddleware({ autoReturnHtml, allowCrawler, }: {
2
+ autoReturnHtml?: Boolean;
3
+ allowCrawler?: Boolean | Function;
4
+ }): (req: any, res: any, next: Function) => Promise<any>;
@@ -0,0 +1,41 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import { useCache } from './cache';
11
+ import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
12
+ export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
13
+ return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
14
+ const isBot = isBotUserAgent(req);
15
+ const isSelf = isSelfCrawler(req);
16
+ if (!isBot || isSelf) {
17
+ return next();
18
+ }
19
+ const fullUrl = getFullUrl(req);
20
+ const canCrawl = yield isAcceptCrawler(fullUrl);
21
+ const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
22
+ // can not crawl, skip
23
+ if (!canCrawl || !allowCrawlerResult) {
24
+ return next();
25
+ }
26
+ const cacheData = yield useCache.get(fullUrl);
27
+ // add cached html to req
28
+ req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
29
+ // add cached lastModified to req, ISO string to GMT string
30
+ req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
31
+ if (req.cachedLastmod) {
32
+ res.setHeader('Last-Modified', req.cachedLastmod);
33
+ }
34
+ if (autoReturnHtml && req.cachedHtml) {
35
+ res.send(req.cachedHtml);
36
+ return;
37
+ }
38
+ // missing cache
39
+ next();
40
+ });
41
+ }
@@ -0,0 +1,16 @@
1
+ import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
2
+ export { puppeteer };
3
+ export declare function ensurePuppeteerrc(): Promise<{
4
+ cacheDirectory: string;
5
+ temporaryDirectory: string;
6
+ }>;
7
+ export declare function ensureBrowser(): Promise<void>;
8
+ export declare function connectBrowser(): Promise<Browser | null>;
9
+ export declare function launchBrowser(): Promise<Browser>;
10
+ export declare const getBrowser: () => Promise<Browser>;
11
+ export declare const closeBrowser: ({ trimCache }?: {
12
+ trimCache?: boolean;
13
+ }) => Promise<void>;
14
+ export declare function initPage({ abortResourceTypes }?: {
15
+ abortResourceTypes?: never[] | undefined;
16
+ }): Promise<Page>;
@@ -0,0 +1,272 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ // import fs from 'fs-extra';
11
+ // import path from 'path';
12
+ import puppeteer from '@blocklet/puppeteer';
13
+ import { env } from '@blocklet/sdk/lib/config';
14
+ import fs from 'fs-extra';
15
+ import path from 'path';
16
+ import { clearInterval, setInterval } from 'timers';
17
+ import { useCache } from './cache';
18
+ import { config, logger } from './config';
19
+ import { CRAWLER_FLAG, sleep } from './utils';
20
+ // let puppeteerConfig: {
21
+ // cacheDirectory: string;
22
+ // temporaryDirectory: string;
23
+ // };
24
+ const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${env.appId || 'unknown'}`;
25
+ const BrowserStatus = {
26
+ Launching: 'Launching',
27
+ Ready: 'Ready',
28
+ };
29
+ let browser;
30
+ let browserActivatedTimer;
31
+ export { puppeteer };
32
+ export function ensurePuppeteerrc() {
33
+ return __awaiter(this, void 0, void 0, function* () {
34
+ const cacheDirectory = path.join(config.cacheDir, 'puppeteer', 'cache');
35
+ const temporaryDirectory = path.join(config.cacheDir, 'puppeteer', 'tmp');
36
+ const puppeteerrcPath = path.join(config.appDir, '.puppeteerrc.js');
37
+ // ensure directory exists
38
+ yield Promise.all([fs.ensureDir(cacheDirectory), fs.ensureDir(temporaryDirectory), fs.ensureFile(puppeteerrcPath)]);
39
+ const puppeteerConfig = {
40
+ cacheDirectory,
41
+ temporaryDirectory,
42
+ };
43
+ const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
44
+ yield fs.writeFile(puppeteerrcPath, fileContent);
45
+ logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
46
+ return puppeteerConfig;
47
+ });
48
+ }
49
+ export function ensureBrowser() {
50
+ return __awaiter(this, void 0, void 0, function* () {
51
+ const puppeteerConfig = yield ensurePuppeteerrc();
52
+ const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
53
+ logger.info('executablePath', executablePath);
54
+ if (!fs.existsSync(executablePath)) {
55
+ logger.info('start download browser', puppeteerConfig);
56
+ const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
57
+ try {
58
+ // @ts-ignore
59
+ // eslint-disable-next-line import/extensions
60
+ return yield import('@blocklet/puppeteer/internal/node/install.js');
61
+ }
62
+ catch (err) {
63
+ logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
64
+ }
65
+ }))();
66
+ if (downloadBrowser) {
67
+ yield downloadBrowser();
68
+ logger.info('Browser download completed successfully');
69
+ }
70
+ }
71
+ // try to launch browser
72
+ if (config.testOnInitialize) {
73
+ const browser = yield launchBrowser();
74
+ if (!browser) {
75
+ throw new Error('Failed to launch browser');
76
+ }
77
+ yield closeBrowser();
78
+ }
79
+ logger.info('Puppeteer is ready');
80
+ });
81
+ }
82
+ export function connectBrowser() {
83
+ return __awaiter(this, void 0, void 0, function* () {
84
+ const browserWSEndpoint = yield useCache.get(BROWSER_WS_ENDPOINT_KEY);
85
+ if (!browserWSEndpoint) {
86
+ return null;
87
+ }
88
+ // retry if browser is launching
89
+ if (browserWSEndpoint.status === BrowserStatus.Launching) {
90
+ yield sleep(Math.floor(Math.random() * 1000));
91
+ return connectBrowser();
92
+ }
93
+ try {
94
+ browser = yield puppeteer.connect({
95
+ browserWSEndpoint: browserWSEndpoint.endpoint,
96
+ });
97
+ logger.info('Connect browser success');
98
+ }
99
+ catch (err) {
100
+ logger.warn('Connect browser failed, clear endpoint', err);
101
+ yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
102
+ return null;
103
+ }
104
+ return browser;
105
+ });
106
+ }
107
+ export function launchBrowser() {
108
+ return __awaiter(this, void 0, void 0, function* () {
109
+ yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
110
+ endpoint: null,
111
+ status: BrowserStatus.Launching,
112
+ });
113
+ try {
114
+ // @ts-ignore
115
+ browser = yield puppeteer.launch({
116
+ headless: true,
117
+ args: [
118
+ // docs: https://peter.sh/experiments/chromium-command-line-switches/
119
+ '--no-first-run',
120
+ '--hide-scrollbars',
121
+ '--no-sandbox',
122
+ '--no-zygote',
123
+ '--disable-setuid-sandbox',
124
+ '--disable-gpu',
125
+ '--disable-dev-shm-usage',
126
+ '--disable-site-isolation-trials',
127
+ '--disable-accelerated-2d-canvas',
128
+ '--disable-extensions',
129
+ '--js-flags=--max_old_space_size=512', // 限制V8内存
130
+ '--disable-background-networking',
131
+ '--disable-default-apps',
132
+ // '--disable-web-security', // 允许跨域请求
133
+ '--disable-software-rasterizer',
134
+ '--disable-crash-reporter',
135
+ '--disable-service-workers',
136
+ '--disable-notifications',
137
+ '--disable-infobars',
138
+ '--font-render-hinting=none',
139
+ ],
140
+ });
141
+ logger.info('Launch browser success');
142
+ }
143
+ catch (error) {
144
+ logger.error('launch browser failed: ', error);
145
+ // cleanup browser endpoint
146
+ yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
147
+ throw error;
148
+ }
149
+ // save browserWSEndpoint to cache
150
+ const endpoint = yield browser.wsEndpoint();
151
+ yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
152
+ endpoint,
153
+ status: BrowserStatus.Ready,
154
+ });
155
+ return browser;
156
+ });
157
+ }
158
+ function checkBrowserActivated() {
159
+ clearBrowserActivatedTimer();
160
+ let count = 0;
161
+ browserActivatedTimer = setInterval(() => __awaiter(this, void 0, void 0, function* () {
162
+ var _a;
163
+ if (browser) {
164
+ const pages = yield browser.pages().catch(() => []);
165
+ if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
166
+ count++;
167
+ logger.debug(`Browser inactive count: ${count}/3`);
168
+ }
169
+ else {
170
+ count = 0; // 重置计数器!
171
+ }
172
+ if (count >= 3) {
173
+ logger.info('Browser inactive for 3 minutes, closing...');
174
+ yield closeBrowser({
175
+ trimCache: true,
176
+ });
177
+ }
178
+ }
179
+ }), 1000 * 60);
180
+ }
181
+ function clearBrowserActivatedTimer() {
182
+ if (browserActivatedTimer) {
183
+ clearInterval(browserActivatedTimer);
184
+ browserActivatedTimer = null;
185
+ }
186
+ }
187
+ export const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
188
+ if (browser)
189
+ return browser;
190
+ // sleep random time (0 ~ 5s),to avoid concurrent blocklet
191
+ yield sleep(Math.floor(Math.random() * 1000 * 5));
192
+ // try to connect browser
193
+ const connectedBrowser = yield connectBrowser();
194
+ if (connectedBrowser) {
195
+ logger.debug('getBrowser.connectedBrowser');
196
+ browser = connectedBrowser;
197
+ return browser;
198
+ }
199
+ // try to launch browser
200
+ const launchedBrowser = yield launchBrowser();
201
+ if (launchedBrowser) {
202
+ logger.debug('getBrowser.launchedBrowser');
203
+ browser = launchedBrowser;
204
+ checkBrowserActivated();
205
+ return browser;
206
+ }
207
+ throw new Error('No browser to use, should install redis or browser');
208
+ });
209
+ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
210
+ if (!browser)
211
+ return;
212
+ // close all pages
213
+ try {
214
+ const pages = yield browser.pages();
215
+ yield Promise.all(pages.map((page) => page.close()));
216
+ }
217
+ catch (err) {
218
+ logger.error('Failed to close all pages:', err);
219
+ }
220
+ // close browser
221
+ try {
222
+ yield browser.close();
223
+ }
224
+ catch (err) {
225
+ logger.error('Failed to close browser:', err);
226
+ }
227
+ // clear cache
228
+ try {
229
+ if (trimCache) {
230
+ yield puppeteer.trimCache();
231
+ logger.info('Trim cache success');
232
+ }
233
+ // try to clear temporary directory
234
+ // if (puppeteerConfig) {
235
+ // await fs.emptyDir(puppeteerConfig.temporaryDirectory);
236
+ // }
237
+ if (global.gc) {
238
+ global.gc();
239
+ }
240
+ }
241
+ catch (err) {
242
+ logger.error('Failed to clear browser cache:', err);
243
+ }
244
+ browser = null;
245
+ clearBrowserActivatedTimer();
246
+ yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
247
+ logger.info('Close browser success');
248
+ });
249
+ export function initPage() {
250
+ return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
251
+ const browser = yield getBrowser();
252
+ const page = yield browser.newPage();
253
+ yield page.setViewport({ width: 1440, height: 900 });
254
+ // page setting
255
+ // add custom headers
256
+ yield page.setExtraHTTPHeaders({
257
+ [CRAWLER_FLAG]: 'true',
258
+ });
259
+ // abort resource types
260
+ if (abortResourceTypes.length > 0) {
261
+ yield page.setRequestInterception(true);
262
+ page.on('request', (req) => {
263
+ // @ts-ignore
264
+ if (abortResourceTypes.includes(req.resourceType())) {
265
+ return req.abort();
266
+ }
267
+ return req.continue();
268
+ });
269
+ }
270
+ return page;
271
+ });
272
+ }
package/esm/utils.d.ts ADDED
@@ -0,0 +1,15 @@
1
+ export declare const api: import("axios").AxiosInstance;
2
+ export declare const sleep: (ms: number) => Promise<unknown>;
3
+ export declare const CRAWLER_FLAG = "x-crawler";
4
+ export declare const isSelfCrawler: (req: any) => boolean;
5
+ export declare const getDefaultRobotsUrl: (url: string) => string;
6
+ export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
7
+ export declare const getDefaultSitemapUrl: (url: string) => string;
8
+ export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
9
+ export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
10
+ export declare const isBotUserAgent: (req: any) => boolean;
11
+ export declare const getComponentInfo: () => {};
12
+ export declare const getFullUrl: (req: any) => string;
13
+ export declare const getRelativePath: (url: string) => string;
14
+ export declare const formatUrl: (url: string) => string;
15
+ export declare function md5(content: string | Uint8Array): string;