@arcblock/crawler 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocklet.d.ts +6 -0
- package/dist/blocklet.js +199 -0
- package/dist/cache.d.ts +10 -0
- package/dist/cache.js +119 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +17 -0
- package/dist/crawler.d.ts +28 -0
- package/dist/crawler.js +314 -0
- package/dist/db/index.d.ts +1 -0
- package/dist/db/index.js +41 -0
- package/dist/db/job.d.ts +33 -0
- package/dist/db/job.js +54 -0
- package/dist/db/snapshot.d.ts +31 -0
- package/dist/db/snapshot.js +52 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +45 -0
- package/dist/middleware.d.ts +4 -0
- package/dist/middleware.js +44 -0
- package/dist/puppeteer.d.ts +16 -0
- package/dist/puppeteer.js +318 -0
- package/dist/utils.d.ts +15 -0
- package/dist/utils.js +239 -0
- package/esm/blocklet.d.ts +6 -0
- package/esm/blocklet.js +190 -0
- package/esm/cache.d.ts +10 -0
- package/esm/cache.js +114 -0
- package/esm/config.d.ts +10 -0
- package/esm/config.js +11 -0
- package/esm/crawler.d.ts +28 -0
- package/esm/crawler.js +301 -0
- package/esm/db/index.d.ts +1 -0
- package/esm/db/index.js +35 -0
- package/esm/db/job.d.ts +33 -0
- package/esm/db/job.js +50 -0
- package/esm/db/snapshot.d.ts +31 -0
- package/esm/db/snapshot.js +48 -0
- package/esm/index.d.ts +6 -0
- package/esm/index.js +26 -0
- package/esm/middleware.d.ts +4 -0
- package/esm/middleware.js +41 -0
- package/esm/puppeteer.d.ts +16 -0
- package/esm/puppeteer.js +272 -0
- package/esm/utils.d.ts +15 -0
- package/esm/utils.js +220 -0
- package/package.json +10 -3
- package/src/blocklet.ts +0 -223
- package/src/cache.ts +0 -117
- package/src/config.ts +0 -13
- package/src/crawler.ts +0 -364
- package/src/db/index.ts +0 -27
- package/src/db/job.ts +0 -93
- package/src/db/snapshot.ts +0 -89
- package/src/index.ts +0 -19
- package/src/middleware.ts +0 -46
- package/src/puppeteer.ts +0 -296
- package/src/utils.ts +0 -240
- package/third.d.ts +0 -1
- package/tsconfig.json +0 -9
package/esm/db/index.js
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { Sequelize } from '@sequelize/core';
|
|
11
|
+
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
12
|
+
import path from 'path';
|
|
13
|
+
import { config, logger } from '../config';
|
|
14
|
+
import { initJobModel } from './job';
|
|
15
|
+
import { initSnapshotModel } from './snapshot';
|
|
16
|
+
export function ensureDatabase() {
|
|
17
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
18
|
+
const sequelize = new Sequelize({
|
|
19
|
+
dialect: SqliteDialect,
|
|
20
|
+
storage: path.join(config.dataDir, 'snap-kit.db'),
|
|
21
|
+
logging: (msg) => logger.debug(msg),
|
|
22
|
+
});
|
|
23
|
+
yield initSnapshotModel(sequelize);
|
|
24
|
+
yield initJobModel(sequelize);
|
|
25
|
+
try {
|
|
26
|
+
yield sequelize.authenticate();
|
|
27
|
+
yield sequelize.sync();
|
|
28
|
+
logger.info('Successfully connected to database');
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
logger.error('Failed to connect to database:', error);
|
|
32
|
+
throw error;
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
}
|
package/esm/db/job.d.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
export interface JobState {
|
|
3
|
+
id?: string;
|
|
4
|
+
jobId: string;
|
|
5
|
+
url: string;
|
|
6
|
+
includeScreenshot?: boolean;
|
|
7
|
+
includeHtml?: boolean;
|
|
8
|
+
width?: number;
|
|
9
|
+
height?: number;
|
|
10
|
+
quality?: number;
|
|
11
|
+
timeout?: number;
|
|
12
|
+
fullPage?: boolean;
|
|
13
|
+
}
|
|
14
|
+
export interface JobModel {
|
|
15
|
+
id: string;
|
|
16
|
+
queue: string;
|
|
17
|
+
job: JobState;
|
|
18
|
+
retryCount: number;
|
|
19
|
+
willRunAt: number;
|
|
20
|
+
delay: number;
|
|
21
|
+
cancelled: boolean;
|
|
22
|
+
}
|
|
23
|
+
declare class Job extends Model<JobModel> implements JobModel {
|
|
24
|
+
id: JobModel['id'];
|
|
25
|
+
queue: JobModel['queue'];
|
|
26
|
+
job: JobModel['job'];
|
|
27
|
+
retryCount: JobModel['retryCount'];
|
|
28
|
+
willRunAt: JobModel['willRunAt'];
|
|
29
|
+
delay: JobModel['delay'];
|
|
30
|
+
cancelled: JobModel['cancelled'];
|
|
31
|
+
}
|
|
32
|
+
export { Job };
|
|
33
|
+
export declare function initJobModel(sequelize: Sequelize): typeof Job;
|
package/esm/db/job.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { DataTypes, Model } from '@sequelize/core';
|
|
2
|
+
class Job extends Model {
|
|
3
|
+
}
|
|
4
|
+
export { Job };
|
|
5
|
+
export function initJobModel(sequelize) {
|
|
6
|
+
Job.init({
|
|
7
|
+
id: {
|
|
8
|
+
type: DataTypes.STRING(40),
|
|
9
|
+
primaryKey: true,
|
|
10
|
+
},
|
|
11
|
+
queue: {
|
|
12
|
+
type: DataTypes.STRING(32),
|
|
13
|
+
allowNull: false,
|
|
14
|
+
},
|
|
15
|
+
job: {
|
|
16
|
+
type: DataTypes.JSON,
|
|
17
|
+
allowNull: false,
|
|
18
|
+
},
|
|
19
|
+
retryCount: {
|
|
20
|
+
type: DataTypes.INTEGER,
|
|
21
|
+
},
|
|
22
|
+
delay: {
|
|
23
|
+
type: DataTypes.INTEGER,
|
|
24
|
+
},
|
|
25
|
+
willRunAt: {
|
|
26
|
+
type: DataTypes.INTEGER,
|
|
27
|
+
},
|
|
28
|
+
cancelled: {
|
|
29
|
+
type: DataTypes.BOOLEAN,
|
|
30
|
+
defaultValue: false,
|
|
31
|
+
},
|
|
32
|
+
createdAt: {
|
|
33
|
+
type: DataTypes.DATE,
|
|
34
|
+
defaultValue: DataTypes.NOW,
|
|
35
|
+
index: true,
|
|
36
|
+
},
|
|
37
|
+
updatedAt: {
|
|
38
|
+
type: DataTypes.DATE,
|
|
39
|
+
defaultValue: DataTypes.NOW,
|
|
40
|
+
index: true,
|
|
41
|
+
},
|
|
42
|
+
}, {
|
|
43
|
+
sequelize,
|
|
44
|
+
indexes: [{ fields: ['queue'] }],
|
|
45
|
+
modelName: 'job',
|
|
46
|
+
tableName: 'jobs',
|
|
47
|
+
timestamps: true,
|
|
48
|
+
});
|
|
49
|
+
return Job;
|
|
50
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
interface SnapshotModel {
|
|
3
|
+
jobId: string;
|
|
4
|
+
url: string;
|
|
5
|
+
status: 'success' | 'failed' | 'pending';
|
|
6
|
+
html?: string | null;
|
|
7
|
+
screenshot?: string | null;
|
|
8
|
+
error?: string;
|
|
9
|
+
lastModified?: string;
|
|
10
|
+
options?: {
|
|
11
|
+
width?: number;
|
|
12
|
+
height?: number;
|
|
13
|
+
includeScreenshot?: boolean;
|
|
14
|
+
includeHtml?: boolean;
|
|
15
|
+
quality?: number;
|
|
16
|
+
fullPage?: boolean;
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
20
|
+
jobId: SnapshotModel['jobId'];
|
|
21
|
+
url: SnapshotModel['url'];
|
|
22
|
+
status: SnapshotModel['status'];
|
|
23
|
+
html?: SnapshotModel['html'];
|
|
24
|
+
screenshot?: SnapshotModel['screenshot'];
|
|
25
|
+
error?: SnapshotModel['error'];
|
|
26
|
+
lastModified?: SnapshotModel['lastModified'];
|
|
27
|
+
options: SnapshotModel['options'];
|
|
28
|
+
}
|
|
29
|
+
export { Snapshot };
|
|
30
|
+
export type { SnapshotModel };
|
|
31
|
+
export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { DataTypes, Model } from '@sequelize/core';
|
|
2
|
+
class Snapshot extends Model {
|
|
3
|
+
}
|
|
4
|
+
export { Snapshot };
|
|
5
|
+
export function initSnapshotModel(sequelize) {
|
|
6
|
+
Snapshot.init({
|
|
7
|
+
jobId: {
|
|
8
|
+
type: DataTypes.STRING,
|
|
9
|
+
primaryKey: true,
|
|
10
|
+
allowNull: false,
|
|
11
|
+
},
|
|
12
|
+
url: {
|
|
13
|
+
type: DataTypes.STRING,
|
|
14
|
+
allowNull: false,
|
|
15
|
+
index: true,
|
|
16
|
+
},
|
|
17
|
+
status: {
|
|
18
|
+
type: DataTypes.ENUM('success', 'failed'),
|
|
19
|
+
allowNull: false,
|
|
20
|
+
},
|
|
21
|
+
html: {
|
|
22
|
+
type: DataTypes.TEXT,
|
|
23
|
+
allowNull: true,
|
|
24
|
+
},
|
|
25
|
+
screenshot: {
|
|
26
|
+
type: DataTypes.STRING,
|
|
27
|
+
allowNull: true,
|
|
28
|
+
},
|
|
29
|
+
error: {
|
|
30
|
+
type: DataTypes.STRING,
|
|
31
|
+
allowNull: true,
|
|
32
|
+
},
|
|
33
|
+
lastModified: {
|
|
34
|
+
type: DataTypes.STRING,
|
|
35
|
+
allowNull: true,
|
|
36
|
+
},
|
|
37
|
+
options: {
|
|
38
|
+
type: DataTypes.JSON,
|
|
39
|
+
allowNull: true,
|
|
40
|
+
},
|
|
41
|
+
}, {
|
|
42
|
+
sequelize,
|
|
43
|
+
modelName: 'snapshot',
|
|
44
|
+
tableName: 'snap',
|
|
45
|
+
timestamps: true,
|
|
46
|
+
});
|
|
47
|
+
return Snapshot;
|
|
48
|
+
}
|
package/esm/index.d.ts
ADDED
package/esm/index.js
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { config, logger } from './config';
|
|
11
|
+
import { createCrawlQueue } from './crawler';
|
|
12
|
+
import { ensureDatabase } from './db';
|
|
13
|
+
import { ensureBrowser } from './puppeteer';
|
|
14
|
+
export * from './blocklet';
|
|
15
|
+
export * from './crawler';
|
|
16
|
+
export * from './middleware';
|
|
17
|
+
export { Snapshot } from './db/snapshot';
|
|
18
|
+
export function initCrawler(_config) {
|
|
19
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
20
|
+
Object.assign(config, _config);
|
|
21
|
+
logger.debug('init crawler', config);
|
|
22
|
+
yield ensureDatabase();
|
|
23
|
+
yield createCrawlQueue();
|
|
24
|
+
yield ensureBrowser();
|
|
25
|
+
});
|
|
26
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { useCache } from './cache';
|
|
11
|
+
import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
|
|
12
|
+
export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
|
|
13
|
+
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
14
|
+
const isBot = isBotUserAgent(req);
|
|
15
|
+
const isSelf = isSelfCrawler(req);
|
|
16
|
+
if (!isBot || isSelf) {
|
|
17
|
+
return next();
|
|
18
|
+
}
|
|
19
|
+
const fullUrl = getFullUrl(req);
|
|
20
|
+
const canCrawl = yield isAcceptCrawler(fullUrl);
|
|
21
|
+
const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
|
|
22
|
+
// can not crawl, skip
|
|
23
|
+
if (!canCrawl || !allowCrawlerResult) {
|
|
24
|
+
return next();
|
|
25
|
+
}
|
|
26
|
+
const cacheData = yield useCache.get(fullUrl);
|
|
27
|
+
// add cached html to req
|
|
28
|
+
req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
|
|
29
|
+
// add cached lastModified to req, ISO string to GMT string
|
|
30
|
+
req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
|
|
31
|
+
if (req.cachedLastmod) {
|
|
32
|
+
res.setHeader('Last-Modified', req.cachedLastmod);
|
|
33
|
+
}
|
|
34
|
+
if (autoReturnHtml && req.cachedHtml) {
|
|
35
|
+
res.send(req.cachedHtml);
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
// missing cache
|
|
39
|
+
next();
|
|
40
|
+
});
|
|
41
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
|
|
2
|
+
export { puppeteer };
|
|
3
|
+
export declare function ensurePuppeteerrc(): Promise<{
|
|
4
|
+
cacheDirectory: string;
|
|
5
|
+
temporaryDirectory: string;
|
|
6
|
+
}>;
|
|
7
|
+
export declare function ensureBrowser(): Promise<void>;
|
|
8
|
+
export declare function connectBrowser(): Promise<Browser | null>;
|
|
9
|
+
export declare function launchBrowser(): Promise<Browser>;
|
|
10
|
+
export declare const getBrowser: () => Promise<Browser>;
|
|
11
|
+
export declare const closeBrowser: ({ trimCache }?: {
|
|
12
|
+
trimCache?: boolean;
|
|
13
|
+
}) => Promise<void>;
|
|
14
|
+
export declare function initPage({ abortResourceTypes }?: {
|
|
15
|
+
abortResourceTypes?: never[] | undefined;
|
|
16
|
+
}): Promise<Page>;
|
package/esm/puppeteer.js
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
// import fs from 'fs-extra';
|
|
11
|
+
// import path from 'path';
|
|
12
|
+
import puppeteer from '@blocklet/puppeteer';
|
|
13
|
+
import { env } from '@blocklet/sdk/lib/config';
|
|
14
|
+
import fs from 'fs-extra';
|
|
15
|
+
import path from 'path';
|
|
16
|
+
import { clearInterval, setInterval } from 'timers';
|
|
17
|
+
import { useCache } from './cache';
|
|
18
|
+
import { config, logger } from './config';
|
|
19
|
+
import { CRAWLER_FLAG, sleep } from './utils';
|
|
20
|
+
// let puppeteerConfig: {
|
|
21
|
+
// cacheDirectory: string;
|
|
22
|
+
// temporaryDirectory: string;
|
|
23
|
+
// };
|
|
24
|
+
const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${env.appId || 'unknown'}`;
|
|
25
|
+
const BrowserStatus = {
|
|
26
|
+
Launching: 'Launching',
|
|
27
|
+
Ready: 'Ready',
|
|
28
|
+
};
|
|
29
|
+
let browser;
|
|
30
|
+
let browserActivatedTimer;
|
|
31
|
+
export { puppeteer };
|
|
32
|
+
export function ensurePuppeteerrc() {
|
|
33
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
34
|
+
const cacheDirectory = path.join(config.cacheDir, 'puppeteer', 'cache');
|
|
35
|
+
const temporaryDirectory = path.join(config.cacheDir, 'puppeteer', 'tmp');
|
|
36
|
+
const puppeteerrcPath = path.join(config.appDir, '.puppeteerrc.js');
|
|
37
|
+
// ensure directory exists
|
|
38
|
+
yield Promise.all([fs.ensureDir(cacheDirectory), fs.ensureDir(temporaryDirectory), fs.ensureFile(puppeteerrcPath)]);
|
|
39
|
+
const puppeteerConfig = {
|
|
40
|
+
cacheDirectory,
|
|
41
|
+
temporaryDirectory,
|
|
42
|
+
};
|
|
43
|
+
const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
|
|
44
|
+
yield fs.writeFile(puppeteerrcPath, fileContent);
|
|
45
|
+
logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
|
|
46
|
+
return puppeteerConfig;
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
export function ensureBrowser() {
|
|
50
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
51
|
+
const puppeteerConfig = yield ensurePuppeteerrc();
|
|
52
|
+
const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';
|
|
53
|
+
logger.info('executablePath', executablePath);
|
|
54
|
+
if (!fs.existsSync(executablePath)) {
|
|
55
|
+
logger.info('start download browser', puppeteerConfig);
|
|
56
|
+
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
try {
|
|
58
|
+
// @ts-ignore
|
|
59
|
+
// eslint-disable-next-line import/extensions
|
|
60
|
+
return yield import('@blocklet/puppeteer/internal/node/install.js');
|
|
61
|
+
}
|
|
62
|
+
catch (err) {
|
|
63
|
+
logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
|
|
64
|
+
}
|
|
65
|
+
}))();
|
|
66
|
+
if (downloadBrowser) {
|
|
67
|
+
yield downloadBrowser();
|
|
68
|
+
logger.info('Browser download completed successfully');
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// try to launch browser
|
|
72
|
+
if (config.testOnInitialize) {
|
|
73
|
+
const browser = yield launchBrowser();
|
|
74
|
+
if (!browser) {
|
|
75
|
+
throw new Error('Failed to launch browser');
|
|
76
|
+
}
|
|
77
|
+
yield closeBrowser();
|
|
78
|
+
}
|
|
79
|
+
logger.info('Puppeteer is ready');
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
export function connectBrowser() {
|
|
83
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
84
|
+
const browserWSEndpoint = yield useCache.get(BROWSER_WS_ENDPOINT_KEY);
|
|
85
|
+
if (!browserWSEndpoint) {
|
|
86
|
+
return null;
|
|
87
|
+
}
|
|
88
|
+
// retry if browser is launching
|
|
89
|
+
if (browserWSEndpoint.status === BrowserStatus.Launching) {
|
|
90
|
+
yield sleep(Math.floor(Math.random() * 1000));
|
|
91
|
+
return connectBrowser();
|
|
92
|
+
}
|
|
93
|
+
try {
|
|
94
|
+
browser = yield puppeteer.connect({
|
|
95
|
+
browserWSEndpoint: browserWSEndpoint.endpoint,
|
|
96
|
+
});
|
|
97
|
+
logger.info('Connect browser success');
|
|
98
|
+
}
|
|
99
|
+
catch (err) {
|
|
100
|
+
logger.warn('Connect browser failed, clear endpoint', err);
|
|
101
|
+
yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
return browser;
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
export function launchBrowser() {
|
|
108
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
109
|
+
yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
|
|
110
|
+
endpoint: null,
|
|
111
|
+
status: BrowserStatus.Launching,
|
|
112
|
+
});
|
|
113
|
+
try {
|
|
114
|
+
// @ts-ignore
|
|
115
|
+
browser = yield puppeteer.launch({
|
|
116
|
+
headless: true,
|
|
117
|
+
args: [
|
|
118
|
+
// docs: https://peter.sh/experiments/chromium-command-line-switches/
|
|
119
|
+
'--no-first-run',
|
|
120
|
+
'--hide-scrollbars',
|
|
121
|
+
'--no-sandbox',
|
|
122
|
+
'--no-zygote',
|
|
123
|
+
'--disable-setuid-sandbox',
|
|
124
|
+
'--disable-gpu',
|
|
125
|
+
'--disable-dev-shm-usage',
|
|
126
|
+
'--disable-site-isolation-trials',
|
|
127
|
+
'--disable-accelerated-2d-canvas',
|
|
128
|
+
'--disable-extensions',
|
|
129
|
+
'--js-flags=--max_old_space_size=512', // 限制V8内存
|
|
130
|
+
'--disable-background-networking',
|
|
131
|
+
'--disable-default-apps',
|
|
132
|
+
// '--disable-web-security', // 允许跨域请求
|
|
133
|
+
'--disable-software-rasterizer',
|
|
134
|
+
'--disable-crash-reporter',
|
|
135
|
+
'--disable-service-workers',
|
|
136
|
+
'--disable-notifications',
|
|
137
|
+
'--disable-infobars',
|
|
138
|
+
'--font-render-hinting=none',
|
|
139
|
+
],
|
|
140
|
+
});
|
|
141
|
+
logger.info('Launch browser success');
|
|
142
|
+
}
|
|
143
|
+
catch (error) {
|
|
144
|
+
logger.error('launch browser failed: ', error);
|
|
145
|
+
// cleanup browser endpoint
|
|
146
|
+
yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
147
|
+
throw error;
|
|
148
|
+
}
|
|
149
|
+
// save browserWSEndpoint to cache
|
|
150
|
+
const endpoint = yield browser.wsEndpoint();
|
|
151
|
+
yield useCache.set(BROWSER_WS_ENDPOINT_KEY, {
|
|
152
|
+
endpoint,
|
|
153
|
+
status: BrowserStatus.Ready,
|
|
154
|
+
});
|
|
155
|
+
return browser;
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
function checkBrowserActivated() {
|
|
159
|
+
clearBrowserActivatedTimer();
|
|
160
|
+
let count = 0;
|
|
161
|
+
browserActivatedTimer = setInterval(() => __awaiter(this, void 0, void 0, function* () {
|
|
162
|
+
var _a;
|
|
163
|
+
if (browser) {
|
|
164
|
+
const pages = yield browser.pages().catch(() => []);
|
|
165
|
+
if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
|
|
166
|
+
count++;
|
|
167
|
+
logger.debug(`Browser inactive count: ${count}/3`);
|
|
168
|
+
}
|
|
169
|
+
else {
|
|
170
|
+
count = 0; // 重置计数器!
|
|
171
|
+
}
|
|
172
|
+
if (count >= 3) {
|
|
173
|
+
logger.info('Browser inactive for 3 minutes, closing...');
|
|
174
|
+
yield closeBrowser({
|
|
175
|
+
trimCache: true,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}), 1000 * 60);
|
|
180
|
+
}
|
|
181
|
+
function clearBrowserActivatedTimer() {
|
|
182
|
+
if (browserActivatedTimer) {
|
|
183
|
+
clearInterval(browserActivatedTimer);
|
|
184
|
+
browserActivatedTimer = null;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
export const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
188
|
+
if (browser)
|
|
189
|
+
return browser;
|
|
190
|
+
// sleep random time (0 ~ 5s),to avoid concurrent blocklet
|
|
191
|
+
yield sleep(Math.floor(Math.random() * 1000 * 5));
|
|
192
|
+
// try to connect browser
|
|
193
|
+
const connectedBrowser = yield connectBrowser();
|
|
194
|
+
if (connectedBrowser) {
|
|
195
|
+
logger.debug('getBrowser.connectedBrowser');
|
|
196
|
+
browser = connectedBrowser;
|
|
197
|
+
return browser;
|
|
198
|
+
}
|
|
199
|
+
// try to launch browser
|
|
200
|
+
const launchedBrowser = yield launchBrowser();
|
|
201
|
+
if (launchedBrowser) {
|
|
202
|
+
logger.debug('getBrowser.launchedBrowser');
|
|
203
|
+
browser = launchedBrowser;
|
|
204
|
+
checkBrowserActivated();
|
|
205
|
+
return browser;
|
|
206
|
+
}
|
|
207
|
+
throw new Error('No browser to use, should install redis or browser');
|
|
208
|
+
});
|
|
209
|
+
export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* ({ trimCache = true } = {}) {
|
|
210
|
+
if (!browser)
|
|
211
|
+
return;
|
|
212
|
+
// close all pages
|
|
213
|
+
try {
|
|
214
|
+
const pages = yield browser.pages();
|
|
215
|
+
yield Promise.all(pages.map((page) => page.close()));
|
|
216
|
+
}
|
|
217
|
+
catch (err) {
|
|
218
|
+
logger.error('Failed to close all pages:', err);
|
|
219
|
+
}
|
|
220
|
+
// close browser
|
|
221
|
+
try {
|
|
222
|
+
yield browser.close();
|
|
223
|
+
}
|
|
224
|
+
catch (err) {
|
|
225
|
+
logger.error('Failed to close browser:', err);
|
|
226
|
+
}
|
|
227
|
+
// clear cache
|
|
228
|
+
try {
|
|
229
|
+
if (trimCache) {
|
|
230
|
+
yield puppeteer.trimCache();
|
|
231
|
+
logger.info('Trim cache success');
|
|
232
|
+
}
|
|
233
|
+
// try to clear temporary directory
|
|
234
|
+
// if (puppeteerConfig) {
|
|
235
|
+
// await fs.emptyDir(puppeteerConfig.temporaryDirectory);
|
|
236
|
+
// }
|
|
237
|
+
if (global.gc) {
|
|
238
|
+
global.gc();
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch (err) {
|
|
242
|
+
logger.error('Failed to clear browser cache:', err);
|
|
243
|
+
}
|
|
244
|
+
browser = null;
|
|
245
|
+
clearBrowserActivatedTimer();
|
|
246
|
+
yield useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
247
|
+
logger.info('Close browser success');
|
|
248
|
+
});
|
|
249
|
+
export function initPage() {
|
|
250
|
+
return __awaiter(this, arguments, void 0, function* ({ abortResourceTypes = [] } = {}) {
|
|
251
|
+
const browser = yield getBrowser();
|
|
252
|
+
const page = yield browser.newPage();
|
|
253
|
+
yield page.setViewport({ width: 1440, height: 900 });
|
|
254
|
+
// page setting
|
|
255
|
+
// add custom headers
|
|
256
|
+
yield page.setExtraHTTPHeaders({
|
|
257
|
+
[CRAWLER_FLAG]: 'true',
|
|
258
|
+
});
|
|
259
|
+
// abort resource types
|
|
260
|
+
if (abortResourceTypes.length > 0) {
|
|
261
|
+
yield page.setRequestInterception(true);
|
|
262
|
+
page.on('request', (req) => {
|
|
263
|
+
// @ts-ignore
|
|
264
|
+
if (abortResourceTypes.includes(req.resourceType())) {
|
|
265
|
+
return req.abort();
|
|
266
|
+
}
|
|
267
|
+
return req.continue();
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
return page;
|
|
271
|
+
});
|
|
272
|
+
}
|
package/esm/utils.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export declare const api: import("axios").AxiosInstance;
|
|
2
|
+
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
3
|
+
export declare const CRAWLER_FLAG = "x-crawler";
|
|
4
|
+
export declare const isSelfCrawler: (req: any) => boolean;
|
|
5
|
+
export declare const getDefaultRobotsUrl: (url: string) => string;
|
|
6
|
+
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
7
|
+
export declare const getDefaultSitemapUrl: (url: string) => string;
|
|
8
|
+
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
9
|
+
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
10
|
+
export declare const isBotUserAgent: (req: any) => boolean;
|
|
11
|
+
export declare const getComponentInfo: () => {};
|
|
12
|
+
export declare const getFullUrl: (req: any) => string;
|
|
13
|
+
export declare const getRelativePath: (url: string) => string;
|
|
14
|
+
export declare const formatUrl: (url: string) => string;
|
|
15
|
+
export declare function md5(content: string | Uint8Array): string;
|