@arcblock/crawler 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/lib/cjs/config.d.ts +9 -3
- package/lib/cjs/config.js +2 -10
- package/lib/cjs/crawler.d.ts +1 -1
- package/lib/cjs/crawler.js +24 -9
- package/lib/cjs/cron.js +5 -0
- package/lib/cjs/index.d.ts +1 -4
- package/lib/cjs/index.js +3 -1
- package/lib/cjs/services/snapshot.js +8 -1
- package/lib/cjs/site.js +2 -1
- package/lib/cjs/store/job.d.ts +6 -0
- package/lib/cjs/store/snapshot.d.ts +6 -0
- package/lib/cjs/utils.js +5 -0
- package/lib/esm/config.d.ts +9 -3
- package/lib/esm/config.js +2 -10
- package/lib/esm/crawler.d.ts +1 -1
- package/lib/esm/crawler.js +24 -9
- package/lib/esm/cron.js +5 -0
- package/lib/esm/index.d.ts +1 -4
- package/lib/esm/index.js +3 -1
- package/lib/esm/services/snapshot.js +8 -1
- package/lib/esm/site.js +2 -1
- package/lib/esm/store/job.d.ts +6 -0
- package/lib/esm/store/snapshot.d.ts +6 -0
- package/lib/esm/utils.js +5 -0
- package/package.json +9 -10
package/README.md
CHANGED
|
@@ -43,8 +43,7 @@ await initCrawler({
|
|
|
43
43
|
immediate: !!env.preferences.cronImmediate,
|
|
44
44
|
sites: env.preferences.cronSites,
|
|
45
45
|
time: env.preferences.cronTime,
|
|
46
|
-
|
|
47
|
-
sitemapConcurrency: env.preferences.sitemapConcurrency,
|
|
46
|
+
concurrency: env.preferences.concurrency,
|
|
48
47
|
},
|
|
49
48
|
});
|
|
50
49
|
```
|
package/lib/cjs/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/cjs/config.js
CHANGED
|
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
|
|
|
9
9
|
exports.config = {
|
|
10
10
|
isProd: process.env.NODE_ENV === 'production',
|
|
11
11
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
12
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
13
12
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
13
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
14
14
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
15
15
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
16
|
-
|
|
17
|
-
siteCron: {
|
|
18
|
-
sites: [],
|
|
19
|
-
enabled: true,
|
|
20
|
-
time: '0 0 0 * * *',
|
|
21
|
-
immediate: false,
|
|
22
|
-
crawlConcurrency: 2,
|
|
23
|
-
sitemapConcurrency: 30,
|
|
24
|
-
},
|
|
16
|
+
concurrency: 2,
|
|
25
17
|
};
|
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
9
|
html: string | null;
|
|
10
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
11
|
meta: {
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -33,7 +33,7 @@ function createCrawlQueue() {
|
|
|
33
33
|
const db = new BaseState(job_1.Job);
|
|
34
34
|
crawlQueue = (0, queue_1.default)({
|
|
35
35
|
store: new sequelize_1.default(db, 'crawler'),
|
|
36
|
-
concurrency: config_1.config.
|
|
36
|
+
concurrency: config_1.config.concurrency,
|
|
37
37
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
38
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
39
|
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
@@ -57,13 +57,14 @@ function createCrawlQueue() {
|
|
|
57
57
|
// } catch (error) {
|
|
58
58
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
59
59
|
// }
|
|
60
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
60
61
|
try {
|
|
61
62
|
// get page content later
|
|
62
|
-
const result = yield (0, exports.getPageContent)(
|
|
63
|
+
const result = yield (0, exports.getPageContent)(formattedJob);
|
|
63
64
|
if (!result || (!result.html && !result.screenshot)) {
|
|
64
|
-
config_1.logger.error(`failed to crawl ${
|
|
65
|
+
config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
65
66
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
66
|
-
job,
|
|
67
|
+
job: formattedJob,
|
|
67
68
|
snapshot: {
|
|
68
69
|
status: 'failed',
|
|
69
70
|
error: 'Failed to crawl content',
|
|
@@ -79,7 +80,7 @@ function createCrawlQueue() {
|
|
|
79
80
|
});
|
|
80
81
|
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
81
82
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
82
|
-
job,
|
|
83
|
+
job: formattedJob,
|
|
83
84
|
snapshot: {
|
|
84
85
|
status: 'success',
|
|
85
86
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
@@ -91,9 +92,9 @@ function createCrawlQueue() {
|
|
|
91
92
|
return snapshot;
|
|
92
93
|
}
|
|
93
94
|
catch (error) {
|
|
94
|
-
config_1.logger.error(`Failed to crawl ${
|
|
95
|
+
config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
95
96
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
96
|
-
job,
|
|
97
|
+
job: formattedJob,
|
|
97
98
|
snapshot: {
|
|
98
99
|
status: 'failed',
|
|
99
100
|
error: 'Internal error',
|
|
@@ -137,7 +138,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
137
138
|
};
|
|
138
139
|
});
|
|
139
140
|
}
|
|
140
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
141
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
141
142
|
const page = yield (0, puppeteer_1.initPage)();
|
|
142
143
|
if (width && height) {
|
|
143
144
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -145,6 +146,21 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
145
146
|
if (headers) {
|
|
146
147
|
yield page.setExtraHTTPHeaders(headers);
|
|
147
148
|
}
|
|
149
|
+
// handle cookies
|
|
150
|
+
if (cookies) {
|
|
151
|
+
const { hostname } = new URL(url);
|
|
152
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
153
|
+
yield page.setCookie(...cookieParams);
|
|
154
|
+
}
|
|
155
|
+
// handle localStorage
|
|
156
|
+
if (localStorage) {
|
|
157
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
158
|
+
items.forEach((item) => {
|
|
159
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
160
|
+
window.localStorage.setItem(item.key, value);
|
|
161
|
+
});
|
|
162
|
+
}, localStorage);
|
|
163
|
+
}
|
|
148
164
|
let html = null;
|
|
149
165
|
let screenshot = null;
|
|
150
166
|
const meta = {};
|
|
@@ -240,7 +256,6 @@ exports.getPageContent = getPageContent;
|
|
|
240
256
|
// eslint-disable-next-line require-await
|
|
241
257
|
function crawlUrl(params, callback) {
|
|
242
258
|
return __awaiter(this, void 0, void 0, function* () {
|
|
243
|
-
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
244
259
|
// skip duplicate job
|
|
245
260
|
const existsJob = yield job_1.Job.isExists(params);
|
|
246
261
|
if (existsJob) {
|
package/lib/cjs/cron.js
CHANGED
|
@@ -20,6 +20,8 @@ let cron = null;
|
|
|
20
20
|
function initCron() {
|
|
21
21
|
if (cron)
|
|
22
22
|
return;
|
|
23
|
+
if (!config_1.config.siteCron)
|
|
24
|
+
return;
|
|
23
25
|
config_1.logger.info('Init cron', { config: config_1.config.siteCron });
|
|
24
26
|
cron = cron_1.default.init({
|
|
25
27
|
context: {},
|
|
@@ -29,6 +31,9 @@ function initCron() {
|
|
|
29
31
|
time: config_1.config.siteCron.time,
|
|
30
32
|
options: { runOnInit: config_1.config.siteCron.immediate },
|
|
31
33
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
34
|
+
var _a;
|
|
35
|
+
if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
36
|
+
return;
|
|
32
37
|
config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
|
|
33
38
|
for (const site of config_1.config.siteCron.sites) {
|
|
34
39
|
try {
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -3,7 +3,4 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -50,6 +50,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
50
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
51
|
exports.utils = void 0;
|
|
52
52
|
exports.initCrawler = initCrawler;
|
|
53
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
53
54
|
const merge_1 = __importDefault(require("lodash/merge"));
|
|
54
55
|
const config_1 = require("./config");
|
|
55
56
|
const crawler_1 = require("./crawler");
|
|
@@ -62,13 +63,14 @@ __exportStar(require("./services/snapshot"), exports);
|
|
|
62
63
|
exports.utils = __importStar(require("./utils"));
|
|
63
64
|
function initCrawler(params) {
|
|
64
65
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
|
+
var _a;
|
|
65
67
|
(0, merge_1.default)(config_1.config, params);
|
|
66
68
|
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
67
69
|
try {
|
|
68
70
|
yield (0, store_1.initDatabase)();
|
|
69
71
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
70
72
|
yield (0, crawler_1.createCrawlQueue)();
|
|
71
|
-
if (config_1.config.siteCron.enabled) {
|
|
73
|
+
if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
72
74
|
yield (0, cron_1.initCron)();
|
|
73
75
|
}
|
|
74
76
|
}
|
|
@@ -16,6 +16,7 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
|
|
|
16
16
|
exports.formatSnapshot = formatSnapshot;
|
|
17
17
|
exports.getSnapshot = getSnapshot;
|
|
18
18
|
exports.getLatestSnapshot = getLatestSnapshot;
|
|
19
|
+
const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
|
|
19
20
|
const pick_1 = __importDefault(require("lodash/pick"));
|
|
20
21
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
21
22
|
const node_path_1 = __importDefault(require("node:path"));
|
|
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
|
|
|
36
37
|
}
|
|
37
38
|
function formatSnapshot(snapshot, columns) {
|
|
38
39
|
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
-
let data =
|
|
40
|
+
let data = (0, cloneDeep_1.default)(snapshot);
|
|
40
41
|
// format screenshot path to full url
|
|
41
42
|
if (data.screenshot) {
|
|
42
43
|
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
|
|
|
46
47
|
const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
|
|
47
48
|
data.html = html.toString();
|
|
48
49
|
}
|
|
50
|
+
// remove sensitive options that should not be returned
|
|
51
|
+
if (data.options) {
|
|
52
|
+
delete data.options.cookies;
|
|
53
|
+
delete data.options.localStorage;
|
|
54
|
+
delete data.options.headers;
|
|
55
|
+
}
|
|
49
56
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
50
57
|
data = (0, pick_1.default)(data, columns);
|
|
51
58
|
}
|
package/lib/cjs/site.js
CHANGED
|
@@ -27,6 +27,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
27
27
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
28
28
|
}
|
|
29
29
|
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
30
|
+
var _b;
|
|
30
31
|
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
31
32
|
const key = `${url}-${pathname}`;
|
|
32
33
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -72,7 +73,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
72
73
|
includeScreenshot: false,
|
|
73
74
|
includeHtml: true,
|
|
74
75
|
});
|
|
75
|
-
}), { concurrency: config_1.config.siteCron.
|
|
76
|
+
}), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
76
77
|
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
77
78
|
url,
|
|
78
79
|
pathname,
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -12,6 +13,11 @@ export interface JobState {
|
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
14
15
|
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: {
|
|
18
|
+
key: string;
|
|
19
|
+
value: string;
|
|
20
|
+
}[];
|
|
15
21
|
}
|
|
16
22
|
export interface JobModel {
|
|
17
23
|
id: string;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -19,6 +20,11 @@ export interface SnapshotModel {
|
|
|
19
20
|
quality?: number;
|
|
20
21
|
fullPage?: boolean;
|
|
21
22
|
headers?: Record<string, string>;
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
}
|
|
24
30
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
package/lib/cjs/utils.js
CHANGED
package/lib/esm/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/esm/config.js
CHANGED
|
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
|
|
|
3
3
|
export const config = {
|
|
4
4
|
isProd: process.env.NODE_ENV === 'production',
|
|
5
5
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
6
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
7
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
8
8
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
9
9
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
10
|
-
|
|
11
|
-
siteCron: {
|
|
12
|
-
sites: [],
|
|
13
|
-
enabled: true,
|
|
14
|
-
time: '0 0 0 * * *',
|
|
15
|
-
immediate: false,
|
|
16
|
-
crawlConcurrency: 2,
|
|
17
|
-
sitemapConcurrency: 30,
|
|
18
|
-
},
|
|
10
|
+
concurrency: 2,
|
|
19
11
|
};
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -5,7 +5,7 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
9
|
html: string | null;
|
|
10
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
11
|
meta: {
|
package/lib/esm/crawler.js
CHANGED
|
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
|
|
|
24
24
|
const db = new BaseState(Job);
|
|
25
25
|
crawlQueue = createQueue({
|
|
26
26
|
store: new SequelizeStore(db, 'crawler'),
|
|
27
|
-
concurrency: config.
|
|
27
|
+
concurrency: config.concurrency,
|
|
28
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
29
|
logger.info('Starting to execute crawl job', job);
|
|
30
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
@@ -48,13 +48,14 @@ export function createCrawlQueue() {
|
|
|
48
48
|
// } catch (error) {
|
|
49
49
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
50
|
// }
|
|
51
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
51
52
|
try {
|
|
52
53
|
// get page content later
|
|
53
|
-
const result = yield getPageContent(
|
|
54
|
+
const result = yield getPageContent(formattedJob);
|
|
54
55
|
if (!result || (!result.html && !result.screenshot)) {
|
|
55
|
-
logger.error(`failed to crawl ${
|
|
56
|
+
logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
56
57
|
const snapshot = convertJobToSnapshot({
|
|
57
|
-
job,
|
|
58
|
+
job: formattedJob,
|
|
58
59
|
snapshot: {
|
|
59
60
|
status: 'failed',
|
|
60
61
|
error: 'Failed to crawl content',
|
|
@@ -70,7 +71,7 @@ export function createCrawlQueue() {
|
|
|
70
71
|
});
|
|
71
72
|
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
72
73
|
const snapshot = convertJobToSnapshot({
|
|
73
|
-
job,
|
|
74
|
+
job: formattedJob,
|
|
74
75
|
snapshot: {
|
|
75
76
|
status: 'success',
|
|
76
77
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
@@ -82,9 +83,9 @@ export function createCrawlQueue() {
|
|
|
82
83
|
return snapshot;
|
|
83
84
|
}
|
|
84
85
|
catch (error) {
|
|
85
|
-
logger.error(`Failed to crawl ${
|
|
86
|
+
logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
86
87
|
const snapshot = convertJobToSnapshot({
|
|
87
|
-
job,
|
|
88
|
+
job: formattedJob,
|
|
88
89
|
snapshot: {
|
|
89
90
|
status: 'failed',
|
|
90
91
|
error: 'Internal error',
|
|
@@ -128,7 +129,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
128
129
|
};
|
|
129
130
|
});
|
|
130
131
|
}
|
|
131
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
132
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
132
133
|
const page = yield initPage();
|
|
133
134
|
if (width && height) {
|
|
134
135
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -136,6 +137,21 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
136
137
|
if (headers) {
|
|
137
138
|
yield page.setExtraHTTPHeaders(headers);
|
|
138
139
|
}
|
|
140
|
+
// handle cookies
|
|
141
|
+
if (cookies) {
|
|
142
|
+
const { hostname } = new URL(url);
|
|
143
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
144
|
+
yield page.setCookie(...cookieParams);
|
|
145
|
+
}
|
|
146
|
+
// handle localStorage
|
|
147
|
+
if (localStorage) {
|
|
148
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
149
|
+
items.forEach((item) => {
|
|
150
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
151
|
+
window.localStorage.setItem(item.key, value);
|
|
152
|
+
});
|
|
153
|
+
}, localStorage);
|
|
154
|
+
}
|
|
139
155
|
let html = null;
|
|
140
156
|
let screenshot = null;
|
|
141
157
|
const meta = {};
|
|
@@ -230,7 +246,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
230
246
|
// eslint-disable-next-line require-await
|
|
231
247
|
export function crawlUrl(params, callback) {
|
|
232
248
|
return __awaiter(this, void 0, void 0, function* () {
|
|
233
|
-
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
234
249
|
// skip duplicate job
|
|
235
250
|
const existsJob = yield Job.isExists(params);
|
|
236
251
|
if (existsJob) {
|
package/lib/esm/cron.js
CHANGED
|
@@ -14,6 +14,8 @@ let cron = null;
|
|
|
14
14
|
export function initCron() {
|
|
15
15
|
if (cron)
|
|
16
16
|
return;
|
|
17
|
+
if (!config.siteCron)
|
|
18
|
+
return;
|
|
17
19
|
logger.info('Init cron', { config: config.siteCron });
|
|
18
20
|
cron = Cron.init({
|
|
19
21
|
context: {},
|
|
@@ -23,6 +25,9 @@ export function initCron() {
|
|
|
23
25
|
time: config.siteCron.time,
|
|
24
26
|
options: { runOnInit: config.siteCron.immediate },
|
|
25
27
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
28
|
+
var _a;
|
|
29
|
+
if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
30
|
+
return;
|
|
26
31
|
logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
|
|
27
32
|
for (const site of config.siteCron.sites) {
|
|
28
33
|
try {
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -3,7 +3,4 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
10
11
|
import merge from 'lodash/merge';
|
|
11
12
|
import { config, logger } from './config';
|
|
12
13
|
import { createCrawlQueue } from './crawler';
|
|
@@ -19,13 +20,14 @@ export * from './services/snapshot';
|
|
|
19
20
|
export * as utils from './utils';
|
|
20
21
|
export function initCrawler(params) {
|
|
21
22
|
return __awaiter(this, void 0, void 0, function* () {
|
|
23
|
+
var _a;
|
|
22
24
|
merge(config, params);
|
|
23
25
|
logger.info('Init crawler', { params, config });
|
|
24
26
|
try {
|
|
25
27
|
yield initDatabase();
|
|
26
28
|
yield ensureBrowser();
|
|
27
29
|
yield createCrawlQueue();
|
|
28
|
-
if (config.siteCron.enabled) {
|
|
30
|
+
if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
29
31
|
yield initCron();
|
|
30
32
|
}
|
|
31
33
|
}
|
|
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
import cloneDeep from 'lodash/cloneDeep';
|
|
10
11
|
import pick from 'lodash/pick';
|
|
11
12
|
import fs from 'node:fs/promises';
|
|
12
13
|
import path from 'node:path';
|
|
@@ -27,7 +28,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
|
|
|
27
28
|
}
|
|
28
29
|
export function formatSnapshot(snapshot, columns) {
|
|
29
30
|
return __awaiter(this, void 0, void 0, function* () {
|
|
30
|
-
let data =
|
|
31
|
+
let data = cloneDeep(snapshot);
|
|
31
32
|
// format screenshot path to full url
|
|
32
33
|
if (data.screenshot) {
|
|
33
34
|
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
@@ -37,6 +38,12 @@ export function formatSnapshot(snapshot, columns) {
|
|
|
37
38
|
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
38
39
|
data.html = html.toString();
|
|
39
40
|
}
|
|
41
|
+
// remove sensitive options that should not be returned
|
|
42
|
+
if (data.options) {
|
|
43
|
+
delete data.options.cookies;
|
|
44
|
+
delete data.options.localStorage;
|
|
45
|
+
delete data.options.headers;
|
|
46
|
+
}
|
|
40
47
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
41
48
|
data = pick(data, columns);
|
|
42
49
|
}
|
package/lib/esm/site.js
CHANGED
|
@@ -21,6 +21,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
21
21
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
22
22
|
}
|
|
23
23
|
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
24
|
+
var _b;
|
|
24
25
|
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
25
26
|
const key = `${url}-${pathname}`;
|
|
26
27
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -66,7 +67,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
66
67
|
includeScreenshot: false,
|
|
67
68
|
includeHtml: true,
|
|
68
69
|
});
|
|
69
|
-
}), { concurrency: config.siteCron.
|
|
70
|
+
}), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
70
71
|
logger.info('Enqueued jobs from sitemap finished', {
|
|
71
72
|
url,
|
|
72
73
|
pathname,
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -12,6 +13,11 @@ export interface JobState {
|
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
14
15
|
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: {
|
|
18
|
+
key: string;
|
|
19
|
+
value: string;
|
|
20
|
+
}[];
|
|
15
21
|
}
|
|
16
22
|
export interface JobModel {
|
|
17
23
|
id: string;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -19,6 +20,11 @@ export interface SnapshotModel {
|
|
|
19
20
|
quality?: number;
|
|
20
21
|
fullPage?: boolean;
|
|
21
22
|
headers?: Record<string, string>;
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
}
|
|
24
30
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
package/lib/esm/utils.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.6",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -45,33 +45,32 @@
|
|
|
45
45
|
]
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"@abtnode/cron": "^1.16.
|
|
49
|
-
"@abtnode/models": "^1.16.
|
|
50
|
-
"@abtnode/queue": "^1.16.
|
|
51
|
-
"@blocklet/logger": "^1.16.
|
|
48
|
+
"@abtnode/cron": "^1.16.44",
|
|
49
|
+
"@abtnode/models": "^1.16.44",
|
|
50
|
+
"@abtnode/queue": "^1.16.44",
|
|
51
|
+
"@blocklet/logger": "^1.16.44",
|
|
52
52
|
"@blocklet/puppeteer": "^22.11.3",
|
|
53
|
-
"@blocklet/sdk": "^1.16.
|
|
53
|
+
"@blocklet/sdk": "^1.16.44",
|
|
54
54
|
"@sequelize/core": "7.0.0-alpha.46",
|
|
55
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
56
56
|
"axios": "^1.7.9",
|
|
57
57
|
"fs-extra": "^11.2.0",
|
|
58
58
|
"lodash": "^4.17.21",
|
|
59
59
|
"lru-cache": "^10.4.3",
|
|
60
|
+
"p-map": "^7.0.3",
|
|
60
61
|
"robots-parser": "^3.0.1",
|
|
61
62
|
"sitemap": "^7.1.2",
|
|
62
63
|
"sqlite3": "^5.1.7",
|
|
63
|
-
"ufo": "^1.5.4"
|
|
64
|
-
"p-map": "^7.0.3"
|
|
64
|
+
"ufo": "^1.5.4"
|
|
65
65
|
},
|
|
66
66
|
"devDependencies": {
|
|
67
|
-
"@blocklet/js-sdk": "^1.16.39",
|
|
68
67
|
"@types/dotenv-flow": "^3.3.3",
|
|
69
68
|
"@types/express": "^4.17.21",
|
|
70
69
|
"@types/fs-extra": "^11.0.4",
|
|
71
70
|
"@types/lodash": "^4.17.16",
|
|
72
71
|
"@types/node": "^20.17.19",
|
|
73
|
-
"express": "^4.21.2",
|
|
74
72
|
"bumpp": "^9.11.1",
|
|
73
|
+
"express": "^4.21.2",
|
|
75
74
|
"nodemon": "^3.1.9",
|
|
76
75
|
"npm-run-all": "^4.1.5",
|
|
77
76
|
"puppeteer": "^24.8.2",
|