@arcblock/crawler 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/lib/cjs/config.d.ts +9 -3
- package/lib/cjs/config.js +2 -10
- package/lib/cjs/crawler.js +18 -18
- package/lib/cjs/cron.js +5 -0
- package/lib/cjs/index.d.ts +1 -4
- package/lib/cjs/index.js +3 -1
- package/lib/cjs/services/snapshot.js +8 -1
- package/lib/cjs/site.js +2 -1
- package/lib/cjs/store/job.d.ts +4 -1
- package/lib/cjs/store/snapshot.d.ts +6 -0
- package/lib/esm/config.d.ts +9 -3
- package/lib/esm/config.js +2 -10
- package/lib/esm/crawler.js +18 -18
- package/lib/esm/cron.js +5 -0
- package/lib/esm/index.d.ts +1 -4
- package/lib/esm/index.js +3 -1
- package/lib/esm/services/snapshot.js +8 -1
- package/lib/esm/site.js +2 -1
- package/lib/esm/store/job.d.ts +4 -1
- package/lib/esm/store/snapshot.d.ts +6 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -43,8 +43,7 @@ await initCrawler({
|
|
|
43
43
|
immediate: !!env.preferences.cronImmediate,
|
|
44
44
|
sites: env.preferences.cronSites,
|
|
45
45
|
time: env.preferences.cronTime,
|
|
46
|
-
|
|
47
|
-
sitemapConcurrency: env.preferences.sitemapConcurrency,
|
|
46
|
+
concurrency: env.preferences.concurrency,
|
|
48
47
|
},
|
|
49
48
|
});
|
|
50
49
|
```
|
package/lib/cjs/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/cjs/config.js
CHANGED
|
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
|
|
|
9
9
|
exports.config = {
|
|
10
10
|
isProd: process.env.NODE_ENV === 'production',
|
|
11
11
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
12
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
13
12
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
13
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
14
14
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
15
15
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
16
|
-
|
|
17
|
-
siteCron: {
|
|
18
|
-
sites: [],
|
|
19
|
-
enabled: true,
|
|
20
|
-
time: '0 0 0 * * *',
|
|
21
|
-
immediate: false,
|
|
22
|
-
crawlConcurrency: 2,
|
|
23
|
-
sitemapConcurrency: 30,
|
|
24
|
-
},
|
|
16
|
+
concurrency: 2,
|
|
25
17
|
};
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -33,7 +33,7 @@ function createCrawlQueue() {
|
|
|
33
33
|
const db = new BaseState(job_1.Job);
|
|
34
34
|
crawlQueue = (0, queue_1.default)({
|
|
35
35
|
store: new sequelize_1.default(db, 'crawler'),
|
|
36
|
-
concurrency: config_1.config.
|
|
36
|
+
concurrency: config_1.config.concurrency,
|
|
37
37
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
38
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
39
|
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
@@ -57,18 +57,14 @@ function createCrawlQueue() {
|
|
|
57
57
|
// } catch (error) {
|
|
58
58
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
59
59
|
// }
|
|
60
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
60
61
|
try {
|
|
61
62
|
// get page content later
|
|
62
|
-
const result = yield (0, exports.getPageContent)(
|
|
63
|
-
// for blocklet theme
|
|
64
|
-
blocklet_theme_prefer: 'light',
|
|
65
|
-
// for blocklet domain warning
|
|
66
|
-
'domain-warning-skip': Date.now().toString(),
|
|
67
|
-
} }, job));
|
|
63
|
+
const result = yield (0, exports.getPageContent)(formattedJob);
|
|
68
64
|
if (!result || (!result.html && !result.screenshot)) {
|
|
69
|
-
config_1.logger.error(`failed to crawl ${
|
|
65
|
+
config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
70
66
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
71
|
-
job,
|
|
67
|
+
job: formattedJob,
|
|
72
68
|
snapshot: {
|
|
73
69
|
status: 'failed',
|
|
74
70
|
error: 'Failed to crawl content',
|
|
@@ -84,7 +80,7 @@ function createCrawlQueue() {
|
|
|
84
80
|
});
|
|
85
81
|
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
86
82
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
87
|
-
job,
|
|
83
|
+
job: formattedJob,
|
|
88
84
|
snapshot: {
|
|
89
85
|
status: 'success',
|
|
90
86
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
@@ -96,9 +92,9 @@ function createCrawlQueue() {
|
|
|
96
92
|
return snapshot;
|
|
97
93
|
}
|
|
98
94
|
catch (error) {
|
|
99
|
-
config_1.logger.error(`Failed to crawl ${
|
|
95
|
+
config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
100
96
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
101
|
-
job,
|
|
97
|
+
job: formattedJob,
|
|
102
98
|
snapshot: {
|
|
103
99
|
status: 'failed',
|
|
104
100
|
error: 'Internal error',
|
|
@@ -142,7 +138,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
142
138
|
};
|
|
143
139
|
});
|
|
144
140
|
}
|
|
145
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies
|
|
141
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
146
142
|
const page = yield (0, puppeteer_1.initPage)();
|
|
147
143
|
if (width && height) {
|
|
148
144
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -150,13 +146,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
150
146
|
if (headers) {
|
|
151
147
|
yield page.setExtraHTTPHeaders(headers);
|
|
152
148
|
}
|
|
153
|
-
|
|
154
|
-
|
|
149
|
+
// handle cookies
|
|
150
|
+
if (cookies) {
|
|
151
|
+
const { hostname } = new URL(url);
|
|
152
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
153
|
+
yield page.setCookie(...cookieParams);
|
|
155
154
|
}
|
|
155
|
+
// handle localStorage
|
|
156
156
|
if (localStorage) {
|
|
157
157
|
yield page.evaluateOnNewDocument((items) => {
|
|
158
|
-
|
|
159
|
-
|
|
158
|
+
items.forEach((item) => {
|
|
159
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
160
|
+
window.localStorage.setItem(item.key, value);
|
|
160
161
|
});
|
|
161
162
|
}, localStorage);
|
|
162
163
|
}
|
|
@@ -255,7 +256,6 @@ exports.getPageContent = getPageContent;
|
|
|
255
256
|
// eslint-disable-next-line require-await
|
|
256
257
|
function crawlUrl(params, callback) {
|
|
257
258
|
return __awaiter(this, void 0, void 0, function* () {
|
|
258
|
-
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
259
259
|
// skip duplicate job
|
|
260
260
|
const existsJob = yield job_1.Job.isExists(params);
|
|
261
261
|
if (existsJob) {
|
package/lib/cjs/cron.js
CHANGED
|
@@ -20,6 +20,8 @@ let cron = null;
|
|
|
20
20
|
function initCron() {
|
|
21
21
|
if (cron)
|
|
22
22
|
return;
|
|
23
|
+
if (!config_1.config.siteCron)
|
|
24
|
+
return;
|
|
23
25
|
config_1.logger.info('Init cron', { config: config_1.config.siteCron });
|
|
24
26
|
cron = cron_1.default.init({
|
|
25
27
|
context: {},
|
|
@@ -29,6 +31,9 @@ function initCron() {
|
|
|
29
31
|
time: config_1.config.siteCron.time,
|
|
30
32
|
options: { runOnInit: config_1.config.siteCron.immediate },
|
|
31
33
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
34
|
+
var _a;
|
|
35
|
+
if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
36
|
+
return;
|
|
32
37
|
config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
|
|
33
38
|
for (const site of config_1.config.siteCron.sites) {
|
|
34
39
|
try {
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -3,7 +3,4 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -50,6 +50,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
50
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
51
|
exports.utils = void 0;
|
|
52
52
|
exports.initCrawler = initCrawler;
|
|
53
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
53
54
|
const merge_1 = __importDefault(require("lodash/merge"));
|
|
54
55
|
const config_1 = require("./config");
|
|
55
56
|
const crawler_1 = require("./crawler");
|
|
@@ -62,13 +63,14 @@ __exportStar(require("./services/snapshot"), exports);
|
|
|
62
63
|
exports.utils = __importStar(require("./utils"));
|
|
63
64
|
function initCrawler(params) {
|
|
64
65
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
|
+
var _a;
|
|
65
67
|
(0, merge_1.default)(config_1.config, params);
|
|
66
68
|
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
67
69
|
try {
|
|
68
70
|
yield (0, store_1.initDatabase)();
|
|
69
71
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
70
72
|
yield (0, crawler_1.createCrawlQueue)();
|
|
71
|
-
if (config_1.config.siteCron.enabled) {
|
|
73
|
+
if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
72
74
|
yield (0, cron_1.initCron)();
|
|
73
75
|
}
|
|
74
76
|
}
|
|
@@ -16,6 +16,7 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
|
|
|
16
16
|
exports.formatSnapshot = formatSnapshot;
|
|
17
17
|
exports.getSnapshot = getSnapshot;
|
|
18
18
|
exports.getLatestSnapshot = getLatestSnapshot;
|
|
19
|
+
const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
|
|
19
20
|
const pick_1 = __importDefault(require("lodash/pick"));
|
|
20
21
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
21
22
|
const node_path_1 = __importDefault(require("node:path"));
|
|
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
|
|
|
36
37
|
}
|
|
37
38
|
function formatSnapshot(snapshot, columns) {
|
|
38
39
|
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
-
let data =
|
|
40
|
+
let data = (0, cloneDeep_1.default)(snapshot);
|
|
40
41
|
// format screenshot path to full url
|
|
41
42
|
if (data.screenshot) {
|
|
42
43
|
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
|
|
|
46
47
|
const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
|
|
47
48
|
data.html = html.toString();
|
|
48
49
|
}
|
|
50
|
+
// remove sensitive options that should not be returned
|
|
51
|
+
if (data.options) {
|
|
52
|
+
delete data.options.cookies;
|
|
53
|
+
delete data.options.localStorage;
|
|
54
|
+
delete data.options.headers;
|
|
55
|
+
}
|
|
49
56
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
50
57
|
data = (0, pick_1.default)(data, columns);
|
|
51
58
|
}
|
package/lib/cjs/site.js
CHANGED
|
@@ -27,6 +27,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
27
27
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
28
28
|
}
|
|
29
29
|
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
30
|
+
var _b;
|
|
30
31
|
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
31
32
|
const key = `${url}-${pathname}`;
|
|
32
33
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -72,7 +73,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
72
73
|
includeScreenshot: false,
|
|
73
74
|
includeHtml: true,
|
|
74
75
|
});
|
|
75
|
-
}), { concurrency: config_1.config.siteCron.
|
|
76
|
+
}), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
76
77
|
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
77
78
|
url,
|
|
78
79
|
pathname,
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -14,7 +14,10 @@ export interface JobState {
|
|
|
14
14
|
lastModified?: string;
|
|
15
15
|
headers?: Record<string, string>;
|
|
16
16
|
cookies?: CookieParam[];
|
|
17
|
-
localStorage?:
|
|
17
|
+
localStorage?: {
|
|
18
|
+
key: string;
|
|
19
|
+
value: string;
|
|
20
|
+
}[];
|
|
18
21
|
}
|
|
19
22
|
export interface JobModel {
|
|
20
23
|
id: string;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -19,6 +20,11 @@ export interface SnapshotModel {
|
|
|
19
20
|
quality?: number;
|
|
20
21
|
fullPage?: boolean;
|
|
21
22
|
headers?: Record<string, string>;
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
}
|
|
24
30
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
package/lib/esm/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/esm/config.js
CHANGED
|
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
|
|
|
3
3
|
export const config = {
|
|
4
4
|
isProd: process.env.NODE_ENV === 'production',
|
|
5
5
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
6
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
7
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
8
8
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
9
9
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
10
|
-
|
|
11
|
-
siteCron: {
|
|
12
|
-
sites: [],
|
|
13
|
-
enabled: true,
|
|
14
|
-
time: '0 0 0 * * *',
|
|
15
|
-
immediate: false,
|
|
16
|
-
crawlConcurrency: 2,
|
|
17
|
-
sitemapConcurrency: 30,
|
|
18
|
-
},
|
|
10
|
+
concurrency: 2,
|
|
19
11
|
};
|
package/lib/esm/crawler.js
CHANGED
|
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
|
|
|
24
24
|
const db = new BaseState(Job);
|
|
25
25
|
crawlQueue = createQueue({
|
|
26
26
|
store: new SequelizeStore(db, 'crawler'),
|
|
27
|
-
concurrency: config.
|
|
27
|
+
concurrency: config.concurrency,
|
|
28
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
29
|
logger.info('Starting to execute crawl job', job);
|
|
30
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
@@ -48,18 +48,14 @@ export function createCrawlQueue() {
|
|
|
48
48
|
// } catch (error) {
|
|
49
49
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
50
|
// }
|
|
51
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
51
52
|
try {
|
|
52
53
|
// get page content later
|
|
53
|
-
const result = yield getPageContent(
|
|
54
|
-
// for blocklet theme
|
|
55
|
-
blocklet_theme_prefer: 'light',
|
|
56
|
-
// for blocklet domain warning
|
|
57
|
-
'domain-warning-skip': Date.now().toString(),
|
|
58
|
-
} }, job));
|
|
54
|
+
const result = yield getPageContent(formattedJob);
|
|
59
55
|
if (!result || (!result.html && !result.screenshot)) {
|
|
60
|
-
logger.error(`failed to crawl ${
|
|
56
|
+
logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
61
57
|
const snapshot = convertJobToSnapshot({
|
|
62
|
-
job,
|
|
58
|
+
job: formattedJob,
|
|
63
59
|
snapshot: {
|
|
64
60
|
status: 'failed',
|
|
65
61
|
error: 'Failed to crawl content',
|
|
@@ -75,7 +71,7 @@ export function createCrawlQueue() {
|
|
|
75
71
|
});
|
|
76
72
|
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
77
73
|
const snapshot = convertJobToSnapshot({
|
|
78
|
-
job,
|
|
74
|
+
job: formattedJob,
|
|
79
75
|
snapshot: {
|
|
80
76
|
status: 'success',
|
|
81
77
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
@@ -87,9 +83,9 @@ export function createCrawlQueue() {
|
|
|
87
83
|
return snapshot;
|
|
88
84
|
}
|
|
89
85
|
catch (error) {
|
|
90
|
-
logger.error(`Failed to crawl ${
|
|
86
|
+
logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
91
87
|
const snapshot = convertJobToSnapshot({
|
|
92
|
-
job,
|
|
88
|
+
job: formattedJob,
|
|
93
89
|
snapshot: {
|
|
94
90
|
status: 'failed',
|
|
95
91
|
error: 'Internal error',
|
|
@@ -133,7 +129,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
133
129
|
};
|
|
134
130
|
});
|
|
135
131
|
}
|
|
136
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies
|
|
132
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
137
133
|
const page = yield initPage();
|
|
138
134
|
if (width && height) {
|
|
139
135
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -141,13 +137,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
141
137
|
if (headers) {
|
|
142
138
|
yield page.setExtraHTTPHeaders(headers);
|
|
143
139
|
}
|
|
144
|
-
|
|
145
|
-
|
|
140
|
+
// handle cookies
|
|
141
|
+
if (cookies) {
|
|
142
|
+
const { hostname } = new URL(url);
|
|
143
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
144
|
+
yield page.setCookie(...cookieParams);
|
|
146
145
|
}
|
|
146
|
+
// handle localStorage
|
|
147
147
|
if (localStorage) {
|
|
148
148
|
yield page.evaluateOnNewDocument((items) => {
|
|
149
|
-
|
|
150
|
-
|
|
149
|
+
items.forEach((item) => {
|
|
150
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
151
|
+
window.localStorage.setItem(item.key, value);
|
|
151
152
|
});
|
|
152
153
|
}, localStorage);
|
|
153
154
|
}
|
|
@@ -245,7 +246,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
245
246
|
// eslint-disable-next-line require-await
|
|
246
247
|
export function crawlUrl(params, callback) {
|
|
247
248
|
return __awaiter(this, void 0, void 0, function* () {
|
|
248
|
-
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
249
249
|
// skip duplicate job
|
|
250
250
|
const existsJob = yield Job.isExists(params);
|
|
251
251
|
if (existsJob) {
|
package/lib/esm/cron.js
CHANGED
|
@@ -14,6 +14,8 @@ let cron = null;
|
|
|
14
14
|
export function initCron() {
|
|
15
15
|
if (cron)
|
|
16
16
|
return;
|
|
17
|
+
if (!config.siteCron)
|
|
18
|
+
return;
|
|
17
19
|
logger.info('Init cron', { config: config.siteCron });
|
|
18
20
|
cron = Cron.init({
|
|
19
21
|
context: {},
|
|
@@ -23,6 +25,9 @@ export function initCron() {
|
|
|
23
25
|
time: config.siteCron.time,
|
|
24
26
|
options: { runOnInit: config.siteCron.immediate },
|
|
25
27
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
28
|
+
var _a;
|
|
29
|
+
if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
30
|
+
return;
|
|
26
31
|
logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
|
|
27
32
|
for (const site of config.siteCron.sites) {
|
|
28
33
|
try {
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -3,7 +3,4 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
10
11
|
import merge from 'lodash/merge';
|
|
11
12
|
import { config, logger } from './config';
|
|
12
13
|
import { createCrawlQueue } from './crawler';
|
|
@@ -19,13 +20,14 @@ export * from './services/snapshot';
|
|
|
19
20
|
export * as utils from './utils';
|
|
20
21
|
export function initCrawler(params) {
|
|
21
22
|
return __awaiter(this, void 0, void 0, function* () {
|
|
23
|
+
var _a;
|
|
22
24
|
merge(config, params);
|
|
23
25
|
logger.info('Init crawler', { params, config });
|
|
24
26
|
try {
|
|
25
27
|
yield initDatabase();
|
|
26
28
|
yield ensureBrowser();
|
|
27
29
|
yield createCrawlQueue();
|
|
28
|
-
if (config.siteCron.enabled) {
|
|
30
|
+
if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
29
31
|
yield initCron();
|
|
30
32
|
}
|
|
31
33
|
}
|
|
@@ -7,6 +7,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
import cloneDeep from 'lodash/cloneDeep';
|
|
10
11
|
import pick from 'lodash/pick';
|
|
11
12
|
import fs from 'node:fs/promises';
|
|
12
13
|
import path from 'node:path';
|
|
@@ -27,7 +28,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
|
|
|
27
28
|
}
|
|
28
29
|
export function formatSnapshot(snapshot, columns) {
|
|
29
30
|
return __awaiter(this, void 0, void 0, function* () {
|
|
30
|
-
let data =
|
|
31
|
+
let data = cloneDeep(snapshot);
|
|
31
32
|
// format screenshot path to full url
|
|
32
33
|
if (data.screenshot) {
|
|
33
34
|
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
@@ -37,6 +38,12 @@ export function formatSnapshot(snapshot, columns) {
|
|
|
37
38
|
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
38
39
|
data.html = html.toString();
|
|
39
40
|
}
|
|
41
|
+
// remove sensitive options that should not be returned
|
|
42
|
+
if (data.options) {
|
|
43
|
+
delete data.options.cookies;
|
|
44
|
+
delete data.options.localStorage;
|
|
45
|
+
delete data.options.headers;
|
|
46
|
+
}
|
|
40
47
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
41
48
|
data = pick(data, columns);
|
|
42
49
|
}
|
package/lib/esm/site.js
CHANGED
|
@@ -21,6 +21,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
21
21
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
22
22
|
}
|
|
23
23
|
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
24
|
+
var _b;
|
|
24
25
|
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
25
26
|
const key = `${url}-${pathname}`;
|
|
26
27
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -66,7 +67,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
66
67
|
includeScreenshot: false,
|
|
67
68
|
includeHtml: true,
|
|
68
69
|
});
|
|
69
|
-
}), { concurrency: config.siteCron.
|
|
70
|
+
}), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
70
71
|
logger.info('Enqueued jobs from sitemap finished', {
|
|
71
72
|
url,
|
|
72
73
|
pathname,
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -14,7 +14,10 @@ export interface JobState {
|
|
|
14
14
|
lastModified?: string;
|
|
15
15
|
headers?: Record<string, string>;
|
|
16
16
|
cookies?: CookieParam[];
|
|
17
|
-
localStorage?:
|
|
17
|
+
localStorage?: {
|
|
18
|
+
key: string;
|
|
19
|
+
value: string;
|
|
20
|
+
}[];
|
|
18
21
|
}
|
|
19
22
|
export interface JobModel {
|
|
20
23
|
id: string;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -19,6 +20,11 @@ export interface SnapshotModel {
|
|
|
19
20
|
quality?: number;
|
|
20
21
|
fullPage?: boolean;
|
|
21
22
|
headers?: Record<string, string>;
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
}
|
|
24
30
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|