@arcblock/crawler 1.0.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +44 -112
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +67 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +35 -100
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +21 -32
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +64 -71
- package/package.json +20 -32
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -17
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -17
package/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# @arcblock/crawler
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export type Site = {
|
|
2
|
+
url: string;
|
|
3
|
+
pathname: string;
|
|
4
|
+
/** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
|
|
5
|
+
interval?: number;
|
|
6
|
+
};
|
|
7
|
+
export type Config = {
|
|
8
|
+
isProd: boolean;
|
|
9
|
+
dataDir: string;
|
|
10
|
+
appDir: string;
|
|
11
|
+
appUrl: string;
|
|
12
|
+
cacheDir: string;
|
|
13
|
+
puppeteerPath?: string;
|
|
14
|
+
siteCron: {
|
|
15
|
+
sites: Site[];
|
|
16
|
+
time: string;
|
|
17
|
+
runOnInit: boolean;
|
|
18
|
+
concurrency: number;
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
export declare const logger: any;
|
|
22
|
+
export declare const config: Config;
|
|
@@ -5,13 +5,19 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
6
|
exports.config = exports.logger = void 0;
|
|
7
7
|
const logger_1 = __importDefault(require("@blocklet/logger"));
|
|
8
|
-
exports.logger = (0, logger_1.default)('crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
8
|
+
exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
9
9
|
exports.config = {
|
|
10
|
-
|
|
10
|
+
isProd: process.env.NODE_ENV === 'production',
|
|
11
11
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
12
12
|
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
13
13
|
appUrl: process.env.BLOCKLET_APP_URL,
|
|
14
14
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
15
15
|
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
16
|
-
|
|
16
|
+
// cron
|
|
17
|
+
siteCron: {
|
|
18
|
+
sites: [],
|
|
19
|
+
time: '0 0 */12 * * *',
|
|
20
|
+
runOnInit: false,
|
|
21
|
+
concurrency: 5,
|
|
22
|
+
},
|
|
17
23
|
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { JobState } from './store/job';
|
|
2
|
+
import { SnapshotModel } from './store/snapshot';
|
|
3
|
+
export declare function createCrawlQueue(): void;
|
|
4
|
+
export declare function getDataDir(): Promise<{
|
|
5
|
+
htmlDir: string;
|
|
6
|
+
screenshotDir: string;
|
|
7
|
+
}>;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
+
url: string;
|
|
10
|
+
includeScreenshot?: boolean;
|
|
11
|
+
includeHtml?: boolean;
|
|
12
|
+
width?: number;
|
|
13
|
+
height?: number;
|
|
14
|
+
quality?: number;
|
|
15
|
+
timeout?: number;
|
|
16
|
+
fullPage?: boolean;
|
|
17
|
+
}) => Promise<{
|
|
18
|
+
html: string;
|
|
19
|
+
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
|
+
}>;
|
|
21
|
+
/**
|
|
22
|
+
* crawl url and return job id
|
|
23
|
+
* @param params
|
|
24
|
+
* @param callback callback when job finished
|
|
25
|
+
*/
|
|
26
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
|
|
@@ -15,22 +15,17 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
15
15
|
exports.getPageContent = void 0;
|
|
16
16
|
exports.createCrawlQueue = createCrawlQueue;
|
|
17
17
|
exports.getDataDir = getDataDir;
|
|
18
|
-
exports.
|
|
19
|
-
exports.getJob = getJob;
|
|
20
|
-
exports.formatSnapshot = formatSnapshot;
|
|
21
|
-
exports.getSnapshot = getSnapshot;
|
|
18
|
+
exports.crawlUrl = crawlUrl;
|
|
22
19
|
const queue_1 = __importDefault(require("@abtnode/queue"));
|
|
23
20
|
const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
|
|
24
|
-
const core_1 = __importDefault(require("@sequelize/core"));
|
|
25
21
|
const crypto_1 = require("crypto");
|
|
26
22
|
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
27
|
-
const pick_1 = __importDefault(require("lodash/pick"));
|
|
28
23
|
const path_1 = __importDefault(require("path"));
|
|
29
|
-
const ufo_1 = require("ufo");
|
|
30
24
|
const config_1 = require("./config");
|
|
31
|
-
const job_1 = require("./db/job");
|
|
32
|
-
const snapshot_1 = require("./db/snapshot");
|
|
33
25
|
const puppeteer_1 = require("./puppeteer");
|
|
26
|
+
const snapshot_1 = require("./services/snapshot");
|
|
27
|
+
const job_1 = require("./store/job");
|
|
28
|
+
const snapshot_2 = require("./store/snapshot");
|
|
34
29
|
const utils_1 = require("./utils");
|
|
35
30
|
const { BaseState } = require('@abtnode/models');
|
|
36
31
|
let crawlQueue;
|
|
@@ -44,14 +39,14 @@ function createCrawlQueue() {
|
|
|
44
39
|
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
45
40
|
if (!canCrawl) {
|
|
46
41
|
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
47
|
-
const snapshot = convertJobToSnapshot({
|
|
42
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
48
43
|
job,
|
|
49
44
|
snapshot: {
|
|
50
45
|
status: 'failed',
|
|
51
46
|
error: 'Denied by robots.txt',
|
|
52
47
|
},
|
|
53
48
|
});
|
|
54
|
-
yield
|
|
49
|
+
yield snapshot_2.Snapshot.upsert(snapshot);
|
|
55
50
|
return snapshot;
|
|
56
51
|
}
|
|
57
52
|
// if index reach autoCloseBrowserCount, close browser
|
|
@@ -67,14 +62,14 @@ function createCrawlQueue() {
|
|
|
67
62
|
const result = yield (0, exports.getPageContent)(job);
|
|
68
63
|
if (!result || (!result.html && !result.screenshot)) {
|
|
69
64
|
config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
70
|
-
const snapshot = convertJobToSnapshot({
|
|
65
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
71
66
|
job,
|
|
72
67
|
snapshot: {
|
|
73
68
|
status: 'failed',
|
|
74
69
|
error: 'Failed to crawl content',
|
|
75
70
|
},
|
|
76
71
|
});
|
|
77
|
-
yield
|
|
72
|
+
yield snapshot_2.Snapshot.upsert(snapshot);
|
|
78
73
|
return snapshot;
|
|
79
74
|
}
|
|
80
75
|
// save html and screenshot to data dir
|
|
@@ -83,7 +78,7 @@ function createCrawlQueue() {
|
|
|
83
78
|
html: result.html,
|
|
84
79
|
});
|
|
85
80
|
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
86
|
-
const snapshot = convertJobToSnapshot({
|
|
81
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
87
82
|
job,
|
|
88
83
|
snapshot: {
|
|
89
84
|
status: 'success',
|
|
@@ -91,29 +86,19 @@ function createCrawlQueue() {
|
|
|
91
86
|
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
|
|
92
87
|
},
|
|
93
88
|
});
|
|
94
|
-
yield
|
|
89
|
+
yield snapshot_2.Snapshot.upsert(snapshot);
|
|
95
90
|
return snapshot;
|
|
96
|
-
// save to redis
|
|
97
|
-
// if (saveToRedis) {
|
|
98
|
-
// useCache.set(url, {
|
|
99
|
-
// html: result.html || '',
|
|
100
|
-
// lastModified,
|
|
101
|
-
// });
|
|
102
|
-
// logger.info(`success to crawl ${url}`, job);
|
|
103
|
-
// return result;
|
|
104
|
-
// }
|
|
105
91
|
}
|
|
106
92
|
catch (error) {
|
|
107
93
|
config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
108
|
-
|
|
109
|
-
const snapshot = convertJobToSnapshot({
|
|
94
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
110
95
|
job,
|
|
111
96
|
snapshot: {
|
|
112
97
|
status: 'failed',
|
|
113
98
|
error: 'Internal error',
|
|
114
99
|
},
|
|
115
100
|
});
|
|
116
|
-
yield
|
|
101
|
+
yield snapshot_2.Snapshot.upsert(snapshot);
|
|
117
102
|
return snapshot;
|
|
118
103
|
}
|
|
119
104
|
}),
|
|
@@ -157,7 +142,7 @@ function formatHtml(htmlString) {
|
|
|
157
142
|
}
|
|
158
143
|
return htmlString;
|
|
159
144
|
}
|
|
160
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
145
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
161
146
|
config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
162
147
|
const page = yield (0, puppeteer_1.initPage)();
|
|
163
148
|
if (width && height) {
|
|
@@ -199,15 +184,24 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
199
184
|
}
|
|
200
185
|
catch (err) {
|
|
201
186
|
config_1.logger.error('Failed to get screenshot:', err);
|
|
187
|
+
throw err;
|
|
202
188
|
}
|
|
203
189
|
}
|
|
204
190
|
// get html
|
|
205
191
|
if (includeHtml) {
|
|
206
|
-
|
|
207
|
-
html = yield
|
|
192
|
+
try {
|
|
193
|
+
html = yield page.evaluate(() => {
|
|
194
|
+
// add meta tag to record crawler
|
|
195
|
+
const meta = document.createElement('meta');
|
|
196
|
+
meta.name = 'arcblock-crawler';
|
|
197
|
+
meta.content = 'true';
|
|
198
|
+
document.head.appendChild(meta);
|
|
199
|
+
return document.documentElement.outerHTML;
|
|
200
|
+
});
|
|
208
201
|
}
|
|
209
|
-
|
|
210
|
-
|
|
202
|
+
catch (err) {
|
|
203
|
+
config_1.logger.error('Failed to get html:', err);
|
|
204
|
+
throw err;
|
|
211
205
|
}
|
|
212
206
|
}
|
|
213
207
|
}
|
|
@@ -225,11 +219,16 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
225
219
|
};
|
|
226
220
|
});
|
|
227
221
|
exports.getPageContent = getPageContent;
|
|
228
|
-
|
|
222
|
+
/**
|
|
223
|
+
* crawl url and return job id
|
|
224
|
+
* @param params
|
|
225
|
+
* @param callback callback when job finished
|
|
226
|
+
*/
|
|
227
|
+
function crawlUrl(params, callback) {
|
|
229
228
|
return __awaiter(this, void 0, void 0, function* () {
|
|
230
|
-
params = Object.assign(Object.assign({}, params), {
|
|
229
|
+
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
231
230
|
// skip duplicate job
|
|
232
|
-
const
|
|
231
|
+
const { job: duplicateJob } = (yield job_1.Job.findJob({
|
|
233
232
|
url: params.url,
|
|
234
233
|
includeScreenshot: params.includeScreenshot,
|
|
235
234
|
includeHtml: params.includeHtml,
|
|
@@ -237,89 +236,22 @@ function createCrawlJob(params, callback) {
|
|
|
237
236
|
width: params.width,
|
|
238
237
|
height: params.height,
|
|
239
238
|
fullPage: params.fullPage,
|
|
240
|
-
});
|
|
241
|
-
if (
|
|
239
|
+
})) || {};
|
|
240
|
+
if (duplicateJob) {
|
|
242
241
|
config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
|
|
243
|
-
return
|
|
242
|
+
return duplicateJob.id;
|
|
244
243
|
}
|
|
245
244
|
config_1.logger.info('create crawl job', params);
|
|
246
|
-
const
|
|
247
|
-
job.
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
245
|
+
const jobId = (0, crypto_1.randomUUID)();
|
|
246
|
+
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
247
|
+
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
248
|
+
config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
249
|
+
callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
|
|
250
|
+
}));
|
|
251
251
|
job.on('failed', ({ error }) => {
|
|
252
252
|
config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
253
253
|
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
254
254
|
});
|
|
255
|
-
return
|
|
256
|
-
});
|
|
257
|
-
}
|
|
258
|
-
// @ts-ignore
|
|
259
|
-
function getJob(condition) {
|
|
260
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
261
|
-
const where = Object.keys(condition)
|
|
262
|
-
.filter((key) => condition[key] !== undefined)
|
|
263
|
-
.map((key) => {
|
|
264
|
-
return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
|
|
265
|
-
});
|
|
266
|
-
const job = yield crawlQueue.store.db.findOne({
|
|
267
|
-
where: {
|
|
268
|
-
[core_1.default.Op.and]: where,
|
|
269
|
-
},
|
|
270
|
-
});
|
|
271
|
-
if (job) {
|
|
272
|
-
return job.job;
|
|
273
|
-
}
|
|
274
|
-
return null;
|
|
275
|
-
});
|
|
276
|
-
}
|
|
277
|
-
function convertJobToSnapshot({ job, snapshot }) {
|
|
278
|
-
return Object.assign({
|
|
279
|
-
// @ts-ignore
|
|
280
|
-
jobId: job.jobId || job.id, url: job.url, options: {
|
|
281
|
-
width: job.width,
|
|
282
|
-
height: job.height,
|
|
283
|
-
includeScreenshot: job.includeScreenshot,
|
|
284
|
-
includeHtml: job.includeHtml,
|
|
285
|
-
quality: job.quality,
|
|
286
|
-
fullPage: job.fullPage,
|
|
287
|
-
} }, snapshot);
|
|
288
|
-
}
|
|
289
|
-
function formatSnapshot(snapshot, columns) {
|
|
290
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
291
|
-
let data = Object.assign({}, snapshot);
|
|
292
|
-
// format screenshot path to full url
|
|
293
|
-
if (data.screenshot) {
|
|
294
|
-
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
295
|
-
}
|
|
296
|
-
// format html path to string
|
|
297
|
-
if (data.html) {
|
|
298
|
-
const html = yield fs_extra_1.default.readFile(path_1.default.join(config_1.config.dataDir, data.html));
|
|
299
|
-
data.html = html.toString();
|
|
300
|
-
}
|
|
301
|
-
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
302
|
-
data = (0, pick_1.default)(data, columns);
|
|
303
|
-
}
|
|
304
|
-
return data;
|
|
305
|
-
});
|
|
306
|
-
}
|
|
307
|
-
/**
|
|
308
|
-
* get snapshot from db or crawl queue
|
|
309
|
-
*/
|
|
310
|
-
function getSnapshot(jobId) {
|
|
311
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
312
|
-
const snapshotModel = yield snapshot_1.Snapshot.findByPk(jobId);
|
|
313
|
-
if (snapshotModel) {
|
|
314
|
-
return snapshotModel.toJSON();
|
|
315
|
-
}
|
|
316
|
-
const job = yield getJob({ id: jobId });
|
|
317
|
-
if (job) {
|
|
318
|
-
return {
|
|
319
|
-
jobId,
|
|
320
|
-
status: 'pending',
|
|
321
|
-
};
|
|
322
|
-
}
|
|
323
|
-
return null;
|
|
255
|
+
return jobId;
|
|
324
256
|
});
|
|
325
257
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function initCron(): any;
|
package/lib/cjs/cron.js
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.initCron = initCron;
|
|
16
|
+
const cron_1 = __importDefault(require("@abtnode/cron"));
|
|
17
|
+
const config_1 = require("./config");
|
|
18
|
+
const site_1 = require("./site");
|
|
19
|
+
let cron = null;
|
|
20
|
+
function initCron() {
|
|
21
|
+
if (cron)
|
|
22
|
+
return;
|
|
23
|
+
config_1.logger.info('Init cron', { config: config_1.config.siteCron });
|
|
24
|
+
cron = cron_1.default.init({
|
|
25
|
+
context: {},
|
|
26
|
+
jobs: [
|
|
27
|
+
{
|
|
28
|
+
name: 'crawl-site',
|
|
29
|
+
time: config_1.config.siteCron.time,
|
|
30
|
+
options: { runOnInit: config_1.config.siteCron.runOnInit },
|
|
31
|
+
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
32
|
+
config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
|
|
33
|
+
for (const site of config_1.config.siteCron.sites) {
|
|
34
|
+
try {
|
|
35
|
+
yield (0, site_1.crawlSite)(site);
|
|
36
|
+
}
|
|
37
|
+
catch (err) {
|
|
38
|
+
config_1.logger.error('Cron task error', { err, site });
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}),
|
|
42
|
+
},
|
|
43
|
+
],
|
|
44
|
+
onError: (err) => {
|
|
45
|
+
config_1.logger.error('Cron error', err);
|
|
46
|
+
},
|
|
47
|
+
});
|
|
48
|
+
return cron;
|
|
49
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { Config } from './config';
|
|
2
|
+
export * from './crawler';
|
|
3
|
+
export * from './site';
|
|
4
|
+
export * from './services/snapshot';
|
|
5
|
+
export * as utils from './utils';
|
|
6
|
+
type DeepPartial<T> = T extends object ? {
|
|
7
|
+
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
+
} : T;
|
|
9
|
+
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
package/lib/cjs/index.js
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
19
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
20
|
+
};
|
|
21
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
22
|
+
var ownKeys = function(o) {
|
|
23
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
24
|
+
var ar = [];
|
|
25
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
26
|
+
return ar;
|
|
27
|
+
};
|
|
28
|
+
return ownKeys(o);
|
|
29
|
+
};
|
|
30
|
+
return function (mod) {
|
|
31
|
+
if (mod && mod.__esModule) return mod;
|
|
32
|
+
var result = {};
|
|
33
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
34
|
+
__setModuleDefault(result, mod);
|
|
35
|
+
return result;
|
|
36
|
+
};
|
|
37
|
+
})();
|
|
38
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
39
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
40
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
41
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
42
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
43
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
44
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
45
|
+
});
|
|
46
|
+
};
|
|
47
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
48
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
49
|
+
};
|
|
50
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
|
+
exports.utils = void 0;
|
|
52
|
+
exports.initCrawler = initCrawler;
|
|
53
|
+
const merge_1 = __importDefault(require("lodash/merge"));
|
|
54
|
+
const config_1 = require("./config");
|
|
55
|
+
const crawler_1 = require("./crawler");
|
|
56
|
+
const cron_1 = require("./cron");
|
|
57
|
+
const puppeteer_1 = require("./puppeteer");
|
|
58
|
+
const store_1 = require("./store");
|
|
59
|
+
__exportStar(require("./crawler"), exports);
|
|
60
|
+
__exportStar(require("./site"), exports);
|
|
61
|
+
__exportStar(require("./services/snapshot"), exports);
|
|
62
|
+
exports.utils = __importStar(require("./utils"));
|
|
63
|
+
function initCrawler(params) {
|
|
64
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
65
|
+
config_1.logger.info('Init crawler', { params });
|
|
66
|
+
(0, merge_1.default)(config_1.config, params);
|
|
67
|
+
try {
|
|
68
|
+
yield (0, store_1.initDatabase)();
|
|
69
|
+
yield (0, puppeteer_1.ensureBrowser)();
|
|
70
|
+
yield (0, crawler_1.createCrawlQueue)();
|
|
71
|
+
yield (0, cron_1.initCron)();
|
|
72
|
+
}
|
|
73
|
+
catch (err) {
|
|
74
|
+
config_1.logger.error('Init crawler error', { err });
|
|
75
|
+
throw err;
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
|
|
1
|
+
import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
|
|
2
2
|
export { puppeteer };
|
|
3
3
|
export declare function ensurePuppeteerrc(): Promise<{
|
|
4
4
|
cacheDirectory: string;
|
|
@@ -12,5 +12,5 @@ export declare const closeBrowser: ({ trimCache }?: {
|
|
|
12
12
|
trimCache?: boolean;
|
|
13
13
|
}) => Promise<void>;
|
|
14
14
|
export declare function initPage({ abortResourceTypes }?: {
|
|
15
|
-
abortResourceTypes?:
|
|
15
|
+
abortResourceTypes?: ResourceType[];
|
|
16
16
|
}): Promise<Page>;
|