@arcblock/crawler 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +56 -113
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +88 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +48 -102
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +26 -37
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +84 -71
  46. package/package.json +22 -33
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -15
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -15
package/README.md ADDED
@@ -0,0 +1 @@
1
+ # @arcblock/crawler
@@ -0,0 +1,22 @@
1
+ export type Site = {
2
+ url: string;
3
+ pathname: string;
4
+ /** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
5
+ interval?: number;
6
+ };
7
+ export type Config = {
8
+ isProd: boolean;
9
+ dataDir: string;
10
+ appDir: string;
11
+ appUrl: string;
12
+ cacheDir: string;
13
+ puppeteerPath?: string;
14
+ siteCron: {
15
+ sites: Site[];
16
+ time: string;
17
+ runOnInit: boolean;
18
+ concurrency: number;
19
+ };
20
+ };
21
+ export declare const logger: any;
22
+ export declare const config: Config;
@@ -5,13 +5,19 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.config = exports.logger = void 0;
7
7
  const logger_1 = __importDefault(require("@blocklet/logger"));
8
- exports.logger = (0, logger_1.default)('crawler', { level: process.env.LOG_LEVEL || 'info' });
8
+ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
9
9
  exports.config = {
10
- redisUrl: process.env.REDIS_URL,
10
+ isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
12
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
13
  appUrl: process.env.BLOCKLET_APP_URL,
14
14
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
15
15
  cacheDir: process.env.BLOCKLET_CACHE_DIR,
16
- testOnInitialize: process.env.NODE_ENV === 'production',
16
+ // cron
17
+ siteCron: {
18
+ sites: [],
19
+ time: '0 0 */12 * * *',
20
+ runOnInit: false,
21
+ concurrency: 5,
22
+ },
17
23
  };
@@ -0,0 +1,26 @@
1
+ import { JobState } from './store/job';
2
+ import { SnapshotModel } from './store/snapshot';
3
+ export declare function createCrawlQueue(): void;
4
+ export declare function getDataDir(): Promise<{
5
+ htmlDir: string;
6
+ screenshotDir: string;
7
+ }>;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
+ url: string;
10
+ includeScreenshot?: boolean;
11
+ includeHtml?: boolean;
12
+ width?: number;
13
+ height?: number;
14
+ quality?: number;
15
+ timeout?: number;
16
+ fullPage?: boolean;
17
+ }) => Promise<{
18
+ html: string;
19
+ screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ }>;
21
+ /**
22
+ * crawl url and return job id
23
+ * @param params
24
+ * @param callback callback when job finished
25
+ */
26
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
@@ -15,22 +15,17 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.getPageContent = void 0;
16
16
  exports.createCrawlQueue = createCrawlQueue;
17
17
  exports.getDataDir = getDataDir;
18
- exports.createCrawlJob = createCrawlJob;
19
- exports.getJob = getJob;
20
- exports.formatSnapshot = formatSnapshot;
21
- exports.getSnapshot = getSnapshot;
18
+ exports.crawlUrl = crawlUrl;
22
19
  const queue_1 = __importDefault(require("@abtnode/queue"));
23
20
  const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
24
- const core_1 = __importDefault(require("@sequelize/core"));
25
21
  const crypto_1 = require("crypto");
26
22
  const fs_extra_1 = __importDefault(require("fs-extra"));
27
- const pick_1 = __importDefault(require("lodash/pick"));
28
23
  const path_1 = __importDefault(require("path"));
29
- const ufo_1 = require("ufo");
30
24
  const config_1 = require("./config");
31
- const job_1 = require("./db/job");
32
- const snapshot_1 = require("./db/snapshot");
33
25
  const puppeteer_1 = require("./puppeteer");
26
+ const snapshot_1 = require("./services/snapshot");
27
+ const job_1 = require("./store/job");
28
+ const snapshot_2 = require("./store/snapshot");
34
29
  const utils_1 = require("./utils");
35
30
  const { BaseState } = require('@abtnode/models');
36
31
  let crawlQueue;
@@ -40,18 +35,18 @@ function createCrawlQueue() {
40
35
  store: new sequelize_1.default(db, 'crawler'),
41
36
  concurrency: 1,
42
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
43
- config_1.logger.debug('job start:', job);
38
+ config_1.logger.info('Starting to execute crawl job', job);
44
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
45
40
  if (!canCrawl) {
46
41
  config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
47
- const snapshot = convertJobToSnapshot({
42
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
48
43
  job,
49
44
  snapshot: {
50
45
  status: 'failed',
51
46
  error: 'Denied by robots.txt',
52
47
  },
53
48
  });
54
- yield snapshot_1.Snapshot.upsert(snapshot);
49
+ yield snapshot_2.Snapshot.upsert(snapshot);
55
50
  return snapshot;
56
51
  }
57
52
  // if index reach autoCloseBrowserCount, close browser
@@ -67,14 +62,14 @@ function createCrawlQueue() {
67
62
  const result = yield (0, exports.getPageContent)(job);
68
63
  if (!result || (!result.html && !result.screenshot)) {
69
64
  config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
70
- const snapshot = convertJobToSnapshot({
65
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
71
66
  job,
72
67
  snapshot: {
73
68
  status: 'failed',
74
69
  error: 'Failed to crawl content',
75
70
  },
76
71
  });
77
- yield snapshot_1.Snapshot.upsert(snapshot);
72
+ yield snapshot_2.Snapshot.upsert(snapshot);
78
73
  return snapshot;
79
74
  }
80
75
  // save html and screenshot to data dir
@@ -83,7 +78,7 @@ function createCrawlQueue() {
83
78
  html: result.html,
84
79
  });
85
80
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
86
- const snapshot = convertJobToSnapshot({
81
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
87
82
  job,
88
83
  snapshot: {
89
84
  status: 'success',
@@ -91,29 +86,19 @@ function createCrawlQueue() {
91
86
  html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
92
87
  },
93
88
  });
94
- yield snapshot_1.Snapshot.upsert(snapshot);
89
+ yield snapshot_2.Snapshot.upsert(snapshot);
95
90
  return snapshot;
96
- // save to redis
97
- // if (saveToRedis) {
98
- // useCache.set(url, {
99
- // html: result.html || '',
100
- // lastModified,
101
- // });
102
- // logger.info(`success to crawl ${url}`, job);
103
- // return result;
104
- // }
105
91
  }
106
92
  catch (error) {
107
93
  config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
108
- console.error(error.stack);
109
- const snapshot = convertJobToSnapshot({
94
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
110
95
  job,
111
96
  snapshot: {
112
97
  status: 'failed',
113
98
  error: 'Internal error',
114
99
  },
115
100
  });
116
- yield snapshot_1.Snapshot.upsert(snapshot);
101
+ yield snapshot_2.Snapshot.upsert(snapshot);
117
102
  return snapshot;
118
103
  }
119
104
  }),
@@ -157,11 +142,11 @@ function formatHtml(htmlString) {
157
142
  }
158
143
  return htmlString;
159
144
  }
160
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
145
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
161
146
  config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
162
147
  const page = yield (0, puppeteer_1.initPage)();
163
148
  if (width && height) {
164
- yield page.setViewport({ width, height });
149
+ yield page.setViewport({ width, height, deviceScaleFactor: 2 });
165
150
  }
166
151
  let html = null;
167
152
  let screenshot = null;
@@ -176,26 +161,47 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
176
161
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
177
162
  }
178
163
  // await for networkidle0
179
- // https://pptr.dev/api/puppeteer.page.goforward/#remarks
164
+ // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
180
165
  yield page.waitForNetworkIdle({
181
- idleTime: 2 * 1000,
166
+ idleTime: 1.5 * 1000,
182
167
  });
183
168
  // get screenshot
184
169
  if (includeScreenshot) {
170
+ // Try to find the tallest element and set the browser to the same height
171
+ if (fullPage) {
172
+ const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
173
+ config_1.logger.info('findMaxScrollHeight', { maxScrollHeight });
174
+ if (maxScrollHeight) {
175
+ yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
176
+ yield page.evaluate((scrollHeight) => {
177
+ window.scrollTo(0, scrollHeight || 0);
178
+ document.documentElement.scrollTo(0, scrollHeight || 0);
179
+ }, maxScrollHeight);
180
+ }
181
+ }
185
182
  try {
186
183
  screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
187
184
  }
188
185
  catch (err) {
189
186
  config_1.logger.error('Failed to get screenshot:', err);
187
+ throw err;
190
188
  }
191
189
  }
192
190
  // get html
193
191
  if (includeHtml) {
194
- if (formatPageContent) {
195
- html = yield formatPageContent({ page, url });
192
+ try {
193
+ html = yield page.evaluate(() => {
194
+ // add meta tag to record crawler
195
+ const meta = document.createElement('meta');
196
+ meta.name = 'arcblock-crawler';
197
+ meta.content = 'true';
198
+ document.head.appendChild(meta);
199
+ return document.documentElement.outerHTML;
200
+ });
196
201
  }
197
- else {
198
- html = yield page.content();
202
+ catch (err) {
203
+ config_1.logger.error('Failed to get html:', err);
204
+ throw err;
199
205
  }
200
206
  }
201
207
  }
@@ -213,11 +219,16 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
213
219
  };
214
220
  });
215
221
  exports.getPageContent = getPageContent;
216
- function createCrawlJob(params, callback) {
222
+ /**
223
+ * crawl url and return job id
224
+ * @param params
225
+ * @param callback callback when job finished
226
+ */
227
+ function crawlUrl(params, callback) {
217
228
  return __awaiter(this, void 0, void 0, function* () {
218
229
  params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
219
230
  // skip duplicate job
220
- const existsJob = yield getJob({
231
+ const { job: duplicateJob } = (yield job_1.Job.findJob({
221
232
  url: params.url,
222
233
  includeScreenshot: params.includeScreenshot,
223
234
  includeHtml: params.includeHtml,
@@ -225,18 +236,18 @@ function createCrawlJob(params, callback) {
225
236
  width: params.width,
226
237
  height: params.height,
227
238
  fullPage: params.fullPage,
228
- });
229
- config_1.logger.info('create crawl job', params);
230
- if (existsJob) {
239
+ })) || {};
240
+ if (duplicateJob) {
231
241
  config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
232
- return existsJob.id;
242
+ return duplicateJob.id;
233
243
  }
244
+ config_1.logger.info('create crawl job', params);
234
245
  const jobId = (0, crypto_1.randomUUID)();
235
246
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
236
- job.on('finished', ({ result }) => {
247
+ job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
237
248
  config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
238
- callback === null || callback === void 0 ? void 0 : callback(result);
239
- });
249
+ callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
250
+ }));
240
251
  job.on('failed', ({ error }) => {
241
252
  config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
242
253
  callback === null || callback === void 0 ? void 0 : callback(null);
@@ -244,71 +255,3 @@ function createCrawlJob(params, callback) {
244
255
  return jobId;
245
256
  });
246
257
  }
247
- // @ts-ignore
248
- function getJob(condition) {
249
- return __awaiter(this, void 0, void 0, function* () {
250
- const where = Object.keys(condition)
251
- .filter((key) => condition[key] !== undefined)
252
- .map((key) => {
253
- return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
254
- });
255
- const job = yield crawlQueue.store.db.findOne({
256
- where: {
257
- [core_1.default.Op.and]: where,
258
- },
259
- });
260
- if (job) {
261
- return job.job;
262
- }
263
- return null;
264
- });
265
- }
266
- function convertJobToSnapshot({ job, snapshot }) {
267
- return Object.assign({
268
- // @ts-ignore
269
- jobId: job.jobId || job.id, url: job.url, options: {
270
- width: job.width,
271
- height: job.height,
272
- includeScreenshot: job.includeScreenshot,
273
- includeHtml: job.includeHtml,
274
- quality: job.quality,
275
- fullPage: job.fullPage,
276
- } }, snapshot);
277
- }
278
- function formatSnapshot(snapshot, columns) {
279
- return __awaiter(this, void 0, void 0, function* () {
280
- let data = Object.assign({}, snapshot);
281
- // format screenshot path to full url
282
- if (data.screenshot) {
283
- data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
284
- }
285
- // format html path to string
286
- if (data.html) {
287
- const html = yield fs_extra_1.default.readFile(path_1.default.join(config_1.config.dataDir, data.html));
288
- data.html = html.toString();
289
- }
290
- if (columns === null || columns === void 0 ? void 0 : columns.length) {
291
- data = (0, pick_1.default)(data, columns);
292
- }
293
- return data;
294
- });
295
- }
296
- /**
297
- * get snapshot from db or crawl queue
298
- */
299
- function getSnapshot(jobId) {
300
- return __awaiter(this, void 0, void 0, function* () {
301
- const snapshotModel = yield snapshot_1.Snapshot.findByPk(jobId);
302
- if (snapshotModel) {
303
- return snapshotModel.toJSON();
304
- }
305
- const job = yield getJob({ id: jobId });
306
- if (job) {
307
- return {
308
- jobId,
309
- status: 'pending',
310
- };
311
- }
312
- return null;
313
- });
314
- }
@@ -0,0 +1 @@
1
+ export declare function initCron(): any;
@@ -0,0 +1,49 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.initCron = initCron;
16
+ const cron_1 = __importDefault(require("@abtnode/cron"));
17
+ const config_1 = require("./config");
18
+ const site_1 = require("./site");
19
+ let cron = null;
20
+ function initCron() {
21
+ if (cron)
22
+ return;
23
+ config_1.logger.info('Init cron', { config: config_1.config.siteCron });
24
+ cron = cron_1.default.init({
25
+ context: {},
26
+ jobs: [
27
+ {
28
+ name: 'crawl-site',
29
+ time: config_1.config.siteCron.time,
30
+ options: { runOnInit: config_1.config.siteCron.runOnInit },
31
+ fn: () => __awaiter(this, void 0, void 0, function* () {
32
+ config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
+ for (const site of config_1.config.siteCron.sites) {
34
+ try {
35
+ yield (0, site_1.crawlSite)(site);
36
+ }
37
+ catch (err) {
38
+ config_1.logger.error('Cron task error', { err, site });
39
+ }
40
+ }
41
+ }),
42
+ },
43
+ ],
44
+ onError: (err) => {
45
+ config_1.logger.error('Cron error', err);
46
+ },
47
+ });
48
+ return cron;
49
+ }
@@ -0,0 +1,9 @@
1
+ import { Config } from './config';
2
+ export * from './crawler';
3
+ export * from './site';
4
+ export * from './services/snapshot';
5
+ export * as utils from './utils';
6
+ type DeepPartial<T> = T extends object ? {
7
+ [P in keyof T]?: DeepPartial<T[P]>;
8
+ } : T;
9
+ export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
@@ -0,0 +1,78 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
19
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
20
+ };
21
+ var __importStar = (this && this.__importStar) || (function () {
22
+ var ownKeys = function(o) {
23
+ ownKeys = Object.getOwnPropertyNames || function (o) {
24
+ var ar = [];
25
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
26
+ return ar;
27
+ };
28
+ return ownKeys(o);
29
+ };
30
+ return function (mod) {
31
+ if (mod && mod.__esModule) return mod;
32
+ var result = {};
33
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
34
+ __setModuleDefault(result, mod);
35
+ return result;
36
+ };
37
+ })();
38
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
39
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
40
+ return new (P || (P = Promise))(function (resolve, reject) {
41
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
42
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
43
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
44
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
45
+ });
46
+ };
47
+ var __importDefault = (this && this.__importDefault) || function (mod) {
48
+ return (mod && mod.__esModule) ? mod : { "default": mod };
49
+ };
50
+ Object.defineProperty(exports, "__esModule", { value: true });
51
+ exports.utils = void 0;
52
+ exports.initCrawler = initCrawler;
53
+ const merge_1 = __importDefault(require("lodash/merge"));
54
+ const config_1 = require("./config");
55
+ const crawler_1 = require("./crawler");
56
+ const cron_1 = require("./cron");
57
+ const puppeteer_1 = require("./puppeteer");
58
+ const store_1 = require("./store");
59
+ __exportStar(require("./crawler"), exports);
60
+ __exportStar(require("./site"), exports);
61
+ __exportStar(require("./services/snapshot"), exports);
62
+ exports.utils = __importStar(require("./utils"));
63
+ function initCrawler(params) {
64
+ return __awaiter(this, void 0, void 0, function* () {
65
+ config_1.logger.info('Init crawler', { params });
66
+ (0, merge_1.default)(config_1.config, params);
67
+ try {
68
+ yield (0, store_1.initDatabase)();
69
+ yield (0, puppeteer_1.ensureBrowser)();
70
+ yield (0, crawler_1.createCrawlQueue)();
71
+ yield (0, cron_1.initCron)();
72
+ }
73
+ catch (err) {
74
+ config_1.logger.error('Init crawler error', { err });
75
+ throw err;
76
+ }
77
+ });
78
+ }
@@ -1,4 +1,4 @@
1
- import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
1
+ import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
2
2
  export { puppeteer };
3
3
  export declare function ensurePuppeteerrc(): Promise<{
4
4
  cacheDirectory: string;
@@ -12,5 +12,5 @@ export declare const closeBrowser: ({ trimCache }?: {
12
12
  trimCache?: boolean;
13
13
  }) => Promise<void>;
14
14
  export declare function initPage({ abortResourceTypes }?: {
15
- abortResourceTypes?: never[] | undefined;
15
+ abortResourceTypes?: ResourceType[];
16
16
  }): Promise<Page>;