@arcblock/crawler 1.0.6 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +66 -0
  2. package/lib/cjs/config.d.ts +24 -0
  3. package/{dist → lib/cjs}/config.js +13 -5
  4. package/lib/cjs/crawler.d.ts +30 -0
  5. package/{dist → lib/cjs}/crawler.js +63 -117
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +80 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +79 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +22 -6
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +10 -6
  21. package/lib/cjs/store/snapshot.js +72 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +24 -0
  25. package/lib/esm/config.js +19 -0
  26. package/lib/esm/crawler.d.ts +30 -0
  27. package/{esm → lib/esm}/crawler.js +54 -105
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +21 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +72 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +23 -7
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +10 -6
  43. package/lib/esm/store/snapshot.js +68 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/config.js +0 -11
  68. package/esm/crawler.d.ts +0 -28
  69. package/esm/db/index.d.ts +0 -1
  70. package/esm/db/job.js +0 -50
  71. package/esm/db/snapshot.js +0 -48
  72. package/esm/index.d.ts +0 -6
  73. package/esm/middleware.d.ts +0 -4
  74. package/esm/middleware.js +0 -41
  75. package/esm/utils.d.ts +0 -17
package/README.md ADDED
@@ -0,0 +1,66 @@
1
+ # @arcblock/crawler
2
+
3
+ A crawler module designed for Blocklets. It supports batch crawling of HTML, webpage screenshots, title, description, and more, based on URL or Sitemap.
4
+
5
+ ## Usage
6
+
7
+ ```typescript
8
+ import { crawlUrl, getSnapshot, initCrawler } from '@arcblock/crawler';
9
+
10
+ await initCrawler();
11
+
12
+ // Asynchronously crawl a page
13
+ const jobId = await crawlUrl({ url: 'https://www.arcblock.io', includeScreenshot: true, includeHtml: true });
14
+
15
+ // Get the crawling result (need to wait for the crawler to finish)
16
+ const snapshot = await getSnapshot(jobId);
17
+ ```
18
+
19
+ ### initCrawler
20
+
21
+ Initializes the crawler.
22
+
23
+ ### crawlUrl
24
+
25
+ Crawls the specified page.
26
+
27
+ ### getSnapshot
28
+
29
+ Gets the crawling result by jobId.
30
+
31
+ ### getLatestSnapshot
32
+
33
+ Gets the latest crawling result by URL.
34
+
35
+ ## Schedule
36
+
37
+ Passing siteCron to initCrawler will enable scheduled tasks to periodically crawl all pages of specified websites based on their sitemaps.
38
+
39
+ ```typescript
40
+ await initCrawler({
41
+ siteCron: {
42
+ enabled: !!env.preferences.cronEnabled,
43
+ immediate: !!env.preferences.cronImmediate,
44
+ sites: env.preferences.cronSites,
45
+ time: env.preferences.cronTime,
46
+ crawlConcurrency: env.preferences.crawlConcurrency,
47
+ sitemapConcurrency: env.preferences.sitemapConcurrency,
48
+ },
49
+ });
50
+ ```
51
+
52
+ ## Environment Variables
53
+
54
+ - `PUPPETEER_EXECUTABLE_PATH`: The execution path for Puppeteer. This variable is not required if used within the `arcblock/snap-kit` Docker image. When developing locally, you can set it to the Chrome path: `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
55
+
56
+ If not referenced by a Blocklet, some dependent Blocklet environment variables need to be provided:
57
+
58
+ - `BLOCKLET_CACHE_DIR`: (Optional) The directory for automatic Puppeteer installation if `PUPPETEER_EXECUTABLE_PATH` is not provided. Defaults to `process.cwd()`.
59
+
60
+ - `BLOCKLET_APP_URL`: (Optional) The domain prefix for screenshot. Defaults to `/`.
61
+
62
+ - `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
63
+
64
+ ## SQLite
65
+
66
+ When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.
@@ -0,0 +1,24 @@
1
+ export type Site = {
2
+ url: string;
3
+ pathname: string;
4
+ /** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
5
+ interval?: number;
6
+ };
7
+ export type Config = {
8
+ isProd: boolean;
9
+ dataDir: string;
10
+ appDir: string;
11
+ appUrl: string;
12
+ cacheDir: string;
13
+ puppeteerPath?: string;
14
+ siteCron: {
15
+ sites: Site[];
16
+ time: string;
17
+ enabled: boolean;
18
+ immediate: boolean;
19
+ crawlConcurrency: number;
20
+ sitemapConcurrency: number;
21
+ };
22
+ };
23
+ export declare const logger: any;
24
+ export declare const config: Config;
@@ -5,13 +5,21 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.config = exports.logger = void 0;
7
7
  const logger_1 = __importDefault(require("@blocklet/logger"));
8
- exports.logger = (0, logger_1.default)('crawler', { level: process.env.LOG_LEVEL || 'info' });
8
+ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
9
9
  exports.config = {
10
- redisUrl: process.env.REDIS_URL,
10
+ isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
12
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
- appUrl: process.env.BLOCKLET_APP_URL,
13
+ cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
14
+ appUrl: process.env.BLOCKLET_APP_URL || '/',
14
15
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
15
- cacheDir: process.env.BLOCKLET_CACHE_DIR,
16
- testOnInitialize: process.env.NODE_ENV === 'production',
16
+ // cron
17
+ siteCron: {
18
+ sites: [],
19
+ enabled: true,
20
+ time: '0 0 0 * * *',
21
+ immediate: false,
22
+ crawlConcurrency: 2,
23
+ sitemapConcurrency: 30,
24
+ },
17
25
  };
@@ -0,0 +1,30 @@
1
+ import { JobState } from './store/job';
2
+ import { SnapshotModel } from './store/snapshot';
3
+ export declare function createCrawlQueue(): void;
4
+ export declare function getDataDir(): Promise<{
5
+ htmlDir: string;
6
+ screenshotDir: string;
7
+ }>;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
+ url: string;
10
+ includeScreenshot?: boolean;
11
+ includeHtml?: boolean;
12
+ width?: number;
13
+ height?: number;
14
+ quality?: number;
15
+ timeout?: number;
16
+ fullPage?: boolean;
17
+ }) => Promise<{
18
+ html: string;
19
+ screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ meta: {
21
+ title?: string;
22
+ description?: string;
23
+ };
24
+ }>;
25
+ /**
26
+ * crawl url and return job id
27
+ * @param params
28
+ * @param callback callback when job finished
29
+ */
30
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
@@ -15,22 +15,17 @@ Object.defineProperty(exports, "__esModule", { value: true });
15
15
  exports.getPageContent = void 0;
16
16
  exports.createCrawlQueue = createCrawlQueue;
17
17
  exports.getDataDir = getDataDir;
18
- exports.createCrawlJob = createCrawlJob;
19
- exports.getJob = getJob;
20
- exports.formatSnapshot = formatSnapshot;
21
- exports.getSnapshot = getSnapshot;
18
+ exports.crawlUrl = crawlUrl;
22
19
  const queue_1 = __importDefault(require("@abtnode/queue"));
23
20
  const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
24
- const core_1 = __importDefault(require("@sequelize/core"));
25
21
  const crypto_1 = require("crypto");
26
22
  const fs_extra_1 = __importDefault(require("fs-extra"));
27
- const pick_1 = __importDefault(require("lodash/pick"));
28
23
  const path_1 = __importDefault(require("path"));
29
- const ufo_1 = require("ufo");
30
24
  const config_1 = require("./config");
31
- const job_1 = require("./db/job");
32
- const snapshot_1 = require("./db/snapshot");
33
25
  const puppeteer_1 = require("./puppeteer");
26
+ const snapshot_1 = require("./services/snapshot");
27
+ const job_1 = require("./store/job");
28
+ const snapshot_2 = require("./store/snapshot");
34
29
  const utils_1 = require("./utils");
35
30
  const { BaseState } = require('@abtnode/models');
36
31
  let crawlQueue;
@@ -38,20 +33,20 @@ function createCrawlQueue() {
38
33
  const db = new BaseState(job_1.Job);
39
34
  crawlQueue = (0, queue_1.default)({
40
35
  store: new sequelize_1.default(db, 'crawler'),
41
- concurrency: 1,
36
+ concurrency: config_1.config.siteCron.crawlConcurrency,
42
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
43
38
  config_1.logger.info('Starting to execute crawl job', job);
44
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
45
40
  if (!canCrawl) {
46
41
  config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
47
- const snapshot = convertJobToSnapshot({
42
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
48
43
  job,
49
44
  snapshot: {
50
45
  status: 'failed',
51
46
  error: 'Denied by robots.txt',
52
47
  },
53
48
  });
54
- yield snapshot_1.Snapshot.upsert(snapshot);
49
+ yield snapshot_2.Snapshot.upsert(snapshot);
55
50
  return snapshot;
56
51
  }
57
52
  // if index reach autoCloseBrowserCount, close browser
@@ -67,14 +62,14 @@ function createCrawlQueue() {
67
62
  const result = yield (0, exports.getPageContent)(job);
68
63
  if (!result || (!result.html && !result.screenshot)) {
69
64
  config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
70
- const snapshot = convertJobToSnapshot({
65
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
71
66
  job,
72
67
  snapshot: {
73
68
  status: 'failed',
74
69
  error: 'Failed to crawl content',
75
70
  },
76
71
  });
77
- yield snapshot_1.Snapshot.upsert(snapshot);
72
+ yield snapshot_2.Snapshot.upsert(snapshot);
78
73
  return snapshot;
79
74
  }
80
75
  // save html and screenshot to data dir
@@ -83,37 +78,28 @@ function createCrawlQueue() {
83
78
  html: result.html,
84
79
  });
85
80
  // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
86
- const snapshot = convertJobToSnapshot({
81
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
87
82
  job,
88
83
  snapshot: {
89
84
  status: 'success',
90
85
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
91
86
  html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
87
+ meta: result.meta,
92
88
  },
93
89
  });
94
- yield snapshot_1.Snapshot.upsert(snapshot);
90
+ yield snapshot_2.Snapshot.upsert(snapshot);
95
91
  return snapshot;
96
- // save to redis
97
- // if (saveToRedis) {
98
- // useCache.set(url, {
99
- // html: result.html || '',
100
- // lastModified,
101
- // });
102
- // logger.info(`success to crawl ${url}`, job);
103
- // return result;
104
- // }
105
92
  }
106
93
  catch (error) {
107
94
  config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
108
- console.error(error.stack);
109
- const snapshot = convertJobToSnapshot({
95
+ const snapshot = (0, snapshot_1.convertJobToSnapshot)({
110
96
  job,
111
97
  snapshot: {
112
98
  status: 'failed',
113
99
  error: 'Internal error',
114
100
  },
115
101
  });
116
- yield snapshot_1.Snapshot.upsert(snapshot);
102
+ yield snapshot_2.Snapshot.upsert(snapshot);
117
103
  return snapshot;
118
104
  }
119
105
  }),
@@ -157,7 +143,7 @@ function formatHtml(htmlString) {
157
143
  }
158
144
  return htmlString;
159
145
  }
160
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
146
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
161
147
  config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
162
148
  const page = yield (0, puppeteer_1.initPage)();
163
149
  if (width && height) {
@@ -165,6 +151,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
165
151
  }
166
152
  let html = null;
167
153
  let screenshot = null;
154
+ const meta = {};
168
155
  try {
169
156
  const response = yield page.goto(url, { timeout });
170
157
  if (!response) {
@@ -185,7 +172,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
185
172
  // Try to find the tallest element and set the browser to the same height
186
173
  if (fullPage) {
187
174
  const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
188
- config_1.logger.info('findMaxScrollHeight', { maxScrollHeight });
175
+ config_1.logger.debug('findMaxScrollHeight', { maxScrollHeight });
189
176
  if (maxScrollHeight) {
190
177
  yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
191
178
  yield page.evaluate((scrollHeight) => {
@@ -199,17 +186,37 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
199
186
  }
200
187
  catch (err) {
201
188
  config_1.logger.error('Failed to get screenshot:', err);
189
+ throw err;
202
190
  }
203
191
  }
204
192
  // get html
205
- if (includeHtml) {
206
- if (formatPageContent) {
207
- html = yield formatPageContent({ page, url });
208
- }
209
- else {
210
- html = yield page.content();
193
+ try {
194
+ const data = yield page.evaluate(() => {
195
+ var _a;
196
+ // add meta tag to record crawler
197
+ const meta = document.createElement('meta');
198
+ meta.name = 'arcblock-crawler';
199
+ meta.content = 'true';
200
+ document.head.appendChild(meta);
201
+ // get title and meta description
202
+ const title = document.title || '';
203
+ const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
204
+ return {
205
+ html: document.documentElement.outerHTML,
206
+ title,
207
+ description,
208
+ };
209
+ });
210
+ meta.title = data.title;
211
+ meta.description = data.description;
212
+ if (includeHtml) {
213
+ html = data.html;
211
214
  }
212
215
  }
216
+ catch (err) {
217
+ config_1.logger.error('Failed to get html:', err);
218
+ throw err;
219
+ }
213
220
  }
214
221
  catch (error) {
215
222
  config_1.logger.error('Failed to get page content:', error);
@@ -222,14 +229,20 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
222
229
  return {
223
230
  html,
224
231
  screenshot,
232
+ meta,
225
233
  };
226
234
  });
227
235
  exports.getPageContent = getPageContent;
228
- function createCrawlJob(params, callback) {
236
+ /**
237
+ * crawl url and return job id
238
+ * @param params
239
+ * @param callback callback when job finished
240
+ */
241
+ function crawlUrl(params, callback) {
229
242
  return __awaiter(this, void 0, void 0, function* () {
230
- params = Object.assign(Object.assign({}, params), { id: (0, crypto_1.randomUUID)(), url: (0, utils_1.formatUrl)(params.url) });
243
+ params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
231
244
  // skip duplicate job
232
- const existsJob = yield getJob({
245
+ const { job: duplicateJob } = (yield job_1.Job.findJob({
233
246
  url: params.url,
234
247
  includeScreenshot: params.includeScreenshot,
235
248
  includeHtml: params.includeHtml,
@@ -237,89 +250,22 @@ function createCrawlJob(params, callback) {
237
250
  width: params.width,
238
251
  height: params.height,
239
252
  fullPage: params.fullPage,
240
- });
241
- if (existsJob) {
242
- config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
243
- return existsJob.id;
253
+ })) || {};
254
+ if (duplicateJob) {
255
+ config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
256
+ return duplicateJob.id;
244
257
  }
245
258
  config_1.logger.info('create crawl job', params);
246
- const job = crawlQueue.push(params);
247
- job.on('finished', ({ result }) => {
248
- config_1.logger.info(`Crawl completed ${params.url}`, { job: params, result });
249
- callback === null || callback === void 0 ? void 0 : callback(result);
250
- });
259
+ const jobId = (0, crypto_1.randomUUID)();
260
+ const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
261
+ job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
262
+ config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
263
+ callback === null || callback === void 0 ? void 0 : callback(result ? yield (0, snapshot_1.formatSnapshot)(result) : null);
264
+ }));
251
265
  job.on('failed', ({ error }) => {
252
266
  config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
253
267
  callback === null || callback === void 0 ? void 0 : callback(null);
254
268
  });
255
- return params.id;
256
- });
257
- }
258
- // @ts-ignore
259
- function getJob(condition) {
260
- return __awaiter(this, void 0, void 0, function* () {
261
- const where = Object.keys(condition)
262
- .filter((key) => condition[key] !== undefined)
263
- .map((key) => {
264
- return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
265
- });
266
- const job = yield crawlQueue.store.db.findOne({
267
- where: {
268
- [core_1.default.Op.and]: where,
269
- },
270
- });
271
- if (job) {
272
- return job.job;
273
- }
274
- return null;
275
- });
276
- }
277
- function convertJobToSnapshot({ job, snapshot }) {
278
- return Object.assign({
279
- // @ts-ignore
280
- jobId: job.jobId || job.id, url: job.url, options: {
281
- width: job.width,
282
- height: job.height,
283
- includeScreenshot: job.includeScreenshot,
284
- includeHtml: job.includeHtml,
285
- quality: job.quality,
286
- fullPage: job.fullPage,
287
- } }, snapshot);
288
- }
289
- function formatSnapshot(snapshot, columns) {
290
- return __awaiter(this, void 0, void 0, function* () {
291
- let data = Object.assign({}, snapshot);
292
- // format screenshot path to full url
293
- if (data.screenshot) {
294
- data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
295
- }
296
- // format html path to string
297
- if (data.html) {
298
- const html = yield fs_extra_1.default.readFile(path_1.default.join(config_1.config.dataDir, data.html));
299
- data.html = html.toString();
300
- }
301
- if (columns === null || columns === void 0 ? void 0 : columns.length) {
302
- data = (0, pick_1.default)(data, columns);
303
- }
304
- return data;
305
- });
306
- }
307
- /**
308
- * get snapshot from db or crawl queue
309
- */
310
- function getSnapshot(jobId) {
311
- return __awaiter(this, void 0, void 0, function* () {
312
- const snapshotModel = yield snapshot_1.Snapshot.findByPk(jobId);
313
- if (snapshotModel) {
314
- return snapshotModel.toJSON();
315
- }
316
- const job = yield getJob({ id: jobId });
317
- if (job) {
318
- return {
319
- jobId,
320
- status: 'pending',
321
- };
322
- }
323
- return null;
269
+ return jobId;
324
270
  });
325
271
  }
@@ -0,0 +1 @@
1
+ export declare function initCron(): any;
@@ -0,0 +1,49 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.initCron = initCron;
16
+ const cron_1 = __importDefault(require("@abtnode/cron"));
17
+ const config_1 = require("./config");
18
+ const site_1 = require("./site");
19
+ let cron = null;
20
+ function initCron() {
21
+ if (cron)
22
+ return;
23
+ config_1.logger.info('Init cron', { config: config_1.config.siteCron });
24
+ cron = cron_1.default.init({
25
+ context: {},
26
+ jobs: [
27
+ {
28
+ name: 'crawl-site',
29
+ time: config_1.config.siteCron.time,
30
+ options: { runOnInit: config_1.config.siteCron.immediate },
31
+ fn: () => __awaiter(this, void 0, void 0, function* () {
32
+ config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
+ for (const site of config_1.config.siteCron.sites) {
34
+ try {
35
+ yield (0, site_1.crawlSite)(site);
36
+ }
37
+ catch (err) {
38
+ config_1.logger.error('Cron task error', { err, site });
39
+ }
40
+ }
41
+ }),
42
+ },
43
+ ],
44
+ onError: (err) => {
45
+ config_1.logger.error('Cron error', err);
46
+ },
47
+ });
48
+ return cron;
49
+ }
@@ -0,0 +1,9 @@
1
+ import { Config } from './config';
2
+ export * from './crawler';
3
+ export * from './site';
4
+ export * from './services/snapshot';
5
+ export * as utils from './utils';
6
+ type DeepPartial<T> = T extends object ? {
7
+ [P in keyof T]?: DeepPartial<T[P]>;
8
+ } : T;
9
+ export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
@@ -0,0 +1,80 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
19
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
20
+ };
21
+ var __importStar = (this && this.__importStar) || (function () {
22
+ var ownKeys = function(o) {
23
+ ownKeys = Object.getOwnPropertyNames || function (o) {
24
+ var ar = [];
25
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
26
+ return ar;
27
+ };
28
+ return ownKeys(o);
29
+ };
30
+ return function (mod) {
31
+ if (mod && mod.__esModule) return mod;
32
+ var result = {};
33
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
34
+ __setModuleDefault(result, mod);
35
+ return result;
36
+ };
37
+ })();
38
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
39
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
40
+ return new (P || (P = Promise))(function (resolve, reject) {
41
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
42
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
43
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
44
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
45
+ });
46
+ };
47
+ var __importDefault = (this && this.__importDefault) || function (mod) {
48
+ return (mod && mod.__esModule) ? mod : { "default": mod };
49
+ };
50
+ Object.defineProperty(exports, "__esModule", { value: true });
51
+ exports.utils = void 0;
52
+ exports.initCrawler = initCrawler;
53
+ const merge_1 = __importDefault(require("lodash/merge"));
54
+ const config_1 = require("./config");
55
+ const crawler_1 = require("./crawler");
56
+ const cron_1 = require("./cron");
57
+ const puppeteer_1 = require("./puppeteer");
58
+ const store_1 = require("./store");
59
+ __exportStar(require("./crawler"), exports);
60
+ __exportStar(require("./site"), exports);
61
+ __exportStar(require("./services/snapshot"), exports);
62
+ exports.utils = __importStar(require("./utils"));
63
+ function initCrawler(params) {
64
+ return __awaiter(this, void 0, void 0, function* () {
65
+ (0, merge_1.default)(config_1.config, params);
66
+ config_1.logger.info('Init crawler', { params, config: config_1.config });
67
+ try {
68
+ yield (0, store_1.initDatabase)();
69
+ yield (0, puppeteer_1.ensureBrowser)();
70
+ yield (0, crawler_1.createCrawlQueue)();
71
+ if (config_1.config.siteCron.enabled) {
72
+ yield (0, cron_1.initCron)();
73
+ }
74
+ }
75
+ catch (err) {
76
+ config_1.logger.error('Init crawler error', { err });
77
+ throw err;
78
+ }
79
+ });
80
+ }
@@ -1,4 +1,4 @@
1
- import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
1
+ import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
2
2
  export { puppeteer };
3
3
  export declare function ensurePuppeteerrc(): Promise<{
4
4
  cacheDirectory: string;
@@ -12,5 +12,5 @@ export declare const closeBrowser: ({ trimCache }?: {
12
12
  trimCache?: boolean;
13
13
  }) => Promise<void>;
14
14
  export declare function initPage({ abortResourceTypes }?: {
15
- abortResourceTypes?: never[] | undefined;
15
+ abortResourceTypes?: ResourceType[];
16
16
  }): Promise<Page>;