@arcblock/crawler 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1 +1,66 @@
1
1
  # @arcblock/crawler
2
+
3
+ A crawler module designed for Blocklets. It supports batch crawling of HTML, webpage screenshots, title, description, and more, based on URL or Sitemap.
4
+
5
+ ## Usage
6
+
7
+ ```typescript
8
+ import { crawlUrl, getSnapshot, initCrawler } from '@arcblock/crawler';
9
+
10
+ await initCrawler();
11
+
12
+ // Asynchronously crawl a page
13
+ const jobId = await crawlUrl({ url: 'https://www.arcblock.io', includeScreenshot: true, includeHtml: true });
14
+
15
+ // Get the crawling result (need to wait for the crawler to finish)
16
+ const snapshot = await getSnapshot(jobId);
17
+ ```
18
+
19
+ ### initCrawler
20
+
21
+ Initializes the crawler.
22
+
23
+ ### crawlUrl
24
+
25
+ Crawls the specified page.
26
+
27
+ ### getSnapshot
28
+
29
+ Gets the crawling result by jobId.
30
+
31
+ ### getLatestSnapshot
32
+
33
+ Gets the latest crawling result by URL.
34
+
35
+ ## Schedule
36
+
37
+ Passing siteCron to initCrawler will enable scheduled tasks to periodically crawl all pages of specified websites based on their sitemaps.
38
+
39
+ ```typescript
40
+ await initCrawler({
41
+ siteCron: {
42
+ enabled: !!env.preferences.cronEnabled,
43
+ immediate: !!env.preferences.cronImmediate,
44
+ sites: env.preferences.cronSites,
45
+ time: env.preferences.cronTime,
46
+ crawlConcurrency: env.preferences.crawlConcurrency,
47
+ sitemapConcurrency: env.preferences.sitemapConcurrency,
48
+ },
49
+ });
50
+ ```
51
+
52
+ ## Environment Variables
53
+
54
+ - `PUPPETEER_EXECUTABLE_PATH`: The execution path for Puppeteer. This variable is not required if used within the `arcblock/snap-kit` Docker image. When developing locally, you can set it to the Chrome path: `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
55
+
56
+ If not referenced by a Blocklet, some dependent Blocklet environment variables need to be provided:
57
+
58
+ - `BLOCKLET_CACHE_DIR`: (Optional) The directory for automatic Puppeteer installation if `PUPPETEER_EXECUTABLE_PATH` is not provided. Defaults to `process.cwd()`.
59
+
60
+ - `BLOCKLET_APP_URL`: (Optional) The domain prefix for screenshot. Defaults to `/`.
61
+
62
+ - `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
63
+
64
+ ## SQLite
65
+
66
+ When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.
@@ -14,8 +14,10 @@ export type Config = {
14
14
  siteCron: {
15
15
  sites: Site[];
16
16
  time: string;
17
- runOnInit: boolean;
18
- concurrency: number;
17
+ enabled: boolean;
18
+ immediate: boolean;
19
+ crawlConcurrency: number;
20
+ sitemapConcurrency: number;
19
21
  };
20
22
  };
21
23
  export declare const logger: any;
package/lib/cjs/config.js CHANGED
@@ -10,14 +10,16 @@ exports.config = {
10
10
  isProd: process.env.NODE_ENV === 'production',
11
11
  dataDir: process.env.BLOCKLET_DATA_DIR,
12
12
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
13
- appUrl: process.env.BLOCKLET_APP_URL,
13
+ cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
14
+ appUrl: process.env.BLOCKLET_APP_URL || '/',
14
15
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
15
- cacheDir: process.env.BLOCKLET_CACHE_DIR,
16
16
  // cron
17
17
  siteCron: {
18
18
  sites: [],
19
- time: '0 0 */12 * * *',
20
- runOnInit: false,
21
- concurrency: 5,
19
+ enabled: true,
20
+ time: '0 0 0 * * *',
21
+ immediate: false,
22
+ crawlConcurrency: 2,
23
+ sitemapConcurrency: 30,
22
24
  },
23
25
  };
@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
17
17
  }) => Promise<{
18
18
  html: string;
19
19
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ meta: {
21
+ title?: string;
22
+ description?: string;
23
+ };
20
24
  }>;
21
25
  /**
22
26
  * crawl url and return job id
@@ -33,7 +33,7 @@ function createCrawlQueue() {
33
33
  const db = new BaseState(job_1.Job);
34
34
  crawlQueue = (0, queue_1.default)({
35
35
  store: new sequelize_1.default(db, 'crawler'),
36
- concurrency: 1,
36
+ concurrency: config_1.config.siteCron.crawlConcurrency,
37
37
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
38
38
  config_1.logger.info('Starting to execute crawl job', job);
39
39
  const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
@@ -84,6 +84,7 @@ function createCrawlQueue() {
84
84
  status: 'success',
85
85
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
86
86
  html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
87
+ meta: result.meta,
87
88
  },
88
89
  });
89
90
  yield snapshot_2.Snapshot.upsert(snapshot);
@@ -150,6 +151,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
150
151
  }
151
152
  let html = null;
152
153
  let screenshot = null;
154
+ const meta = {};
153
155
  try {
154
156
  const response = yield page.goto(url, { timeout });
155
157
  if (!response) {
@@ -170,7 +172,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
170
172
  // Try to find the tallest element and set the browser to the same height
171
173
  if (fullPage) {
172
174
  const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
173
- config_1.logger.info('findMaxScrollHeight', { maxScrollHeight });
175
+ config_1.logger.debug('findMaxScrollHeight', { maxScrollHeight });
174
176
  if (maxScrollHeight) {
175
177
  yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
176
178
  yield page.evaluate((scrollHeight) => {
@@ -188,22 +190,33 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
188
190
  }
189
191
  }
190
192
  // get html
191
- if (includeHtml) {
192
- try {
193
- html = yield page.evaluate(() => {
194
- // add meta tag to record crawler
195
- const meta = document.createElement('meta');
196
- meta.name = 'arcblock-crawler';
197
- meta.content = 'true';
198
- document.head.appendChild(meta);
199
- return document.documentElement.outerHTML;
200
- });
201
- }
202
- catch (err) {
203
- config_1.logger.error('Failed to get html:', err);
204
- throw err;
193
+ try {
194
+ const data = yield page.evaluate(() => {
195
+ var _a;
196
+ // add meta tag to record crawler
197
+ const meta = document.createElement('meta');
198
+ meta.name = 'arcblock-crawler';
199
+ meta.content = 'true';
200
+ document.head.appendChild(meta);
201
+ // get title and meta description
202
+ const title = document.title || '';
203
+ const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
204
+ return {
205
+ html: document.documentElement.outerHTML,
206
+ title,
207
+ description,
208
+ };
209
+ });
210
+ meta.title = data.title;
211
+ meta.description = data.description;
212
+ if (includeHtml) {
213
+ html = data.html;
205
214
  }
206
215
  }
216
+ catch (err) {
217
+ config_1.logger.error('Failed to get html:', err);
218
+ throw err;
219
+ }
207
220
  }
208
221
  catch (error) {
209
222
  config_1.logger.error('Failed to get page content:', error);
@@ -216,6 +229,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
216
229
  return {
217
230
  html,
218
231
  screenshot,
232
+ meta,
219
233
  };
220
234
  });
221
235
  exports.getPageContent = getPageContent;
@@ -238,7 +252,7 @@ function crawlUrl(params, callback) {
238
252
  fullPage: params.fullPage,
239
253
  })) || {};
240
254
  if (duplicateJob) {
241
- config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
255
+ config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
242
256
  return duplicateJob.id;
243
257
  }
244
258
  config_1.logger.info('create crawl job', params);
package/lib/cjs/cron.js CHANGED
@@ -27,7 +27,7 @@ function initCron() {
27
27
  {
28
28
  name: 'crawl-site',
29
29
  time: config_1.config.siteCron.time,
30
- options: { runOnInit: config_1.config.siteCron.runOnInit },
30
+ options: { runOnInit: config_1.config.siteCron.immediate },
31
31
  fn: () => __awaiter(this, void 0, void 0, function* () {
32
32
  config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
33
33
  for (const site of config_1.config.siteCron.sites) {
package/lib/cjs/index.js CHANGED
@@ -62,13 +62,15 @@ __exportStar(require("./services/snapshot"), exports);
62
62
  exports.utils = __importStar(require("./utils"));
63
63
  function initCrawler(params) {
64
64
  return __awaiter(this, void 0, void 0, function* () {
65
- config_1.logger.info('Init crawler', { params });
66
65
  (0, merge_1.default)(config_1.config, params);
66
+ config_1.logger.info('Init crawler', { params, config: config_1.config });
67
67
  try {
68
68
  yield (0, store_1.initDatabase)();
69
69
  yield (0, puppeteer_1.ensureBrowser)();
70
70
  yield (0, crawler_1.createCrawlQueue)();
71
- yield (0, cron_1.initCron)();
71
+ if (config_1.config.siteCron.enabled) {
72
+ yield (0, cron_1.initCron)();
73
+ }
72
74
  }
73
75
  catch (err) {
74
76
  config_1.logger.error('Init crawler error', { err });
package/lib/cjs/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
package/lib/cjs/site.js CHANGED
@@ -28,6 +28,11 @@ function parseSitemapUrl(sitemapItem) {
28
28
  }
29
29
  const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
30
30
  config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
31
+ const key = `${url}-${pathname}`;
32
+ if (crawlBlockletRunningMap.has(key)) {
33
+ config_1.logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
34
+ return [];
35
+ }
31
36
  const sitemapList = yield (0, utils_1.getSitemapList)(url);
32
37
  const pathnameRegex = new RegExp(pathname);
33
38
  const sitemapItems = sitemapList
@@ -36,33 +41,31 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
36
41
  return parseSitemapUrl(sitemapItem);
37
42
  });
38
43
  config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
39
- const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
40
- const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
41
- if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
42
- const lastModified = new Date(snapshot.lastModified);
43
- // skip if snapshot lastModified is greater than sitemap lastmod
44
- if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
45
- return null;
46
- }
47
- // skip if interval time has not been reached
48
- if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
49
- return null;
50
- }
51
- }
52
- return { url, sitemapItem };
53
- })))).filter(Boolean);
54
- config_1.logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
55
- const key = `${url}-${pathname}`;
56
- crawlBlockletRunningMap.set(key, crawlableItems);
44
+ let processCount = 0;
45
+ crawlBlockletRunningMap.set(key, true);
57
46
  try {
58
- const jobIds = yield (0, p_map_1.default)(crawlableItems, ({ url, sitemapItem }) => {
47
+ const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
48
+ processCount++;
49
+ const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
50
+ if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
51
+ const lastModified = new Date(snapshot.lastModified);
52
+ // skip if snapshot lastModified is greater than sitemap lastmod
53
+ if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
54
+ return null;
55
+ }
56
+ // skip if interval time has not been reached
57
+ if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
58
+ return null;
59
+ }
60
+ }
61
+ config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
59
62
  return (0, crawler_1.crawlUrl)({
60
63
  url,
61
64
  lastModified: sitemapItem.lastmod,
62
65
  includeScreenshot: false,
63
66
  includeHtml: true,
64
67
  });
65
- }, { concurrency: config_1.config.siteCron.concurrency });
68
+ }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
66
69
  return jobIds;
67
70
  }
68
71
  catch (error) {
@@ -45,7 +45,7 @@ function initDatabase() {
45
45
  sequelize.query('pragma journal_size_limit = 67108864;'),
46
46
  ]);
47
47
  yield sequelize.authenticate();
48
- yield sequelize.sync();
48
+ yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
49
49
  config_1.logger.info('Successfully connected to database');
50
50
  }
51
51
  catch (error) {
@@ -7,6 +7,10 @@ export interface SnapshotModel {
7
7
  screenshot?: string | null;
8
8
  error?: string;
9
9
  lastModified?: string;
10
+ meta?: {
11
+ title?: string;
12
+ description?: string;
13
+ };
10
14
  options?: {
11
15
  width?: number;
12
16
  height?: number;
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
24
28
  screenshot?: SnapshotModel['screenshot'];
25
29
  error?: SnapshotModel['error'];
26
30
  lastModified?: SnapshotModel['lastModified'];
31
+ meta?: SnapshotModel['meta'];
27
32
  options: SnapshotModel['options'];
28
33
  static initModel(sequelize: Sequelize): typeof Snapshot;
29
34
  static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
@@ -44,6 +44,10 @@ class Snapshot extends core_1.Model {
44
44
  type: core_1.DataTypes.STRING,
45
45
  allowNull: true,
46
46
  },
47
+ meta: {
48
+ type: core_1.DataTypes.JSON,
49
+ allowNull: true,
50
+ },
47
51
  options: {
48
52
  type: core_1.DataTypes.JSON,
49
53
  allowNull: true,
@@ -14,8 +14,10 @@ export type Config = {
14
14
  siteCron: {
15
15
  sites: Site[];
16
16
  time: string;
17
- runOnInit: boolean;
18
- concurrency: number;
17
+ enabled: boolean;
18
+ immediate: boolean;
19
+ crawlConcurrency: number;
20
+ sitemapConcurrency: number;
19
21
  };
20
22
  };
21
23
  export declare const logger: any;
package/lib/esm/config.js CHANGED
@@ -4,14 +4,16 @@ export const config = {
4
4
  isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
6
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
- appUrl: process.env.BLOCKLET_APP_URL,
7
+ cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
8
+ appUrl: process.env.BLOCKLET_APP_URL || '/',
8
9
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
9
- cacheDir: process.env.BLOCKLET_CACHE_DIR,
10
10
  // cron
11
11
  siteCron: {
12
12
  sites: [],
13
- time: '0 0 */12 * * *',
14
- runOnInit: false,
15
- concurrency: 5,
13
+ enabled: true,
14
+ time: '0 0 0 * * *',
15
+ immediate: false,
16
+ crawlConcurrency: 2,
17
+ sitemapConcurrency: 30,
16
18
  },
17
19
  };
@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
17
17
  }) => Promise<{
18
18
  html: string;
19
19
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ meta: {
21
+ title?: string;
22
+ description?: string;
23
+ };
20
24
  }>;
21
25
  /**
22
26
  * crawl url and return job id
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
24
24
  const db = new BaseState(Job);
25
25
  crawlQueue = createQueue({
26
26
  store: new SequelizeStore(db, 'crawler'),
27
- concurrency: 1,
27
+ concurrency: config.siteCron.crawlConcurrency,
28
28
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
29
  logger.info('Starting to execute crawl job', job);
30
30
  const canCrawl = yield isAcceptCrawler(job.url);
@@ -75,6 +75,7 @@ export function createCrawlQueue() {
75
75
  status: 'success',
76
76
  screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
77
77
  html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
78
+ meta: result.meta,
78
79
  },
79
80
  });
80
81
  yield Snapshot.upsert(snapshot);
@@ -141,6 +142,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
141
142
  }
142
143
  let html = null;
143
144
  let screenshot = null;
145
+ const meta = {};
144
146
  try {
145
147
  const response = yield page.goto(url, { timeout });
146
148
  if (!response) {
@@ -161,7 +163,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
161
163
  // Try to find the tallest element and set the browser to the same height
162
164
  if (fullPage) {
163
165
  const maxScrollHeight = yield findMaxScrollHeight(page);
164
- logger.info('findMaxScrollHeight', { maxScrollHeight });
166
+ logger.debug('findMaxScrollHeight', { maxScrollHeight });
165
167
  if (maxScrollHeight) {
166
168
  yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
167
169
  yield page.evaluate((scrollHeight) => {
@@ -179,22 +181,33 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
179
181
  }
180
182
  }
181
183
  // get html
182
- if (includeHtml) {
183
- try {
184
- html = yield page.evaluate(() => {
185
- // add meta tag to record crawler
186
- const meta = document.createElement('meta');
187
- meta.name = 'arcblock-crawler';
188
- meta.content = 'true';
189
- document.head.appendChild(meta);
190
- return document.documentElement.outerHTML;
191
- });
192
- }
193
- catch (err) {
194
- logger.error('Failed to get html:', err);
195
- throw err;
184
+ try {
185
+ const data = yield page.evaluate(() => {
186
+ var _a;
187
+ // add meta tag to record crawler
188
+ const meta = document.createElement('meta');
189
+ meta.name = 'arcblock-crawler';
190
+ meta.content = 'true';
191
+ document.head.appendChild(meta);
192
+ // get title and meta description
193
+ const title = document.title || '';
194
+ const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
195
+ return {
196
+ html: document.documentElement.outerHTML,
197
+ title,
198
+ description,
199
+ };
200
+ });
201
+ meta.title = data.title;
202
+ meta.description = data.description;
203
+ if (includeHtml) {
204
+ html = data.html;
196
205
  }
197
206
  }
207
+ catch (err) {
208
+ logger.error('Failed to get html:', err);
209
+ throw err;
210
+ }
198
211
  }
199
212
  catch (error) {
200
213
  logger.error('Failed to get page content:', error);
@@ -207,6 +220,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
207
220
  return {
208
221
  html,
209
222
  screenshot,
223
+ meta,
210
224
  };
211
225
  });
212
226
  /**
@@ -228,7 +242,7 @@ export function crawlUrl(params, callback) {
228
242
  fullPage: params.fullPage,
229
243
  })) || {};
230
244
  if (duplicateJob) {
231
- logger.warn(`Crawl job already exists for ${params.url}, skip`);
245
+ logger.info(`Crawl job already exists for ${params.url}, skip`);
232
246
  return duplicateJob.id;
233
247
  }
234
248
  logger.info('create crawl job', params);
package/lib/esm/cron.js CHANGED
@@ -21,7 +21,7 @@ export function initCron() {
21
21
  {
22
22
  name: 'crawl-site',
23
23
  time: config.siteCron.time,
24
- options: { runOnInit: config.siteCron.runOnInit },
24
+ options: { runOnInit: config.siteCron.immediate },
25
25
  fn: () => __awaiter(this, void 0, void 0, function* () {
26
26
  logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
27
27
  for (const site of config.siteCron.sites) {
package/lib/esm/index.js CHANGED
@@ -19,13 +19,15 @@ export * from './services/snapshot';
19
19
  export * as utils from './utils';
20
20
  export function initCrawler(params) {
21
21
  return __awaiter(this, void 0, void 0, function* () {
22
- logger.info('Init crawler', { params });
23
22
  merge(config, params);
23
+ logger.info('Init crawler', { params, config });
24
24
  try {
25
25
  yield initDatabase();
26
26
  yield ensureBrowser();
27
27
  yield createCrawlQueue();
28
- yield initCron();
28
+ if (config.siteCron.enabled) {
29
+ yield initCron();
30
+ }
29
31
  }
30
32
  catch (err) {
31
33
  logger.error('Init crawler error', { err });
package/lib/esm/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
package/lib/esm/site.js CHANGED
@@ -22,6 +22,11 @@ function parseSitemapUrl(sitemapItem) {
22
22
  }
23
23
  export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
24
24
  logger.info(`Start crawl from sitemap ${url}`, { pathname });
25
+ const key = `${url}-${pathname}`;
26
+ if (crawlBlockletRunningMap.has(key)) {
27
+ logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
28
+ return [];
29
+ }
25
30
  const sitemapList = yield getSitemapList(url);
26
31
  const pathnameRegex = new RegExp(pathname);
27
32
  const sitemapItems = sitemapList
@@ -30,33 +35,31 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
30
35
  return parseSitemapUrl(sitemapItem);
31
36
  });
32
37
  logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
33
- const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
34
- const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
35
- if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
36
- const lastModified = new Date(snapshot.lastModified);
37
- // skip if snapshot lastModified is greater than sitemap lastmod
38
- if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
39
- return null;
40
- }
41
- // skip if interval time has not been reached
42
- if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
43
- return null;
44
- }
45
- }
46
- return { url, sitemapItem };
47
- })))).filter(Boolean);
48
- logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
49
- const key = `${url}-${pathname}`;
50
- crawlBlockletRunningMap.set(key, crawlableItems);
38
+ let processCount = 0;
39
+ crawlBlockletRunningMap.set(key, true);
51
40
  try {
52
- const jobIds = yield pMap(crawlableItems, ({ url, sitemapItem }) => {
41
+ const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
42
+ processCount++;
43
+ const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
44
+ if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
45
+ const lastModified = new Date(snapshot.lastModified);
46
+ // skip if snapshot lastModified is greater than sitemap lastmod
47
+ if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
48
+ return null;
49
+ }
50
+ // skip if interval time has not been reached
51
+ if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
52
+ return null;
53
+ }
54
+ }
55
+ logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
53
56
  return crawlUrl({
54
57
  url,
55
58
  lastModified: sitemapItem.lastmod,
56
59
  includeScreenshot: false,
57
60
  includeHtml: true,
58
61
  });
59
- }, { concurrency: config.siteCron.concurrency });
62
+ }), { concurrency: config.siteCron.sitemapConcurrency });
60
63
  return jobIds;
61
64
  }
62
65
  catch (error) {
@@ -39,7 +39,7 @@ export function initDatabase() {
39
39
  sequelize.query('pragma journal_size_limit = 67108864;'),
40
40
  ]);
41
41
  yield sequelize.authenticate();
42
- yield sequelize.sync();
42
+ yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
43
43
  logger.info('Successfully connected to database');
44
44
  }
45
45
  catch (error) {
@@ -7,6 +7,10 @@ export interface SnapshotModel {
7
7
  screenshot?: string | null;
8
8
  error?: string;
9
9
  lastModified?: string;
10
+ meta?: {
11
+ title?: string;
12
+ description?: string;
13
+ };
10
14
  options?: {
11
15
  width?: number;
12
16
  height?: number;
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
24
28
  screenshot?: SnapshotModel['screenshot'];
25
29
  error?: SnapshotModel['error'];
26
30
  lastModified?: SnapshotModel['lastModified'];
31
+ meta?: SnapshotModel['meta'];
27
32
  options: SnapshotModel['options'];
28
33
  static initModel(sequelize: Sequelize): typeof Snapshot;
29
34
  static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
@@ -41,6 +41,10 @@ export class Snapshot extends Model {
41
41
  type: DataTypes.STRING,
42
42
  allowNull: true,
43
43
  },
44
+ meta: {
45
+ type: DataTypes.JSON,
46
+ allowNull: true,
47
+ },
44
48
  options: {
45
49
  type: DataTypes.JSON,
46
50
  allowNull: true,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.1",
3
+ "version": "1.1.2",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",