@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
@@ -0,0 +1,68 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.Snapshot = void 0;
13
+ const core_1 = require("@sequelize/core");
14
+ class Snapshot extends core_1.Model {
15
+ static initModel(sequelize) {
16
+ return Snapshot.init({
17
+ jobId: {
18
+ type: core_1.DataTypes.STRING,
19
+ primaryKey: true,
20
+ allowNull: false,
21
+ },
22
+ url: {
23
+ type: core_1.DataTypes.STRING,
24
+ allowNull: false,
25
+ index: true,
26
+ },
27
+ status: {
28
+ type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
29
+ allowNull: false,
30
+ },
31
+ html: {
32
+ type: core_1.DataTypes.TEXT,
33
+ allowNull: true,
34
+ },
35
+ screenshot: {
36
+ type: core_1.DataTypes.STRING,
37
+ allowNull: true,
38
+ },
39
+ error: {
40
+ type: core_1.DataTypes.STRING,
41
+ allowNull: true,
42
+ },
43
+ lastModified: {
44
+ type: core_1.DataTypes.STRING,
45
+ allowNull: true,
46
+ },
47
+ options: {
48
+ type: core_1.DataTypes.JSON,
49
+ allowNull: true,
50
+ },
51
+ }, {
52
+ sequelize,
53
+ modelName: 'snapshot',
54
+ tableName: 'snap',
55
+ timestamps: true,
56
+ });
57
+ }
58
+ static findSnapshot(condition) {
59
+ return __awaiter(this, void 0, void 0, function* () {
60
+ const snapshot = yield Snapshot.findOne(Object.assign({ order: [
61
+ ['lastModified', 'DESC'],
62
+ ['updatedAt', 'DESC'],
63
+ ] }, condition));
64
+ return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
65
+ });
66
+ }
67
+ }
68
+ exports.Snapshot = Snapshot;
@@ -0,0 +1,32 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { Request } from 'express';
3
+ export declare const axios: import("axios").AxiosInstance;
4
+ export declare const CRAWLER_FLAG = "x-arcblock-crawler";
5
+ export declare const sleep: (ms: number) => Promise<unknown>;
6
+ /**
7
+ * Check if the request is a arcblock crawler
8
+ */
9
+ export declare const isSelfCrawler: (req: Request) => boolean;
10
+ /**
11
+ * Check if the request is a static file
12
+ */
13
+ export declare function isStaticFile(req: Request): boolean;
14
+ /**
15
+ * Check if the request is a spider
16
+ */
17
+ export declare function isSpider(req: Request): boolean;
18
+ /**
19
+ * Get and parse the robots.txt by `robots-parser`
20
+ */
21
+ export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
22
+ /**
23
+ * Check if the url is allowed to crawl from robots.txt
24
+ */
25
+ export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
26
+ /**
27
+ * Get and parse the sitemap.xml by `sitemap` package
28
+ */
29
+ export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
30
+ export declare const formatUrl: (url: string) => string;
31
+ export declare function md5(content: string | Uint8Array): string;
32
+ export declare function findMaxScrollHeight(page: Page): Promise<number>;
@@ -12,11 +12,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
12
12
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
13
  };
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
15
+ exports.formatUrl = exports.getSitemapList = exports.isAcceptCrawler = exports.isSelfCrawler = exports.sleep = exports.CRAWLER_FLAG = exports.axios = void 0;
16
+ exports.isStaticFile = isStaticFile;
17
+ exports.isSpider = isSpider;
16
18
  exports.getRobots = getRobots;
17
19
  exports.md5 = md5;
18
20
  exports.findMaxScrollHeight = findMaxScrollHeight;
19
- const config_1 = require("@blocklet/sdk/lib/config");
20
21
  const axios_1 = __importDefault(require("axios"));
21
22
  const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
22
23
  const uniq_1 = __importDefault(require("lodash/uniq"));
@@ -25,24 +26,14 @@ const robots_parser_1 = __importDefault(require("robots-parser"));
25
26
  const sitemap_1 = require("sitemap");
26
27
  const stream_1 = require("stream");
27
28
  const ufo_1 = require("ufo");
28
- exports.api = axios_1.default.create({
29
- timeout: 1000 * 10,
29
+ const config_1 = require("./config");
30
+ exports.axios = axios_1.default.create({
31
+ timeout: 1000 * 30,
30
32
  headers: {
31
33
  'Content-Type': 'application/json',
32
34
  },
33
35
  });
34
- const sleep = (ms) => {
35
- return new Promise((resolve) => {
36
- setTimeout(resolve, ms);
37
- });
38
- };
39
- exports.sleep = sleep;
40
- exports.CRAWLER_FLAG = 'x-crawler';
41
- const isSelfCrawler = (req) => {
42
- const ua = req.get('user-agent') || '';
43
- return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
44
- };
45
- exports.isSelfCrawler = isSelfCrawler;
36
+ exports.CRAWLER_FLAG = 'x-arcblock-crawler';
46
37
  /**
47
38
  * A default set of user agent patterns for bots/crawlers that do not perform
48
39
  * well with pages that require JavaScript.
@@ -98,12 +89,8 @@ const botUserAgents = [
98
89
  /AlibabaGroup/i,
99
90
  /adaptive-edge-crawler/i,
100
91
  ];
101
- const isSpider = (ua) => botUserAgents.some((spider) => {
102
- return spider.test(ua);
103
- });
104
92
  /**
105
- * A default set of file extensions for static assets that do not need to be
106
- * proxied.
93
+ * A default set of file extensions for static assets that do not need to be proxied.
107
94
  */
108
95
  const staticFileExtensions = [
109
96
  'ai',
@@ -148,89 +135,91 @@ const staticFileExtensions = [
148
135
  'xml',
149
136
  'zip',
150
137
  ];
151
- const getDefaultRobotsUrl = (url) => {
152
- const { origin } = new URL(url);
153
- return (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
138
+ const sleep = (ms) => {
139
+ return new Promise((resolve) => {
140
+ setTimeout(resolve, ms);
141
+ });
154
142
  };
155
- exports.getDefaultRobotsUrl = getDefaultRobotsUrl;
143
+ exports.sleep = sleep;
144
+ /**
145
+ * Check if the request is a arcblock crawler
146
+ */
147
+ const isSelfCrawler = (req) => {
148
+ const ua = req.get('user-agent') || '';
149
+ return req.get(exports.CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
150
+ };
151
+ exports.isSelfCrawler = isSelfCrawler;
152
+ /**
153
+ * Check if the request is a static file
154
+ */
155
+ function isStaticFile(req) {
156
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
157
+ return excludeUrlPattern.test(req.path);
158
+ }
159
+ /**
160
+ * Check if the request is a spider
161
+ */
162
+ function isSpider(req) {
163
+ const ua = req.get('user-agent') || '';
164
+ return botUserAgents.some((spider) => spider.test(ua));
165
+ }
166
+ /**
167
+ * Get and parse the robots.txt by `robots-parser`
168
+ */
156
169
  function getRobots(url) {
157
170
  return __awaiter(this, void 0, void 0, function* () {
158
171
  const { origin } = new URL(url);
159
172
  const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
160
- const { data } = yield exports.api.get(robotsUrl).catch(() => ({
161
- data: '',
162
- }));
173
+ const { data } = yield exports.axios.get(robotsUrl).catch((error) => {
174
+ config_1.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
175
+ return { data: null };
176
+ });
163
177
  return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
164
178
  });
165
179
  }
166
- const getDefaultSitemapUrl = (url) => {
167
- const { origin } = new URL(url);
168
- return (0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1');
169
- };
170
- exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
180
+ /**
181
+ * Check if the url is allowed to crawl from robots.txt
182
+ */
171
183
  const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
172
184
  const robots = yield getRobots(url);
173
185
  const isAllowed = robots ? yield robots.isAllowed(url) : true;
174
186
  return isAllowed;
175
187
  });
176
188
  exports.isAcceptCrawler = isAcceptCrawler;
189
+ /**
190
+ * Get and parse the sitemap.xml by `sitemap` package
191
+ */
177
192
  const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
178
- let sitemapUrlList = [(0, exports.getDefaultSitemapUrl)(url)];
193
+ let sitemapUrlList = [];
179
194
  const robots = yield getRobots(url);
180
195
  if (robots) {
181
- const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
182
- if (robotsTxtSitemapUrlList.length > 0) {
183
- sitemapUrlList = robotsTxtSitemapUrlList;
184
- }
196
+ sitemapUrlList = (yield robots.getSitemaps()) || [];
197
+ }
198
+ if (!sitemapUrlList.length) {
199
+ const { origin } = new URL(url);
200
+ sitemapUrlList.push((0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1'));
185
201
  }
186
202
  // loop site map url list
187
203
  const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
188
- const newUrl = new URL(sitemapUrl);
189
- newUrl.searchParams.set('nocache', '1');
190
- sitemapUrl = newUrl.toString();
191
- const { data: sitemapTxt } = yield exports.api.get(sitemapUrl).catch(() => ({
192
- data: '',
193
- }));
194
- if (sitemapTxt) {
195
- const stream = stream_1.Readable.from([sitemapTxt]);
196
- const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
197
- return sitemapJson;
204
+ sitemapUrl = (0, ufo_1.withQuery)(sitemapUrl, { nocache: '1' });
205
+ try {
206
+ const { data: sitemapTxt } = yield exports.axios.get(sitemapUrl).catch(() => ({
207
+ data: '',
208
+ }));
209
+ if (sitemapTxt) {
210
+ const stream = stream_1.Readable.from([sitemapTxt]);
211
+ const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
212
+ return sitemapJson;
213
+ }
214
+ }
215
+ catch (error) {
216
+ config_1.logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
198
217
  }
199
218
  return [];
200
219
  })));
201
220
  return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
202
221
  });
203
222
  exports.getSitemapList = getSitemapList;
204
- const isBotUserAgent = (req) => {
205
- const ua = req.get('user-agent');
206
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
207
- if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
208
- return false;
209
- }
210
- return true;
211
- };
212
- exports.isBotUserAgent = isBotUserAgent;
213
- const getComponentInfo = () => {
214
- return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
215
- };
216
- exports.getComponentInfo = getComponentInfo;
217
- const getFullUrl = (req) => {
218
- const blockletPathname = req.headers['x-path-prefix']
219
- ? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
220
- : req.originalUrl;
221
- return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
222
- };
223
- exports.getFullUrl = getFullUrl;
224
- const getRelativePath = (url) => {
225
- try {
226
- return new URL(url).pathname;
227
- }
228
- catch (error) {
229
- // ignore error
230
- }
231
- return url;
232
- };
233
- exports.getRelativePath = getRelativePath;
234
223
  const formatUrl = (url) => {
235
224
  return url.replace(/\/$/, '').trim();
236
225
  };
@@ -0,0 +1,22 @@
1
+ export type Site = {
2
+ url: string;
3
+ pathname: string;
4
+ /** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
5
+ interval?: number;
6
+ };
7
+ export type Config = {
8
+ isProd: boolean;
9
+ dataDir: string;
10
+ appDir: string;
11
+ appUrl: string;
12
+ cacheDir: string;
13
+ puppeteerPath?: string;
14
+ siteCron: {
15
+ sites: Site[];
16
+ time: string;
17
+ runOnInit: boolean;
18
+ concurrency: number;
19
+ };
20
+ };
21
+ export declare const logger: any;
22
+ export declare const config: Config;
@@ -1,11 +1,17 @@
1
1
  import createLogger from '@blocklet/logger';
2
- export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
2
+ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
3
3
  export const config = {
4
- redisUrl: process.env.REDIS_URL,
4
+ isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
6
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
7
  appUrl: process.env.BLOCKLET_APP_URL,
8
8
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
9
9
  cacheDir: process.env.BLOCKLET_CACHE_DIR,
10
- testOnInitialize: process.env.NODE_ENV === 'production',
10
+ // cron
11
+ siteCron: {
12
+ sites: [],
13
+ time: '0 0 */12 * * *',
14
+ runOnInit: false,
15
+ concurrency: 5,
16
+ },
11
17
  };
@@ -0,0 +1,26 @@
1
+ import { JobState } from './store/job';
2
+ import { SnapshotModel } from './store/snapshot';
3
+ export declare function createCrawlQueue(): void;
4
+ export declare function getDataDir(): Promise<{
5
+ htmlDir: string;
6
+ screenshotDir: string;
7
+ }>;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
+ url: string;
10
+ includeScreenshot?: boolean;
11
+ includeHtml?: boolean;
12
+ width?: number;
13
+ height?: number;
14
+ quality?: number;
15
+ timeout?: number;
16
+ fullPage?: boolean;
17
+ }) => Promise<{
18
+ html: string;
19
+ screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ }>;
21
+ /**
22
+ * crawl url and return job id
23
+ * @param params
24
+ * @param callback callback when job finished
25
+ */
26
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
@@ -9,16 +9,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
9
9
  };
10
10
  import createQueue from '@abtnode/queue';
11
11
  import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
12
- import sequelize from '@sequelize/core';
13
12
  import { randomUUID } from 'crypto';
14
13
  import fs from 'fs-extra';
15
- import pick from 'lodash/pick';
16
14
  import path from 'path';
17
- import { joinURL } from 'ufo';
18
15
  import { config, logger } from './config';
19
- import { Job } from './db/job';
20
- import { Snapshot } from './db/snapshot';
21
16
  import { initPage } from './puppeteer';
17
+ import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
18
+ import { Job } from './store/job';
19
+ import { Snapshot } from './store/snapshot';
22
20
  import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
23
21
  const { BaseState } = require('@abtnode/models');
24
22
  let crawlQueue;
@@ -81,19 +79,9 @@ export function createCrawlQueue() {
81
79
  });
82
80
  yield Snapshot.upsert(snapshot);
83
81
  return snapshot;
84
- // save to redis
85
- // if (saveToRedis) {
86
- // useCache.set(url, {
87
- // html: result.html || '',
88
- // lastModified,
89
- // });
90
- // logger.info(`success to crawl ${url}`, job);
91
- // return result;
92
- // }
93
82
  }
94
83
  catch (error) {
95
84
  logger.error(`Failed to crawl ${job.url}`, { error, job });
96
- console.error(error.stack);
97
85
  const snapshot = convertJobToSnapshot({
98
86
  job,
99
87
  snapshot: {
@@ -145,7 +133,7 @@ function formatHtml(htmlString) {
145
133
  }
146
134
  return htmlString;
147
135
  }
148
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
136
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
149
137
  logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
150
138
  const page = yield initPage();
151
139
  if (width && height) {
@@ -187,15 +175,24 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
187
175
  }
188
176
  catch (err) {
189
177
  logger.error('Failed to get screenshot:', err);
178
+ throw err;
190
179
  }
191
180
  }
192
181
  // get html
193
182
  if (includeHtml) {
194
- if (formatPageContent) {
195
- html = yield formatPageContent({ page, url });
183
+ try {
184
+ html = yield page.evaluate(() => {
185
+ // add meta tag to record crawler
186
+ const meta = document.createElement('meta');
187
+ meta.name = 'arcblock-crawler';
188
+ meta.content = 'true';
189
+ document.head.appendChild(meta);
190
+ return document.documentElement.outerHTML;
191
+ });
196
192
  }
197
- else {
198
- html = yield page.content();
193
+ catch (err) {
194
+ logger.error('Failed to get html:', err);
195
+ throw err;
199
196
  }
200
197
  }
201
198
  }
@@ -212,11 +209,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
212
209
  screenshot,
213
210
  };
214
211
  });
215
- export function createCrawlJob(params, callback) {
212
+ /**
213
+ * crawl url and return job id
214
+ * @param params
215
+ * @param callback callback when job finished
216
+ */
217
+ export function crawlUrl(params, callback) {
216
218
  return __awaiter(this, void 0, void 0, function* () {
217
- params = Object.assign(Object.assign({}, params), { id: randomUUID(), url: formatUrl(params.url) });
219
+ params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
218
220
  // skip duplicate job
219
- const existsJob = yield getJob({
221
+ const { job: duplicateJob } = (yield Job.findJob({
220
222
  url: params.url,
221
223
  includeScreenshot: params.includeScreenshot,
222
224
  includeHtml: params.includeHtml,
@@ -224,89 +226,22 @@ export function createCrawlJob(params, callback) {
224
226
  width: params.width,
225
227
  height: params.height,
226
228
  fullPage: params.fullPage,
227
- });
228
- if (existsJob) {
229
+ })) || {};
230
+ if (duplicateJob) {
229
231
  logger.warn(`Crawl job already exists for ${params.url}, skip`);
230
- return existsJob.id;
232
+ return duplicateJob.id;
231
233
  }
232
234
  logger.info('create crawl job', params);
233
- const job = crawlQueue.push(params);
234
- job.on('finished', ({ result }) => {
235
- logger.info(`Crawl completed ${params.url}`, { job: params, result });
236
- callback === null || callback === void 0 ? void 0 : callback(result);
237
- });
235
+ const jobId = randomUUID();
236
+ const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
237
+ job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
238
+ logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
239
+ callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
240
+ }));
238
241
  job.on('failed', ({ error }) => {
239
242
  logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
240
243
  callback === null || callback === void 0 ? void 0 : callback(null);
241
244
  });
242
- return params.id;
243
- });
244
- }
245
- // @ts-ignore
246
- export function getJob(condition) {
247
- return __awaiter(this, void 0, void 0, function* () {
248
- const where = Object.keys(condition)
249
- .filter((key) => condition[key] !== undefined)
250
- .map((key) => {
251
- return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
252
- });
253
- const job = yield crawlQueue.store.db.findOne({
254
- where: {
255
- [sequelize.Op.and]: where,
256
- },
257
- });
258
- if (job) {
259
- return job.job;
260
- }
261
- return null;
262
- });
263
- }
264
- function convertJobToSnapshot({ job, snapshot }) {
265
- return Object.assign({
266
- // @ts-ignore
267
- jobId: job.jobId || job.id, url: job.url, options: {
268
- width: job.width,
269
- height: job.height,
270
- includeScreenshot: job.includeScreenshot,
271
- includeHtml: job.includeHtml,
272
- quality: job.quality,
273
- fullPage: job.fullPage,
274
- } }, snapshot);
275
- }
276
- export function formatSnapshot(snapshot, columns) {
277
- return __awaiter(this, void 0, void 0, function* () {
278
- let data = Object.assign({}, snapshot);
279
- // format screenshot path to full url
280
- if (data.screenshot) {
281
- data.screenshot = joinURL(config.appUrl, data.screenshot);
282
- }
283
- // format html path to string
284
- if (data.html) {
285
- const html = yield fs.readFile(path.join(config.dataDir, data.html));
286
- data.html = html.toString();
287
- }
288
- if (columns === null || columns === void 0 ? void 0 : columns.length) {
289
- data = pick(data, columns);
290
- }
291
- return data;
292
- });
293
- }
294
- /**
295
- * get snapshot from db or crawl queue
296
- */
297
- export function getSnapshot(jobId) {
298
- return __awaiter(this, void 0, void 0, function* () {
299
- const snapshotModel = yield Snapshot.findByPk(jobId);
300
- if (snapshotModel) {
301
- return snapshotModel.toJSON();
302
- }
303
- const job = yield getJob({ id: jobId });
304
- if (job) {
305
- return {
306
- jobId,
307
- status: 'pending',
308
- };
309
- }
310
- return null;
245
+ return jobId;
311
246
  });
312
247
  }
@@ -0,0 +1 @@
1
+ export declare function initCron(): any;
@@ -0,0 +1,43 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ import Cron from '@abtnode/cron';
11
+ import { config, logger } from './config';
12
+ import { crawlSite } from './site';
13
+ let cron = null;
14
+ export function initCron() {
15
+ if (cron)
16
+ return;
17
+ logger.info('Init cron', { config: config.siteCron });
18
+ cron = Cron.init({
19
+ context: {},
20
+ jobs: [
21
+ {
22
+ name: 'crawl-site',
23
+ time: config.siteCron.time,
24
+ options: { runOnInit: config.siteCron.runOnInit },
25
+ fn: () => __awaiter(this, void 0, void 0, function* () {
26
+ logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
27
+ for (const site of config.siteCron.sites) {
28
+ try {
29
+ yield crawlSite(site);
30
+ }
31
+ catch (err) {
32
+ logger.error('Cron task error', { err, site });
33
+ }
34
+ }
35
+ }),
36
+ },
37
+ ],
38
+ onError: (err) => {
39
+ logger.error('Cron error', err);
40
+ },
41
+ });
42
+ return cron;
43
+ }
@@ -0,0 +1,9 @@
1
+ import { Config } from './config';
2
+ export * from './crawler';
3
+ export * from './site';
4
+ export * from './services/snapshot';
5
+ export * as utils from './utils';
6
+ type DeepPartial<T> = T extends object ? {
7
+ [P in keyof T]?: DeepPartial<T[P]>;
8
+ } : T;
9
+ export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;