@arcblock/crawler 1.0.5 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +56 -113
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +88 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +48 -102
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +26 -37
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +84 -71
  46. package/package.json +22 -33
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -15
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -15
@@ -0,0 +1,68 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.Snapshot = void 0;
13
+ const core_1 = require("@sequelize/core");
14
+ class Snapshot extends core_1.Model {
15
+ static initModel(sequelize) {
16
+ return Snapshot.init({
17
+ jobId: {
18
+ type: core_1.DataTypes.STRING,
19
+ primaryKey: true,
20
+ allowNull: false,
21
+ },
22
+ url: {
23
+ type: core_1.DataTypes.STRING,
24
+ allowNull: false,
25
+ index: true,
26
+ },
27
+ status: {
28
+ type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
29
+ allowNull: false,
30
+ },
31
+ html: {
32
+ type: core_1.DataTypes.TEXT,
33
+ allowNull: true,
34
+ },
35
+ screenshot: {
36
+ type: core_1.DataTypes.STRING,
37
+ allowNull: true,
38
+ },
39
+ error: {
40
+ type: core_1.DataTypes.STRING,
41
+ allowNull: true,
42
+ },
43
+ lastModified: {
44
+ type: core_1.DataTypes.STRING,
45
+ allowNull: true,
46
+ },
47
+ options: {
48
+ type: core_1.DataTypes.JSON,
49
+ allowNull: true,
50
+ },
51
+ }, {
52
+ sequelize,
53
+ modelName: 'snapshot',
54
+ tableName: 'snap',
55
+ timestamps: true,
56
+ });
57
+ }
58
+ static findSnapshot(condition) {
59
+ return __awaiter(this, void 0, void 0, function* () {
60
+ const snapshot = yield Snapshot.findOne(Object.assign({ order: [
61
+ ['lastModified', 'DESC'],
62
+ ['updatedAt', 'DESC'],
63
+ ] }, condition));
64
+ return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
65
+ });
66
+ }
67
+ }
68
+ exports.Snapshot = Snapshot;
@@ -0,0 +1,32 @@
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { Request } from 'express';
3
+ export declare const axios: import("axios").AxiosInstance;
4
+ export declare const CRAWLER_FLAG = "x-arcblock-crawler";
5
+ export declare const sleep: (ms: number) => Promise<unknown>;
6
+ /**
7
+ * Check if the request is a arcblock crawler
8
+ */
9
+ export declare const isSelfCrawler: (req: Request) => boolean;
10
+ /**
11
+ * Check if the request is a static file
12
+ */
13
+ export declare function isStaticFile(req: Request): boolean;
14
+ /**
15
+ * Check if the request is a spider
16
+ */
17
+ export declare function isSpider(req: Request): boolean;
18
+ /**
19
+ * Get and parse the robots.txt by `robots-parser`
20
+ */
21
+ export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
22
+ /**
23
+ * Check if the url is allowed to crawl from robots.txt
24
+ */
25
+ export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
26
+ /**
27
+ * Get and parse the sitemap.xml by `sitemap` package
28
+ */
29
+ export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
30
+ export declare const formatUrl: (url: string) => string;
31
+ export declare function md5(content: string | Uint8Array): string;
32
+ export declare function findMaxScrollHeight(page: Page): Promise<number>;
@@ -12,10 +12,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
12
12
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
13
  };
14
14
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.formatUrl = exports.getRelativePath = exports.getFullUrl = exports.getComponentInfo = exports.isBotUserAgent = exports.getSitemapList = exports.isAcceptCrawler = exports.getDefaultSitemapUrl = exports.getDefaultRobotsUrl = exports.isSelfCrawler = exports.CRAWLER_FLAG = exports.sleep = exports.api = void 0;
15
+ exports.formatUrl = exports.getSitemapList = exports.isAcceptCrawler = exports.isSelfCrawler = exports.sleep = exports.CRAWLER_FLAG = exports.axios = void 0;
16
+ exports.isStaticFile = isStaticFile;
17
+ exports.isSpider = isSpider;
16
18
  exports.getRobots = getRobots;
17
19
  exports.md5 = md5;
18
- const config_1 = require("@blocklet/sdk/lib/config");
20
+ exports.findMaxScrollHeight = findMaxScrollHeight;
19
21
  const axios_1 = __importDefault(require("axios"));
20
22
  const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
21
23
  const uniq_1 = __importDefault(require("lodash/uniq"));
@@ -24,24 +26,14 @@ const robots_parser_1 = __importDefault(require("robots-parser"));
24
26
  const sitemap_1 = require("sitemap");
25
27
  const stream_1 = require("stream");
26
28
  const ufo_1 = require("ufo");
27
- exports.api = axios_1.default.create({
28
- timeout: 1000 * 10,
29
+ const config_1 = require("./config");
30
+ exports.axios = axios_1.default.create({
31
+ timeout: 1000 * 30,
29
32
  headers: {
30
33
  'Content-Type': 'application/json',
31
34
  },
32
35
  });
33
- const sleep = (ms) => {
34
- return new Promise((resolve) => {
35
- setTimeout(resolve, ms);
36
- });
37
- };
38
- exports.sleep = sleep;
39
- exports.CRAWLER_FLAG = 'x-crawler';
40
- const isSelfCrawler = (req) => {
41
- const ua = req.get('user-agent') || '';
42
- return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
43
- };
44
- exports.isSelfCrawler = isSelfCrawler;
36
+ exports.CRAWLER_FLAG = 'x-arcblock-crawler';
45
37
  /**
46
38
  * A default set of user agent patterns for bots/crawlers that do not perform
47
39
  * well with pages that require JavaScript.
@@ -97,12 +89,8 @@ const botUserAgents = [
97
89
  /AlibabaGroup/i,
98
90
  /adaptive-edge-crawler/i,
99
91
  ];
100
- const isSpider = (ua) => botUserAgents.some((spider) => {
101
- return spider.test(ua);
102
- });
103
92
  /**
104
- * A default set of file extensions for static assets that do not need to be
105
- * proxied.
93
+ * A default set of file extensions for static assets that do not need to be proxied.
106
94
  */
107
95
  const staticFileExtensions = [
108
96
  'ai',
@@ -147,89 +135,91 @@ const staticFileExtensions = [
147
135
  'xml',
148
136
  'zip',
149
137
  ];
150
- const getDefaultRobotsUrl = (url) => {
151
- const { origin } = new URL(url);
152
- return (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
138
+ const sleep = (ms) => {
139
+ return new Promise((resolve) => {
140
+ setTimeout(resolve, ms);
141
+ });
153
142
  };
154
- exports.getDefaultRobotsUrl = getDefaultRobotsUrl;
143
+ exports.sleep = sleep;
144
+ /**
145
+ * Check if the request is a arcblock crawler
146
+ */
147
+ const isSelfCrawler = (req) => {
148
+ const ua = req.get('user-agent') || '';
149
+ return req.get(exports.CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
150
+ };
151
+ exports.isSelfCrawler = isSelfCrawler;
152
+ /**
153
+ * Check if the request is a static file
154
+ */
155
+ function isStaticFile(req) {
156
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
157
+ return excludeUrlPattern.test(req.path);
158
+ }
159
+ /**
160
+ * Check if the request is a spider
161
+ */
162
+ function isSpider(req) {
163
+ const ua = req.get('user-agent') || '';
164
+ return botUserAgents.some((spider) => spider.test(ua));
165
+ }
166
+ /**
167
+ * Get and parse the robots.txt by `robots-parser`
168
+ */
155
169
  function getRobots(url) {
156
170
  return __awaiter(this, void 0, void 0, function* () {
157
171
  const { origin } = new URL(url);
158
172
  const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
159
- const { data } = yield exports.api.get(robotsUrl).catch(() => ({
160
- data: '',
161
- }));
173
+ const { data } = yield exports.axios.get(robotsUrl).catch((error) => {
174
+ config_1.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
175
+ return { data: null };
176
+ });
162
177
  return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
163
178
  });
164
179
  }
165
- const getDefaultSitemapUrl = (url) => {
166
- const { origin } = new URL(url);
167
- return (0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1');
168
- };
169
- exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
180
+ /**
181
+ * Check if the url is allowed to crawl from robots.txt
182
+ */
170
183
  const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
171
184
  const robots = yield getRobots(url);
172
185
  const isAllowed = robots ? yield robots.isAllowed(url) : true;
173
186
  return isAllowed;
174
187
  });
175
188
  exports.isAcceptCrawler = isAcceptCrawler;
189
+ /**
190
+ * Get and parse the sitemap.xml by `sitemap` package
191
+ */
176
192
  const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
177
- let sitemapUrlList = [(0, exports.getDefaultSitemapUrl)(url)];
193
+ let sitemapUrlList = [];
178
194
  const robots = yield getRobots(url);
179
195
  if (robots) {
180
- const robotsTxtSitemapUrlList = (yield robots.getSitemaps()) || [];
181
- if (robotsTxtSitemapUrlList.length > 0) {
182
- sitemapUrlList = robotsTxtSitemapUrlList;
183
- }
196
+ sitemapUrlList = (yield robots.getSitemaps()) || [];
197
+ }
198
+ if (!sitemapUrlList.length) {
199
+ const { origin } = new URL(url);
200
+ sitemapUrlList.push((0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1'));
184
201
  }
185
202
  // loop site map url list
186
203
  const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
187
- const newUrl = new URL(sitemapUrl);
188
- newUrl.searchParams.set('nocache', '1');
189
- sitemapUrl = newUrl.toString();
190
- const { data: sitemapTxt } = yield exports.api.get(sitemapUrl).catch(() => ({
191
- data: '',
192
- }));
193
- if (sitemapTxt) {
194
- const stream = stream_1.Readable.from([sitemapTxt]);
195
- const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
196
- return sitemapJson;
204
+ sitemapUrl = (0, ufo_1.withQuery)(sitemapUrl, { nocache: '1' });
205
+ try {
206
+ const { data: sitemapTxt } = yield exports.axios.get(sitemapUrl).catch(() => ({
207
+ data: '',
208
+ }));
209
+ if (sitemapTxt) {
210
+ const stream = stream_1.Readable.from([sitemapTxt]);
211
+ const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
212
+ return sitemapJson;
213
+ }
214
+ }
215
+ catch (error) {
216
+ config_1.logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
197
217
  }
198
218
  return [];
199
219
  })));
200
220
  return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
201
221
  });
202
222
  exports.getSitemapList = getSitemapList;
203
- const isBotUserAgent = (req) => {
204
- const ua = req.get('user-agent');
205
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
206
- if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
207
- return false;
208
- }
209
- return true;
210
- };
211
- exports.isBotUserAgent = isBotUserAgent;
212
- const getComponentInfo = () => {
213
- return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
214
- };
215
- exports.getComponentInfo = getComponentInfo;
216
- const getFullUrl = (req) => {
217
- const blockletPathname = req.headers['x-path-prefix']
218
- ? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
219
- : req.originalUrl;
220
- return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
221
- };
222
- exports.getFullUrl = getFullUrl;
223
- const getRelativePath = (url) => {
224
- try {
225
- return new URL(url).pathname;
226
- }
227
- catch (error) {
228
- // ignore error
229
- }
230
- return url;
231
- };
232
- exports.getRelativePath = getRelativePath;
233
223
  const formatUrl = (url) => {
234
224
  return url.replace(/\/$/, '').trim();
235
225
  };
@@ -237,3 +227,23 @@ exports.formatUrl = formatUrl;
237
227
  function md5(content) {
238
228
  return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
239
229
  }
230
+ function findMaxScrollHeight(page) {
231
+ return __awaiter(this, void 0, void 0, function* () {
232
+ const maxHeightHandler = yield page.evaluateHandle(() => {
233
+ const elements = Array.from(document.querySelectorAll('*'));
234
+ let maxHeight = document.body.scrollHeight;
235
+ for (const el of elements) {
236
+ const style = window.getComputedStyle(el);
237
+ if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
238
+ if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
239
+ maxHeight = el.scrollHeight;
240
+ }
241
+ }
242
+ }
243
+ return maxHeight;
244
+ });
245
+ const maxHeight = yield maxHeightHandler.jsonValue();
246
+ maxHeightHandler.dispose();
247
+ return maxHeight;
248
+ });
249
+ }
@@ -0,0 +1,22 @@
1
+ export type Site = {
2
+ url: string;
3
+ pathname: string;
4
+ /** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
5
+ interval?: number;
6
+ };
7
+ export type Config = {
8
+ isProd: boolean;
9
+ dataDir: string;
10
+ appDir: string;
11
+ appUrl: string;
12
+ cacheDir: string;
13
+ puppeteerPath?: string;
14
+ siteCron: {
15
+ sites: Site[];
16
+ time: string;
17
+ runOnInit: boolean;
18
+ concurrency: number;
19
+ };
20
+ };
21
+ export declare const logger: any;
22
+ export declare const config: Config;
@@ -1,11 +1,17 @@
1
1
  import createLogger from '@blocklet/logger';
2
- export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
2
+ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
3
3
  export const config = {
4
- redisUrl: process.env.REDIS_URL,
4
+ isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
6
  appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
7
  appUrl: process.env.BLOCKLET_APP_URL,
8
8
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
9
9
  cacheDir: process.env.BLOCKLET_CACHE_DIR,
10
- testOnInitialize: process.env.NODE_ENV === 'production',
10
+ // cron
11
+ siteCron: {
12
+ sites: [],
13
+ time: '0 0 */12 * * *',
14
+ runOnInit: false,
15
+ concurrency: 5,
16
+ },
11
17
  };
@@ -0,0 +1,26 @@
1
+ import { JobState } from './store/job';
2
+ import { SnapshotModel } from './store/snapshot';
3
+ export declare function createCrawlQueue(): void;
4
+ export declare function getDataDir(): Promise<{
5
+ htmlDir: string;
6
+ screenshotDir: string;
7
+ }>;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
+ url: string;
10
+ includeScreenshot?: boolean;
11
+ includeHtml?: boolean;
12
+ width?: number;
13
+ height?: number;
14
+ quality?: number;
15
+ timeout?: number;
16
+ fullPage?: boolean;
17
+ }) => Promise<{
18
+ html: string;
19
+ screenshot: Uint8Array<ArrayBufferLike> | null;
20
+ }>;
21
+ /**
22
+ * crawl url and return job id
23
+ * @param params
24
+ * @param callback callback when job finished
25
+ */
26
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
@@ -9,17 +9,15 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
9
9
  };
10
10
  import createQueue from '@abtnode/queue';
11
11
  import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
12
- import sequelize from '@sequelize/core';
13
12
  import { randomUUID } from 'crypto';
14
13
  import fs from 'fs-extra';
15
- import pick from 'lodash/pick';
16
14
  import path from 'path';
17
- import { joinURL } from 'ufo';
18
15
  import { config, logger } from './config';
19
- import { Job } from './db/job';
20
- import { Snapshot } from './db/snapshot';
21
16
  import { initPage } from './puppeteer';
22
- import { formatUrl, isAcceptCrawler, md5 } from './utils';
17
+ import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
18
+ import { Job } from './store/job';
19
+ import { Snapshot } from './store/snapshot';
20
+ import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
23
21
  const { BaseState } = require('@abtnode/models');
24
22
  let crawlQueue;
25
23
  export function createCrawlQueue() {
@@ -28,7 +26,7 @@ export function createCrawlQueue() {
28
26
  store: new SequelizeStore(db, 'crawler'),
29
27
  concurrency: 1,
30
28
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
31
- logger.debug('job start:', job);
29
+ logger.info('Starting to execute crawl job', job);
32
30
  const canCrawl = yield isAcceptCrawler(job.url);
33
31
  if (!canCrawl) {
34
32
  logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
@@ -81,19 +79,9 @@ export function createCrawlQueue() {
81
79
  });
82
80
  yield Snapshot.upsert(snapshot);
83
81
  return snapshot;
84
- // save to redis
85
- // if (saveToRedis) {
86
- // useCache.set(url, {
87
- // html: result.html || '',
88
- // lastModified,
89
- // });
90
- // logger.info(`success to crawl ${url}`, job);
91
- // return result;
92
- // }
93
82
  }
94
83
  catch (error) {
95
84
  logger.error(`Failed to crawl ${job.url}`, { error, job });
96
- console.error(error.stack);
97
85
  const snapshot = convertJobToSnapshot({
98
86
  job,
99
87
  snapshot: {
@@ -145,11 +133,11 @@ function formatHtml(htmlString) {
145
133
  }
146
134
  return htmlString;
147
135
  }
148
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
136
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
149
137
  logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
150
138
  const page = yield initPage();
151
139
  if (width && height) {
152
- yield page.setViewport({ width, height });
140
+ yield page.setViewport({ width, height, deviceScaleFactor: 2 });
153
141
  }
154
142
  let html = null;
155
143
  let screenshot = null;
@@ -164,26 +152,47 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
164
152
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
165
153
  }
166
154
  // await for networkidle0
167
- // https://pptr.dev/api/puppeteer.page.goforward/#remarks
155
+ // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
168
156
  yield page.waitForNetworkIdle({
169
- idleTime: 2 * 1000,
157
+ idleTime: 1.5 * 1000,
170
158
  });
171
159
  // get screenshot
172
160
  if (includeScreenshot) {
161
+ // Try to find the tallest element and set the browser to the same height
162
+ if (fullPage) {
163
+ const maxScrollHeight = yield findMaxScrollHeight(page);
164
+ logger.info('findMaxScrollHeight', { maxScrollHeight });
165
+ if (maxScrollHeight) {
166
+ yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
167
+ yield page.evaluate((scrollHeight) => {
168
+ window.scrollTo(0, scrollHeight || 0);
169
+ document.documentElement.scrollTo(0, scrollHeight || 0);
170
+ }, maxScrollHeight);
171
+ }
172
+ }
173
173
  try {
174
174
  screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
175
175
  }
176
176
  catch (err) {
177
177
  logger.error('Failed to get screenshot:', err);
178
+ throw err;
178
179
  }
179
180
  }
180
181
  // get html
181
182
  if (includeHtml) {
182
- if (formatPageContent) {
183
- html = yield formatPageContent({ page, url });
183
+ try {
184
+ html = yield page.evaluate(() => {
185
+ // add meta tag to record crawler
186
+ const meta = document.createElement('meta');
187
+ meta.name = 'arcblock-crawler';
188
+ meta.content = 'true';
189
+ document.head.appendChild(meta);
190
+ return document.documentElement.outerHTML;
191
+ });
184
192
  }
185
- else {
186
- html = yield page.content();
193
+ catch (err) {
194
+ logger.error('Failed to get html:', err);
195
+ throw err;
187
196
  }
188
197
  }
189
198
  }
@@ -200,11 +209,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
200
209
  screenshot,
201
210
  };
202
211
  });
203
- export function createCrawlJob(params, callback) {
212
+ /**
213
+ * crawl url and return job id
214
+ * @param params
215
+ * @param callback callback when job finished
216
+ */
217
+ export function crawlUrl(params, callback) {
204
218
  return __awaiter(this, void 0, void 0, function* () {
205
219
  params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
206
220
  // skip duplicate job
207
- const existsJob = yield getJob({
221
+ const { job: duplicateJob } = (yield Job.findJob({
208
222
  url: params.url,
209
223
  includeScreenshot: params.includeScreenshot,
210
224
  includeHtml: params.includeHtml,
@@ -212,18 +226,18 @@ export function createCrawlJob(params, callback) {
212
226
  width: params.width,
213
227
  height: params.height,
214
228
  fullPage: params.fullPage,
215
- });
216
- logger.info('create crawl job', params);
217
- if (existsJob) {
229
+ })) || {};
230
+ if (duplicateJob) {
218
231
  logger.warn(`Crawl job already exists for ${params.url}, skip`);
219
- return existsJob.id;
232
+ return duplicateJob.id;
220
233
  }
234
+ logger.info('create crawl job', params);
221
235
  const jobId = randomUUID();
222
236
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
223
- job.on('finished', ({ result }) => {
237
+ job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
224
238
  logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
225
- callback === null || callback === void 0 ? void 0 : callback(result);
226
- });
239
+ callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
240
+ }));
227
241
  job.on('failed', ({ error }) => {
228
242
  logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
229
243
  callback === null || callback === void 0 ? void 0 : callback(null);
@@ -231,71 +245,3 @@ export function createCrawlJob(params, callback) {
231
245
  return jobId;
232
246
  });
233
247
  }
234
- // @ts-ignore
235
- export function getJob(condition) {
236
- return __awaiter(this, void 0, void 0, function* () {
237
- const where = Object.keys(condition)
238
- .filter((key) => condition[key] !== undefined)
239
- .map((key) => {
240
- return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
241
- });
242
- const job = yield crawlQueue.store.db.findOne({
243
- where: {
244
- [sequelize.Op.and]: where,
245
- },
246
- });
247
- if (job) {
248
- return job.job;
249
- }
250
- return null;
251
- });
252
- }
253
- function convertJobToSnapshot({ job, snapshot }) {
254
- return Object.assign({
255
- // @ts-ignore
256
- jobId: job.jobId || job.id, url: job.url, options: {
257
- width: job.width,
258
- height: job.height,
259
- includeScreenshot: job.includeScreenshot,
260
- includeHtml: job.includeHtml,
261
- quality: job.quality,
262
- fullPage: job.fullPage,
263
- } }, snapshot);
264
- }
265
- export function formatSnapshot(snapshot, columns) {
266
- return __awaiter(this, void 0, void 0, function* () {
267
- let data = Object.assign({}, snapshot);
268
- // format screenshot path to full url
269
- if (data.screenshot) {
270
- data.screenshot = joinURL(config.appUrl, data.screenshot);
271
- }
272
- // format html path to string
273
- if (data.html) {
274
- const html = yield fs.readFile(path.join(config.dataDir, data.html));
275
- data.html = html.toString();
276
- }
277
- if (columns === null || columns === void 0 ? void 0 : columns.length) {
278
- data = pick(data, columns);
279
- }
280
- return data;
281
- });
282
- }
283
- /**
284
- * get snapshot from db or crawl queue
285
- */
286
- export function getSnapshot(jobId) {
287
- return __awaiter(this, void 0, void 0, function* () {
288
- const snapshotModel = yield Snapshot.findByPk(jobId);
289
- if (snapshotModel) {
290
- return snapshotModel.toJSON();
291
- }
292
- const job = yield getJob({ id: jobId });
293
- if (job) {
294
- return {
295
- jobId,
296
- status: 'pending',
297
- };
298
- }
299
- return null;
300
- });
301
- }
@@ -0,0 +1 @@
1
+ export declare function initCron(): any;