@arcblock/crawler 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -61,6 +61,8 @@ If not referenced by a Blocklet, some dependent Blocklet environment variables n
61
61
 
62
62
  - `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
63
63
 
64
+ - `BLOCKLET_LOG_DIR`: (Required) Directory path for storing @blocklet/logger logs
65
+
64
66
  ## SQLite
65
67
 
66
68
  When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- includeScreenshot?: boolean;
11
- includeHtml?: boolean;
12
- width?: number;
13
- height?: number;
14
- quality?: number;
15
- timeout?: number;
16
- fullPage?: boolean;
17
- }) => Promise<{
18
- html: string;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
9
+ html: string | null;
19
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
11
  meta: {
21
12
  title?: string;
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
27
18
  * @param params
28
19
  * @param callback callback when job finished
29
20
  */
30
- export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
21
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
@@ -137,18 +137,14 @@ function saveSnapshotToLocal(_a) {
137
137
  };
138
138
  });
139
139
  }
140
- function formatHtml(htmlString) {
141
- if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
142
- return '';
143
- }
144
- return htmlString;
145
- }
146
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
147
- config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
140
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
148
141
  const page = yield (0, puppeteer_1.initPage)();
149
142
  if (width && height) {
150
143
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
151
144
  }
145
+ if (headers) {
146
+ yield page.setExtraHTTPHeaders(headers);
147
+ }
152
148
  let html = null;
153
149
  let screenshot = null;
154
150
  const meta = {};
@@ -158,7 +154,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
158
154
  throw new Error(`Failed to load page: response is null for ${url}`);
159
155
  }
160
156
  const statusCode = response.status();
161
- config_1.logger.debug('getPageContent.response', { response, statusCode });
162
157
  if (![200, 304].includes(statusCode)) {
163
158
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
164
159
  }
@@ -207,6 +202,11 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
207
202
  description,
208
203
  };
209
204
  });
205
+ // check if the page is an error page
206
+ const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
207
+ if (isErrorPage) {
208
+ throw new Error('Page is an error page');
209
+ }
210
210
  meta.title = data.title;
211
211
  meta.description = data.description;
212
212
  if (includeHtml) {
@@ -225,7 +225,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
225
225
  finally {
226
226
  yield page.close();
227
227
  }
228
- html = formatHtml(html || '');
229
228
  return {
230
229
  html,
231
230
  screenshot,
@@ -238,24 +237,17 @@ exports.getPageContent = getPageContent;
238
237
  * @param params
239
238
  * @param callback callback when job finished
240
239
  */
240
+ // eslint-disable-next-line require-await
241
241
  function crawlUrl(params, callback) {
242
242
  return __awaiter(this, void 0, void 0, function* () {
243
243
  params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
244
244
  // skip duplicate job
245
- const { job: duplicateJob } = (yield job_1.Job.findJob({
246
- url: params.url,
247
- includeScreenshot: params.includeScreenshot,
248
- includeHtml: params.includeHtml,
249
- quality: params.quality,
250
- width: params.width,
251
- height: params.height,
252
- fullPage: params.fullPage,
253
- })) || {};
254
- if (duplicateJob) {
245
+ const existsJob = yield job_1.Job.isExists(params);
246
+ if (existsJob) {
255
247
  config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
256
- return duplicateJob.id;
248
+ return existsJob.id;
257
249
  }
258
- config_1.logger.info('create crawl job', params);
250
+ config_1.logger.info('enqueue crawl job', params);
259
251
  const jobId = (0, crypto_1.randomUUID)();
260
252
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
261
253
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
package/lib/cjs/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
package/lib/cjs/site.js CHANGED
@@ -42,6 +42,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
42
42
  });
43
43
  config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
44
44
  let processCount = 0;
45
+ let crawlCount = 0;
45
46
  crawlBlockletRunningMap.set(key, true);
46
47
  try {
47
48
  const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
@@ -58,7 +59,13 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
58
59
  return null;
59
60
  }
60
61
  }
61
- config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
62
+ config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
63
+ snapshotExists: !!snapshot,
64
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
65
+ sitemapLastmod: sitemapItem.lastmod,
66
+ url,
67
+ });
68
+ crawlCount++;
62
69
  return (0, crawler_1.crawlUrl)({
63
70
  url,
64
71
  lastModified: sitemapItem.lastmod,
@@ -66,6 +73,12 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
66
73
  includeHtml: true,
67
74
  });
68
75
  }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
76
+ config_1.logger.info('Enqueued jobs from sitemap finished', {
77
+ url,
78
+ pathname,
79
+ processCount,
80
+ crawlCount,
81
+ });
69
82
  return jobIds;
70
83
  }
71
84
  catch (error) {
@@ -11,6 +11,7 @@ export interface JobState {
11
11
  timeout?: number;
12
12
  fullPage?: boolean;
13
13
  lastModified?: string;
14
+ headers?: Record<string, string>;
14
15
  }
15
16
  export interface JobModel {
16
17
  id: string;
@@ -31,4 +32,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
31
32
  cancelled: JobModel['cancelled'];
32
33
  static initModel(sequelize: Sequelize): typeof Job;
33
34
  static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
35
+ static isExists(condition: Partial<JobState> & {
36
+ url: string;
37
+ }): Promise<JobModel | null | undefined>;
34
38
  }
@@ -41,9 +41,13 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
41
41
  step((generator = generator.apply(thisArg, _arguments || [])).next());
42
42
  });
43
43
  };
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
44
47
  Object.defineProperty(exports, "__esModule", { value: true });
45
48
  exports.Job = void 0;
46
49
  const core_1 = __importStar(require("@sequelize/core"));
50
+ const isEqual_1 = __importDefault(require("lodash/isEqual"));
47
51
  class Job extends core_1.Model {
48
52
  static initModel(sequelize) {
49
53
  return Job.init({
@@ -106,5 +110,22 @@ class Job extends core_1.Model {
106
110
  return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
107
111
  });
108
112
  }
113
+ static isExists(condition) {
114
+ return __awaiter(this, void 0, void 0, function* () {
115
+ const jobs = yield Job.findAll({
116
+ where: core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), '$.url'), condition.url),
117
+ });
118
+ if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
119
+ return null;
120
+ }
121
+ const existsJob = jobs.find((job) => {
122
+ const jobModel = job.get('job');
123
+ return Object.keys(condition).every((key) => {
124
+ return (0, isEqual_1.default)(condition[key], jobModel[key]);
125
+ });
126
+ });
127
+ return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
128
+ });
129
+ }
109
130
  }
110
131
  exports.Job = Job;
@@ -18,6 +18,7 @@ export interface SnapshotModel {
18
18
  includeHtml?: boolean;
19
19
  quality?: number;
20
20
  fullPage?: boolean;
21
+ headers?: Record<string, string>;
21
22
  };
22
23
  }
23
24
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/cjs/utils.js CHANGED
@@ -153,7 +153,7 @@ exports.isSelfCrawler = isSelfCrawler;
153
153
  * Check if the request is a static file
154
154
  */
155
155
  function isStaticFile(req) {
156
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
156
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
157
157
  return excludeUrlPattern.test(req.path);
158
158
  }
159
159
  /**
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- includeScreenshot?: boolean;
11
- includeHtml?: boolean;
12
- width?: number;
13
- height?: number;
14
- quality?: number;
15
- timeout?: number;
16
- fullPage?: boolean;
17
- }) => Promise<{
18
- html: string;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
9
+ html: string | null;
19
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
11
  meta: {
21
12
  title?: string;
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
27
18
  * @param params
28
19
  * @param callback callback when job finished
29
20
  */
30
- export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
21
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
@@ -128,18 +128,14 @@ function saveSnapshotToLocal(_a) {
128
128
  };
129
129
  });
130
130
  }
131
- function formatHtml(htmlString) {
132
- if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
133
- return '';
134
- }
135
- return htmlString;
136
- }
137
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
138
- logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
131
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
139
132
  const page = yield initPage();
140
133
  if (width && height) {
141
134
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
142
135
  }
136
+ if (headers) {
137
+ yield page.setExtraHTTPHeaders(headers);
138
+ }
143
139
  let html = null;
144
140
  let screenshot = null;
145
141
  const meta = {};
@@ -149,7 +145,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
149
145
  throw new Error(`Failed to load page: response is null for ${url}`);
150
146
  }
151
147
  const statusCode = response.status();
152
- logger.debug('getPageContent.response', { response, statusCode });
153
148
  if (![200, 304].includes(statusCode)) {
154
149
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
155
150
  }
@@ -198,6 +193,11 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
198
193
  description,
199
194
  };
200
195
  });
196
+ // check if the page is an error page
197
+ const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
198
+ if (isErrorPage) {
199
+ throw new Error('Page is an error page');
200
+ }
201
201
  meta.title = data.title;
202
202
  meta.description = data.description;
203
203
  if (includeHtml) {
@@ -216,7 +216,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
216
216
  finally {
217
217
  yield page.close();
218
218
  }
219
- html = formatHtml(html || '');
220
219
  return {
221
220
  html,
222
221
  screenshot,
@@ -228,24 +227,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
228
227
  * @param params
229
228
  * @param callback callback when job finished
230
229
  */
230
+ // eslint-disable-next-line require-await
231
231
  export function crawlUrl(params, callback) {
232
232
  return __awaiter(this, void 0, void 0, function* () {
233
233
  params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
234
234
  // skip duplicate job
235
- const { job: duplicateJob } = (yield Job.findJob({
236
- url: params.url,
237
- includeScreenshot: params.includeScreenshot,
238
- includeHtml: params.includeHtml,
239
- quality: params.quality,
240
- width: params.width,
241
- height: params.height,
242
- fullPage: params.fullPage,
243
- })) || {};
244
- if (duplicateJob) {
235
+ const existsJob = yield Job.isExists(params);
236
+ if (existsJob) {
245
237
  logger.info(`Crawl job already exists for ${params.url}, skip`);
246
- return duplicateJob.id;
238
+ return existsJob.id;
247
239
  }
248
- logger.info('create crawl job', params);
240
+ logger.info('enqueue crawl job', params);
249
241
  const jobId = randomUUID();
250
242
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
251
243
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
package/lib/esm/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
package/lib/esm/site.js CHANGED
@@ -36,6 +36,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
36
36
  });
37
37
  logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
38
38
  let processCount = 0;
39
+ let crawlCount = 0;
39
40
  crawlBlockletRunningMap.set(key, true);
40
41
  try {
41
42
  const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
@@ -52,7 +53,13 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
52
53
  return null;
53
54
  }
54
55
  }
55
- logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
56
+ logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
57
+ snapshotExists: !!snapshot,
58
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
59
+ sitemapLastmod: sitemapItem.lastmod,
60
+ url,
61
+ });
62
+ crawlCount++;
56
63
  return crawlUrl({
57
64
  url,
58
65
  lastModified: sitemapItem.lastmod,
@@ -60,6 +67,12 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
60
67
  includeHtml: true,
61
68
  });
62
69
  }), { concurrency: config.siteCron.sitemapConcurrency });
70
+ logger.info('Enqueued jobs from sitemap finished', {
71
+ url,
72
+ pathname,
73
+ processCount,
74
+ crawlCount,
75
+ });
63
76
  return jobIds;
64
77
  }
65
78
  catch (error) {
@@ -11,6 +11,7 @@ export interface JobState {
11
11
  timeout?: number;
12
12
  fullPage?: boolean;
13
13
  lastModified?: string;
14
+ headers?: Record<string, string>;
14
15
  }
15
16
  export interface JobModel {
16
17
  id: string;
@@ -31,4 +32,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
31
32
  cancelled: JobModel['cancelled'];
32
33
  static initModel(sequelize: Sequelize): typeof Job;
33
34
  static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
35
+ static isExists(condition: Partial<JobState> & {
36
+ url: string;
37
+ }): Promise<JobModel | null | undefined>;
34
38
  }
@@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
8
  });
9
9
  };
10
10
  import sequelize, { DataTypes, Model } from '@sequelize/core';
11
+ import isEqual from 'lodash/isEqual';
11
12
  export class Job extends Model {
12
13
  static initModel(sequelize) {
13
14
  return Job.init({
@@ -70,4 +71,21 @@ export class Job extends Model {
70
71
  return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
71
72
  });
72
73
  }
74
+ static isExists(condition) {
75
+ return __awaiter(this, void 0, void 0, function* () {
76
+ const jobs = yield Job.findAll({
77
+ where: sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), '$.url'), condition.url),
78
+ });
79
+ if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
80
+ return null;
81
+ }
82
+ const existsJob = jobs.find((job) => {
83
+ const jobModel = job.get('job');
84
+ return Object.keys(condition).every((key) => {
85
+ return isEqual(condition[key], jobModel[key]);
86
+ });
87
+ });
88
+ return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
89
+ });
90
+ }
73
91
  }
@@ -18,6 +18,7 @@ export interface SnapshotModel {
18
18
  includeHtml?: boolean;
19
19
  quality?: number;
20
20
  fullPage?: boolean;
21
+ headers?: Record<string, string>;
21
22
  };
22
23
  }
23
24
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/esm/utils.js CHANGED
@@ -140,7 +140,7 @@ export const isSelfCrawler = (req) => {
140
140
  * Check if the request is a static file
141
141
  */
142
142
  export function isStaticFile(req) {
143
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
143
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
144
144
  return excludeUrlPattern.test(req.path);
145
145
  }
146
146
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.2",
3
+ "version": "1.1.4",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -55,12 +55,9 @@
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",
57
57
  "fs-extra": "^11.2.0",
58
- "generic-pool": "^3.9.0",
59
58
  "lodash": "^4.17.21",
60
59
  "lru-cache": "^10.4.3",
61
- "redis": "^4.7.0",
62
60
  "robots-parser": "^3.0.1",
63
- "sequelize": "^6.37.7",
64
61
  "sitemap": "^7.1.2",
65
62
  "sqlite3": "^5.1.7",
66
63
  "ufo": "^1.5.4",