@arcblock/crawler 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- includeScreenshot?: boolean;
11
- includeHtml?: boolean;
12
- width?: number;
13
- height?: number;
14
- quality?: number;
15
- timeout?: number;
16
- fullPage?: boolean;
17
- }) => Promise<{
18
- html: string;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
+ html: string | null;
19
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
11
  meta: {
21
12
  title?: string;
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
27
18
  * @param params
28
19
  * @param callback callback when job finished
29
20
  */
30
- export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
21
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
@@ -59,7 +59,12 @@ function createCrawlQueue() {
59
59
  // }
60
60
  try {
61
61
  // get page content later
62
- const result = yield (0, exports.getPageContent)(job);
62
+ const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
63
+ // for blocklet theme
64
+ blocklet_theme_prefer: 'light',
65
+ // for blocklet domain warning
66
+ 'domain-warning-skip': Date.now().toString(),
67
+ } }, job));
63
68
  if (!result || (!result.html && !result.screenshot)) {
64
69
  config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
65
70
  const snapshot = (0, snapshot_1.convertJobToSnapshot)({
@@ -137,18 +142,24 @@ function saveSnapshotToLocal(_a) {
137
142
  };
138
143
  });
139
144
  }
140
- function formatHtml(htmlString) {
141
- if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
142
- return '';
143
- }
144
- return htmlString;
145
- }
146
- const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
147
- config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
145
+ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
148
146
  const page = yield (0, puppeteer_1.initPage)();
149
147
  if (width && height) {
150
148
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
151
149
  }
150
+ if (headers) {
151
+ yield page.setExtraHTTPHeaders(headers);
152
+ }
153
+ if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
154
+ yield page.setCookie(...cookies);
155
+ }
156
+ if (localStorage) {
157
+ yield page.evaluateOnNewDocument((items) => {
158
+ Object.entries(items).forEach(([key, value]) => {
159
+ window.localStorage.setItem(key, value);
160
+ });
161
+ }, localStorage);
162
+ }
152
163
  let html = null;
153
164
  let screenshot = null;
154
165
  const meta = {};
@@ -158,7 +169,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
158
169
  throw new Error(`Failed to load page: response is null for ${url}`);
159
170
  }
160
171
  const statusCode = response.status();
161
- config_1.logger.debug('getPageContent.response', { response, statusCode });
162
172
  if (![200, 304].includes(statusCode)) {
163
173
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
164
174
  }
@@ -207,6 +217,11 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
207
217
  description,
208
218
  };
209
219
  });
220
+ // check if the page is an error page
221
+ const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
222
+ if (isErrorPage) {
223
+ throw new Error('Page is an error page');
224
+ }
210
225
  meta.title = data.title;
211
226
  meta.description = data.description;
212
227
  if (includeHtml) {
@@ -225,7 +240,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
225
240
  finally {
226
241
  yield page.close();
227
242
  }
228
- html = formatHtml(html || '');
229
243
  return {
230
244
  html,
231
245
  screenshot,
@@ -238,24 +252,17 @@ exports.getPageContent = getPageContent;
238
252
  * @param params
239
253
  * @param callback callback when job finished
240
254
  */
255
+ // eslint-disable-next-line require-await
241
256
  function crawlUrl(params, callback) {
242
257
  return __awaiter(this, void 0, void 0, function* () {
243
258
  params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
244
259
  // skip duplicate job
245
- const { job: duplicateJob } = (yield job_1.Job.findJob({
246
- url: params.url,
247
- includeScreenshot: params.includeScreenshot,
248
- includeHtml: params.includeHtml,
249
- quality: params.quality,
250
- width: params.width,
251
- height: params.height,
252
- fullPage: params.fullPage,
253
- })) || {};
254
- if (duplicateJob) {
260
+ const existsJob = yield job_1.Job.isExists(params);
261
+ if (existsJob) {
255
262
  config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
256
- return duplicateJob.id;
263
+ return existsJob.id;
257
264
  }
258
- config_1.logger.info('create crawl job', params);
265
+ config_1.logger.info('enqueue crawl job', params);
259
266
  const jobId = (0, crypto_1.randomUUID)();
260
267
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
261
268
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
package/lib/cjs/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
package/lib/cjs/site.js CHANGED
@@ -42,6 +42,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
42
42
  });
43
43
  config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
44
44
  let processCount = 0;
45
+ let crawlCount = 0;
45
46
  crawlBlockletRunningMap.set(key, true);
46
47
  try {
47
48
  const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
@@ -58,7 +59,13 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
58
59
  return null;
59
60
  }
60
61
  }
61
- config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
62
+ config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
63
+ snapshotExists: !!snapshot,
64
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
65
+ sitemapLastmod: sitemapItem.lastmod,
66
+ url,
67
+ });
68
+ crawlCount++;
62
69
  return (0, crawler_1.crawlUrl)({
63
70
  url,
64
71
  lastModified: sitemapItem.lastmod,
@@ -66,6 +73,12 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
66
73
  includeHtml: true,
67
74
  });
68
75
  }), { concurrency: config_1.config.siteCron.sitemapConcurrency });
76
+ config_1.logger.info('Enqueued jobs from sitemap finished', {
77
+ url,
78
+ pathname,
79
+ processCount,
80
+ crawlCount,
81
+ });
69
82
  return jobIds;
70
83
  }
71
84
  catch (error) {
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -11,6 +12,9 @@ export interface JobState {
11
12
  timeout?: number;
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
15
+ headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: Record<string, string>;
14
18
  }
15
19
  export interface JobModel {
16
20
  id: string;
@@ -31,4 +35,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
31
35
  cancelled: JobModel['cancelled'];
32
36
  static initModel(sequelize: Sequelize): typeof Job;
33
37
  static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
38
+ static isExists(condition: Partial<JobState> & {
39
+ url: string;
40
+ }): Promise<JobModel | null | undefined>;
34
41
  }
@@ -41,9 +41,13 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
41
41
  step((generator = generator.apply(thisArg, _arguments || [])).next());
42
42
  });
43
43
  };
44
+ var __importDefault = (this && this.__importDefault) || function (mod) {
45
+ return (mod && mod.__esModule) ? mod : { "default": mod };
46
+ };
44
47
  Object.defineProperty(exports, "__esModule", { value: true });
45
48
  exports.Job = void 0;
46
49
  const core_1 = __importStar(require("@sequelize/core"));
50
+ const isEqual_1 = __importDefault(require("lodash/isEqual"));
47
51
  class Job extends core_1.Model {
48
52
  static initModel(sequelize) {
49
53
  return Job.init({
@@ -106,5 +110,22 @@ class Job extends core_1.Model {
106
110
  return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
107
111
  });
108
112
  }
113
+ static isExists(condition) {
114
+ return __awaiter(this, void 0, void 0, function* () {
115
+ const jobs = yield Job.findAll({
116
+ where: core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), '$.url'), condition.url),
117
+ });
118
+ if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
119
+ return null;
120
+ }
121
+ const existsJob = jobs.find((job) => {
122
+ const jobModel = job.get('job');
123
+ return Object.keys(condition).every((key) => {
124
+ return (0, isEqual_1.default)(condition[key], jobModel[key]);
125
+ });
126
+ });
127
+ return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
128
+ });
129
+ }
109
130
  }
110
131
  exports.Job = Job;
@@ -18,6 +18,7 @@ export interface SnapshotModel {
18
18
  includeHtml?: boolean;
19
19
  quality?: number;
20
20
  fullPage?: boolean;
21
+ headers?: Record<string, string>;
21
22
  };
22
23
  }
23
24
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/cjs/utils.js CHANGED
@@ -134,6 +134,11 @@ const staticFileExtensions = [
134
134
  'xls',
135
135
  'xml',
136
136
  'zip',
137
+ 'ts',
138
+ 'json',
139
+ 'md',
140
+ 'yml',
141
+ 'yaml',
137
142
  ];
138
143
  const sleep = (ms) => {
139
144
  return new Promise((resolve) => {
@@ -153,7 +158,7 @@ exports.isSelfCrawler = isSelfCrawler;
153
158
  * Check if the request is a static file
154
159
  */
155
160
  function isStaticFile(req) {
156
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
161
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
157
162
  return excludeUrlPattern.test(req.path);
158
163
  }
159
164
  /**
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
5
5
  htmlDir: string;
6
6
  screenshotDir: string;
7
7
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
9
- url: string;
10
- includeScreenshot?: boolean;
11
- includeHtml?: boolean;
12
- width?: number;
13
- height?: number;
14
- quality?: number;
15
- timeout?: number;
16
- fullPage?: boolean;
17
- }) => Promise<{
18
- html: string;
8
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
+ html: string | null;
19
10
  screenshot: Uint8Array<ArrayBufferLike> | null;
20
11
  meta: {
21
12
  title?: string;
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
27
18
  * @param params
28
19
  * @param callback callback when job finished
29
20
  */
30
- export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
21
+ export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
@@ -50,7 +50,12 @@ export function createCrawlQueue() {
50
50
  // }
51
51
  try {
52
52
  // get page content later
53
- const result = yield getPageContent(job);
53
+ const result = yield getPageContent(Object.assign({ localStorage: {
54
+ // for blocklet theme
55
+ blocklet_theme_prefer: 'light',
56
+ // for blocklet domain warning
57
+ 'domain-warning-skip': Date.now().toString(),
58
+ } }, job));
54
59
  if (!result || (!result.html && !result.screenshot)) {
55
60
  logger.error(`failed to crawl ${job.url}, empty content`, job);
56
61
  const snapshot = convertJobToSnapshot({
@@ -128,18 +133,24 @@ function saveSnapshotToLocal(_a) {
128
133
  };
129
134
  });
130
135
  }
131
- function formatHtml(htmlString) {
132
- if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
133
- return '';
134
- }
135
- return htmlString;
136
- }
137
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
138
- logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
136
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
139
137
  const page = yield initPage();
140
138
  if (width && height) {
141
139
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
142
140
  }
141
+ if (headers) {
142
+ yield page.setExtraHTTPHeaders(headers);
143
+ }
144
+ if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
145
+ yield page.setCookie(...cookies);
146
+ }
147
+ if (localStorage) {
148
+ yield page.evaluateOnNewDocument((items) => {
149
+ Object.entries(items).forEach(([key, value]) => {
150
+ window.localStorage.setItem(key, value);
151
+ });
152
+ }, localStorage);
153
+ }
143
154
  let html = null;
144
155
  let screenshot = null;
145
156
  const meta = {};
@@ -149,7 +160,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
149
160
  throw new Error(`Failed to load page: response is null for ${url}`);
150
161
  }
151
162
  const statusCode = response.status();
152
- logger.debug('getPageContent.response', { response, statusCode });
153
163
  if (![200, 304].includes(statusCode)) {
154
164
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
155
165
  }
@@ -198,6 +208,11 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
198
208
  description,
199
209
  };
200
210
  });
211
+ // check if the page is an error page
212
+ const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
213
+ if (isErrorPage) {
214
+ throw new Error('Page is an error page');
215
+ }
201
216
  meta.title = data.title;
202
217
  meta.description = data.description;
203
218
  if (includeHtml) {
@@ -216,7 +231,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
216
231
  finally {
217
232
  yield page.close();
218
233
  }
219
- html = formatHtml(html || '');
220
234
  return {
221
235
  html,
222
236
  screenshot,
@@ -228,24 +242,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
228
242
  * @param params
229
243
  * @param callback callback when job finished
230
244
  */
245
+ // eslint-disable-next-line require-await
231
246
  export function crawlUrl(params, callback) {
232
247
  return __awaiter(this, void 0, void 0, function* () {
233
248
  params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
234
249
  // skip duplicate job
235
- const { job: duplicateJob } = (yield Job.findJob({
236
- url: params.url,
237
- includeScreenshot: params.includeScreenshot,
238
- includeHtml: params.includeHtml,
239
- quality: params.quality,
240
- width: params.width,
241
- height: params.height,
242
- fullPage: params.fullPage,
243
- })) || {};
244
- if (duplicateJob) {
250
+ const existsJob = yield Job.isExists(params);
251
+ if (existsJob) {
245
252
  logger.info(`Crawl job already exists for ${params.url}, skip`);
246
- return duplicateJob.id;
253
+ return existsJob.id;
247
254
  }
248
- logger.info('create crawl job', params);
255
+ logger.info('enqueue crawl job', params);
249
256
  const jobId = randomUUID();
250
257
  const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
251
258
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
package/lib/esm/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
package/lib/esm/site.js CHANGED
@@ -36,6 +36,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
36
36
  });
37
37
  logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
38
38
  let processCount = 0;
39
+ let crawlCount = 0;
39
40
  crawlBlockletRunningMap.set(key, true);
40
41
  try {
41
42
  const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
@@ -52,7 +53,13 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
52
53
  return null;
53
54
  }
54
55
  }
55
- logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
56
+ logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
57
+ snapshotExists: !!snapshot,
58
+ lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
59
+ sitemapLastmod: sitemapItem.lastmod,
60
+ url,
61
+ });
62
+ crawlCount++;
56
63
  return crawlUrl({
57
64
  url,
58
65
  lastModified: sitemapItem.lastmod,
@@ -60,6 +67,12 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
60
67
  includeHtml: true,
61
68
  });
62
69
  }), { concurrency: config.siteCron.sitemapConcurrency });
70
+ logger.info('Enqueued jobs from sitemap finished', {
71
+ url,
72
+ pathname,
73
+ processCount,
74
+ crawlCount,
75
+ });
63
76
  return jobIds;
64
77
  }
65
78
  catch (error) {
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { Model, Sequelize } from '@sequelize/core';
2
3
  export interface JobState {
3
4
  id?: string;
@@ -11,6 +12,9 @@ export interface JobState {
11
12
  timeout?: number;
12
13
  fullPage?: boolean;
13
14
  lastModified?: string;
15
+ headers?: Record<string, string>;
16
+ cookies?: CookieParam[];
17
+ localStorage?: Record<string, string>;
14
18
  }
15
19
  export interface JobModel {
16
20
  id: string;
@@ -31,4 +35,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
31
35
  cancelled: JobModel['cancelled'];
32
36
  static initModel(sequelize: Sequelize): typeof Job;
33
37
  static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
38
+ static isExists(condition: Partial<JobState> & {
39
+ url: string;
40
+ }): Promise<JobModel | null | undefined>;
34
41
  }
@@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
8
  });
9
9
  };
10
10
  import sequelize, { DataTypes, Model } from '@sequelize/core';
11
+ import isEqual from 'lodash/isEqual';
11
12
  export class Job extends Model {
12
13
  static initModel(sequelize) {
13
14
  return Job.init({
@@ -70,4 +71,21 @@ export class Job extends Model {
70
71
  return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
71
72
  });
72
73
  }
74
+ static isExists(condition) {
75
+ return __awaiter(this, void 0, void 0, function* () {
76
+ const jobs = yield Job.findAll({
77
+ where: sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), '$.url'), condition.url),
78
+ });
79
+ if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
80
+ return null;
81
+ }
82
+ const existsJob = jobs.find((job) => {
83
+ const jobModel = job.get('job');
84
+ return Object.keys(condition).every((key) => {
85
+ return isEqual(condition[key], jobModel[key]);
86
+ });
87
+ });
88
+ return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
89
+ });
90
+ }
73
91
  }
@@ -18,6 +18,7 @@ export interface SnapshotModel {
18
18
  includeHtml?: boolean;
19
19
  quality?: number;
20
20
  fullPage?: boolean;
21
+ headers?: Record<string, string>;
21
22
  };
22
23
  }
23
24
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
package/lib/esm/utils.js CHANGED
@@ -123,6 +123,11 @@ const staticFileExtensions = [
123
123
  'xls',
124
124
  'xml',
125
125
  'zip',
126
+ 'ts',
127
+ 'json',
128
+ 'md',
129
+ 'yml',
130
+ 'yaml',
126
131
  ];
127
132
  export const sleep = (ms) => {
128
133
  return new Promise((resolve) => {
@@ -140,7 +145,7 @@ export const isSelfCrawler = (req) => {
140
145
  * Check if the request is a static file
141
146
  */
142
147
  export function isStaticFile(req) {
143
- const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
148
+ const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
144
149
  return excludeUrlPattern.test(req.path);
145
150
  }
146
151
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.3",
3
+ "version": "1.1.5",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -45,33 +45,32 @@
45
45
  ]
46
46
  },
47
47
  "dependencies": {
48
- "@abtnode/cron": "^1.16.43",
49
- "@abtnode/models": "^1.16.43",
50
- "@abtnode/queue": "^1.16.43",
51
- "@blocklet/logger": "^1.16.43",
48
+ "@abtnode/cron": "^1.16.44",
49
+ "@abtnode/models": "^1.16.44",
50
+ "@abtnode/queue": "^1.16.44",
51
+ "@blocklet/logger": "^1.16.44",
52
52
  "@blocklet/puppeteer": "^22.11.3",
53
- "@blocklet/sdk": "^1.16.43",
53
+ "@blocklet/sdk": "^1.16.44",
54
54
  "@sequelize/core": "7.0.0-alpha.46",
55
55
  "@sequelize/sqlite3": "7.0.0-alpha.46",
56
56
  "axios": "^1.7.9",
57
57
  "fs-extra": "^11.2.0",
58
58
  "lodash": "^4.17.21",
59
59
  "lru-cache": "^10.4.3",
60
+ "p-map": "^7.0.3",
60
61
  "robots-parser": "^3.0.1",
61
62
  "sitemap": "^7.1.2",
62
63
  "sqlite3": "^5.1.7",
63
- "ufo": "^1.5.4",
64
- "p-map": "^7.0.3"
64
+ "ufo": "^1.5.4"
65
65
  },
66
66
  "devDependencies": {
67
- "@blocklet/js-sdk": "^1.16.39",
68
67
  "@types/dotenv-flow": "^3.3.3",
69
68
  "@types/express": "^4.17.21",
70
69
  "@types/fs-extra": "^11.0.4",
71
70
  "@types/lodash": "^4.17.16",
72
71
  "@types/node": "^20.17.19",
73
- "express": "^4.21.2",
74
72
  "bumpp": "^9.11.1",
73
+ "express": "^4.21.2",
75
74
  "nodemon": "^3.1.9",
76
75
  "npm-run-all": "^4.1.5",
77
76
  "puppeteer": "^24.8.2",