@arcblock/crawler 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/lib/cjs/crawler.d.ts +3 -12
- package/lib/cjs/crawler.js +14 -22
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +14 -1
- package/lib/cjs/store/job.d.ts +4 -0
- package/lib/cjs/store/job.js +21 -0
- package/lib/cjs/store/snapshot.d.ts +1 -0
- package/lib/cjs/utils.js +1 -1
- package/lib/esm/crawler.d.ts +3 -12
- package/lib/esm/crawler.js +14 -22
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +14 -1
- package/lib/esm/store/job.d.ts +4 -0
- package/lib/esm/store/job.js +18 -0
- package/lib/esm/store/snapshot.d.ts +1 -0
- package/lib/esm/utils.js +1 -1
- package/package.json +1 -4
package/README.md
CHANGED
|
@@ -61,6 +61,8 @@ If not referenced by a Blocklet, some dependent Blocklet environment variables n
|
|
|
61
61
|
|
|
62
62
|
- `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
|
|
63
63
|
|
|
64
|
+
- `BLOCKLET_LOG_DIR`: (Required) Directory path for storing @blocklet/logger logs
|
|
65
|
+
|
|
64
66
|
## SQLite
|
|
65
67
|
|
|
66
68
|
When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.
|
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
-
|
|
10
|
-
includeScreenshot?: boolean;
|
|
11
|
-
includeHtml?: boolean;
|
|
12
|
-
width?: number;
|
|
13
|
-
height?: number;
|
|
14
|
-
quality?: number;
|
|
15
|
-
timeout?: number;
|
|
16
|
-
fullPage?: boolean;
|
|
17
|
-
}) => Promise<{
|
|
18
|
-
html: string;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
9
|
+
html: string | null;
|
|
19
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
11
|
meta: {
|
|
21
12
|
title?: string;
|
|
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
27
18
|
* @param params
|
|
28
19
|
* @param callback callback when job finished
|
|
29
20
|
*/
|
|
30
|
-
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string
|
|
21
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -137,18 +137,14 @@ function saveSnapshotToLocal(_a) {
|
|
|
137
137
|
};
|
|
138
138
|
});
|
|
139
139
|
}
|
|
140
|
-
function
|
|
141
|
-
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
142
|
-
return '';
|
|
143
|
-
}
|
|
144
|
-
return htmlString;
|
|
145
|
-
}
|
|
146
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
147
|
-
config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
140
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
148
141
|
const page = yield (0, puppeteer_1.initPage)();
|
|
149
142
|
if (width && height) {
|
|
150
143
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
151
144
|
}
|
|
145
|
+
if (headers) {
|
|
146
|
+
yield page.setExtraHTTPHeaders(headers);
|
|
147
|
+
}
|
|
152
148
|
let html = null;
|
|
153
149
|
let screenshot = null;
|
|
154
150
|
const meta = {};
|
|
@@ -158,7 +154,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
158
154
|
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
159
155
|
}
|
|
160
156
|
const statusCode = response.status();
|
|
161
|
-
config_1.logger.debug('getPageContent.response', { response, statusCode });
|
|
162
157
|
if (![200, 304].includes(statusCode)) {
|
|
163
158
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
164
159
|
}
|
|
@@ -207,6 +202,11 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
207
202
|
description,
|
|
208
203
|
};
|
|
209
204
|
});
|
|
205
|
+
// check if the page is an error page
|
|
206
|
+
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
207
|
+
if (isErrorPage) {
|
|
208
|
+
throw new Error('Page is an error page');
|
|
209
|
+
}
|
|
210
210
|
meta.title = data.title;
|
|
211
211
|
meta.description = data.description;
|
|
212
212
|
if (includeHtml) {
|
|
@@ -225,7 +225,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
225
225
|
finally {
|
|
226
226
|
yield page.close();
|
|
227
227
|
}
|
|
228
|
-
html = formatHtml(html || '');
|
|
229
228
|
return {
|
|
230
229
|
html,
|
|
231
230
|
screenshot,
|
|
@@ -238,24 +237,17 @@ exports.getPageContent = getPageContent;
|
|
|
238
237
|
* @param params
|
|
239
238
|
* @param callback callback when job finished
|
|
240
239
|
*/
|
|
240
|
+
// eslint-disable-next-line require-await
|
|
241
241
|
function crawlUrl(params, callback) {
|
|
242
242
|
return __awaiter(this, void 0, void 0, function* () {
|
|
243
243
|
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
244
244
|
// skip duplicate job
|
|
245
|
-
const
|
|
246
|
-
|
|
247
|
-
includeScreenshot: params.includeScreenshot,
|
|
248
|
-
includeHtml: params.includeHtml,
|
|
249
|
-
quality: params.quality,
|
|
250
|
-
width: params.width,
|
|
251
|
-
height: params.height,
|
|
252
|
-
fullPage: params.fullPage,
|
|
253
|
-
})) || {};
|
|
254
|
-
if (duplicateJob) {
|
|
245
|
+
const existsJob = yield job_1.Job.isExists(params);
|
|
246
|
+
if (existsJob) {
|
|
255
247
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
256
|
-
return
|
|
248
|
+
return existsJob.id;
|
|
257
249
|
}
|
|
258
|
-
config_1.logger.info('
|
|
250
|
+
config_1.logger.info('enqueue crawl job', params);
|
|
259
251
|
const jobId = (0, crypto_1.randomUUID)();
|
|
260
252
|
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
261
253
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
package/lib/cjs/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
package/lib/cjs/site.js
CHANGED
|
@@ -42,6 +42,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
42
42
|
});
|
|
43
43
|
config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
44
44
|
let processCount = 0;
|
|
45
|
+
let crawlCount = 0;
|
|
45
46
|
crawlBlockletRunningMap.set(key, true);
|
|
46
47
|
try {
|
|
47
48
|
const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
@@ -58,7 +59,13 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
58
59
|
return null;
|
|
59
60
|
}
|
|
60
61
|
}
|
|
61
|
-
config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}
|
|
62
|
+
config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
|
|
63
|
+
snapshotExists: !!snapshot,
|
|
64
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
65
|
+
sitemapLastmod: sitemapItem.lastmod,
|
|
66
|
+
url,
|
|
67
|
+
});
|
|
68
|
+
crawlCount++;
|
|
62
69
|
return (0, crawler_1.crawlUrl)({
|
|
63
70
|
url,
|
|
64
71
|
lastModified: sitemapItem.lastmod,
|
|
@@ -66,6 +73,12 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
66
73
|
includeHtml: true,
|
|
67
74
|
});
|
|
68
75
|
}), { concurrency: config_1.config.siteCron.sitemapConcurrency });
|
|
76
|
+
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
77
|
+
url,
|
|
78
|
+
pathname,
|
|
79
|
+
processCount,
|
|
80
|
+
crawlCount,
|
|
81
|
+
});
|
|
69
82
|
return jobIds;
|
|
70
83
|
}
|
|
71
84
|
catch (error) {
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -11,6 +11,7 @@ export interface JobState {
|
|
|
11
11
|
timeout?: number;
|
|
12
12
|
fullPage?: boolean;
|
|
13
13
|
lastModified?: string;
|
|
14
|
+
headers?: Record<string, string>;
|
|
14
15
|
}
|
|
15
16
|
export interface JobModel {
|
|
16
17
|
id: string;
|
|
@@ -31,4 +32,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
|
|
|
31
32
|
cancelled: JobModel['cancelled'];
|
|
32
33
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
34
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
35
|
+
static isExists(condition: Partial<JobState> & {
|
|
36
|
+
url: string;
|
|
37
|
+
}): Promise<JobModel | null | undefined>;
|
|
34
38
|
}
|
package/lib/cjs/store/job.js
CHANGED
|
@@ -41,9 +41,13 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
41
41
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
42
|
});
|
|
43
43
|
};
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
44
47
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
48
|
exports.Job = void 0;
|
|
46
49
|
const core_1 = __importStar(require("@sequelize/core"));
|
|
50
|
+
const isEqual_1 = __importDefault(require("lodash/isEqual"));
|
|
47
51
|
class Job extends core_1.Model {
|
|
48
52
|
static initModel(sequelize) {
|
|
49
53
|
return Job.init({
|
|
@@ -106,5 +110,22 @@ class Job extends core_1.Model {
|
|
|
106
110
|
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
107
111
|
});
|
|
108
112
|
}
|
|
113
|
+
static isExists(condition) {
|
|
114
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
115
|
+
const jobs = yield Job.findAll({
|
|
116
|
+
where: core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), '$.url'), condition.url),
|
|
117
|
+
});
|
|
118
|
+
if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
const existsJob = jobs.find((job) => {
|
|
122
|
+
const jobModel = job.get('job');
|
|
123
|
+
return Object.keys(condition).every((key) => {
|
|
124
|
+
return (0, isEqual_1.default)(condition[key], jobModel[key]);
|
|
125
|
+
});
|
|
126
|
+
});
|
|
127
|
+
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
128
|
+
});
|
|
129
|
+
}
|
|
109
130
|
}
|
|
110
131
|
exports.Job = Job;
|
package/lib/cjs/utils.js
CHANGED
|
@@ -153,7 +153,7 @@ exports.isSelfCrawler = isSelfCrawler;
|
|
|
153
153
|
* Check if the request is a static file
|
|
154
154
|
*/
|
|
155
155
|
function isStaticFile(req) {
|
|
156
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})
|
|
156
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
|
|
157
157
|
return excludeUrlPattern.test(req.path);
|
|
158
158
|
}
|
|
159
159
|
/**
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
-
|
|
10
|
-
includeScreenshot?: boolean;
|
|
11
|
-
includeHtml?: boolean;
|
|
12
|
-
width?: number;
|
|
13
|
-
height?: number;
|
|
14
|
-
quality?: number;
|
|
15
|
-
timeout?: number;
|
|
16
|
-
fullPage?: boolean;
|
|
17
|
-
}) => Promise<{
|
|
18
|
-
html: string;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, }: JobState) => Promise<{
|
|
9
|
+
html: string | null;
|
|
19
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
11
|
meta: {
|
|
21
12
|
title?: string;
|
|
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
27
18
|
* @param params
|
|
28
19
|
* @param callback callback when job finished
|
|
29
20
|
*/
|
|
30
|
-
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string
|
|
21
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
package/lib/esm/crawler.js
CHANGED
|
@@ -128,18 +128,14 @@ function saveSnapshotToLocal(_a) {
|
|
|
128
128
|
};
|
|
129
129
|
});
|
|
130
130
|
}
|
|
131
|
-
function
|
|
132
|
-
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
133
|
-
return '';
|
|
134
|
-
}
|
|
135
|
-
return htmlString;
|
|
136
|
-
}
|
|
137
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
138
|
-
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
131
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, }) {
|
|
139
132
|
const page = yield initPage();
|
|
140
133
|
if (width && height) {
|
|
141
134
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
142
135
|
}
|
|
136
|
+
if (headers) {
|
|
137
|
+
yield page.setExtraHTTPHeaders(headers);
|
|
138
|
+
}
|
|
143
139
|
let html = null;
|
|
144
140
|
let screenshot = null;
|
|
145
141
|
const meta = {};
|
|
@@ -149,7 +145,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
149
145
|
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
150
146
|
}
|
|
151
147
|
const statusCode = response.status();
|
|
152
|
-
logger.debug('getPageContent.response', { response, statusCode });
|
|
153
148
|
if (![200, 304].includes(statusCode)) {
|
|
154
149
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
155
150
|
}
|
|
@@ -198,6 +193,11 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
198
193
|
description,
|
|
199
194
|
};
|
|
200
195
|
});
|
|
196
|
+
// check if the page is an error page
|
|
197
|
+
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
198
|
+
if (isErrorPage) {
|
|
199
|
+
throw new Error('Page is an error page');
|
|
200
|
+
}
|
|
201
201
|
meta.title = data.title;
|
|
202
202
|
meta.description = data.description;
|
|
203
203
|
if (includeHtml) {
|
|
@@ -216,7 +216,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
216
216
|
finally {
|
|
217
217
|
yield page.close();
|
|
218
218
|
}
|
|
219
|
-
html = formatHtml(html || '');
|
|
220
219
|
return {
|
|
221
220
|
html,
|
|
222
221
|
screenshot,
|
|
@@ -228,24 +227,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
228
227
|
* @param params
|
|
229
228
|
* @param callback callback when job finished
|
|
230
229
|
*/
|
|
230
|
+
// eslint-disable-next-line require-await
|
|
231
231
|
export function crawlUrl(params, callback) {
|
|
232
232
|
return __awaiter(this, void 0, void 0, function* () {
|
|
233
233
|
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
234
234
|
// skip duplicate job
|
|
235
|
-
const
|
|
236
|
-
|
|
237
|
-
includeScreenshot: params.includeScreenshot,
|
|
238
|
-
includeHtml: params.includeHtml,
|
|
239
|
-
quality: params.quality,
|
|
240
|
-
width: params.width,
|
|
241
|
-
height: params.height,
|
|
242
|
-
fullPage: params.fullPage,
|
|
243
|
-
})) || {};
|
|
244
|
-
if (duplicateJob) {
|
|
235
|
+
const existsJob = yield Job.isExists(params);
|
|
236
|
+
if (existsJob) {
|
|
245
237
|
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
246
|
-
return
|
|
238
|
+
return existsJob.id;
|
|
247
239
|
}
|
|
248
|
-
logger.info('
|
|
240
|
+
logger.info('enqueue crawl job', params);
|
|
249
241
|
const jobId = randomUUID();
|
|
250
242
|
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
251
243
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
package/lib/esm/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
package/lib/esm/site.js
CHANGED
|
@@ -36,6 +36,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
36
36
|
});
|
|
37
37
|
logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
38
38
|
let processCount = 0;
|
|
39
|
+
let crawlCount = 0;
|
|
39
40
|
crawlBlockletRunningMap.set(key, true);
|
|
40
41
|
try {
|
|
41
42
|
const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
@@ -52,7 +53,13 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
52
53
|
return null;
|
|
53
54
|
}
|
|
54
55
|
}
|
|
55
|
-
logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}
|
|
56
|
+
logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
|
|
57
|
+
snapshotExists: !!snapshot,
|
|
58
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
59
|
+
sitemapLastmod: sitemapItem.lastmod,
|
|
60
|
+
url,
|
|
61
|
+
});
|
|
62
|
+
crawlCount++;
|
|
56
63
|
return crawlUrl({
|
|
57
64
|
url,
|
|
58
65
|
lastModified: sitemapItem.lastmod,
|
|
@@ -60,6 +67,12 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
60
67
|
includeHtml: true,
|
|
61
68
|
});
|
|
62
69
|
}), { concurrency: config.siteCron.sitemapConcurrency });
|
|
70
|
+
logger.info('Enqueued jobs from sitemap finished', {
|
|
71
|
+
url,
|
|
72
|
+
pathname,
|
|
73
|
+
processCount,
|
|
74
|
+
crawlCount,
|
|
75
|
+
});
|
|
63
76
|
return jobIds;
|
|
64
77
|
}
|
|
65
78
|
catch (error) {
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -11,6 +11,7 @@ export interface JobState {
|
|
|
11
11
|
timeout?: number;
|
|
12
12
|
fullPage?: boolean;
|
|
13
13
|
lastModified?: string;
|
|
14
|
+
headers?: Record<string, string>;
|
|
14
15
|
}
|
|
15
16
|
export interface JobModel {
|
|
16
17
|
id: string;
|
|
@@ -31,4 +32,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
|
|
|
31
32
|
cancelled: JobModel['cancelled'];
|
|
32
33
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
34
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
35
|
+
static isExists(condition: Partial<JobState> & {
|
|
36
|
+
url: string;
|
|
37
|
+
}): Promise<JobModel | null | undefined>;
|
|
34
38
|
}
|
package/lib/esm/store/job.js
CHANGED
|
@@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
10
|
import sequelize, { DataTypes, Model } from '@sequelize/core';
|
|
11
|
+
import isEqual from 'lodash/isEqual';
|
|
11
12
|
export class Job extends Model {
|
|
12
13
|
static initModel(sequelize) {
|
|
13
14
|
return Job.init({
|
|
@@ -70,4 +71,21 @@ export class Job extends Model {
|
|
|
70
71
|
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
71
72
|
});
|
|
72
73
|
}
|
|
74
|
+
static isExists(condition) {
|
|
75
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
76
|
+
const jobs = yield Job.findAll({
|
|
77
|
+
where: sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), '$.url'), condition.url),
|
|
78
|
+
});
|
|
79
|
+
if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
const existsJob = jobs.find((job) => {
|
|
83
|
+
const jobModel = job.get('job');
|
|
84
|
+
return Object.keys(condition).every((key) => {
|
|
85
|
+
return isEqual(condition[key], jobModel[key]);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
89
|
+
});
|
|
90
|
+
}
|
|
73
91
|
}
|
package/lib/esm/utils.js
CHANGED
|
@@ -140,7 +140,7 @@ export const isSelfCrawler = (req) => {
|
|
|
140
140
|
* Check if the request is a static file
|
|
141
141
|
*/
|
|
142
142
|
export function isStaticFile(req) {
|
|
143
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})
|
|
143
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
|
|
144
144
|
return excludeUrlPattern.test(req.path);
|
|
145
145
|
}
|
|
146
146
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.4",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -55,12 +55,9 @@
|
|
|
55
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
56
56
|
"axios": "^1.7.9",
|
|
57
57
|
"fs-extra": "^11.2.0",
|
|
58
|
-
"generic-pool": "^3.9.0",
|
|
59
58
|
"lodash": "^4.17.21",
|
|
60
59
|
"lru-cache": "^10.4.3",
|
|
61
|
-
"redis": "^4.7.0",
|
|
62
60
|
"robots-parser": "^3.0.1",
|
|
63
|
-
"sequelize": "^6.37.7",
|
|
64
61
|
"sitemap": "^7.1.2",
|
|
65
62
|
"sqlite3": "^5.1.7",
|
|
66
63
|
"ufo": "^1.5.4",
|