@arcblock/crawler 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +3 -12
- package/lib/cjs/crawler.js +30 -23
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +14 -1
- package/lib/cjs/store/job.d.ts +7 -0
- package/lib/cjs/store/job.js +21 -0
- package/lib/cjs/store/snapshot.d.ts +1 -0
- package/lib/cjs/utils.js +6 -1
- package/lib/esm/crawler.d.ts +3 -12
- package/lib/esm/crawler.js +30 -23
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +14 -1
- package/lib/esm/store/job.d.ts +7 -0
- package/lib/esm/store/job.js +18 -0
- package/lib/esm/store/snapshot.d.ts +1 -0
- package/lib/esm/utils.js +6 -1
- package/package.json +9 -10
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
-
|
|
10
|
-
includeScreenshot?: boolean;
|
|
11
|
-
includeHtml?: boolean;
|
|
12
|
-
width?: number;
|
|
13
|
-
height?: number;
|
|
14
|
-
quality?: number;
|
|
15
|
-
timeout?: number;
|
|
16
|
-
fullPage?: boolean;
|
|
17
|
-
}) => Promise<{
|
|
18
|
-
html: string;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
|
+
html: string | null;
|
|
19
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
11
|
meta: {
|
|
21
12
|
title?: string;
|
|
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
27
18
|
* @param params
|
|
28
19
|
* @param callback callback when job finished
|
|
29
20
|
*/
|
|
30
|
-
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string
|
|
21
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -59,7 +59,12 @@ function createCrawlQueue() {
|
|
|
59
59
|
// }
|
|
60
60
|
try {
|
|
61
61
|
// get page content later
|
|
62
|
-
const result = yield (0, exports.getPageContent)(
|
|
62
|
+
const result = yield (0, exports.getPageContent)(Object.assign({ localStorage: {
|
|
63
|
+
// for blocklet theme
|
|
64
|
+
blocklet_theme_prefer: 'light',
|
|
65
|
+
// for blocklet domain warning
|
|
66
|
+
'domain-warning-skip': Date.now().toString(),
|
|
67
|
+
} }, job));
|
|
63
68
|
if (!result || (!result.html && !result.screenshot)) {
|
|
64
69
|
config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
65
70
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
@@ -137,18 +142,24 @@ function saveSnapshotToLocal(_a) {
|
|
|
137
142
|
};
|
|
138
143
|
});
|
|
139
144
|
}
|
|
140
|
-
function
|
|
141
|
-
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
142
|
-
return '';
|
|
143
|
-
}
|
|
144
|
-
return htmlString;
|
|
145
|
-
}
|
|
146
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
147
|
-
config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
145
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
|
|
148
146
|
const page = yield (0, puppeteer_1.initPage)();
|
|
149
147
|
if (width && height) {
|
|
150
148
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
151
149
|
}
|
|
150
|
+
if (headers) {
|
|
151
|
+
yield page.setExtraHTTPHeaders(headers);
|
|
152
|
+
}
|
|
153
|
+
if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
|
|
154
|
+
yield page.setCookie(...cookies);
|
|
155
|
+
}
|
|
156
|
+
if (localStorage) {
|
|
157
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
158
|
+
Object.entries(items).forEach(([key, value]) => {
|
|
159
|
+
window.localStorage.setItem(key, value);
|
|
160
|
+
});
|
|
161
|
+
}, localStorage);
|
|
162
|
+
}
|
|
152
163
|
let html = null;
|
|
153
164
|
let screenshot = null;
|
|
154
165
|
const meta = {};
|
|
@@ -158,7 +169,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
158
169
|
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
159
170
|
}
|
|
160
171
|
const statusCode = response.status();
|
|
161
|
-
config_1.logger.debug('getPageContent.response', { response, statusCode });
|
|
162
172
|
if (![200, 304].includes(statusCode)) {
|
|
163
173
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
164
174
|
}
|
|
@@ -207,6 +217,11 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
207
217
|
description,
|
|
208
218
|
};
|
|
209
219
|
});
|
|
220
|
+
// check if the page is an error page
|
|
221
|
+
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
222
|
+
if (isErrorPage) {
|
|
223
|
+
throw new Error('Page is an error page');
|
|
224
|
+
}
|
|
210
225
|
meta.title = data.title;
|
|
211
226
|
meta.description = data.description;
|
|
212
227
|
if (includeHtml) {
|
|
@@ -225,7 +240,6 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
225
240
|
finally {
|
|
226
241
|
yield page.close();
|
|
227
242
|
}
|
|
228
|
-
html = formatHtml(html || '');
|
|
229
243
|
return {
|
|
230
244
|
html,
|
|
231
245
|
screenshot,
|
|
@@ -238,24 +252,17 @@ exports.getPageContent = getPageContent;
|
|
|
238
252
|
* @param params
|
|
239
253
|
* @param callback callback when job finished
|
|
240
254
|
*/
|
|
255
|
+
// eslint-disable-next-line require-await
|
|
241
256
|
function crawlUrl(params, callback) {
|
|
242
257
|
return __awaiter(this, void 0, void 0, function* () {
|
|
243
258
|
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
244
259
|
// skip duplicate job
|
|
245
|
-
const
|
|
246
|
-
|
|
247
|
-
includeScreenshot: params.includeScreenshot,
|
|
248
|
-
includeHtml: params.includeHtml,
|
|
249
|
-
quality: params.quality,
|
|
250
|
-
width: params.width,
|
|
251
|
-
height: params.height,
|
|
252
|
-
fullPage: params.fullPage,
|
|
253
|
-
})) || {};
|
|
254
|
-
if (duplicateJob) {
|
|
260
|
+
const existsJob = yield job_1.Job.isExists(params);
|
|
261
|
+
if (existsJob) {
|
|
255
262
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
256
|
-
return
|
|
263
|
+
return existsJob.id;
|
|
257
264
|
}
|
|
258
|
-
config_1.logger.info('
|
|
265
|
+
config_1.logger.info('enqueue crawl job', params);
|
|
259
266
|
const jobId = (0, crypto_1.randomUUID)();
|
|
260
267
|
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
261
268
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
package/lib/cjs/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
package/lib/cjs/site.js
CHANGED
|
@@ -42,6 +42,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
42
42
|
});
|
|
43
43
|
config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
44
44
|
let processCount = 0;
|
|
45
|
+
let crawlCount = 0;
|
|
45
46
|
crawlBlockletRunningMap.set(key, true);
|
|
46
47
|
try {
|
|
47
48
|
const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
@@ -58,7 +59,13 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
58
59
|
return null;
|
|
59
60
|
}
|
|
60
61
|
}
|
|
61
|
-
config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}
|
|
62
|
+
config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
|
|
63
|
+
snapshotExists: !!snapshot,
|
|
64
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
65
|
+
sitemapLastmod: sitemapItem.lastmod,
|
|
66
|
+
url,
|
|
67
|
+
});
|
|
68
|
+
crawlCount++;
|
|
62
69
|
return (0, crawler_1.crawlUrl)({
|
|
63
70
|
url,
|
|
64
71
|
lastModified: sitemapItem.lastmod,
|
|
@@ -66,6 +73,12 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
66
73
|
includeHtml: true,
|
|
67
74
|
});
|
|
68
75
|
}), { concurrency: config_1.config.siteCron.sitemapConcurrency });
|
|
76
|
+
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
77
|
+
url,
|
|
78
|
+
pathname,
|
|
79
|
+
processCount,
|
|
80
|
+
crawlCount,
|
|
81
|
+
});
|
|
69
82
|
return jobIds;
|
|
70
83
|
}
|
|
71
84
|
catch (error) {
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -11,6 +12,9 @@ export interface JobState {
|
|
|
11
12
|
timeout?: number;
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
15
|
+
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: Record<string, string>;
|
|
14
18
|
}
|
|
15
19
|
export interface JobModel {
|
|
16
20
|
id: string;
|
|
@@ -31,4 +35,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
|
|
|
31
35
|
cancelled: JobModel['cancelled'];
|
|
32
36
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
37
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
38
|
+
static isExists(condition: Partial<JobState> & {
|
|
39
|
+
url: string;
|
|
40
|
+
}): Promise<JobModel | null | undefined>;
|
|
34
41
|
}
|
package/lib/cjs/store/job.js
CHANGED
|
@@ -41,9 +41,13 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
41
41
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
42
|
});
|
|
43
43
|
};
|
|
44
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
45
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
46
|
+
};
|
|
44
47
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
48
|
exports.Job = void 0;
|
|
46
49
|
const core_1 = __importStar(require("@sequelize/core"));
|
|
50
|
+
const isEqual_1 = __importDefault(require("lodash/isEqual"));
|
|
47
51
|
class Job extends core_1.Model {
|
|
48
52
|
static initModel(sequelize) {
|
|
49
53
|
return Job.init({
|
|
@@ -106,5 +110,22 @@ class Job extends core_1.Model {
|
|
|
106
110
|
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
107
111
|
});
|
|
108
112
|
}
|
|
113
|
+
static isExists(condition) {
|
|
114
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
115
|
+
const jobs = yield Job.findAll({
|
|
116
|
+
where: core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), '$.url'), condition.url),
|
|
117
|
+
});
|
|
118
|
+
if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
const existsJob = jobs.find((job) => {
|
|
122
|
+
const jobModel = job.get('job');
|
|
123
|
+
return Object.keys(condition).every((key) => {
|
|
124
|
+
return (0, isEqual_1.default)(condition[key], jobModel[key]);
|
|
125
|
+
});
|
|
126
|
+
});
|
|
127
|
+
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
128
|
+
});
|
|
129
|
+
}
|
|
109
130
|
}
|
|
110
131
|
exports.Job = Job;
|
package/lib/cjs/utils.js
CHANGED
|
@@ -134,6 +134,11 @@ const staticFileExtensions = [
|
|
|
134
134
|
'xls',
|
|
135
135
|
'xml',
|
|
136
136
|
'zip',
|
|
137
|
+
'ts',
|
|
138
|
+
'json',
|
|
139
|
+
'md',
|
|
140
|
+
'yml',
|
|
141
|
+
'yaml',
|
|
137
142
|
];
|
|
138
143
|
const sleep = (ms) => {
|
|
139
144
|
return new Promise((resolve) => {
|
|
@@ -153,7 +158,7 @@ exports.isSelfCrawler = isSelfCrawler;
|
|
|
153
158
|
* Check if the request is a static file
|
|
154
159
|
*/
|
|
155
160
|
function isStaticFile(req) {
|
|
156
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})
|
|
161
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
|
|
157
162
|
return excludeUrlPattern.test(req.path);
|
|
158
163
|
}
|
|
159
164
|
/**
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -5,17 +5,8 @@ export declare function getDataDir(): Promise<{
|
|
|
5
5
|
htmlDir: string;
|
|
6
6
|
screenshotDir: string;
|
|
7
7
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
-
|
|
10
|
-
includeScreenshot?: boolean;
|
|
11
|
-
includeHtml?: boolean;
|
|
12
|
-
width?: number;
|
|
13
|
-
height?: number;
|
|
14
|
-
quality?: number;
|
|
15
|
-
timeout?: number;
|
|
16
|
-
fullPage?: boolean;
|
|
17
|
-
}) => Promise<{
|
|
18
|
-
html: string;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
|
+
html: string | null;
|
|
19
10
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
11
|
meta: {
|
|
21
12
|
title?: string;
|
|
@@ -27,4 +18,4 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
27
18
|
* @param params
|
|
28
19
|
* @param callback callback when job finished
|
|
29
20
|
*/
|
|
30
|
-
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string
|
|
21
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
package/lib/esm/crawler.js
CHANGED
|
@@ -50,7 +50,12 @@ export function createCrawlQueue() {
|
|
|
50
50
|
// }
|
|
51
51
|
try {
|
|
52
52
|
// get page content later
|
|
53
|
-
const result = yield getPageContent(
|
|
53
|
+
const result = yield getPageContent(Object.assign({ localStorage: {
|
|
54
|
+
// for blocklet theme
|
|
55
|
+
blocklet_theme_prefer: 'light',
|
|
56
|
+
// for blocklet domain warning
|
|
57
|
+
'domain-warning-skip': Date.now().toString(),
|
|
58
|
+
} }, job));
|
|
54
59
|
if (!result || (!result.html && !result.screenshot)) {
|
|
55
60
|
logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
56
61
|
const snapshot = convertJobToSnapshot({
|
|
@@ -128,18 +133,24 @@ function saveSnapshotToLocal(_a) {
|
|
|
128
133
|
};
|
|
129
134
|
});
|
|
130
135
|
}
|
|
131
|
-
function
|
|
132
|
-
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
133
|
-
return '';
|
|
134
|
-
}
|
|
135
|
-
return htmlString;
|
|
136
|
-
}
|
|
137
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
138
|
-
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
136
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
|
|
139
137
|
const page = yield initPage();
|
|
140
138
|
if (width && height) {
|
|
141
139
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
142
140
|
}
|
|
141
|
+
if (headers) {
|
|
142
|
+
yield page.setExtraHTTPHeaders(headers);
|
|
143
|
+
}
|
|
144
|
+
if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
|
|
145
|
+
yield page.setCookie(...cookies);
|
|
146
|
+
}
|
|
147
|
+
if (localStorage) {
|
|
148
|
+
yield page.evaluateOnNewDocument((items) => {
|
|
149
|
+
Object.entries(items).forEach(([key, value]) => {
|
|
150
|
+
window.localStorage.setItem(key, value);
|
|
151
|
+
});
|
|
152
|
+
}, localStorage);
|
|
153
|
+
}
|
|
143
154
|
let html = null;
|
|
144
155
|
let screenshot = null;
|
|
145
156
|
const meta = {};
|
|
@@ -149,7 +160,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
149
160
|
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
150
161
|
}
|
|
151
162
|
const statusCode = response.status();
|
|
152
|
-
logger.debug('getPageContent.response', { response, statusCode });
|
|
153
163
|
if (![200, 304].includes(statusCode)) {
|
|
154
164
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
155
165
|
}
|
|
@@ -198,6 +208,11 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
198
208
|
description,
|
|
199
209
|
};
|
|
200
210
|
});
|
|
211
|
+
// check if the page is an error page
|
|
212
|
+
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
213
|
+
if (isErrorPage) {
|
|
214
|
+
throw new Error('Page is an error page');
|
|
215
|
+
}
|
|
201
216
|
meta.title = data.title;
|
|
202
217
|
meta.description = data.description;
|
|
203
218
|
if (includeHtml) {
|
|
@@ -216,7 +231,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
216
231
|
finally {
|
|
217
232
|
yield page.close();
|
|
218
233
|
}
|
|
219
|
-
html = formatHtml(html || '');
|
|
220
234
|
return {
|
|
221
235
|
html,
|
|
222
236
|
screenshot,
|
|
@@ -228,24 +242,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
228
242
|
* @param params
|
|
229
243
|
* @param callback callback when job finished
|
|
230
244
|
*/
|
|
245
|
+
// eslint-disable-next-line require-await
|
|
231
246
|
export function crawlUrl(params, callback) {
|
|
232
247
|
return __awaiter(this, void 0, void 0, function* () {
|
|
233
248
|
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
234
249
|
// skip duplicate job
|
|
235
|
-
const
|
|
236
|
-
|
|
237
|
-
includeScreenshot: params.includeScreenshot,
|
|
238
|
-
includeHtml: params.includeHtml,
|
|
239
|
-
quality: params.quality,
|
|
240
|
-
width: params.width,
|
|
241
|
-
height: params.height,
|
|
242
|
-
fullPage: params.fullPage,
|
|
243
|
-
})) || {};
|
|
244
|
-
if (duplicateJob) {
|
|
250
|
+
const existsJob = yield Job.isExists(params);
|
|
251
|
+
if (existsJob) {
|
|
245
252
|
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
246
|
-
return
|
|
253
|
+
return existsJob.id;
|
|
247
254
|
}
|
|
248
|
-
logger.info('
|
|
255
|
+
logger.info('enqueue crawl job', params);
|
|
249
256
|
const jobId = randomUUID();
|
|
250
257
|
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
251
258
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
package/lib/esm/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
package/lib/esm/site.js
CHANGED
|
@@ -36,6 +36,7 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
36
36
|
});
|
|
37
37
|
logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
38
38
|
let processCount = 0;
|
|
39
|
+
let crawlCount = 0;
|
|
39
40
|
crawlBlockletRunningMap.set(key, true);
|
|
40
41
|
try {
|
|
41
42
|
const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
@@ -52,7 +53,13 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
52
53
|
return null;
|
|
53
54
|
}
|
|
54
55
|
}
|
|
55
|
-
logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}
|
|
56
|
+
logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`, {
|
|
57
|
+
snapshotExists: !!snapshot,
|
|
58
|
+
lastModified: snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified,
|
|
59
|
+
sitemapLastmod: sitemapItem.lastmod,
|
|
60
|
+
url,
|
|
61
|
+
});
|
|
62
|
+
crawlCount++;
|
|
56
63
|
return crawlUrl({
|
|
57
64
|
url,
|
|
58
65
|
lastModified: sitemapItem.lastmod,
|
|
@@ -60,6 +67,12 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
60
67
|
includeHtml: true,
|
|
61
68
|
});
|
|
62
69
|
}), { concurrency: config.siteCron.sitemapConcurrency });
|
|
70
|
+
logger.info('Enqueued jobs from sitemap finished', {
|
|
71
|
+
url,
|
|
72
|
+
pathname,
|
|
73
|
+
processCount,
|
|
74
|
+
crawlCount,
|
|
75
|
+
});
|
|
63
76
|
return jobIds;
|
|
64
77
|
}
|
|
65
78
|
catch (error) {
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface JobState {
|
|
3
4
|
id?: string;
|
|
@@ -11,6 +12,9 @@ export interface JobState {
|
|
|
11
12
|
timeout?: number;
|
|
12
13
|
fullPage?: boolean;
|
|
13
14
|
lastModified?: string;
|
|
15
|
+
headers?: Record<string, string>;
|
|
16
|
+
cookies?: CookieParam[];
|
|
17
|
+
localStorage?: Record<string, string>;
|
|
14
18
|
}
|
|
15
19
|
export interface JobModel {
|
|
16
20
|
id: string;
|
|
@@ -31,4 +35,7 @@ export declare class Job extends Model<JobModel> implements JobModel {
|
|
|
31
35
|
cancelled: JobModel['cancelled'];
|
|
32
36
|
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
37
|
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
38
|
+
static isExists(condition: Partial<JobState> & {
|
|
39
|
+
url: string;
|
|
40
|
+
}): Promise<JobModel | null | undefined>;
|
|
34
41
|
}
|
package/lib/esm/store/job.js
CHANGED
|
@@ -8,6 +8,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
10
|
import sequelize, { DataTypes, Model } from '@sequelize/core';
|
|
11
|
+
import isEqual from 'lodash/isEqual';
|
|
11
12
|
export class Job extends Model {
|
|
12
13
|
static initModel(sequelize) {
|
|
13
14
|
return Job.init({
|
|
@@ -70,4 +71,21 @@ export class Job extends Model {
|
|
|
70
71
|
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
71
72
|
});
|
|
72
73
|
}
|
|
74
|
+
static isExists(condition) {
|
|
75
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
76
|
+
const jobs = yield Job.findAll({
|
|
77
|
+
where: sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), '$.url'), condition.url),
|
|
78
|
+
});
|
|
79
|
+
if (!(jobs === null || jobs === void 0 ? void 0 : jobs.length)) {
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
const existsJob = jobs.find((job) => {
|
|
83
|
+
const jobModel = job.get('job');
|
|
84
|
+
return Object.keys(condition).every((key) => {
|
|
85
|
+
return isEqual(condition[key], jobModel[key]);
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
return existsJob === null || existsJob === void 0 ? void 0 : existsJob.get();
|
|
89
|
+
});
|
|
90
|
+
}
|
|
73
91
|
}
|
package/lib/esm/utils.js
CHANGED
|
@@ -123,6 +123,11 @@ const staticFileExtensions = [
|
|
|
123
123
|
'xls',
|
|
124
124
|
'xml',
|
|
125
125
|
'zip',
|
|
126
|
+
'ts',
|
|
127
|
+
'json',
|
|
128
|
+
'md',
|
|
129
|
+
'yml',
|
|
130
|
+
'yaml',
|
|
126
131
|
];
|
|
127
132
|
export const sleep = (ms) => {
|
|
128
133
|
return new Promise((resolve) => {
|
|
@@ -140,7 +145,7 @@ export const isSelfCrawler = (req) => {
|
|
|
140
145
|
* Check if the request is a static file
|
|
141
146
|
*/
|
|
142
147
|
export function isStaticFile(req) {
|
|
143
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})
|
|
148
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})([\\?#]|$)`, 'i');
|
|
144
149
|
return excludeUrlPattern.test(req.path);
|
|
145
150
|
}
|
|
146
151
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.5",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -45,33 +45,32 @@
|
|
|
45
45
|
]
|
|
46
46
|
},
|
|
47
47
|
"dependencies": {
|
|
48
|
-
"@abtnode/cron": "^1.16.
|
|
49
|
-
"@abtnode/models": "^1.16.
|
|
50
|
-
"@abtnode/queue": "^1.16.
|
|
51
|
-
"@blocklet/logger": "^1.16.
|
|
48
|
+
"@abtnode/cron": "^1.16.44",
|
|
49
|
+
"@abtnode/models": "^1.16.44",
|
|
50
|
+
"@abtnode/queue": "^1.16.44",
|
|
51
|
+
"@blocklet/logger": "^1.16.44",
|
|
52
52
|
"@blocklet/puppeteer": "^22.11.3",
|
|
53
|
-
"@blocklet/sdk": "^1.16.
|
|
53
|
+
"@blocklet/sdk": "^1.16.44",
|
|
54
54
|
"@sequelize/core": "7.0.0-alpha.46",
|
|
55
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
56
56
|
"axios": "^1.7.9",
|
|
57
57
|
"fs-extra": "^11.2.0",
|
|
58
58
|
"lodash": "^4.17.21",
|
|
59
59
|
"lru-cache": "^10.4.3",
|
|
60
|
+
"p-map": "^7.0.3",
|
|
60
61
|
"robots-parser": "^3.0.1",
|
|
61
62
|
"sitemap": "^7.1.2",
|
|
62
63
|
"sqlite3": "^5.1.7",
|
|
63
|
-
"ufo": "^1.5.4"
|
|
64
|
-
"p-map": "^7.0.3"
|
|
64
|
+
"ufo": "^1.5.4"
|
|
65
65
|
},
|
|
66
66
|
"devDependencies": {
|
|
67
|
-
"@blocklet/js-sdk": "^1.16.39",
|
|
68
67
|
"@types/dotenv-flow": "^3.3.3",
|
|
69
68
|
"@types/express": "^4.17.21",
|
|
70
69
|
"@types/fs-extra": "^11.0.4",
|
|
71
70
|
"@types/lodash": "^4.17.16",
|
|
72
71
|
"@types/node": "^20.17.19",
|
|
73
|
-
"express": "^4.21.2",
|
|
74
72
|
"bumpp": "^9.11.1",
|
|
73
|
+
"express": "^4.21.2",
|
|
75
74
|
"nodemon": "^3.1.9",
|
|
76
75
|
"npm-run-all": "^4.1.5",
|
|
77
76
|
"puppeteer": "^24.8.2",
|