@arcblock/crawler 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +56 -113
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +88 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +48 -102
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +26 -37
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +84 -71
- package/package.json +22 -33
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -15
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -15
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.Snapshot = void 0;
|
|
13
|
+
const core_1 = require("@sequelize/core");
|
|
14
|
+
class Snapshot extends core_1.Model {
|
|
15
|
+
static initModel(sequelize) {
|
|
16
|
+
return Snapshot.init({
|
|
17
|
+
jobId: {
|
|
18
|
+
type: core_1.DataTypes.STRING,
|
|
19
|
+
primaryKey: true,
|
|
20
|
+
allowNull: false,
|
|
21
|
+
},
|
|
22
|
+
url: {
|
|
23
|
+
type: core_1.DataTypes.STRING,
|
|
24
|
+
allowNull: false,
|
|
25
|
+
index: true,
|
|
26
|
+
},
|
|
27
|
+
status: {
|
|
28
|
+
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
29
|
+
allowNull: false,
|
|
30
|
+
},
|
|
31
|
+
html: {
|
|
32
|
+
type: core_1.DataTypes.TEXT,
|
|
33
|
+
allowNull: true,
|
|
34
|
+
},
|
|
35
|
+
screenshot: {
|
|
36
|
+
type: core_1.DataTypes.STRING,
|
|
37
|
+
allowNull: true,
|
|
38
|
+
},
|
|
39
|
+
error: {
|
|
40
|
+
type: core_1.DataTypes.STRING,
|
|
41
|
+
allowNull: true,
|
|
42
|
+
},
|
|
43
|
+
lastModified: {
|
|
44
|
+
type: core_1.DataTypes.STRING,
|
|
45
|
+
allowNull: true,
|
|
46
|
+
},
|
|
47
|
+
options: {
|
|
48
|
+
type: core_1.DataTypes.JSON,
|
|
49
|
+
allowNull: true,
|
|
50
|
+
},
|
|
51
|
+
}, {
|
|
52
|
+
sequelize,
|
|
53
|
+
modelName: 'snapshot',
|
|
54
|
+
tableName: 'snap',
|
|
55
|
+
timestamps: true,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
static findSnapshot(condition) {
|
|
59
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
60
|
+
const snapshot = yield Snapshot.findOne(Object.assign({ order: [
|
|
61
|
+
['lastModified', 'DESC'],
|
|
62
|
+
['updatedAt', 'DESC'],
|
|
63
|
+
] }, condition));
|
|
64
|
+
return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
exports.Snapshot = Snapshot;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import { Request } from 'express';
|
|
3
|
+
export declare const axios: import("axios").AxiosInstance;
|
|
4
|
+
export declare const CRAWLER_FLAG = "x-arcblock-crawler";
|
|
5
|
+
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
6
|
+
/**
|
|
7
|
+
* Check if the request is a arcblock crawler
|
|
8
|
+
*/
|
|
9
|
+
export declare const isSelfCrawler: (req: Request) => boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Check if the request is a static file
|
|
12
|
+
*/
|
|
13
|
+
export declare function isStaticFile(req: Request): boolean;
|
|
14
|
+
/**
|
|
15
|
+
* Check if the request is a spider
|
|
16
|
+
*/
|
|
17
|
+
export declare function isSpider(req: Request): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
20
|
+
*/
|
|
21
|
+
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
22
|
+
/**
|
|
23
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
24
|
+
*/
|
|
25
|
+
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
26
|
+
/**
|
|
27
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
28
|
+
*/
|
|
29
|
+
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
30
|
+
export declare const formatUrl: (url: string) => string;
|
|
31
|
+
export declare function md5(content: string | Uint8Array): string;
|
|
32
|
+
export declare function findMaxScrollHeight(page: Page): Promise<number>;
|
|
@@ -12,10 +12,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.formatUrl = exports.
|
|
15
|
+
exports.formatUrl = exports.getSitemapList = exports.isAcceptCrawler = exports.isSelfCrawler = exports.sleep = exports.CRAWLER_FLAG = exports.axios = void 0;
|
|
16
|
+
exports.isStaticFile = isStaticFile;
|
|
17
|
+
exports.isSpider = isSpider;
|
|
16
18
|
exports.getRobots = getRobots;
|
|
17
19
|
exports.md5 = md5;
|
|
18
|
-
|
|
20
|
+
exports.findMaxScrollHeight = findMaxScrollHeight;
|
|
19
21
|
const axios_1 = __importDefault(require("axios"));
|
|
20
22
|
const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
|
|
21
23
|
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
@@ -24,24 +26,14 @@ const robots_parser_1 = __importDefault(require("robots-parser"));
|
|
|
24
26
|
const sitemap_1 = require("sitemap");
|
|
25
27
|
const stream_1 = require("stream");
|
|
26
28
|
const ufo_1 = require("ufo");
|
|
27
|
-
|
|
28
|
-
|
|
29
|
+
const config_1 = require("./config");
|
|
30
|
+
exports.axios = axios_1.default.create({
|
|
31
|
+
timeout: 1000 * 30,
|
|
29
32
|
headers: {
|
|
30
33
|
'Content-Type': 'application/json',
|
|
31
34
|
},
|
|
32
35
|
});
|
|
33
|
-
|
|
34
|
-
return new Promise((resolve) => {
|
|
35
|
-
setTimeout(resolve, ms);
|
|
36
|
-
});
|
|
37
|
-
};
|
|
38
|
-
exports.sleep = sleep;
|
|
39
|
-
exports.CRAWLER_FLAG = 'x-crawler';
|
|
40
|
-
const isSelfCrawler = (req) => {
|
|
41
|
-
const ua = req.get('user-agent') || '';
|
|
42
|
-
return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
|
|
43
|
-
};
|
|
44
|
-
exports.isSelfCrawler = isSelfCrawler;
|
|
36
|
+
exports.CRAWLER_FLAG = 'x-arcblock-crawler';
|
|
45
37
|
/**
|
|
46
38
|
* A default set of user agent patterns for bots/crawlers that do not perform
|
|
47
39
|
* well with pages that require JavaScript.
|
|
@@ -97,12 +89,8 @@ const botUserAgents = [
|
|
|
97
89
|
/AlibabaGroup/i,
|
|
98
90
|
/adaptive-edge-crawler/i,
|
|
99
91
|
];
|
|
100
|
-
const isSpider = (ua) => botUserAgents.some((spider) => {
|
|
101
|
-
return spider.test(ua);
|
|
102
|
-
});
|
|
103
92
|
/**
|
|
104
|
-
* A default set of file extensions for static assets that do not need to be
|
|
105
|
-
* proxied.
|
|
93
|
+
* A default set of file extensions for static assets that do not need to be proxied.
|
|
106
94
|
*/
|
|
107
95
|
const staticFileExtensions = [
|
|
108
96
|
'ai',
|
|
@@ -147,89 +135,91 @@ const staticFileExtensions = [
|
|
|
147
135
|
'xml',
|
|
148
136
|
'zip',
|
|
149
137
|
];
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
|
|
138
|
+
const sleep = (ms) => {
|
|
139
|
+
return new Promise((resolve) => {
|
|
140
|
+
setTimeout(resolve, ms);
|
|
141
|
+
});
|
|
153
142
|
};
|
|
154
|
-
exports.
|
|
143
|
+
exports.sleep = sleep;
|
|
144
|
+
/**
|
|
145
|
+
* Check if the request is a arcblock crawler
|
|
146
|
+
*/
|
|
147
|
+
const isSelfCrawler = (req) => {
|
|
148
|
+
const ua = req.get('user-agent') || '';
|
|
149
|
+
return req.get(exports.CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
|
|
150
|
+
};
|
|
151
|
+
exports.isSelfCrawler = isSelfCrawler;
|
|
152
|
+
/**
|
|
153
|
+
* Check if the request is a static file
|
|
154
|
+
*/
|
|
155
|
+
function isStaticFile(req) {
|
|
156
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
157
|
+
return excludeUrlPattern.test(req.path);
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Check if the request is a spider
|
|
161
|
+
*/
|
|
162
|
+
function isSpider(req) {
|
|
163
|
+
const ua = req.get('user-agent') || '';
|
|
164
|
+
return botUserAgents.some((spider) => spider.test(ua));
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
168
|
+
*/
|
|
155
169
|
function getRobots(url) {
|
|
156
170
|
return __awaiter(this, void 0, void 0, function* () {
|
|
157
171
|
const { origin } = new URL(url);
|
|
158
172
|
const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
|
|
159
|
-
const { data } = yield exports.
|
|
160
|
-
|
|
161
|
-
|
|
173
|
+
const { data } = yield exports.axios.get(robotsUrl).catch((error) => {
|
|
174
|
+
config_1.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
|
|
175
|
+
return { data: null };
|
|
176
|
+
});
|
|
162
177
|
return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
|
|
163
178
|
});
|
|
164
179
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
};
|
|
169
|
-
exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
|
|
180
|
+
/**
|
|
181
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
182
|
+
*/
|
|
170
183
|
const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
171
184
|
const robots = yield getRobots(url);
|
|
172
185
|
const isAllowed = robots ? yield robots.isAllowed(url) : true;
|
|
173
186
|
return isAllowed;
|
|
174
187
|
});
|
|
175
188
|
exports.isAcceptCrawler = isAcceptCrawler;
|
|
189
|
+
/**
|
|
190
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
191
|
+
*/
|
|
176
192
|
const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
177
|
-
let sitemapUrlList = [
|
|
193
|
+
let sitemapUrlList = [];
|
|
178
194
|
const robots = yield getRobots(url);
|
|
179
195
|
if (robots) {
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
196
|
+
sitemapUrlList = (yield robots.getSitemaps()) || [];
|
|
197
|
+
}
|
|
198
|
+
if (!sitemapUrlList.length) {
|
|
199
|
+
const { origin } = new URL(url);
|
|
200
|
+
sitemapUrlList.push((0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1'));
|
|
184
201
|
}
|
|
185
202
|
// loop site map url list
|
|
186
203
|
const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
204
|
+
sitemapUrl = (0, ufo_1.withQuery)(sitemapUrl, { nocache: '1' });
|
|
205
|
+
try {
|
|
206
|
+
const { data: sitemapTxt } = yield exports.axios.get(sitemapUrl).catch(() => ({
|
|
207
|
+
data: '',
|
|
208
|
+
}));
|
|
209
|
+
if (sitemapTxt) {
|
|
210
|
+
const stream = stream_1.Readable.from([sitemapTxt]);
|
|
211
|
+
const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
|
|
212
|
+
return sitemapJson;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
catch (error) {
|
|
216
|
+
config_1.logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
|
|
197
217
|
}
|
|
198
218
|
return [];
|
|
199
219
|
})));
|
|
200
220
|
return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
|
|
201
221
|
});
|
|
202
222
|
exports.getSitemapList = getSitemapList;
|
|
203
|
-
const isBotUserAgent = (req) => {
|
|
204
|
-
const ua = req.get('user-agent');
|
|
205
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
206
|
-
if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
|
|
207
|
-
return false;
|
|
208
|
-
}
|
|
209
|
-
return true;
|
|
210
|
-
};
|
|
211
|
-
exports.isBotUserAgent = isBotUserAgent;
|
|
212
|
-
const getComponentInfo = () => {
|
|
213
|
-
return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
|
|
214
|
-
};
|
|
215
|
-
exports.getComponentInfo = getComponentInfo;
|
|
216
|
-
const getFullUrl = (req) => {
|
|
217
|
-
const blockletPathname = req.headers['x-path-prefix']
|
|
218
|
-
? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
|
|
219
|
-
: req.originalUrl;
|
|
220
|
-
return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
|
|
221
|
-
};
|
|
222
|
-
exports.getFullUrl = getFullUrl;
|
|
223
|
-
const getRelativePath = (url) => {
|
|
224
|
-
try {
|
|
225
|
-
return new URL(url).pathname;
|
|
226
|
-
}
|
|
227
|
-
catch (error) {
|
|
228
|
-
// ignore error
|
|
229
|
-
}
|
|
230
|
-
return url;
|
|
231
|
-
};
|
|
232
|
-
exports.getRelativePath = getRelativePath;
|
|
233
223
|
const formatUrl = (url) => {
|
|
234
224
|
return url.replace(/\/$/, '').trim();
|
|
235
225
|
};
|
|
@@ -237,3 +227,23 @@ exports.formatUrl = formatUrl;
|
|
|
237
227
|
function md5(content) {
|
|
238
228
|
return (0, node_crypto_1.createHash)('md5').update(content).digest('hex');
|
|
239
229
|
}
|
|
230
|
+
function findMaxScrollHeight(page) {
|
|
231
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
232
|
+
const maxHeightHandler = yield page.evaluateHandle(() => {
|
|
233
|
+
const elements = Array.from(document.querySelectorAll('*'));
|
|
234
|
+
let maxHeight = document.body.scrollHeight;
|
|
235
|
+
for (const el of elements) {
|
|
236
|
+
const style = window.getComputedStyle(el);
|
|
237
|
+
if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
|
|
238
|
+
if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
|
|
239
|
+
maxHeight = el.scrollHeight;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return maxHeight;
|
|
244
|
+
});
|
|
245
|
+
const maxHeight = yield maxHeightHandler.jsonValue();
|
|
246
|
+
maxHeightHandler.dispose();
|
|
247
|
+
return maxHeight;
|
|
248
|
+
});
|
|
249
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export type Site = {
|
|
2
|
+
url: string;
|
|
3
|
+
pathname: string;
|
|
4
|
+
/** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
|
|
5
|
+
interval?: number;
|
|
6
|
+
};
|
|
7
|
+
export type Config = {
|
|
8
|
+
isProd: boolean;
|
|
9
|
+
dataDir: string;
|
|
10
|
+
appDir: string;
|
|
11
|
+
appUrl: string;
|
|
12
|
+
cacheDir: string;
|
|
13
|
+
puppeteerPath?: string;
|
|
14
|
+
siteCron: {
|
|
15
|
+
sites: Site[];
|
|
16
|
+
time: string;
|
|
17
|
+
runOnInit: boolean;
|
|
18
|
+
concurrency: number;
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
export declare const logger: any;
|
|
22
|
+
export declare const config: Config;
|
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
import createLogger from '@blocklet/logger';
|
|
2
|
-
export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
2
|
+
export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
3
3
|
export const config = {
|
|
4
|
-
|
|
4
|
+
isProd: process.env.NODE_ENV === 'production',
|
|
5
5
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
6
|
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
7
|
appUrl: process.env.BLOCKLET_APP_URL,
|
|
8
8
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
9
9
|
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
10
|
-
|
|
10
|
+
// cron
|
|
11
|
+
siteCron: {
|
|
12
|
+
sites: [],
|
|
13
|
+
time: '0 0 */12 * * *',
|
|
14
|
+
runOnInit: false,
|
|
15
|
+
concurrency: 5,
|
|
16
|
+
},
|
|
11
17
|
};
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { JobState } from './store/job';
|
|
2
|
+
import { SnapshotModel } from './store/snapshot';
|
|
3
|
+
export declare function createCrawlQueue(): void;
|
|
4
|
+
export declare function getDataDir(): Promise<{
|
|
5
|
+
htmlDir: string;
|
|
6
|
+
screenshotDir: string;
|
|
7
|
+
}>;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
+
url: string;
|
|
10
|
+
includeScreenshot?: boolean;
|
|
11
|
+
includeHtml?: boolean;
|
|
12
|
+
width?: number;
|
|
13
|
+
height?: number;
|
|
14
|
+
quality?: number;
|
|
15
|
+
timeout?: number;
|
|
16
|
+
fullPage?: boolean;
|
|
17
|
+
}) => Promise<{
|
|
18
|
+
html: string;
|
|
19
|
+
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
|
+
}>;
|
|
21
|
+
/**
|
|
22
|
+
* crawl url and return job id
|
|
23
|
+
* @param params
|
|
24
|
+
* @param callback callback when job finished
|
|
25
|
+
*/
|
|
26
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
|
|
@@ -9,17 +9,15 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
};
|
|
10
10
|
import createQueue from '@abtnode/queue';
|
|
11
11
|
import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
|
|
12
|
-
import sequelize from '@sequelize/core';
|
|
13
12
|
import { randomUUID } from 'crypto';
|
|
14
13
|
import fs from 'fs-extra';
|
|
15
|
-
import pick from 'lodash/pick';
|
|
16
14
|
import path from 'path';
|
|
17
|
-
import { joinURL } from 'ufo';
|
|
18
15
|
import { config, logger } from './config';
|
|
19
|
-
import { Job } from './db/job';
|
|
20
|
-
import { Snapshot } from './db/snapshot';
|
|
21
16
|
import { initPage } from './puppeteer';
|
|
22
|
-
import {
|
|
17
|
+
import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
|
|
18
|
+
import { Job } from './store/job';
|
|
19
|
+
import { Snapshot } from './store/snapshot';
|
|
20
|
+
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
23
21
|
const { BaseState } = require('@abtnode/models');
|
|
24
22
|
let crawlQueue;
|
|
25
23
|
export function createCrawlQueue() {
|
|
@@ -28,7 +26,7 @@ export function createCrawlQueue() {
|
|
|
28
26
|
store: new SequelizeStore(db, 'crawler'),
|
|
29
27
|
concurrency: 1,
|
|
30
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
31
|
-
logger.
|
|
29
|
+
logger.info('Starting to execute crawl job', job);
|
|
32
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
33
31
|
if (!canCrawl) {
|
|
34
32
|
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
@@ -81,19 +79,9 @@ export function createCrawlQueue() {
|
|
|
81
79
|
});
|
|
82
80
|
yield Snapshot.upsert(snapshot);
|
|
83
81
|
return snapshot;
|
|
84
|
-
// save to redis
|
|
85
|
-
// if (saveToRedis) {
|
|
86
|
-
// useCache.set(url, {
|
|
87
|
-
// html: result.html || '',
|
|
88
|
-
// lastModified,
|
|
89
|
-
// });
|
|
90
|
-
// logger.info(`success to crawl ${url}`, job);
|
|
91
|
-
// return result;
|
|
92
|
-
// }
|
|
93
82
|
}
|
|
94
83
|
catch (error) {
|
|
95
84
|
logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
96
|
-
console.error(error.stack);
|
|
97
85
|
const snapshot = convertJobToSnapshot({
|
|
98
86
|
job,
|
|
99
87
|
snapshot: {
|
|
@@ -145,11 +133,11 @@ function formatHtml(htmlString) {
|
|
|
145
133
|
}
|
|
146
134
|
return htmlString;
|
|
147
135
|
}
|
|
148
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
136
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
149
137
|
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
150
138
|
const page = yield initPage();
|
|
151
139
|
if (width && height) {
|
|
152
|
-
yield page.setViewport({ width, height });
|
|
140
|
+
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
153
141
|
}
|
|
154
142
|
let html = null;
|
|
155
143
|
let screenshot = null;
|
|
@@ -164,26 +152,47 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
164
152
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
165
153
|
}
|
|
166
154
|
// await for networkidle0
|
|
167
|
-
// https://pptr.dev/api/puppeteer.page.
|
|
155
|
+
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
|
|
168
156
|
yield page.waitForNetworkIdle({
|
|
169
|
-
idleTime:
|
|
157
|
+
idleTime: 1.5 * 1000,
|
|
170
158
|
});
|
|
171
159
|
// get screenshot
|
|
172
160
|
if (includeScreenshot) {
|
|
161
|
+
// Try to find the tallest element and set the browser to the same height
|
|
162
|
+
if (fullPage) {
|
|
163
|
+
const maxScrollHeight = yield findMaxScrollHeight(page);
|
|
164
|
+
logger.info('findMaxScrollHeight', { maxScrollHeight });
|
|
165
|
+
if (maxScrollHeight) {
|
|
166
|
+
yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
|
|
167
|
+
yield page.evaluate((scrollHeight) => {
|
|
168
|
+
window.scrollTo(0, scrollHeight || 0);
|
|
169
|
+
document.documentElement.scrollTo(0, scrollHeight || 0);
|
|
170
|
+
}, maxScrollHeight);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
173
|
try {
|
|
174
174
|
screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
|
|
175
175
|
}
|
|
176
176
|
catch (err) {
|
|
177
177
|
logger.error('Failed to get screenshot:', err);
|
|
178
|
+
throw err;
|
|
178
179
|
}
|
|
179
180
|
}
|
|
180
181
|
// get html
|
|
181
182
|
if (includeHtml) {
|
|
182
|
-
|
|
183
|
-
html = yield
|
|
183
|
+
try {
|
|
184
|
+
html = yield page.evaluate(() => {
|
|
185
|
+
// add meta tag to record crawler
|
|
186
|
+
const meta = document.createElement('meta');
|
|
187
|
+
meta.name = 'arcblock-crawler';
|
|
188
|
+
meta.content = 'true';
|
|
189
|
+
document.head.appendChild(meta);
|
|
190
|
+
return document.documentElement.outerHTML;
|
|
191
|
+
});
|
|
184
192
|
}
|
|
185
|
-
|
|
186
|
-
|
|
193
|
+
catch (err) {
|
|
194
|
+
logger.error('Failed to get html:', err);
|
|
195
|
+
throw err;
|
|
187
196
|
}
|
|
188
197
|
}
|
|
189
198
|
}
|
|
@@ -200,11 +209,16 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
200
209
|
screenshot,
|
|
201
210
|
};
|
|
202
211
|
});
|
|
203
|
-
|
|
212
|
+
/**
|
|
213
|
+
* crawl url and return job id
|
|
214
|
+
* @param params
|
|
215
|
+
* @param callback callback when job finished
|
|
216
|
+
*/
|
|
217
|
+
export function crawlUrl(params, callback) {
|
|
204
218
|
return __awaiter(this, void 0, void 0, function* () {
|
|
205
219
|
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
206
220
|
// skip duplicate job
|
|
207
|
-
const
|
|
221
|
+
const { job: duplicateJob } = (yield Job.findJob({
|
|
208
222
|
url: params.url,
|
|
209
223
|
includeScreenshot: params.includeScreenshot,
|
|
210
224
|
includeHtml: params.includeHtml,
|
|
@@ -212,18 +226,18 @@ export function createCrawlJob(params, callback) {
|
|
|
212
226
|
width: params.width,
|
|
213
227
|
height: params.height,
|
|
214
228
|
fullPage: params.fullPage,
|
|
215
|
-
});
|
|
216
|
-
|
|
217
|
-
if (existsJob) {
|
|
229
|
+
})) || {};
|
|
230
|
+
if (duplicateJob) {
|
|
218
231
|
logger.warn(`Crawl job already exists for ${params.url}, skip`);
|
|
219
|
-
return
|
|
232
|
+
return duplicateJob.id;
|
|
220
233
|
}
|
|
234
|
+
logger.info('create crawl job', params);
|
|
221
235
|
const jobId = randomUUID();
|
|
222
236
|
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
223
|
-
job.on('finished', ({ result })
|
|
237
|
+
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
224
238
|
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
225
|
-
callback === null || callback === void 0 ? void 0 : callback(result);
|
|
226
|
-
});
|
|
239
|
+
callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
|
|
240
|
+
}));
|
|
227
241
|
job.on('failed', ({ error }) => {
|
|
228
242
|
logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
229
243
|
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
@@ -231,71 +245,3 @@ export function createCrawlJob(params, callback) {
|
|
|
231
245
|
return jobId;
|
|
232
246
|
});
|
|
233
247
|
}
|
|
234
|
-
// @ts-ignore
|
|
235
|
-
export function getJob(condition) {
|
|
236
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
237
|
-
const where = Object.keys(condition)
|
|
238
|
-
.filter((key) => condition[key] !== undefined)
|
|
239
|
-
.map((key) => {
|
|
240
|
-
return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
|
|
241
|
-
});
|
|
242
|
-
const job = yield crawlQueue.store.db.findOne({
|
|
243
|
-
where: {
|
|
244
|
-
[sequelize.Op.and]: where,
|
|
245
|
-
},
|
|
246
|
-
});
|
|
247
|
-
if (job) {
|
|
248
|
-
return job.job;
|
|
249
|
-
}
|
|
250
|
-
return null;
|
|
251
|
-
});
|
|
252
|
-
}
|
|
253
|
-
function convertJobToSnapshot({ job, snapshot }) {
|
|
254
|
-
return Object.assign({
|
|
255
|
-
// @ts-ignore
|
|
256
|
-
jobId: job.jobId || job.id, url: job.url, options: {
|
|
257
|
-
width: job.width,
|
|
258
|
-
height: job.height,
|
|
259
|
-
includeScreenshot: job.includeScreenshot,
|
|
260
|
-
includeHtml: job.includeHtml,
|
|
261
|
-
quality: job.quality,
|
|
262
|
-
fullPage: job.fullPage,
|
|
263
|
-
} }, snapshot);
|
|
264
|
-
}
|
|
265
|
-
export function formatSnapshot(snapshot, columns) {
|
|
266
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
267
|
-
let data = Object.assign({}, snapshot);
|
|
268
|
-
// format screenshot path to full url
|
|
269
|
-
if (data.screenshot) {
|
|
270
|
-
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
271
|
-
}
|
|
272
|
-
// format html path to string
|
|
273
|
-
if (data.html) {
|
|
274
|
-
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
275
|
-
data.html = html.toString();
|
|
276
|
-
}
|
|
277
|
-
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
278
|
-
data = pick(data, columns);
|
|
279
|
-
}
|
|
280
|
-
return data;
|
|
281
|
-
});
|
|
282
|
-
}
|
|
283
|
-
/**
|
|
284
|
-
* get snapshot from db or crawl queue
|
|
285
|
-
*/
|
|
286
|
-
export function getSnapshot(jobId) {
|
|
287
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
288
|
-
const snapshotModel = yield Snapshot.findByPk(jobId);
|
|
289
|
-
if (snapshotModel) {
|
|
290
|
-
return snapshotModel.toJSON();
|
|
291
|
-
}
|
|
292
|
-
const job = yield getJob({ id: jobId });
|
|
293
|
-
if (job) {
|
|
294
|
-
return {
|
|
295
|
-
jobId,
|
|
296
|
-
status: 'pending',
|
|
297
|
-
};
|
|
298
|
-
}
|
|
299
|
-
return null;
|
|
300
|
-
});
|
|
301
|
-
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function initCron(): any;
|