@arcblock/crawler 1.0.6 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -0
- package/lib/cjs/config.d.ts +24 -0
- package/{dist → lib/cjs}/config.js +13 -5
- package/lib/cjs/crawler.d.ts +30 -0
- package/{dist → lib/cjs}/crawler.js +63 -117
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +80 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +79 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +22 -6
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +10 -6
- package/lib/cjs/store/snapshot.js +72 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +67 -78
- package/lib/esm/config.d.ts +24 -0
- package/lib/esm/config.js +19 -0
- package/lib/esm/crawler.d.ts +30 -0
- package/{esm → lib/esm}/crawler.js +54 -105
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +21 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +21 -32
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +72 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +23 -7
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +10 -6
- package/lib/esm/store/snapshot.js +68 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +64 -71
- package/package.json +20 -32
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -17
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/config.js +0 -11
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -17
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.Snapshot = void 0;
|
|
13
|
+
const core_1 = require("@sequelize/core");
|
|
14
|
+
class Snapshot extends core_1.Model {
|
|
15
|
+
static initModel(sequelize) {
|
|
16
|
+
return Snapshot.init({
|
|
17
|
+
jobId: {
|
|
18
|
+
type: core_1.DataTypes.STRING,
|
|
19
|
+
primaryKey: true,
|
|
20
|
+
allowNull: false,
|
|
21
|
+
},
|
|
22
|
+
url: {
|
|
23
|
+
type: core_1.DataTypes.STRING,
|
|
24
|
+
allowNull: false,
|
|
25
|
+
index: true,
|
|
26
|
+
},
|
|
27
|
+
status: {
|
|
28
|
+
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
29
|
+
allowNull: false,
|
|
30
|
+
},
|
|
31
|
+
html: {
|
|
32
|
+
type: core_1.DataTypes.TEXT,
|
|
33
|
+
allowNull: true,
|
|
34
|
+
},
|
|
35
|
+
screenshot: {
|
|
36
|
+
type: core_1.DataTypes.STRING,
|
|
37
|
+
allowNull: true,
|
|
38
|
+
},
|
|
39
|
+
error: {
|
|
40
|
+
type: core_1.DataTypes.STRING,
|
|
41
|
+
allowNull: true,
|
|
42
|
+
},
|
|
43
|
+
lastModified: {
|
|
44
|
+
type: core_1.DataTypes.STRING,
|
|
45
|
+
allowNull: true,
|
|
46
|
+
},
|
|
47
|
+
meta: {
|
|
48
|
+
type: core_1.DataTypes.JSON,
|
|
49
|
+
allowNull: true,
|
|
50
|
+
},
|
|
51
|
+
options: {
|
|
52
|
+
type: core_1.DataTypes.JSON,
|
|
53
|
+
allowNull: true,
|
|
54
|
+
},
|
|
55
|
+
}, {
|
|
56
|
+
sequelize,
|
|
57
|
+
modelName: 'snapshot',
|
|
58
|
+
tableName: 'snap',
|
|
59
|
+
timestamps: true,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
static findSnapshot(condition) {
|
|
63
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
64
|
+
const snapshot = yield Snapshot.findOne(Object.assign({ order: [
|
|
65
|
+
['lastModified', 'DESC'],
|
|
66
|
+
['updatedAt', 'DESC'],
|
|
67
|
+
] }, condition));
|
|
68
|
+
return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
exports.Snapshot = Snapshot;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import { Request } from 'express';
|
|
3
|
+
export declare const axios: import("axios").AxiosInstance;
|
|
4
|
+
export declare const CRAWLER_FLAG = "x-arcblock-crawler";
|
|
5
|
+
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
6
|
+
/**
|
|
7
|
+
* Check if the request is a arcblock crawler
|
|
8
|
+
*/
|
|
9
|
+
export declare const isSelfCrawler: (req: Request) => boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Check if the request is a static file
|
|
12
|
+
*/
|
|
13
|
+
export declare function isStaticFile(req: Request): boolean;
|
|
14
|
+
/**
|
|
15
|
+
* Check if the request is a spider
|
|
16
|
+
*/
|
|
17
|
+
export declare function isSpider(req: Request): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
20
|
+
*/
|
|
21
|
+
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
22
|
+
/**
|
|
23
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
24
|
+
*/
|
|
25
|
+
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
26
|
+
/**
|
|
27
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
28
|
+
*/
|
|
29
|
+
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
30
|
+
export declare const formatUrl: (url: string) => string;
|
|
31
|
+
export declare function md5(content: string | Uint8Array): string;
|
|
32
|
+
export declare function findMaxScrollHeight(page: Page): Promise<number>;
|
|
@@ -12,11 +12,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.formatUrl = exports.
|
|
15
|
+
exports.formatUrl = exports.getSitemapList = exports.isAcceptCrawler = exports.isSelfCrawler = exports.sleep = exports.CRAWLER_FLAG = exports.axios = void 0;
|
|
16
|
+
exports.isStaticFile = isStaticFile;
|
|
17
|
+
exports.isSpider = isSpider;
|
|
16
18
|
exports.getRobots = getRobots;
|
|
17
19
|
exports.md5 = md5;
|
|
18
20
|
exports.findMaxScrollHeight = findMaxScrollHeight;
|
|
19
|
-
const config_1 = require("@blocklet/sdk/lib/config");
|
|
20
21
|
const axios_1 = __importDefault(require("axios"));
|
|
21
22
|
const flattenDeep_1 = __importDefault(require("lodash/flattenDeep"));
|
|
22
23
|
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
@@ -25,24 +26,14 @@ const robots_parser_1 = __importDefault(require("robots-parser"));
|
|
|
25
26
|
const sitemap_1 = require("sitemap");
|
|
26
27
|
const stream_1 = require("stream");
|
|
27
28
|
const ufo_1 = require("ufo");
|
|
28
|
-
|
|
29
|
-
|
|
29
|
+
const config_1 = require("./config");
|
|
30
|
+
exports.axios = axios_1.default.create({
|
|
31
|
+
timeout: 1000 * 30,
|
|
30
32
|
headers: {
|
|
31
33
|
'Content-Type': 'application/json',
|
|
32
34
|
},
|
|
33
35
|
});
|
|
34
|
-
|
|
35
|
-
return new Promise((resolve) => {
|
|
36
|
-
setTimeout(resolve, ms);
|
|
37
|
-
});
|
|
38
|
-
};
|
|
39
|
-
exports.sleep = sleep;
|
|
40
|
-
exports.CRAWLER_FLAG = 'x-crawler';
|
|
41
|
-
const isSelfCrawler = (req) => {
|
|
42
|
-
const ua = req.get('user-agent') || '';
|
|
43
|
-
return req.get(exports.CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
|
|
44
|
-
};
|
|
45
|
-
exports.isSelfCrawler = isSelfCrawler;
|
|
36
|
+
exports.CRAWLER_FLAG = 'x-arcblock-crawler';
|
|
46
37
|
/**
|
|
47
38
|
* A default set of user agent patterns for bots/crawlers that do not perform
|
|
48
39
|
* well with pages that require JavaScript.
|
|
@@ -98,12 +89,8 @@ const botUserAgents = [
|
|
|
98
89
|
/AlibabaGroup/i,
|
|
99
90
|
/adaptive-edge-crawler/i,
|
|
100
91
|
];
|
|
101
|
-
const isSpider = (ua) => botUserAgents.some((spider) => {
|
|
102
|
-
return spider.test(ua);
|
|
103
|
-
});
|
|
104
92
|
/**
|
|
105
|
-
* A default set of file extensions for static assets that do not need to be
|
|
106
|
-
* proxied.
|
|
93
|
+
* A default set of file extensions for static assets that do not need to be proxied.
|
|
107
94
|
*/
|
|
108
95
|
const staticFileExtensions = [
|
|
109
96
|
'ai',
|
|
@@ -148,89 +135,91 @@ const staticFileExtensions = [
|
|
|
148
135
|
'xml',
|
|
149
136
|
'zip',
|
|
150
137
|
];
|
|
151
|
-
const
|
|
152
|
-
|
|
153
|
-
|
|
138
|
+
const sleep = (ms) => {
|
|
139
|
+
return new Promise((resolve) => {
|
|
140
|
+
setTimeout(resolve, ms);
|
|
141
|
+
});
|
|
154
142
|
};
|
|
155
|
-
exports.
|
|
143
|
+
exports.sleep = sleep;
|
|
144
|
+
/**
|
|
145
|
+
* Check if the request is a arcblock crawler
|
|
146
|
+
*/
|
|
147
|
+
const isSelfCrawler = (req) => {
|
|
148
|
+
const ua = req.get('user-agent') || '';
|
|
149
|
+
return req.get(exports.CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
|
|
150
|
+
};
|
|
151
|
+
exports.isSelfCrawler = isSelfCrawler;
|
|
152
|
+
/**
|
|
153
|
+
* Check if the request is a static file
|
|
154
|
+
*/
|
|
155
|
+
function isStaticFile(req) {
|
|
156
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
157
|
+
return excludeUrlPattern.test(req.path);
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Check if the request is a spider
|
|
161
|
+
*/
|
|
162
|
+
function isSpider(req) {
|
|
163
|
+
const ua = req.get('user-agent') || '';
|
|
164
|
+
return botUserAgents.some((spider) => spider.test(ua));
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
168
|
+
*/
|
|
156
169
|
function getRobots(url) {
|
|
157
170
|
return __awaiter(this, void 0, void 0, function* () {
|
|
158
171
|
const { origin } = new URL(url);
|
|
159
172
|
const robotsUrl = (0, ufo_1.joinURL)(origin, 'robots.txt?nocache=1');
|
|
160
|
-
const { data } = yield exports.
|
|
161
|
-
|
|
162
|
-
|
|
173
|
+
const { data } = yield exports.axios.get(robotsUrl).catch((error) => {
|
|
174
|
+
config_1.logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
|
|
175
|
+
return { data: null };
|
|
176
|
+
});
|
|
163
177
|
return data ? (0, robots_parser_1.default)(robotsUrl, data) : null;
|
|
164
178
|
});
|
|
165
179
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
};
|
|
170
|
-
exports.getDefaultSitemapUrl = getDefaultSitemapUrl;
|
|
180
|
+
/**
|
|
181
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
182
|
+
*/
|
|
171
183
|
const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
172
184
|
const robots = yield getRobots(url);
|
|
173
185
|
const isAllowed = robots ? yield robots.isAllowed(url) : true;
|
|
174
186
|
return isAllowed;
|
|
175
187
|
});
|
|
176
188
|
exports.isAcceptCrawler = isAcceptCrawler;
|
|
189
|
+
/**
|
|
190
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
191
|
+
*/
|
|
177
192
|
const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
178
|
-
let sitemapUrlList = [
|
|
193
|
+
let sitemapUrlList = [];
|
|
179
194
|
const robots = yield getRobots(url);
|
|
180
195
|
if (robots) {
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
}
|
|
196
|
+
sitemapUrlList = (yield robots.getSitemaps()) || [];
|
|
197
|
+
}
|
|
198
|
+
if (!sitemapUrlList.length) {
|
|
199
|
+
const { origin } = new URL(url);
|
|
200
|
+
sitemapUrlList.push((0, ufo_1.joinURL)(origin, 'sitemap.xml?nocache=1'));
|
|
185
201
|
}
|
|
186
202
|
// loop site map url list
|
|
187
203
|
const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
204
|
+
sitemapUrl = (0, ufo_1.withQuery)(sitemapUrl, { nocache: '1' });
|
|
205
|
+
try {
|
|
206
|
+
const { data: sitemapTxt } = yield exports.axios.get(sitemapUrl).catch(() => ({
|
|
207
|
+
data: '',
|
|
208
|
+
}));
|
|
209
|
+
if (sitemapTxt) {
|
|
210
|
+
const stream = stream_1.Readable.from([sitemapTxt]);
|
|
211
|
+
const sitemapJson = yield (0, sitemap_1.parseSitemap)(stream);
|
|
212
|
+
return sitemapJson;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
catch (error) {
|
|
216
|
+
config_1.logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
|
|
198
217
|
}
|
|
199
218
|
return [];
|
|
200
219
|
})));
|
|
201
220
|
return (0, uniq_1.default)((0, flattenDeep_1.default)(sitemapList.filter(Boolean)));
|
|
202
221
|
});
|
|
203
222
|
exports.getSitemapList = getSitemapList;
|
|
204
|
-
const isBotUserAgent = (req) => {
|
|
205
|
-
const ua = req.get('user-agent');
|
|
206
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
207
|
-
if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
|
|
208
|
-
return false;
|
|
209
|
-
}
|
|
210
|
-
return true;
|
|
211
|
-
};
|
|
212
|
-
exports.isBotUserAgent = isBotUserAgent;
|
|
213
|
-
const getComponentInfo = () => {
|
|
214
|
-
return config_1.components.find((item) => item.did === config_1.env.componentDid) || {};
|
|
215
|
-
};
|
|
216
|
-
exports.getComponentInfo = getComponentInfo;
|
|
217
|
-
const getFullUrl = (req) => {
|
|
218
|
-
const blockletPathname = req.headers['x-path-prefix']
|
|
219
|
-
? (0, ufo_1.joinURL)(req.headers['x-path-prefix'], req.originalUrl)
|
|
220
|
-
: req.originalUrl;
|
|
221
|
-
return (0, ufo_1.joinURL)(config_1.env.appUrl, blockletPathname);
|
|
222
|
-
};
|
|
223
|
-
exports.getFullUrl = getFullUrl;
|
|
224
|
-
const getRelativePath = (url) => {
|
|
225
|
-
try {
|
|
226
|
-
return new URL(url).pathname;
|
|
227
|
-
}
|
|
228
|
-
catch (error) {
|
|
229
|
-
// ignore error
|
|
230
|
-
}
|
|
231
|
-
return url;
|
|
232
|
-
};
|
|
233
|
-
exports.getRelativePath = getRelativePath;
|
|
234
223
|
const formatUrl = (url) => {
|
|
235
224
|
return url.replace(/\/$/, '').trim();
|
|
236
225
|
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
export type Site = {
|
|
2
|
+
url: string;
|
|
3
|
+
pathname: string;
|
|
4
|
+
/** Minimum crawl interval to avoid frequent crawling by scheduled tasks, in milliseconds */
|
|
5
|
+
interval?: number;
|
|
6
|
+
};
|
|
7
|
+
export type Config = {
|
|
8
|
+
isProd: boolean;
|
|
9
|
+
dataDir: string;
|
|
10
|
+
appDir: string;
|
|
11
|
+
appUrl: string;
|
|
12
|
+
cacheDir: string;
|
|
13
|
+
puppeteerPath?: string;
|
|
14
|
+
siteCron: {
|
|
15
|
+
sites: Site[];
|
|
16
|
+
time: string;
|
|
17
|
+
enabled: boolean;
|
|
18
|
+
immediate: boolean;
|
|
19
|
+
crawlConcurrency: number;
|
|
20
|
+
sitemapConcurrency: number;
|
|
21
|
+
};
|
|
22
|
+
};
|
|
23
|
+
export declare const logger: any;
|
|
24
|
+
export declare const config: Config;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import createLogger from '@blocklet/logger';
|
|
2
|
+
export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
3
|
+
export const config = {
|
|
4
|
+
isProd: process.env.NODE_ENV === 'production',
|
|
5
|
+
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
|
+
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
8
|
+
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
9
|
+
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
10
|
+
// cron
|
|
11
|
+
siteCron: {
|
|
12
|
+
sites: [],
|
|
13
|
+
enabled: true,
|
|
14
|
+
time: '0 0 0 * * *',
|
|
15
|
+
immediate: false,
|
|
16
|
+
crawlConcurrency: 2,
|
|
17
|
+
sitemapConcurrency: 30,
|
|
18
|
+
},
|
|
19
|
+
};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { JobState } from './store/job';
|
|
2
|
+
import { SnapshotModel } from './store/snapshot';
|
|
3
|
+
export declare function createCrawlQueue(): void;
|
|
4
|
+
export declare function getDataDir(): Promise<{
|
|
5
|
+
htmlDir: string;
|
|
6
|
+
screenshotDir: string;
|
|
7
|
+
}>;
|
|
8
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
+
url: string;
|
|
10
|
+
includeScreenshot?: boolean;
|
|
11
|
+
includeHtml?: boolean;
|
|
12
|
+
width?: number;
|
|
13
|
+
height?: number;
|
|
14
|
+
quality?: number;
|
|
15
|
+
timeout?: number;
|
|
16
|
+
fullPage?: boolean;
|
|
17
|
+
}) => Promise<{
|
|
18
|
+
html: string;
|
|
19
|
+
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
|
+
meta: {
|
|
21
|
+
title?: string;
|
|
22
|
+
description?: string;
|
|
23
|
+
};
|
|
24
|
+
}>;
|
|
25
|
+
/**
|
|
26
|
+
* crawl url and return job id
|
|
27
|
+
* @param params
|
|
28
|
+
* @param callback callback when job finished
|
|
29
|
+
*/
|
|
30
|
+
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string | undefined>;
|
|
@@ -9,16 +9,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
9
9
|
};
|
|
10
10
|
import createQueue from '@abtnode/queue';
|
|
11
11
|
import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
|
|
12
|
-
import sequelize from '@sequelize/core';
|
|
13
12
|
import { randomUUID } from 'crypto';
|
|
14
13
|
import fs from 'fs-extra';
|
|
15
|
-
import pick from 'lodash/pick';
|
|
16
14
|
import path from 'path';
|
|
17
|
-
import { joinURL } from 'ufo';
|
|
18
15
|
import { config, logger } from './config';
|
|
19
|
-
import { Job } from './db/job';
|
|
20
|
-
import { Snapshot } from './db/snapshot';
|
|
21
16
|
import { initPage } from './puppeteer';
|
|
17
|
+
import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
|
|
18
|
+
import { Job } from './store/job';
|
|
19
|
+
import { Snapshot } from './store/snapshot';
|
|
22
20
|
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
23
21
|
const { BaseState } = require('@abtnode/models');
|
|
24
22
|
let crawlQueue;
|
|
@@ -26,7 +24,7 @@ export function createCrawlQueue() {
|
|
|
26
24
|
const db = new BaseState(Job);
|
|
27
25
|
crawlQueue = createQueue({
|
|
28
26
|
store: new SequelizeStore(db, 'crawler'),
|
|
29
|
-
concurrency:
|
|
27
|
+
concurrency: config.siteCron.crawlConcurrency,
|
|
30
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
31
29
|
logger.info('Starting to execute crawl job', job);
|
|
32
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
@@ -77,23 +75,14 @@ export function createCrawlQueue() {
|
|
|
77
75
|
status: 'success',
|
|
78
76
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
79
77
|
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
|
|
78
|
+
meta: result.meta,
|
|
80
79
|
},
|
|
81
80
|
});
|
|
82
81
|
yield Snapshot.upsert(snapshot);
|
|
83
82
|
return snapshot;
|
|
84
|
-
// save to redis
|
|
85
|
-
// if (saveToRedis) {
|
|
86
|
-
// useCache.set(url, {
|
|
87
|
-
// html: result.html || '',
|
|
88
|
-
// lastModified,
|
|
89
|
-
// });
|
|
90
|
-
// logger.info(`success to crawl ${url}`, job);
|
|
91
|
-
// return result;
|
|
92
|
-
// }
|
|
93
83
|
}
|
|
94
84
|
catch (error) {
|
|
95
85
|
logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
96
|
-
console.error(error.stack);
|
|
97
86
|
const snapshot = convertJobToSnapshot({
|
|
98
87
|
job,
|
|
99
88
|
snapshot: {
|
|
@@ -145,7 +134,7 @@ function formatHtml(htmlString) {
|
|
|
145
134
|
}
|
|
146
135
|
return htmlString;
|
|
147
136
|
}
|
|
148
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
137
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, }) {
|
|
149
138
|
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
150
139
|
const page = yield initPage();
|
|
151
140
|
if (width && height) {
|
|
@@ -153,6 +142,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
153
142
|
}
|
|
154
143
|
let html = null;
|
|
155
144
|
let screenshot = null;
|
|
145
|
+
const meta = {};
|
|
156
146
|
try {
|
|
157
147
|
const response = yield page.goto(url, { timeout });
|
|
158
148
|
if (!response) {
|
|
@@ -173,7 +163,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
173
163
|
// Try to find the tallest element and set the browser to the same height
|
|
174
164
|
if (fullPage) {
|
|
175
165
|
const maxScrollHeight = yield findMaxScrollHeight(page);
|
|
176
|
-
logger.
|
|
166
|
+
logger.debug('findMaxScrollHeight', { maxScrollHeight });
|
|
177
167
|
if (maxScrollHeight) {
|
|
178
168
|
yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
|
|
179
169
|
yield page.evaluate((scrollHeight) => {
|
|
@@ -187,17 +177,37 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
187
177
|
}
|
|
188
178
|
catch (err) {
|
|
189
179
|
logger.error('Failed to get screenshot:', err);
|
|
180
|
+
throw err;
|
|
190
181
|
}
|
|
191
182
|
}
|
|
192
183
|
// get html
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
184
|
+
try {
|
|
185
|
+
const data = yield page.evaluate(() => {
|
|
186
|
+
var _a;
|
|
187
|
+
// add meta tag to record crawler
|
|
188
|
+
const meta = document.createElement('meta');
|
|
189
|
+
meta.name = 'arcblock-crawler';
|
|
190
|
+
meta.content = 'true';
|
|
191
|
+
document.head.appendChild(meta);
|
|
192
|
+
// get title and meta description
|
|
193
|
+
const title = document.title || '';
|
|
194
|
+
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
195
|
+
return {
|
|
196
|
+
html: document.documentElement.outerHTML,
|
|
197
|
+
title,
|
|
198
|
+
description,
|
|
199
|
+
};
|
|
200
|
+
});
|
|
201
|
+
meta.title = data.title;
|
|
202
|
+
meta.description = data.description;
|
|
203
|
+
if (includeHtml) {
|
|
204
|
+
html = data.html;
|
|
199
205
|
}
|
|
200
206
|
}
|
|
207
|
+
catch (err) {
|
|
208
|
+
logger.error('Failed to get html:', err);
|
|
209
|
+
throw err;
|
|
210
|
+
}
|
|
201
211
|
}
|
|
202
212
|
catch (error) {
|
|
203
213
|
logger.error('Failed to get page content:', error);
|
|
@@ -210,13 +220,19 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
210
220
|
return {
|
|
211
221
|
html,
|
|
212
222
|
screenshot,
|
|
223
|
+
meta,
|
|
213
224
|
};
|
|
214
225
|
});
|
|
215
|
-
|
|
226
|
+
/**
|
|
227
|
+
* crawl url and return job id
|
|
228
|
+
* @param params
|
|
229
|
+
* @param callback callback when job finished
|
|
230
|
+
*/
|
|
231
|
+
export function crawlUrl(params, callback) {
|
|
216
232
|
return __awaiter(this, void 0, void 0, function* () {
|
|
217
|
-
params = Object.assign(Object.assign({}, params), {
|
|
233
|
+
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
218
234
|
// skip duplicate job
|
|
219
|
-
const
|
|
235
|
+
const { job: duplicateJob } = (yield Job.findJob({
|
|
220
236
|
url: params.url,
|
|
221
237
|
includeScreenshot: params.includeScreenshot,
|
|
222
238
|
includeHtml: params.includeHtml,
|
|
@@ -224,89 +240,22 @@ export function createCrawlJob(params, callback) {
|
|
|
224
240
|
width: params.width,
|
|
225
241
|
height: params.height,
|
|
226
242
|
fullPage: params.fullPage,
|
|
227
|
-
});
|
|
228
|
-
if (
|
|
229
|
-
logger.
|
|
230
|
-
return
|
|
243
|
+
})) || {};
|
|
244
|
+
if (duplicateJob) {
|
|
245
|
+
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
246
|
+
return duplicateJob.id;
|
|
231
247
|
}
|
|
232
248
|
logger.info('create crawl job', params);
|
|
233
|
-
const
|
|
234
|
-
job.
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
249
|
+
const jobId = randomUUID();
|
|
250
|
+
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
251
|
+
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
252
|
+
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
253
|
+
callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
|
|
254
|
+
}));
|
|
238
255
|
job.on('failed', ({ error }) => {
|
|
239
256
|
logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
240
257
|
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
241
258
|
});
|
|
242
|
-
return
|
|
243
|
-
});
|
|
244
|
-
}
|
|
245
|
-
// @ts-ignore
|
|
246
|
-
export function getJob(condition) {
|
|
247
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
248
|
-
const where = Object.keys(condition)
|
|
249
|
-
.filter((key) => condition[key] !== undefined)
|
|
250
|
-
.map((key) => {
|
|
251
|
-
return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
|
|
252
|
-
});
|
|
253
|
-
const job = yield crawlQueue.store.db.findOne({
|
|
254
|
-
where: {
|
|
255
|
-
[sequelize.Op.and]: where,
|
|
256
|
-
},
|
|
257
|
-
});
|
|
258
|
-
if (job) {
|
|
259
|
-
return job.job;
|
|
260
|
-
}
|
|
261
|
-
return null;
|
|
262
|
-
});
|
|
263
|
-
}
|
|
264
|
-
function convertJobToSnapshot({ job, snapshot }) {
|
|
265
|
-
return Object.assign({
|
|
266
|
-
// @ts-ignore
|
|
267
|
-
jobId: job.jobId || job.id, url: job.url, options: {
|
|
268
|
-
width: job.width,
|
|
269
|
-
height: job.height,
|
|
270
|
-
includeScreenshot: job.includeScreenshot,
|
|
271
|
-
includeHtml: job.includeHtml,
|
|
272
|
-
quality: job.quality,
|
|
273
|
-
fullPage: job.fullPage,
|
|
274
|
-
} }, snapshot);
|
|
275
|
-
}
|
|
276
|
-
export function formatSnapshot(snapshot, columns) {
|
|
277
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
278
|
-
let data = Object.assign({}, snapshot);
|
|
279
|
-
// format screenshot path to full url
|
|
280
|
-
if (data.screenshot) {
|
|
281
|
-
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
282
|
-
}
|
|
283
|
-
// format html path to string
|
|
284
|
-
if (data.html) {
|
|
285
|
-
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
286
|
-
data.html = html.toString();
|
|
287
|
-
}
|
|
288
|
-
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
289
|
-
data = pick(data, columns);
|
|
290
|
-
}
|
|
291
|
-
return data;
|
|
292
|
-
});
|
|
293
|
-
}
|
|
294
|
-
/**
|
|
295
|
-
* get snapshot from db or crawl queue
|
|
296
|
-
*/
|
|
297
|
-
export function getSnapshot(jobId) {
|
|
298
|
-
return __awaiter(this, void 0, void 0, function* () {
|
|
299
|
-
const snapshotModel = yield Snapshot.findByPk(jobId);
|
|
300
|
-
if (snapshotModel) {
|
|
301
|
-
return snapshotModel.toJSON();
|
|
302
|
-
}
|
|
303
|
-
const job = yield getJob({ id: jobId });
|
|
304
|
-
if (job) {
|
|
305
|
-
return {
|
|
306
|
-
jobId,
|
|
307
|
-
status: 'pending',
|
|
308
|
-
};
|
|
309
|
-
}
|
|
310
|
-
return null;
|
|
259
|
+
return jobId;
|
|
311
260
|
});
|
|
312
261
|
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function initCron(): any;
|