@arcblock/crawler 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +56 -113
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +88 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +48 -102
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +26 -37
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +84 -71
- package/package.json +22 -33
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -15
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -15
|
@@ -51,33 +51,28 @@ exports.ensureBrowser = ensureBrowser;
|
|
|
51
51
|
exports.connectBrowser = connectBrowser;
|
|
52
52
|
exports.launchBrowser = launchBrowser;
|
|
53
53
|
exports.initPage = initPage;
|
|
54
|
-
// import fs from 'fs-extra';
|
|
55
|
-
// import path from 'path';
|
|
56
54
|
const puppeteer_1 = __importDefault(require("@blocklet/puppeteer"));
|
|
57
55
|
exports.puppeteer = puppeteer_1.default;
|
|
58
|
-
const config_1 = require("@blocklet/sdk/lib/config");
|
|
59
56
|
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
60
57
|
const path_1 = __importDefault(require("path"));
|
|
61
58
|
const timers_1 = require("timers");
|
|
62
|
-
const
|
|
63
|
-
const config_2 = require("./config");
|
|
59
|
+
const config_1 = require("./config");
|
|
64
60
|
const utils_1 = require("./utils");
|
|
65
|
-
// let puppeteerConfig: {
|
|
66
|
-
// cacheDirectory: string;
|
|
67
|
-
// temporaryDirectory: string;
|
|
68
|
-
// };
|
|
69
|
-
const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${config_1.env.appId || 'unknown'}`;
|
|
70
61
|
const BrowserStatus = {
|
|
62
|
+
None: 'None',
|
|
71
63
|
Launching: 'Launching',
|
|
72
64
|
Ready: 'Ready',
|
|
73
65
|
};
|
|
66
|
+
let browserStatus = BrowserStatus.None;
|
|
67
|
+
/** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
|
|
68
|
+
let browserEndpoint = '';
|
|
74
69
|
let browser;
|
|
75
70
|
let browserActivatedTimer;
|
|
76
71
|
function ensurePuppeteerrc() {
|
|
77
72
|
return __awaiter(this, void 0, void 0, function* () {
|
|
78
|
-
const cacheDirectory = path_1.default.join(
|
|
79
|
-
const temporaryDirectory = path_1.default.join(
|
|
80
|
-
const puppeteerrcPath = path_1.default.join(
|
|
73
|
+
const cacheDirectory = path_1.default.join(config_1.config.cacheDir, 'puppeteer', 'cache');
|
|
74
|
+
const temporaryDirectory = path_1.default.join(config_1.config.cacheDir, 'puppeteer', 'tmp');
|
|
75
|
+
const puppeteerrcPath = path_1.default.join(config_1.config.appDir, '.puppeteerrc.js');
|
|
81
76
|
// ensure directory exists
|
|
82
77
|
yield Promise.all([fs_extra_1.default.ensureDir(cacheDirectory), fs_extra_1.default.ensureDir(temporaryDirectory), fs_extra_1.default.ensureFile(puppeteerrcPath)]);
|
|
83
78
|
const puppeteerConfig = {
|
|
@@ -86,17 +81,17 @@ function ensurePuppeteerrc() {
|
|
|
86
81
|
};
|
|
87
82
|
const fileContent = `module.exports = ${JSON.stringify(puppeteerConfig, null, 2)}`;
|
|
88
83
|
yield fs_extra_1.default.writeFile(puppeteerrcPath, fileContent);
|
|
89
|
-
|
|
84
|
+
config_1.logger.debug(`Puppeteerrc file created at ${puppeteerrcPath}`, puppeteerConfig);
|
|
90
85
|
return puppeteerConfig;
|
|
91
86
|
});
|
|
92
87
|
}
|
|
93
88
|
function ensureBrowser() {
|
|
94
89
|
return __awaiter(this, void 0, void 0, function* () {
|
|
95
90
|
const puppeteerConfig = yield ensurePuppeteerrc();
|
|
96
|
-
const executablePath =
|
|
97
|
-
|
|
98
|
-
if (!fs_extra_1.default.existsSync(executablePath)) {
|
|
99
|
-
|
|
91
|
+
const executablePath = config_1.config.puppeteerPath;
|
|
92
|
+
config_1.logger.debug('executablePath', executablePath);
|
|
93
|
+
if (!executablePath || !fs_extra_1.default.existsSync(executablePath)) {
|
|
94
|
+
config_1.logger.info('start download browser', puppeteerConfig);
|
|
100
95
|
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
101
96
|
try {
|
|
102
97
|
// @ts-ignore
|
|
@@ -104,45 +99,44 @@ function ensureBrowser() {
|
|
|
104
99
|
return yield Promise.resolve().then(() => __importStar(require('@blocklet/puppeteer/internal/node/install.js')));
|
|
105
100
|
}
|
|
106
101
|
catch (err) {
|
|
107
|
-
|
|
102
|
+
config_1.logger.warn('Skipping browser installation because the Puppeteer build is not available. Run `npm install` again after you have re-built Puppeteer.');
|
|
108
103
|
}
|
|
109
104
|
}))();
|
|
110
105
|
if (downloadBrowser) {
|
|
111
106
|
yield downloadBrowser();
|
|
112
|
-
|
|
107
|
+
config_1.logger.info('Browser download completed successfully');
|
|
113
108
|
}
|
|
114
109
|
}
|
|
115
110
|
// try to launch browser
|
|
116
|
-
if (
|
|
111
|
+
if (config_1.config.isProd) {
|
|
117
112
|
const browser = yield launchBrowser();
|
|
118
113
|
if (!browser) {
|
|
119
114
|
throw new Error('Failed to launch browser');
|
|
120
115
|
}
|
|
121
116
|
yield (0, exports.closeBrowser)();
|
|
122
117
|
}
|
|
123
|
-
|
|
118
|
+
config_1.logger.info('Puppeteer is ready');
|
|
124
119
|
});
|
|
125
120
|
}
|
|
126
121
|
function connectBrowser() {
|
|
127
122
|
return __awaiter(this, void 0, void 0, function* () {
|
|
128
|
-
|
|
129
|
-
if (!browserWSEndpoint) {
|
|
123
|
+
if (!browserEndpoint) {
|
|
130
124
|
return null;
|
|
131
125
|
}
|
|
132
126
|
// retry if browser is launching
|
|
133
|
-
if (
|
|
127
|
+
if (browserStatus === BrowserStatus.Launching) {
|
|
134
128
|
yield (0, utils_1.sleep)(Math.floor(Math.random() * 1000));
|
|
135
129
|
return connectBrowser();
|
|
136
130
|
}
|
|
137
131
|
try {
|
|
138
132
|
browser = yield puppeteer_1.default.connect({
|
|
139
|
-
browserWSEndpoint:
|
|
133
|
+
browserWSEndpoint: browserEndpoint,
|
|
140
134
|
});
|
|
141
|
-
|
|
135
|
+
config_1.logger.info('Connect browser success');
|
|
142
136
|
}
|
|
143
137
|
catch (err) {
|
|
144
|
-
|
|
145
|
-
|
|
138
|
+
config_1.logger.warn('Connect browser failed, clear endpoint', err);
|
|
139
|
+
browserEndpoint = '';
|
|
146
140
|
return null;
|
|
147
141
|
}
|
|
148
142
|
return browser;
|
|
@@ -150,12 +144,9 @@ function connectBrowser() {
|
|
|
150
144
|
}
|
|
151
145
|
function launchBrowser() {
|
|
152
146
|
return __awaiter(this, void 0, void 0, function* () {
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
status: BrowserStatus.Launching,
|
|
156
|
-
});
|
|
147
|
+
browserEndpoint = '';
|
|
148
|
+
browserStatus = BrowserStatus.Launching;
|
|
157
149
|
try {
|
|
158
|
-
// @ts-ignore
|
|
159
150
|
browser = yield puppeteer_1.default.launch({
|
|
160
151
|
headless: true,
|
|
161
152
|
args: [
|
|
@@ -182,20 +173,17 @@ function launchBrowser() {
|
|
|
182
173
|
'--font-render-hinting=none',
|
|
183
174
|
],
|
|
184
175
|
});
|
|
185
|
-
|
|
176
|
+
config_1.logger.info('Launch browser');
|
|
186
177
|
}
|
|
187
178
|
catch (error) {
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
179
|
+
config_1.logger.error('launch browser failed: ', error);
|
|
180
|
+
browserStatus = BrowserStatus.None;
|
|
181
|
+
browserEndpoint = '';
|
|
191
182
|
throw error;
|
|
192
183
|
}
|
|
193
184
|
// save browserWSEndpoint to cache
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
endpoint,
|
|
197
|
-
status: BrowserStatus.Ready,
|
|
198
|
-
});
|
|
185
|
+
browserEndpoint = yield browser.wsEndpoint();
|
|
186
|
+
browserStatus = BrowserStatus.Ready;
|
|
199
187
|
return browser;
|
|
200
188
|
});
|
|
201
189
|
}
|
|
@@ -208,13 +196,13 @@ function checkBrowserActivated() {
|
|
|
208
196
|
const pages = yield browser.pages().catch(() => []);
|
|
209
197
|
if (pages.length === 1 && ((_a = pages[0]) === null || _a === void 0 ? void 0 : _a.url()) === 'about:blank') {
|
|
210
198
|
count++;
|
|
211
|
-
|
|
199
|
+
config_1.logger.debug(`Browser inactive count: ${count}/3`);
|
|
212
200
|
}
|
|
213
201
|
else {
|
|
214
202
|
count = 0; // 重置计数器!
|
|
215
203
|
}
|
|
216
204
|
if (count >= 3) {
|
|
217
|
-
|
|
205
|
+
config_1.logger.info('Browser inactive for 3 minutes, closing...');
|
|
218
206
|
yield (0, exports.closeBrowser)({
|
|
219
207
|
trimCache: true,
|
|
220
208
|
});
|
|
@@ -236,14 +224,15 @@ const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
236
224
|
// try to connect browser
|
|
237
225
|
const connectedBrowser = yield connectBrowser();
|
|
238
226
|
if (connectedBrowser) {
|
|
239
|
-
|
|
227
|
+
config_1.logger.debug('getBrowser.connectedBrowser');
|
|
240
228
|
browser = connectedBrowser;
|
|
229
|
+
checkBrowserActivated();
|
|
241
230
|
return browser;
|
|
242
231
|
}
|
|
243
232
|
// try to launch browser
|
|
244
233
|
const launchedBrowser = yield launchBrowser();
|
|
245
234
|
if (launchedBrowser) {
|
|
246
|
-
|
|
235
|
+
config_1.logger.debug('getBrowser.launchedBrowser');
|
|
247
236
|
browser = launchedBrowser;
|
|
248
237
|
checkBrowserActivated();
|
|
249
238
|
return browser;
|
|
@@ -260,20 +249,20 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
|
|
|
260
249
|
yield Promise.all(pages.map((page) => page.close()));
|
|
261
250
|
}
|
|
262
251
|
catch (err) {
|
|
263
|
-
|
|
252
|
+
config_1.logger.warn('Failed to close all pages:', err);
|
|
264
253
|
}
|
|
265
254
|
// close browser
|
|
266
255
|
try {
|
|
267
256
|
yield browser.close();
|
|
268
257
|
}
|
|
269
258
|
catch (err) {
|
|
270
|
-
|
|
259
|
+
config_1.logger.warn('Failed to close browser:', err);
|
|
271
260
|
}
|
|
272
261
|
// clear cache
|
|
273
262
|
try {
|
|
274
263
|
if (trimCache) {
|
|
275
264
|
yield puppeteer_1.default.trimCache();
|
|
276
|
-
|
|
265
|
+
config_1.logger.debug('Trim cache success');
|
|
277
266
|
}
|
|
278
267
|
// try to clear temporary directory
|
|
279
268
|
// if (puppeteerConfig) {
|
|
@@ -284,12 +273,13 @@ const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0, funct
|
|
|
284
273
|
}
|
|
285
274
|
}
|
|
286
275
|
catch (err) {
|
|
287
|
-
|
|
276
|
+
config_1.logger.warn('Failed to clear browser cache:', err);
|
|
288
277
|
}
|
|
289
278
|
browser = null;
|
|
290
279
|
clearBrowserActivatedTimer();
|
|
291
|
-
|
|
292
|
-
|
|
280
|
+
browserEndpoint = '';
|
|
281
|
+
browserStatus = BrowserStatus.None;
|
|
282
|
+
config_1.logger.info('Close browser success');
|
|
293
283
|
});
|
|
294
284
|
exports.closeBrowser = closeBrowser;
|
|
295
285
|
function initPage() {
|
|
@@ -306,7 +296,6 @@ function initPage() {
|
|
|
306
296
|
if (abortResourceTypes.length > 0) {
|
|
307
297
|
yield page.setRequestInterception(true);
|
|
308
298
|
page.on('request', (req) => {
|
|
309
|
-
// @ts-ignore
|
|
310
299
|
if (abortResourceTypes.includes(req.resourceType())) {
|
|
311
300
|
return req.abort();
|
|
312
301
|
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { JobState } from '../store/job';
|
|
2
|
+
import { SnapshotModel } from '../store/snapshot';
|
|
3
|
+
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
|
+
job: JobState;
|
|
5
|
+
snapshot?: Partial<SnapshotModel>;
|
|
6
|
+
}): SnapshotModel;
|
|
7
|
+
export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
|
|
8
|
+
/**
|
|
9
|
+
* get snapshot from db or crawl queue
|
|
10
|
+
*/
|
|
11
|
+
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
|
+
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.convertJobToSnapshot = convertJobToSnapshot;
|
|
16
|
+
exports.formatSnapshot = formatSnapshot;
|
|
17
|
+
exports.getSnapshot = getSnapshot;
|
|
18
|
+
exports.getLatestSnapshot = getLatestSnapshot;
|
|
19
|
+
const pick_1 = __importDefault(require("lodash/pick"));
|
|
20
|
+
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
21
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
22
|
+
const ufo_1 = require("ufo");
|
|
23
|
+
const config_1 = require("../config");
|
|
24
|
+
const job_1 = require("../store/job");
|
|
25
|
+
const snapshot_1 = require("../store/snapshot");
|
|
26
|
+
const utils_1 = require("../utils");
|
|
27
|
+
function convertJobToSnapshot({ job, snapshot }) {
|
|
28
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
29
|
+
width: job.width,
|
|
30
|
+
height: job.height,
|
|
31
|
+
includeScreenshot: job.includeScreenshot,
|
|
32
|
+
includeHtml: job.includeHtml,
|
|
33
|
+
quality: job.quality,
|
|
34
|
+
fullPage: job.fullPage,
|
|
35
|
+
} }, snapshot);
|
|
36
|
+
}
|
|
37
|
+
function formatSnapshot(snapshot, columns) {
|
|
38
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
+
let data = Object.assign({}, snapshot);
|
|
40
|
+
// format screenshot path to full url
|
|
41
|
+
if (data.screenshot) {
|
|
42
|
+
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
43
|
+
}
|
|
44
|
+
// format html path to string
|
|
45
|
+
if (data.html) {
|
|
46
|
+
const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
|
|
47
|
+
data.html = html.toString();
|
|
48
|
+
}
|
|
49
|
+
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
50
|
+
data = (0, pick_1.default)(data, columns);
|
|
51
|
+
}
|
|
52
|
+
return data;
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* get snapshot from db or crawl queue
|
|
57
|
+
*/
|
|
58
|
+
function getSnapshot(jobId) {
|
|
59
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
60
|
+
const snapshot = yield snapshot_1.Snapshot.findSnapshot({ where: { jobId } });
|
|
61
|
+
if (snapshot) {
|
|
62
|
+
return formatSnapshot(snapshot);
|
|
63
|
+
}
|
|
64
|
+
const job = yield job_1.Job.findJob({ id: jobId });
|
|
65
|
+
if (job) {
|
|
66
|
+
return {
|
|
67
|
+
jobId,
|
|
68
|
+
status: 'pending',
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return null;
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
function getLatestSnapshot(url) {
|
|
75
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
76
|
+
const snapshot = yield snapshot_1.Snapshot.findSnapshot({
|
|
77
|
+
where: {
|
|
78
|
+
url: (0, utils_1.formatUrl)(url),
|
|
79
|
+
status: 'success',
|
|
80
|
+
},
|
|
81
|
+
});
|
|
82
|
+
return snapshot ? formatSnapshot(snapshot) : null;
|
|
83
|
+
});
|
|
84
|
+
}
|
package/lib/cjs/site.js
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.crawlSite = void 0;
|
|
16
|
+
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
17
|
+
const p_map_1 = __importDefault(require("p-map"));
|
|
18
|
+
const config_1 = require("./config");
|
|
19
|
+
const crawler_1 = require("./crawler");
|
|
20
|
+
const snapshot_1 = require("./store/snapshot");
|
|
21
|
+
const utils_1 = require("./utils");
|
|
22
|
+
const crawlBlockletRunningMap = new Map();
|
|
23
|
+
function parseSitemapUrl(sitemapItem) {
|
|
24
|
+
var _a;
|
|
25
|
+
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
26
|
+
const urls = (0, uniq_1.default)([...links, sitemapItem.url]).filter(Boolean);
|
|
27
|
+
return urls.map((url) => ({ url, sitemapItem }));
|
|
28
|
+
}
|
|
29
|
+
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
30
|
+
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
31
|
+
const sitemapList = yield (0, utils_1.getSitemapList)(url);
|
|
32
|
+
const pathnameRegex = new RegExp(pathname);
|
|
33
|
+
const sitemapItems = sitemapList
|
|
34
|
+
.filter((item) => new URL(item.url).pathname.match(pathnameRegex))
|
|
35
|
+
.flatMap((sitemapItem) => {
|
|
36
|
+
return parseSitemapUrl(sitemapItem);
|
|
37
|
+
});
|
|
38
|
+
config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
39
|
+
const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
40
|
+
const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
|
|
41
|
+
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
42
|
+
const lastModified = new Date(snapshot.lastModified);
|
|
43
|
+
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
44
|
+
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
// skip if interval time has not been reached
|
|
48
|
+
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return { url, sitemapItem };
|
|
53
|
+
})))).filter(Boolean);
|
|
54
|
+
config_1.logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
|
|
55
|
+
const key = `${url}-${pathname}`;
|
|
56
|
+
crawlBlockletRunningMap.set(key, crawlableItems);
|
|
57
|
+
try {
|
|
58
|
+
const jobIds = yield (0, p_map_1.default)(crawlableItems, ({ url, sitemapItem }) => {
|
|
59
|
+
return (0, crawler_1.crawlUrl)({
|
|
60
|
+
url,
|
|
61
|
+
lastModified: sitemapItem.lastmod,
|
|
62
|
+
includeScreenshot: false,
|
|
63
|
+
includeHtml: true,
|
|
64
|
+
});
|
|
65
|
+
}, { concurrency: config_1.config.siteCron.concurrency });
|
|
66
|
+
return jobIds;
|
|
67
|
+
}
|
|
68
|
+
catch (error) {
|
|
69
|
+
config_1.logger.error(`Failed to crawl from sitemap ${url} ${pathname}`, error);
|
|
70
|
+
throw new Error(error);
|
|
71
|
+
}
|
|
72
|
+
finally {
|
|
73
|
+
crawlBlockletRunningMap.delete(key);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
exports.crawlSite = crawlSite;
|
|
@@ -12,23 +12,38 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
12
12
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
13
|
};
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.
|
|
15
|
+
exports.initDatabase = initDatabase;
|
|
16
16
|
const core_1 = require("@sequelize/core");
|
|
17
17
|
const sqlite3_1 = require("@sequelize/sqlite3");
|
|
18
18
|
const path_1 = __importDefault(require("path"));
|
|
19
19
|
const config_1 = require("../config");
|
|
20
20
|
const job_1 = require("./job");
|
|
21
21
|
const snapshot_1 = require("./snapshot");
|
|
22
|
-
function
|
|
22
|
+
function initDatabase() {
|
|
23
23
|
return __awaiter(this, void 0, void 0, function* () {
|
|
24
24
|
const sequelize = new core_1.Sequelize({
|
|
25
25
|
dialect: sqlite3_1.SqliteDialect,
|
|
26
26
|
storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
|
|
27
|
-
logging: (msg) => config_1.logger.debug(msg),
|
|
27
|
+
logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
|
|
28
|
+
pool: {
|
|
29
|
+
min: 0,
|
|
30
|
+
max: 10,
|
|
31
|
+
idle: 10000,
|
|
32
|
+
},
|
|
33
|
+
retry: {
|
|
34
|
+
match: [/SQLITE_BUSY/],
|
|
35
|
+
name: 'query',
|
|
36
|
+
max: 10,
|
|
37
|
+
},
|
|
28
38
|
});
|
|
29
|
-
|
|
30
|
-
|
|
39
|
+
job_1.Job.initModel(sequelize);
|
|
40
|
+
snapshot_1.Snapshot.initModel(sequelize);
|
|
31
41
|
try {
|
|
42
|
+
yield Promise.all([
|
|
43
|
+
sequelize.query('pragma journal_mode = WAL;'),
|
|
44
|
+
sequelize.query('pragma synchronous = normal;'),
|
|
45
|
+
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
46
|
+
]);
|
|
32
47
|
yield sequelize.authenticate();
|
|
33
48
|
yield sequelize.sync();
|
|
34
49
|
config_1.logger.info('Successfully connected to database');
|
|
@@ -37,5 +52,6 @@ function ensureDatabase() {
|
|
|
37
52
|
config_1.logger.error('Failed to connect to database:', error);
|
|
38
53
|
throw error;
|
|
39
54
|
}
|
|
55
|
+
return sequelize;
|
|
40
56
|
});
|
|
41
57
|
}
|
|
@@ -10,6 +10,7 @@ export interface JobState {
|
|
|
10
10
|
quality?: number;
|
|
11
11
|
timeout?: number;
|
|
12
12
|
fullPage?: boolean;
|
|
13
|
+
lastModified?: string;
|
|
13
14
|
}
|
|
14
15
|
export interface JobModel {
|
|
15
16
|
id: string;
|
|
@@ -20,7 +21,7 @@ export interface JobModel {
|
|
|
20
21
|
delay: number;
|
|
21
22
|
cancelled: boolean;
|
|
22
23
|
}
|
|
23
|
-
declare class Job extends Model<JobModel> implements JobModel {
|
|
24
|
+
export declare class Job extends Model<JobModel> implements JobModel {
|
|
24
25
|
id: JobModel['id'];
|
|
25
26
|
queue: JobModel['queue'];
|
|
26
27
|
job: JobModel['job'];
|
|
@@ -28,6 +29,6 @@ declare class Job extends Model<JobModel> implements JobModel {
|
|
|
28
29
|
willRunAt: JobModel['willRunAt'];
|
|
29
30
|
delay: JobModel['delay'];
|
|
30
31
|
cancelled: JobModel['cancelled'];
|
|
32
|
+
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
|
+
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
31
34
|
}
|
|
32
|
-
export { Job };
|
|
33
|
-
export declare function initJobModel(sequelize: Sequelize): typeof Job;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
exports.Job = void 0;
|
|
46
|
+
const core_1 = __importStar(require("@sequelize/core"));
|
|
47
|
+
class Job extends core_1.Model {
|
|
48
|
+
static initModel(sequelize) {
|
|
49
|
+
return Job.init({
|
|
50
|
+
id: {
|
|
51
|
+
type: core_1.DataTypes.STRING(40),
|
|
52
|
+
primaryKey: true,
|
|
53
|
+
},
|
|
54
|
+
queue: {
|
|
55
|
+
type: core_1.DataTypes.STRING(32),
|
|
56
|
+
allowNull: false,
|
|
57
|
+
},
|
|
58
|
+
job: {
|
|
59
|
+
type: core_1.DataTypes.JSON,
|
|
60
|
+
allowNull: false,
|
|
61
|
+
},
|
|
62
|
+
retryCount: {
|
|
63
|
+
type: core_1.DataTypes.INTEGER,
|
|
64
|
+
},
|
|
65
|
+
delay: {
|
|
66
|
+
type: core_1.DataTypes.INTEGER,
|
|
67
|
+
},
|
|
68
|
+
willRunAt: {
|
|
69
|
+
type: core_1.DataTypes.INTEGER,
|
|
70
|
+
},
|
|
71
|
+
cancelled: {
|
|
72
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
73
|
+
defaultValue: false,
|
|
74
|
+
},
|
|
75
|
+
createdAt: {
|
|
76
|
+
type: core_1.DataTypes.DATE,
|
|
77
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
78
|
+
index: true,
|
|
79
|
+
},
|
|
80
|
+
updatedAt: {
|
|
81
|
+
type: core_1.DataTypes.DATE,
|
|
82
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
83
|
+
index: true,
|
|
84
|
+
},
|
|
85
|
+
}, {
|
|
86
|
+
sequelize,
|
|
87
|
+
indexes: [{ fields: ['queue'] }],
|
|
88
|
+
modelName: 'job',
|
|
89
|
+
tableName: 'jobs',
|
|
90
|
+
timestamps: true,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
static findJob(condition) {
|
|
94
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
95
|
+
const where = Object.keys(condition)
|
|
96
|
+
.filter((key) => condition[key] !== undefined)
|
|
97
|
+
.map((key) => {
|
|
98
|
+
return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
|
|
99
|
+
});
|
|
100
|
+
const job = yield Job.findOne({
|
|
101
|
+
where: {
|
|
102
|
+
[core_1.default.Op.and]: where,
|
|
103
|
+
},
|
|
104
|
+
order: [['createdAt', 'DESC']],
|
|
105
|
+
});
|
|
106
|
+
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
exports.Job = Job;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
-
interface SnapshotModel {
|
|
1
|
+
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
export interface SnapshotModel {
|
|
3
3
|
jobId: string;
|
|
4
4
|
url: string;
|
|
5
5
|
status: 'success' | 'failed' | 'pending';
|
|
@@ -16,7 +16,7 @@ interface SnapshotModel {
|
|
|
16
16
|
fullPage?: boolean;
|
|
17
17
|
};
|
|
18
18
|
}
|
|
19
|
-
declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
19
|
+
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
20
20
|
jobId: SnapshotModel['jobId'];
|
|
21
21
|
url: SnapshotModel['url'];
|
|
22
22
|
status: SnapshotModel['status'];
|
|
@@ -25,7 +25,6 @@ declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
|
25
25
|
error?: SnapshotModel['error'];
|
|
26
26
|
lastModified?: SnapshotModel['lastModified'];
|
|
27
27
|
options: SnapshotModel['options'];
|
|
28
|
+
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
29
|
+
static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
|
|
28
30
|
}
|
|
29
|
-
export { Snapshot };
|
|
30
|
-
export type { SnapshotModel };
|
|
31
|
-
export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
|