@arcblock/crawler 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocklet.d.ts +6 -0
- package/dist/blocklet.js +199 -0
- package/dist/cache.d.ts +10 -0
- package/dist/cache.js +119 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +17 -0
- package/dist/crawler.d.ts +28 -0
- package/dist/crawler.js +314 -0
- package/dist/db/index.d.ts +1 -0
- package/dist/db/index.js +41 -0
- package/dist/db/job.d.ts +33 -0
- package/dist/db/job.js +54 -0
- package/dist/db/snapshot.d.ts +31 -0
- package/dist/db/snapshot.js +52 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +45 -0
- package/dist/middleware.d.ts +4 -0
- package/dist/middleware.js +44 -0
- package/dist/puppeteer.d.ts +16 -0
- package/dist/puppeteer.js +318 -0
- package/dist/utils.d.ts +15 -0
- package/dist/utils.js +239 -0
- package/esm/blocklet.d.ts +6 -0
- package/esm/blocklet.js +190 -0
- package/esm/cache.d.ts +10 -0
- package/esm/cache.js +114 -0
- package/esm/config.d.ts +10 -0
- package/esm/config.js +11 -0
- package/esm/crawler.d.ts +28 -0
- package/esm/crawler.js +301 -0
- package/esm/db/index.d.ts +1 -0
- package/esm/db/index.js +35 -0
- package/esm/db/job.d.ts +33 -0
- package/esm/db/job.js +50 -0
- package/esm/db/snapshot.d.ts +31 -0
- package/esm/db/snapshot.js +48 -0
- package/esm/index.d.ts +6 -0
- package/esm/index.js +26 -0
- package/esm/middleware.d.ts +4 -0
- package/esm/middleware.js +41 -0
- package/esm/puppeteer.d.ts +16 -0
- package/esm/puppeteer.js +272 -0
- package/esm/utils.d.ts +15 -0
- package/esm/utils.js +220 -0
- package/package.json +10 -3
- package/src/blocklet.ts +0 -223
- package/src/cache.ts +0 -117
- package/src/config.ts +0 -13
- package/src/crawler.ts +0 -364
- package/src/db/index.ts +0 -27
- package/src/db/job.ts +0 -93
- package/src/db/snapshot.ts +0 -89
- package/src/index.ts +0 -19
- package/src/middleware.ts +0 -46
- package/src/puppeteer.ts +0 -296
- package/src/utils.ts +0 -240
- package/third.d.ts +0 -1
- package/tsconfig.json +0 -9
package/dist/crawler.js
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.getPageContent = void 0;
|
|
16
|
+
exports.createCrawlQueue = createCrawlQueue;
|
|
17
|
+
exports.getDataDir = getDataDir;
|
|
18
|
+
exports.createCrawlJob = createCrawlJob;
|
|
19
|
+
exports.getJob = getJob;
|
|
20
|
+
exports.formatSnapshot = formatSnapshot;
|
|
21
|
+
exports.getSnapshot = getSnapshot;
|
|
22
|
+
const queue_1 = __importDefault(require("@abtnode/queue"));
|
|
23
|
+
const sequelize_1 = __importDefault(require("@abtnode/queue/lib/store/sequelize"));
|
|
24
|
+
const core_1 = __importDefault(require("@sequelize/core"));
|
|
25
|
+
const crypto_1 = require("crypto");
|
|
26
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
27
|
+
const pick_1 = __importDefault(require("lodash/pick"));
|
|
28
|
+
const path_1 = __importDefault(require("path"));
|
|
29
|
+
const ufo_1 = require("ufo");
|
|
30
|
+
const config_1 = require("./config");
|
|
31
|
+
const job_1 = require("./db/job");
|
|
32
|
+
const snapshot_1 = require("./db/snapshot");
|
|
33
|
+
const puppeteer_1 = require("./puppeteer");
|
|
34
|
+
const utils_1 = require("./utils");
|
|
35
|
+
const { BaseState } = require('@abtnode/models');
|
|
36
|
+
let crawlQueue;
|
|
37
|
+
function createCrawlQueue() {
|
|
38
|
+
const db = new BaseState(job_1.Job);
|
|
39
|
+
crawlQueue = (0, queue_1.default)({
|
|
40
|
+
store: new sequelize_1.default(db, 'crawler'),
|
|
41
|
+
concurrency: 1,
|
|
42
|
+
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
43
|
+
config_1.logger.debug('job start:', job);
|
|
44
|
+
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
45
|
+
if (!canCrawl) {
|
|
46
|
+
config_1.logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
47
|
+
const snapshot = convertJobToSnapshot({
|
|
48
|
+
job,
|
|
49
|
+
snapshot: {
|
|
50
|
+
status: 'failed',
|
|
51
|
+
error: 'Denied by robots.txt',
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
yield snapshot_1.Snapshot.upsert(snapshot);
|
|
55
|
+
return snapshot;
|
|
56
|
+
}
|
|
57
|
+
// if index reach autoCloseBrowserCount, close browser
|
|
58
|
+
// try {
|
|
59
|
+
// if (index >= autoCloseBrowserCount) {
|
|
60
|
+
// await closeBrowser({ trimCache: false });
|
|
61
|
+
// }
|
|
62
|
+
// } catch (error) {
|
|
63
|
+
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
64
|
+
// }
|
|
65
|
+
try {
|
|
66
|
+
// get page content later
|
|
67
|
+
const result = yield (0, exports.getPageContent)(job);
|
|
68
|
+
if (!result || (!result.html && !result.screenshot)) {
|
|
69
|
+
config_1.logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
70
|
+
const snapshot = convertJobToSnapshot({
|
|
71
|
+
job,
|
|
72
|
+
snapshot: {
|
|
73
|
+
status: 'failed',
|
|
74
|
+
error: 'Failed to crawl content',
|
|
75
|
+
},
|
|
76
|
+
});
|
|
77
|
+
yield snapshot_1.Snapshot.upsert(snapshot);
|
|
78
|
+
return snapshot;
|
|
79
|
+
}
|
|
80
|
+
// save html and screenshot to data dir
|
|
81
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
82
|
+
screenshot: result.screenshot,
|
|
83
|
+
html: result.html,
|
|
84
|
+
});
|
|
85
|
+
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
86
|
+
const snapshot = convertJobToSnapshot({
|
|
87
|
+
job,
|
|
88
|
+
snapshot: {
|
|
89
|
+
status: 'success',
|
|
90
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
91
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
yield snapshot_1.Snapshot.upsert(snapshot);
|
|
95
|
+
return snapshot;
|
|
96
|
+
// save to redis
|
|
97
|
+
// if (saveToRedis) {
|
|
98
|
+
// useCache.set(url, {
|
|
99
|
+
// html: result.html || '',
|
|
100
|
+
// lastModified,
|
|
101
|
+
// });
|
|
102
|
+
// logger.info(`success to crawl ${url}`, job);
|
|
103
|
+
// return result;
|
|
104
|
+
// }
|
|
105
|
+
}
|
|
106
|
+
catch (error) {
|
|
107
|
+
config_1.logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
108
|
+
console.error(error.stack);
|
|
109
|
+
const snapshot = convertJobToSnapshot({
|
|
110
|
+
job,
|
|
111
|
+
snapshot: {
|
|
112
|
+
status: 'failed',
|
|
113
|
+
error: 'Internal error',
|
|
114
|
+
},
|
|
115
|
+
});
|
|
116
|
+
yield snapshot_1.Snapshot.upsert(snapshot);
|
|
117
|
+
return snapshot;
|
|
118
|
+
}
|
|
119
|
+
}),
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
function getDataDir() {
|
|
123
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
124
|
+
const htmlDir = path_1.default.join(config_1.config.dataDir, 'data', 'html');
|
|
125
|
+
const screenshotDir = path_1.default.join(config_1.config.dataDir, 'data', 'screenshot');
|
|
126
|
+
yield fs_extra_1.default.ensureDir(htmlDir);
|
|
127
|
+
yield fs_extra_1.default.ensureDir(screenshotDir);
|
|
128
|
+
return { htmlDir, screenshotDir };
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
function saveSnapshotToLocal(_a) {
|
|
132
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
133
|
+
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
134
|
+
let screenshotPath = null;
|
|
135
|
+
let htmlPath = null;
|
|
136
|
+
if (screenshot) {
|
|
137
|
+
const hash = (0, utils_1.md5)(screenshot);
|
|
138
|
+
screenshotPath = path_1.default.join(screenshotDir, `${hash}.webp`);
|
|
139
|
+
config_1.logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
140
|
+
yield fs_extra_1.default.writeFile(screenshotPath, screenshot);
|
|
141
|
+
}
|
|
142
|
+
if (html) {
|
|
143
|
+
const hash = (0, utils_1.md5)(html);
|
|
144
|
+
htmlPath = path_1.default.join(htmlDir, `${hash}.html`);
|
|
145
|
+
config_1.logger.debug('saveSnapshotToLocal.html', { htmlPath });
|
|
146
|
+
yield fs_extra_1.default.writeFile(htmlPath, html);
|
|
147
|
+
}
|
|
148
|
+
return {
|
|
149
|
+
screenshotPath,
|
|
150
|
+
htmlPath,
|
|
151
|
+
};
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
function formatHtml(htmlString) {
|
|
155
|
+
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
156
|
+
return '';
|
|
157
|
+
}
|
|
158
|
+
return htmlString;
|
|
159
|
+
}
|
|
160
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
|
|
161
|
+
config_1.logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
162
|
+
const page = yield (0, puppeteer_1.initPage)();
|
|
163
|
+
if (width && height) {
|
|
164
|
+
yield page.setViewport({ width, height });
|
|
165
|
+
}
|
|
166
|
+
let html = null;
|
|
167
|
+
let screenshot = null;
|
|
168
|
+
try {
|
|
169
|
+
const response = yield page.goto(url, { timeout });
|
|
170
|
+
if (!response) {
|
|
171
|
+
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
172
|
+
}
|
|
173
|
+
const statusCode = response.status();
|
|
174
|
+
config_1.logger.debug('getPageContent.response', { response, statusCode });
|
|
175
|
+
if (![200, 304].includes(statusCode)) {
|
|
176
|
+
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
177
|
+
}
|
|
178
|
+
// await for networkidle0
|
|
179
|
+
// https://pptr.dev/api/puppeteer.page.goforward/#remarks
|
|
180
|
+
yield page.waitForNetworkIdle({
|
|
181
|
+
idleTime: 2 * 1000,
|
|
182
|
+
});
|
|
183
|
+
// get screenshot
|
|
184
|
+
if (includeScreenshot) {
|
|
185
|
+
try {
|
|
186
|
+
screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
|
|
187
|
+
}
|
|
188
|
+
catch (err) {
|
|
189
|
+
config_1.logger.error('Failed to get screenshot:', err);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
// get html
|
|
193
|
+
if (includeHtml) {
|
|
194
|
+
if (formatPageContent) {
|
|
195
|
+
html = yield formatPageContent({ page, url });
|
|
196
|
+
}
|
|
197
|
+
else {
|
|
198
|
+
html = yield page.content();
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
catch (error) {
|
|
203
|
+
config_1.logger.error('Failed to get page content:', error);
|
|
204
|
+
throw error;
|
|
205
|
+
}
|
|
206
|
+
finally {
|
|
207
|
+
yield page.close();
|
|
208
|
+
}
|
|
209
|
+
html = formatHtml(html || '');
|
|
210
|
+
return {
|
|
211
|
+
html,
|
|
212
|
+
screenshot,
|
|
213
|
+
};
|
|
214
|
+
});
|
|
215
|
+
exports.getPageContent = getPageContent;
|
|
216
|
+
function createCrawlJob(params, callback) {
|
|
217
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
218
|
+
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
219
|
+
// skip duplicate job
|
|
220
|
+
const existsJob = yield getJob({
|
|
221
|
+
url: params.url,
|
|
222
|
+
includeScreenshot: params.includeScreenshot,
|
|
223
|
+
includeHtml: params.includeHtml,
|
|
224
|
+
quality: params.quality,
|
|
225
|
+
width: params.width,
|
|
226
|
+
height: params.height,
|
|
227
|
+
fullPage: params.fullPage,
|
|
228
|
+
});
|
|
229
|
+
config_1.logger.info('create crawl job', params);
|
|
230
|
+
if (existsJob) {
|
|
231
|
+
config_1.logger.warn(`Crawl job already exists for ${params.url}, skip`);
|
|
232
|
+
return existsJob.id;
|
|
233
|
+
}
|
|
234
|
+
const jobId = (0, crypto_1.randomUUID)();
|
|
235
|
+
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
236
|
+
job.on('finished', ({ result }) => {
|
|
237
|
+
config_1.logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
238
|
+
callback === null || callback === void 0 ? void 0 : callback(result);
|
|
239
|
+
});
|
|
240
|
+
job.on('failed', ({ error }) => {
|
|
241
|
+
config_1.logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
242
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
243
|
+
});
|
|
244
|
+
return jobId;
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
// @ts-ignore
|
|
248
|
+
function getJob(condition) {
|
|
249
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
250
|
+
const where = Object.keys(condition)
|
|
251
|
+
.filter((key) => condition[key] !== undefined)
|
|
252
|
+
.map((key) => {
|
|
253
|
+
return core_1.default.where(core_1.default.fn('json_extract', core_1.default.col('job'), `$.${key}`), condition[key]);
|
|
254
|
+
});
|
|
255
|
+
const job = yield crawlQueue.store.db.findOne({
|
|
256
|
+
where: {
|
|
257
|
+
[core_1.default.Op.and]: where,
|
|
258
|
+
},
|
|
259
|
+
});
|
|
260
|
+
if (job) {
|
|
261
|
+
return job.job;
|
|
262
|
+
}
|
|
263
|
+
return null;
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
function convertJobToSnapshot({ job, snapshot }) {
|
|
267
|
+
return Object.assign({
|
|
268
|
+
// @ts-ignore
|
|
269
|
+
jobId: job.jobId || job.id, url: job.url, options: {
|
|
270
|
+
width: job.width,
|
|
271
|
+
height: job.height,
|
|
272
|
+
includeScreenshot: job.includeScreenshot,
|
|
273
|
+
includeHtml: job.includeHtml,
|
|
274
|
+
quality: job.quality,
|
|
275
|
+
fullPage: job.fullPage,
|
|
276
|
+
} }, snapshot);
|
|
277
|
+
}
|
|
278
|
+
function formatSnapshot(snapshot, columns) {
|
|
279
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
280
|
+
let data = Object.assign({}, snapshot);
|
|
281
|
+
// format screenshot path to full url
|
|
282
|
+
if (data.screenshot) {
|
|
283
|
+
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
284
|
+
}
|
|
285
|
+
// format html path to string
|
|
286
|
+
if (data.html) {
|
|
287
|
+
const html = yield fs_extra_1.default.readFile(path_1.default.join(config_1.config.dataDir, data.html));
|
|
288
|
+
data.html = html.toString();
|
|
289
|
+
}
|
|
290
|
+
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
291
|
+
data = (0, pick_1.default)(data, columns);
|
|
292
|
+
}
|
|
293
|
+
return data;
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* get snapshot from db or crawl queue
|
|
298
|
+
*/
|
|
299
|
+
function getSnapshot(jobId) {
|
|
300
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
301
|
+
const snapshotModel = yield snapshot_1.Snapshot.findByPk(jobId);
|
|
302
|
+
if (snapshotModel) {
|
|
303
|
+
return snapshotModel.toJSON();
|
|
304
|
+
}
|
|
305
|
+
const job = yield getJob({ id: jobId });
|
|
306
|
+
if (job) {
|
|
307
|
+
return {
|
|
308
|
+
jobId,
|
|
309
|
+
status: 'pending',
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
return null;
|
|
313
|
+
});
|
|
314
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function ensureDatabase(): Promise<void>;
|
package/dist/db/index.js
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.ensureDatabase = ensureDatabase;
|
|
16
|
+
const core_1 = require("@sequelize/core");
|
|
17
|
+
const sqlite3_1 = require("@sequelize/sqlite3");
|
|
18
|
+
const path_1 = __importDefault(require("path"));
|
|
19
|
+
const config_1 = require("../config");
|
|
20
|
+
const job_1 = require("./job");
|
|
21
|
+
const snapshot_1 = require("./snapshot");
|
|
22
|
+
function ensureDatabase() {
|
|
23
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
24
|
+
const sequelize = new core_1.Sequelize({
|
|
25
|
+
dialect: sqlite3_1.SqliteDialect,
|
|
26
|
+
storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
|
|
27
|
+
logging: (msg) => config_1.logger.debug(msg),
|
|
28
|
+
});
|
|
29
|
+
yield (0, snapshot_1.initSnapshotModel)(sequelize);
|
|
30
|
+
yield (0, job_1.initJobModel)(sequelize);
|
|
31
|
+
try {
|
|
32
|
+
yield sequelize.authenticate();
|
|
33
|
+
yield sequelize.sync();
|
|
34
|
+
config_1.logger.info('Successfully connected to database');
|
|
35
|
+
}
|
|
36
|
+
catch (error) {
|
|
37
|
+
config_1.logger.error('Failed to connect to database:', error);
|
|
38
|
+
throw error;
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
}
|
package/dist/db/job.d.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
export interface JobState {
|
|
3
|
+
id?: string;
|
|
4
|
+
jobId: string;
|
|
5
|
+
url: string;
|
|
6
|
+
includeScreenshot?: boolean;
|
|
7
|
+
includeHtml?: boolean;
|
|
8
|
+
width?: number;
|
|
9
|
+
height?: number;
|
|
10
|
+
quality?: number;
|
|
11
|
+
timeout?: number;
|
|
12
|
+
fullPage?: boolean;
|
|
13
|
+
}
|
|
14
|
+
export interface JobModel {
|
|
15
|
+
id: string;
|
|
16
|
+
queue: string;
|
|
17
|
+
job: JobState;
|
|
18
|
+
retryCount: number;
|
|
19
|
+
willRunAt: number;
|
|
20
|
+
delay: number;
|
|
21
|
+
cancelled: boolean;
|
|
22
|
+
}
|
|
23
|
+
declare class Job extends Model<JobModel> implements JobModel {
|
|
24
|
+
id: JobModel['id'];
|
|
25
|
+
queue: JobModel['queue'];
|
|
26
|
+
job: JobModel['job'];
|
|
27
|
+
retryCount: JobModel['retryCount'];
|
|
28
|
+
willRunAt: JobModel['willRunAt'];
|
|
29
|
+
delay: JobModel['delay'];
|
|
30
|
+
cancelled: JobModel['cancelled'];
|
|
31
|
+
}
|
|
32
|
+
export { Job };
|
|
33
|
+
export declare function initJobModel(sequelize: Sequelize): typeof Job;
|
package/dist/db/job.js
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Job = void 0;
|
|
4
|
+
exports.initJobModel = initJobModel;
|
|
5
|
+
const core_1 = require("@sequelize/core");
|
|
6
|
+
class Job extends core_1.Model {
|
|
7
|
+
}
|
|
8
|
+
exports.Job = Job;
|
|
9
|
+
function initJobModel(sequelize) {
|
|
10
|
+
Job.init({
|
|
11
|
+
id: {
|
|
12
|
+
type: core_1.DataTypes.STRING(40),
|
|
13
|
+
primaryKey: true,
|
|
14
|
+
},
|
|
15
|
+
queue: {
|
|
16
|
+
type: core_1.DataTypes.STRING(32),
|
|
17
|
+
allowNull: false,
|
|
18
|
+
},
|
|
19
|
+
job: {
|
|
20
|
+
type: core_1.DataTypes.JSON,
|
|
21
|
+
allowNull: false,
|
|
22
|
+
},
|
|
23
|
+
retryCount: {
|
|
24
|
+
type: core_1.DataTypes.INTEGER,
|
|
25
|
+
},
|
|
26
|
+
delay: {
|
|
27
|
+
type: core_1.DataTypes.INTEGER,
|
|
28
|
+
},
|
|
29
|
+
willRunAt: {
|
|
30
|
+
type: core_1.DataTypes.INTEGER,
|
|
31
|
+
},
|
|
32
|
+
cancelled: {
|
|
33
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
34
|
+
defaultValue: false,
|
|
35
|
+
},
|
|
36
|
+
createdAt: {
|
|
37
|
+
type: core_1.DataTypes.DATE,
|
|
38
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
39
|
+
index: true,
|
|
40
|
+
},
|
|
41
|
+
updatedAt: {
|
|
42
|
+
type: core_1.DataTypes.DATE,
|
|
43
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
44
|
+
index: true,
|
|
45
|
+
},
|
|
46
|
+
}, {
|
|
47
|
+
sequelize,
|
|
48
|
+
indexes: [{ fields: ['queue'] }],
|
|
49
|
+
modelName: 'job',
|
|
50
|
+
tableName: 'jobs',
|
|
51
|
+
timestamps: true,
|
|
52
|
+
});
|
|
53
|
+
return Job;
|
|
54
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
interface SnapshotModel {
|
|
3
|
+
jobId: string;
|
|
4
|
+
url: string;
|
|
5
|
+
status: 'success' | 'failed' | 'pending';
|
|
6
|
+
html?: string | null;
|
|
7
|
+
screenshot?: string | null;
|
|
8
|
+
error?: string;
|
|
9
|
+
lastModified?: string;
|
|
10
|
+
options?: {
|
|
11
|
+
width?: number;
|
|
12
|
+
height?: number;
|
|
13
|
+
includeScreenshot?: boolean;
|
|
14
|
+
includeHtml?: boolean;
|
|
15
|
+
quality?: number;
|
|
16
|
+
fullPage?: boolean;
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
20
|
+
jobId: SnapshotModel['jobId'];
|
|
21
|
+
url: SnapshotModel['url'];
|
|
22
|
+
status: SnapshotModel['status'];
|
|
23
|
+
html?: SnapshotModel['html'];
|
|
24
|
+
screenshot?: SnapshotModel['screenshot'];
|
|
25
|
+
error?: SnapshotModel['error'];
|
|
26
|
+
lastModified?: SnapshotModel['lastModified'];
|
|
27
|
+
options: SnapshotModel['options'];
|
|
28
|
+
}
|
|
29
|
+
export { Snapshot };
|
|
30
|
+
export type { SnapshotModel };
|
|
31
|
+
export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Snapshot = void 0;
|
|
4
|
+
exports.initSnapshotModel = initSnapshotModel;
|
|
5
|
+
const core_1 = require("@sequelize/core");
|
|
6
|
+
class Snapshot extends core_1.Model {
|
|
7
|
+
}
|
|
8
|
+
exports.Snapshot = Snapshot;
|
|
9
|
+
function initSnapshotModel(sequelize) {
|
|
10
|
+
Snapshot.init({
|
|
11
|
+
jobId: {
|
|
12
|
+
type: core_1.DataTypes.STRING,
|
|
13
|
+
primaryKey: true,
|
|
14
|
+
allowNull: false,
|
|
15
|
+
},
|
|
16
|
+
url: {
|
|
17
|
+
type: core_1.DataTypes.STRING,
|
|
18
|
+
allowNull: false,
|
|
19
|
+
index: true,
|
|
20
|
+
},
|
|
21
|
+
status: {
|
|
22
|
+
type: core_1.DataTypes.ENUM('success', 'failed'),
|
|
23
|
+
allowNull: false,
|
|
24
|
+
},
|
|
25
|
+
html: {
|
|
26
|
+
type: core_1.DataTypes.TEXT,
|
|
27
|
+
allowNull: true,
|
|
28
|
+
},
|
|
29
|
+
screenshot: {
|
|
30
|
+
type: core_1.DataTypes.STRING,
|
|
31
|
+
allowNull: true,
|
|
32
|
+
},
|
|
33
|
+
error: {
|
|
34
|
+
type: core_1.DataTypes.STRING,
|
|
35
|
+
allowNull: true,
|
|
36
|
+
},
|
|
37
|
+
lastModified: {
|
|
38
|
+
type: core_1.DataTypes.STRING,
|
|
39
|
+
allowNull: true,
|
|
40
|
+
},
|
|
41
|
+
options: {
|
|
42
|
+
type: core_1.DataTypes.JSON,
|
|
43
|
+
allowNull: true,
|
|
44
|
+
},
|
|
45
|
+
}, {
|
|
46
|
+
sequelize,
|
|
47
|
+
modelName: 'snapshot',
|
|
48
|
+
tableName: 'snap',
|
|
49
|
+
timestamps: true,
|
|
50
|
+
});
|
|
51
|
+
return Snapshot;
|
|
52
|
+
}
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
17
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
18
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
19
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
20
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
21
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
22
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
23
|
+
});
|
|
24
|
+
};
|
|
25
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
+
exports.Snapshot = void 0;
|
|
27
|
+
exports.initCrawler = initCrawler;
|
|
28
|
+
const config_1 = require("./config");
|
|
29
|
+
const crawler_1 = require("./crawler");
|
|
30
|
+
const db_1 = require("./db");
|
|
31
|
+
const puppeteer_1 = require("./puppeteer");
|
|
32
|
+
__exportStar(require("./blocklet"), exports);
|
|
33
|
+
__exportStar(require("./crawler"), exports);
|
|
34
|
+
__exportStar(require("./middleware"), exports);
|
|
35
|
+
var snapshot_1 = require("./db/snapshot");
|
|
36
|
+
Object.defineProperty(exports, "Snapshot", { enumerable: true, get: function () { return snapshot_1.Snapshot; } });
|
|
37
|
+
function initCrawler(_config) {
|
|
38
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
+
Object.assign(config_1.config, _config);
|
|
40
|
+
config_1.logger.debug('init crawler', config_1.config);
|
|
41
|
+
yield (0, db_1.ensureDatabase)();
|
|
42
|
+
yield (0, crawler_1.createCrawlQueue)();
|
|
43
|
+
yield (0, puppeteer_1.ensureBrowser)();
|
|
44
|
+
});
|
|
45
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.initSEOMiddleware = initSEOMiddleware;
|
|
13
|
+
const cache_1 = require("./cache");
|
|
14
|
+
const utils_1 = require("./utils");
|
|
15
|
+
function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
|
|
16
|
+
return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
|
|
17
|
+
const isBot = (0, utils_1.isBotUserAgent)(req);
|
|
18
|
+
const isSelf = (0, utils_1.isSelfCrawler)(req);
|
|
19
|
+
if (!isBot || isSelf) {
|
|
20
|
+
return next();
|
|
21
|
+
}
|
|
22
|
+
const fullUrl = (0, utils_1.getFullUrl)(req);
|
|
23
|
+
const canCrawl = yield (0, utils_1.isAcceptCrawler)(fullUrl);
|
|
24
|
+
const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
|
|
25
|
+
// can not crawl, skip
|
|
26
|
+
if (!canCrawl || !allowCrawlerResult) {
|
|
27
|
+
return next();
|
|
28
|
+
}
|
|
29
|
+
const cacheData = yield cache_1.useCache.get(fullUrl);
|
|
30
|
+
// add cached html to req
|
|
31
|
+
req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
|
|
32
|
+
// add cached lastModified to req, ISO string to GMT string
|
|
33
|
+
req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
|
|
34
|
+
if (req.cachedLastmod) {
|
|
35
|
+
res.setHeader('Last-Modified', req.cachedLastmod);
|
|
36
|
+
}
|
|
37
|
+
if (autoReturnHtml && req.cachedHtml) {
|
|
38
|
+
res.send(req.cachedHtml);
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
// missing cache
|
|
42
|
+
next();
|
|
43
|
+
});
|
|
44
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
|
|
2
|
+
export { puppeteer };
|
|
3
|
+
export declare function ensurePuppeteerrc(): Promise<{
|
|
4
|
+
cacheDirectory: string;
|
|
5
|
+
temporaryDirectory: string;
|
|
6
|
+
}>;
|
|
7
|
+
export declare function ensureBrowser(): Promise<void>;
|
|
8
|
+
export declare function connectBrowser(): Promise<Browser | null>;
|
|
9
|
+
export declare function launchBrowser(): Promise<Browser>;
|
|
10
|
+
export declare const getBrowser: () => Promise<Browser>;
|
|
11
|
+
export declare const closeBrowser: ({ trimCache }?: {
|
|
12
|
+
trimCache?: boolean;
|
|
13
|
+
}) => Promise<void>;
|
|
14
|
+
export declare function initPage({ abortResourceTypes }?: {
|
|
15
|
+
abortResourceTypes?: never[] | undefined;
|
|
16
|
+
}): Promise<Page>;
|