@arcblock/crawler 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocklet.d.ts +6 -0
- package/dist/blocklet.js +199 -0
- package/dist/cache.d.ts +10 -0
- package/dist/cache.js +119 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +17 -0
- package/dist/crawler.d.ts +28 -0
- package/dist/crawler.js +314 -0
- package/dist/db/index.d.ts +1 -0
- package/dist/db/index.js +41 -0
- package/dist/db/job.d.ts +33 -0
- package/dist/db/job.js +54 -0
- package/dist/db/snapshot.d.ts +31 -0
- package/dist/db/snapshot.js +52 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +45 -0
- package/dist/middleware.d.ts +4 -0
- package/dist/middleware.js +44 -0
- package/dist/puppeteer.d.ts +16 -0
- package/dist/puppeteer.js +318 -0
- package/dist/utils.d.ts +15 -0
- package/dist/utils.js +239 -0
- package/esm/blocklet.d.ts +6 -0
- package/esm/blocklet.js +190 -0
- package/esm/cache.d.ts +10 -0
- package/esm/cache.js +114 -0
- package/esm/config.d.ts +10 -0
- package/esm/config.js +11 -0
- package/esm/crawler.d.ts +28 -0
- package/esm/crawler.js +301 -0
- package/esm/db/index.d.ts +1 -0
- package/esm/db/index.js +35 -0
- package/esm/db/job.d.ts +33 -0
- package/esm/db/job.js +50 -0
- package/esm/db/snapshot.d.ts +31 -0
- package/esm/db/snapshot.js +48 -0
- package/esm/index.d.ts +6 -0
- package/esm/index.js +26 -0
- package/esm/middleware.d.ts +4 -0
- package/esm/middleware.js +41 -0
- package/esm/puppeteer.d.ts +16 -0
- package/esm/puppeteer.js +272 -0
- package/esm/utils.d.ts +15 -0
- package/esm/utils.js +220 -0
- package/package.json +10 -3
- package/src/blocklet.ts +0 -223
- package/src/cache.ts +0 -117
- package/src/config.ts +0 -13
- package/src/crawler.ts +0 -364
- package/src/db/index.ts +0 -27
- package/src/db/job.ts +0 -93
- package/src/db/snapshot.ts +0 -89
- package/src/index.ts +0 -19
- package/src/middleware.ts +0 -46
- package/src/puppeteer.ts +0 -296
- package/src/utils.ts +0 -240
- package/third.d.ts +0 -1
- package/tsconfig.json +0 -9
package/src/crawler.ts
DELETED
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
import createQueue from '@abtnode/queue';
|
|
2
|
-
import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
|
|
3
|
-
import sequelize from '@sequelize/core';
|
|
4
|
-
import { randomUUID } from 'crypto';
|
|
5
|
-
import fs from 'fs-extra';
|
|
6
|
-
import pick from 'lodash/pick';
|
|
7
|
-
import path from 'path';
|
|
8
|
-
import { joinURL } from 'ufo';
|
|
9
|
-
|
|
10
|
-
import { config, logger } from './config';
|
|
11
|
-
import { Job, JobState } from './db/job';
|
|
12
|
-
import { Snapshot, SnapshotModel } from './db/snapshot';
|
|
13
|
-
import { initPage } from './puppeteer';
|
|
14
|
-
import { formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
15
|
-
|
|
16
|
-
const { BaseState } = require('@abtnode/models');
|
|
17
|
-
|
|
18
|
-
let crawlQueue;
|
|
19
|
-
|
|
20
|
-
export function createCrawlQueue() {
|
|
21
|
-
const db = new BaseState(Job);
|
|
22
|
-
|
|
23
|
-
crawlQueue = createQueue({
|
|
24
|
-
store: new SequelizeStore(db, 'crawler'),
|
|
25
|
-
concurrency: 1,
|
|
26
|
-
onJob: async (job: JobState) => {
|
|
27
|
-
logger.debug('job start:', job);
|
|
28
|
-
|
|
29
|
-
const canCrawl = await isAcceptCrawler(job.url);
|
|
30
|
-
if (!canCrawl) {
|
|
31
|
-
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
32
|
-
const snapshot = convertJobToSnapshot({
|
|
33
|
-
job,
|
|
34
|
-
snapshot: {
|
|
35
|
-
status: 'failed',
|
|
36
|
-
error: 'Denied by robots.txt',
|
|
37
|
-
},
|
|
38
|
-
});
|
|
39
|
-
await Snapshot.upsert(snapshot);
|
|
40
|
-
return snapshot;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
// if index reach autoCloseBrowserCount, close browser
|
|
44
|
-
// try {
|
|
45
|
-
// if (index >= autoCloseBrowserCount) {
|
|
46
|
-
// await closeBrowser({ trimCache: false });
|
|
47
|
-
// }
|
|
48
|
-
// } catch (error) {
|
|
49
|
-
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
|
-
// }
|
|
51
|
-
|
|
52
|
-
try {
|
|
53
|
-
// get page content later
|
|
54
|
-
const result = await getPageContent(job);
|
|
55
|
-
|
|
56
|
-
if (!result || (!result.html && !result.screenshot)) {
|
|
57
|
-
logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
58
|
-
|
|
59
|
-
const snapshot = convertJobToSnapshot({
|
|
60
|
-
job,
|
|
61
|
-
snapshot: {
|
|
62
|
-
status: 'failed',
|
|
63
|
-
error: 'Failed to crawl content',
|
|
64
|
-
},
|
|
65
|
-
});
|
|
66
|
-
await Snapshot.upsert(snapshot);
|
|
67
|
-
return snapshot;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
// save html and screenshot to data dir
|
|
71
|
-
const { screenshotPath, htmlPath } = await saveSnapshotToLocal({
|
|
72
|
-
screenshot: result.screenshot,
|
|
73
|
-
html: result.html,
|
|
74
|
-
});
|
|
75
|
-
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
76
|
-
|
|
77
|
-
const snapshot = convertJobToSnapshot({
|
|
78
|
-
job,
|
|
79
|
-
snapshot: {
|
|
80
|
-
status: 'success',
|
|
81
|
-
screenshot: screenshotPath?.replace(config.dataDir, ''),
|
|
82
|
-
html: htmlPath?.replace(config.dataDir, ''),
|
|
83
|
-
},
|
|
84
|
-
});
|
|
85
|
-
await Snapshot.upsert(snapshot);
|
|
86
|
-
return snapshot;
|
|
87
|
-
|
|
88
|
-
// save to redis
|
|
89
|
-
// if (saveToRedis) {
|
|
90
|
-
// useCache.set(url, {
|
|
91
|
-
// html: result.html || '',
|
|
92
|
-
// lastModified,
|
|
93
|
-
// });
|
|
94
|
-
|
|
95
|
-
// logger.info(`success to crawl ${url}`, job);
|
|
96
|
-
// return result;
|
|
97
|
-
// }
|
|
98
|
-
} catch (error) {
|
|
99
|
-
logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
100
|
-
console.error(error.stack);
|
|
101
|
-
|
|
102
|
-
const snapshot = convertJobToSnapshot({
|
|
103
|
-
job,
|
|
104
|
-
snapshot: {
|
|
105
|
-
status: 'failed',
|
|
106
|
-
error: 'Internal error',
|
|
107
|
-
},
|
|
108
|
-
});
|
|
109
|
-
await Snapshot.upsert(snapshot);
|
|
110
|
-
return snapshot;
|
|
111
|
-
}
|
|
112
|
-
},
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
export async function getDataDir() {
|
|
117
|
-
const htmlDir = path.join(config.dataDir, 'data', 'html');
|
|
118
|
-
const screenshotDir = path.join(config.dataDir, 'data', 'screenshot');
|
|
119
|
-
|
|
120
|
-
await fs.ensureDir(htmlDir);
|
|
121
|
-
await fs.ensureDir(screenshotDir);
|
|
122
|
-
|
|
123
|
-
return { htmlDir, screenshotDir };
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
async function saveSnapshotToLocal({ screenshot, html }: { screenshot?: Uint8Array | null; html?: string | null }) {
|
|
127
|
-
const { htmlDir, screenshotDir } = await getDataDir();
|
|
128
|
-
|
|
129
|
-
let screenshotPath: string | null = null;
|
|
130
|
-
let htmlPath: string | null = null;
|
|
131
|
-
|
|
132
|
-
if (screenshot) {
|
|
133
|
-
const hash = md5(screenshot);
|
|
134
|
-
screenshotPath = path.join(screenshotDir, `${hash}.webp`);
|
|
135
|
-
|
|
136
|
-
logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
137
|
-
|
|
138
|
-
await fs.writeFile(screenshotPath, screenshot);
|
|
139
|
-
}
|
|
140
|
-
if (html) {
|
|
141
|
-
const hash = md5(html);
|
|
142
|
-
htmlPath = path.join(htmlDir, `${hash}.html`);
|
|
143
|
-
|
|
144
|
-
logger.debug('saveSnapshotToLocal.html', { htmlPath });
|
|
145
|
-
|
|
146
|
-
await fs.writeFile(htmlPath, html);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
return {
|
|
150
|
-
screenshotPath,
|
|
151
|
-
htmlPath,
|
|
152
|
-
};
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
function formatHtml(htmlString: string) {
|
|
156
|
-
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
157
|
-
return '';
|
|
158
|
-
}
|
|
159
|
-
return htmlString;
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
export const getPageContent = async ({
|
|
163
|
-
url,
|
|
164
|
-
formatPageContent,
|
|
165
|
-
includeScreenshot = true,
|
|
166
|
-
includeHtml = true,
|
|
167
|
-
width = 1440,
|
|
168
|
-
height = 900,
|
|
169
|
-
quality = 80,
|
|
170
|
-
timeout = 60 * 1000,
|
|
171
|
-
fullPage = false,
|
|
172
|
-
}: {
|
|
173
|
-
url: string;
|
|
174
|
-
formatPageContent?: Function;
|
|
175
|
-
includeScreenshot?: boolean;
|
|
176
|
-
includeHtml?: boolean;
|
|
177
|
-
width?: number;
|
|
178
|
-
height?: number;
|
|
179
|
-
quality?: number;
|
|
180
|
-
timeout?: number;
|
|
181
|
-
fullPage?: boolean;
|
|
182
|
-
}) => {
|
|
183
|
-
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
184
|
-
|
|
185
|
-
const page = await initPage();
|
|
186
|
-
|
|
187
|
-
if (width && height) {
|
|
188
|
-
await page.setViewport({ width, height });
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
let html: string | null = null;
|
|
192
|
-
let screenshot: Uint8Array | null = null;
|
|
193
|
-
|
|
194
|
-
try {
|
|
195
|
-
const response = await page.goto(url, { timeout });
|
|
196
|
-
|
|
197
|
-
if (!response) {
|
|
198
|
-
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
const statusCode = response.status();
|
|
202
|
-
|
|
203
|
-
logger.debug('getPageContent.response', { response, statusCode });
|
|
204
|
-
|
|
205
|
-
if (![200, 304].includes(statusCode)) {
|
|
206
|
-
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
// await for networkidle0
|
|
210
|
-
// https://pptr.dev/api/puppeteer.page.goforward/#remarks
|
|
211
|
-
await page.waitForNetworkIdle({
|
|
212
|
-
idleTime: 2 * 1000,
|
|
213
|
-
});
|
|
214
|
-
|
|
215
|
-
// get screenshot
|
|
216
|
-
if (includeScreenshot) {
|
|
217
|
-
try {
|
|
218
|
-
screenshot = await page.screenshot({ fullPage, quality, type: 'webp' });
|
|
219
|
-
} catch (err) {
|
|
220
|
-
logger.error('Failed to get screenshot:', err);
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
// get html
|
|
225
|
-
if (includeHtml) {
|
|
226
|
-
if (formatPageContent) {
|
|
227
|
-
html = await formatPageContent({ page, url });
|
|
228
|
-
} else {
|
|
229
|
-
html = await page.content();
|
|
230
|
-
}
|
|
231
|
-
}
|
|
232
|
-
} catch (error) {
|
|
233
|
-
logger.error('Failed to get page content:', error);
|
|
234
|
-
throw error;
|
|
235
|
-
} finally {
|
|
236
|
-
await page.close();
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
html = formatHtml(html || '');
|
|
240
|
-
|
|
241
|
-
return {
|
|
242
|
-
html,
|
|
243
|
-
screenshot,
|
|
244
|
-
};
|
|
245
|
-
};
|
|
246
|
-
|
|
247
|
-
export async function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void) {
|
|
248
|
-
params = {
|
|
249
|
-
...params,
|
|
250
|
-
url: formatUrl(params.url),
|
|
251
|
-
};
|
|
252
|
-
|
|
253
|
-
// skip duplicate job
|
|
254
|
-
const existsJob = await getJob({
|
|
255
|
-
url: params.url,
|
|
256
|
-
includeScreenshot: params.includeScreenshot,
|
|
257
|
-
includeHtml: params.includeHtml,
|
|
258
|
-
quality: params.quality,
|
|
259
|
-
width: params.width,
|
|
260
|
-
height: params.height,
|
|
261
|
-
fullPage: params.fullPage,
|
|
262
|
-
});
|
|
263
|
-
|
|
264
|
-
logger.info('create crawl job', params);
|
|
265
|
-
|
|
266
|
-
if (existsJob) {
|
|
267
|
-
logger.warn(`Crawl job already exists for ${params.url}, skip`);
|
|
268
|
-
return existsJob.id;
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
const jobId = randomUUID();
|
|
272
|
-
const job = crawlQueue.push({ ...params, id: jobId });
|
|
273
|
-
|
|
274
|
-
job.on('finished', ({ result }) => {
|
|
275
|
-
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
276
|
-
callback?.(result);
|
|
277
|
-
});
|
|
278
|
-
|
|
279
|
-
job.on('failed', ({ error }) => {
|
|
280
|
-
logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
281
|
-
callback?.(null);
|
|
282
|
-
});
|
|
283
|
-
|
|
284
|
-
return jobId;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
// @ts-ignore
|
|
288
|
-
export async function getJob(condition: Partial<JobState>) {
|
|
289
|
-
const where = Object.keys(condition)
|
|
290
|
-
.filter((key) => condition[key] !== undefined)
|
|
291
|
-
.map((key) => {
|
|
292
|
-
return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
|
|
293
|
-
});
|
|
294
|
-
|
|
295
|
-
const job = await crawlQueue.store.db.findOne({
|
|
296
|
-
where: {
|
|
297
|
-
[sequelize.Op.and]: where,
|
|
298
|
-
},
|
|
299
|
-
});
|
|
300
|
-
|
|
301
|
-
if (job) {
|
|
302
|
-
return job.job;
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
return null;
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
function convertJobToSnapshot({ job, snapshot }: { job: JobState; snapshot?: Partial<SnapshotModel> }) {
|
|
309
|
-
return {
|
|
310
|
-
// @ts-ignore
|
|
311
|
-
jobId: job.jobId || job.id,
|
|
312
|
-
url: job.url,
|
|
313
|
-
options: {
|
|
314
|
-
width: job.width,
|
|
315
|
-
height: job.height,
|
|
316
|
-
includeScreenshot: job.includeScreenshot,
|
|
317
|
-
includeHtml: job.includeHtml,
|
|
318
|
-
quality: job.quality,
|
|
319
|
-
fullPage: job.fullPage,
|
|
320
|
-
},
|
|
321
|
-
...snapshot,
|
|
322
|
-
} as SnapshotModel;
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
export async function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>) {
|
|
326
|
-
let data = Object.assign({}, snapshot);
|
|
327
|
-
|
|
328
|
-
// format screenshot path to full url
|
|
329
|
-
if (data.screenshot) {
|
|
330
|
-
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
331
|
-
}
|
|
332
|
-
// format html path to string
|
|
333
|
-
if (data.html) {
|
|
334
|
-
const html = await fs.readFile(path.join(config.dataDir, data.html));
|
|
335
|
-
data.html = html.toString();
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
if (columns?.length) {
|
|
339
|
-
data = pick(data, columns);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
return data;
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
/**
|
|
346
|
-
* get snapshot from db or crawl queue
|
|
347
|
-
*/
|
|
348
|
-
export async function getSnapshot(jobId: string) {
|
|
349
|
-
const snapshotModel = await Snapshot.findByPk(jobId);
|
|
350
|
-
|
|
351
|
-
if (snapshotModel) {
|
|
352
|
-
return snapshotModel.toJSON();
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
const job = await getJob({ id: jobId });
|
|
356
|
-
if (job) {
|
|
357
|
-
return {
|
|
358
|
-
jobId,
|
|
359
|
-
status: 'pending',
|
|
360
|
-
} as SnapshotModel;
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
return null;
|
|
364
|
-
}
|
package/src/db/index.ts
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { Sequelize } from '@sequelize/core';
|
|
2
|
-
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
3
|
-
import path from 'path';
|
|
4
|
-
|
|
5
|
-
import { config, logger } from '../config';
|
|
6
|
-
import { initJobModel } from './job';
|
|
7
|
-
import { initSnapshotModel } from './snapshot';
|
|
8
|
-
|
|
9
|
-
export async function ensureDatabase() {
|
|
10
|
-
const sequelize = new Sequelize({
|
|
11
|
-
dialect: SqliteDialect,
|
|
12
|
-
storage: path.join(config.dataDir, 'snap-kit.db'),
|
|
13
|
-
logging: (msg) => logger.debug(msg),
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
await initSnapshotModel(sequelize);
|
|
17
|
-
await initJobModel(sequelize);
|
|
18
|
-
|
|
19
|
-
try {
|
|
20
|
-
await sequelize.authenticate();
|
|
21
|
-
await sequelize.sync();
|
|
22
|
-
logger.info('Successfully connected to database');
|
|
23
|
-
} catch (error) {
|
|
24
|
-
logger.error('Failed to connect to database:', error);
|
|
25
|
-
throw error;
|
|
26
|
-
}
|
|
27
|
-
}
|
package/src/db/job.ts
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import { DataTypes, Model, Sequelize } from '@sequelize/core';
|
|
2
|
-
|
|
3
|
-
export interface JobState {
|
|
4
|
-
id?: string;
|
|
5
|
-
jobId: string;
|
|
6
|
-
url: string;
|
|
7
|
-
includeScreenshot?: boolean;
|
|
8
|
-
includeHtml?: boolean;
|
|
9
|
-
width?: number;
|
|
10
|
-
height?: number;
|
|
11
|
-
quality?: number;
|
|
12
|
-
timeout?: number;
|
|
13
|
-
fullPage?: boolean;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
export interface JobModel {
|
|
17
|
-
id: string;
|
|
18
|
-
queue: string;
|
|
19
|
-
job: JobState;
|
|
20
|
-
retryCount: number;
|
|
21
|
-
willRunAt: number;
|
|
22
|
-
delay: number;
|
|
23
|
-
cancelled: boolean;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
class Job extends Model<JobModel> implements JobModel {
|
|
27
|
-
public id!: JobModel['id'];
|
|
28
|
-
|
|
29
|
-
public queue!: JobModel['queue'];
|
|
30
|
-
|
|
31
|
-
public job!: JobModel['job'];
|
|
32
|
-
|
|
33
|
-
public retryCount!: JobModel['retryCount'];
|
|
34
|
-
|
|
35
|
-
public willRunAt!: JobModel['willRunAt'];
|
|
36
|
-
|
|
37
|
-
public delay!: JobModel['delay'];
|
|
38
|
-
|
|
39
|
-
public cancelled!: JobModel['cancelled'];
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
export { Job };
|
|
43
|
-
|
|
44
|
-
export function initJobModel(sequelize: Sequelize) {
|
|
45
|
-
Job.init(
|
|
46
|
-
{
|
|
47
|
-
id: {
|
|
48
|
-
type: DataTypes.STRING(40),
|
|
49
|
-
primaryKey: true,
|
|
50
|
-
},
|
|
51
|
-
queue: {
|
|
52
|
-
type: DataTypes.STRING(32),
|
|
53
|
-
allowNull: false,
|
|
54
|
-
},
|
|
55
|
-
job: {
|
|
56
|
-
type: DataTypes.JSON,
|
|
57
|
-
allowNull: false,
|
|
58
|
-
},
|
|
59
|
-
retryCount: {
|
|
60
|
-
type: DataTypes.INTEGER,
|
|
61
|
-
},
|
|
62
|
-
delay: {
|
|
63
|
-
type: DataTypes.INTEGER,
|
|
64
|
-
},
|
|
65
|
-
willRunAt: {
|
|
66
|
-
type: DataTypes.INTEGER,
|
|
67
|
-
},
|
|
68
|
-
cancelled: {
|
|
69
|
-
type: DataTypes.BOOLEAN,
|
|
70
|
-
defaultValue: false,
|
|
71
|
-
},
|
|
72
|
-
createdAt: {
|
|
73
|
-
type: DataTypes.DATE,
|
|
74
|
-
defaultValue: DataTypes.NOW,
|
|
75
|
-
index: true,
|
|
76
|
-
},
|
|
77
|
-
updatedAt: {
|
|
78
|
-
type: DataTypes.DATE,
|
|
79
|
-
defaultValue: DataTypes.NOW,
|
|
80
|
-
index: true,
|
|
81
|
-
},
|
|
82
|
-
},
|
|
83
|
-
{
|
|
84
|
-
sequelize,
|
|
85
|
-
indexes: [{ fields: ['queue'] }],
|
|
86
|
-
modelName: 'job',
|
|
87
|
-
tableName: 'jobs',
|
|
88
|
-
timestamps: true,
|
|
89
|
-
},
|
|
90
|
-
);
|
|
91
|
-
|
|
92
|
-
return Job;
|
|
93
|
-
}
|
package/src/db/snapshot.ts
DELETED
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
import { DataTypes, Model, Sequelize } from '@sequelize/core';
|
|
2
|
-
|
|
3
|
-
interface SnapshotModel {
|
|
4
|
-
jobId: string;
|
|
5
|
-
url: string;
|
|
6
|
-
status: 'success' | 'failed' | 'pending';
|
|
7
|
-
html?: string | null;
|
|
8
|
-
screenshot?: string | null;
|
|
9
|
-
error?: string;
|
|
10
|
-
lastModified?: string;
|
|
11
|
-
options?: {
|
|
12
|
-
width?: number;
|
|
13
|
-
height?: number;
|
|
14
|
-
includeScreenshot?: boolean;
|
|
15
|
-
includeHtml?: boolean;
|
|
16
|
-
quality?: number;
|
|
17
|
-
fullPage?: boolean;
|
|
18
|
-
};
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
22
|
-
public jobId!: SnapshotModel['jobId'];
|
|
23
|
-
|
|
24
|
-
public url!: SnapshotModel['url'];
|
|
25
|
-
|
|
26
|
-
public status!: SnapshotModel['status'];
|
|
27
|
-
|
|
28
|
-
public html?: SnapshotModel['html'];
|
|
29
|
-
|
|
30
|
-
public screenshot?: SnapshotModel['screenshot'];
|
|
31
|
-
|
|
32
|
-
public error?: SnapshotModel['error'];
|
|
33
|
-
|
|
34
|
-
public lastModified?: SnapshotModel['lastModified'];
|
|
35
|
-
|
|
36
|
-
public options!: SnapshotModel['options'];
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
export { Snapshot };
|
|
40
|
-
export type { SnapshotModel };
|
|
41
|
-
|
|
42
|
-
export function initSnapshotModel(sequelize: Sequelize) {
|
|
43
|
-
Snapshot.init(
|
|
44
|
-
{
|
|
45
|
-
jobId: {
|
|
46
|
-
type: DataTypes.STRING,
|
|
47
|
-
primaryKey: true,
|
|
48
|
-
allowNull: false,
|
|
49
|
-
},
|
|
50
|
-
url: {
|
|
51
|
-
type: DataTypes.STRING,
|
|
52
|
-
allowNull: false,
|
|
53
|
-
index: true,
|
|
54
|
-
},
|
|
55
|
-
status: {
|
|
56
|
-
type: DataTypes.ENUM('success', 'failed'),
|
|
57
|
-
allowNull: false,
|
|
58
|
-
},
|
|
59
|
-
html: {
|
|
60
|
-
type: DataTypes.TEXT,
|
|
61
|
-
allowNull: true,
|
|
62
|
-
},
|
|
63
|
-
screenshot: {
|
|
64
|
-
type: DataTypes.STRING,
|
|
65
|
-
allowNull: true,
|
|
66
|
-
},
|
|
67
|
-
error: {
|
|
68
|
-
type: DataTypes.STRING,
|
|
69
|
-
allowNull: true,
|
|
70
|
-
},
|
|
71
|
-
lastModified: {
|
|
72
|
-
type: DataTypes.STRING,
|
|
73
|
-
allowNull: true,
|
|
74
|
-
},
|
|
75
|
-
options: {
|
|
76
|
-
type: DataTypes.JSON,
|
|
77
|
-
allowNull: true,
|
|
78
|
-
},
|
|
79
|
-
},
|
|
80
|
-
{
|
|
81
|
-
sequelize,
|
|
82
|
-
modelName: 'snapshot',
|
|
83
|
-
tableName: 'snap',
|
|
84
|
-
timestamps: true,
|
|
85
|
-
},
|
|
86
|
-
);
|
|
87
|
-
|
|
88
|
-
return Snapshot;
|
|
89
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import { config, logger } from './config';
|
|
2
|
-
import { createCrawlQueue } from './crawler';
|
|
3
|
-
import { ensureDatabase } from './db';
|
|
4
|
-
import { ensureBrowser } from './puppeteer';
|
|
5
|
-
|
|
6
|
-
export * from './blocklet';
|
|
7
|
-
export * from './crawler';
|
|
8
|
-
export * from './middleware';
|
|
9
|
-
export { Snapshot } from './db/snapshot';
|
|
10
|
-
|
|
11
|
-
export async function initCrawler(_config: Partial<typeof config>) {
|
|
12
|
-
Object.assign(config, _config);
|
|
13
|
-
|
|
14
|
-
logger.debug('init crawler', config);
|
|
15
|
-
|
|
16
|
-
await ensureDatabase();
|
|
17
|
-
await createCrawlQueue();
|
|
18
|
-
await ensureBrowser();
|
|
19
|
-
}
|
package/src/middleware.ts
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import { useCache } from './cache';
|
|
2
|
-
import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
|
|
3
|
-
|
|
4
|
-
export function initSEOMiddleware({
|
|
5
|
-
autoReturnHtml = true,
|
|
6
|
-
allowCrawler = true,
|
|
7
|
-
}: {
|
|
8
|
-
autoReturnHtml?: Boolean;
|
|
9
|
-
allowCrawler?: Boolean | Function;
|
|
10
|
-
}) {
|
|
11
|
-
return async (req: any, res: any, next: Function) => {
|
|
12
|
-
const isBot = isBotUserAgent(req);
|
|
13
|
-
const isSelf = isSelfCrawler(req);
|
|
14
|
-
|
|
15
|
-
if (!isBot || isSelf) {
|
|
16
|
-
return next();
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
const fullUrl = getFullUrl(req);
|
|
20
|
-
const canCrawl = await isAcceptCrawler(fullUrl);
|
|
21
|
-
const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
|
|
22
|
-
|
|
23
|
-
// can not crawl, skip
|
|
24
|
-
if (!canCrawl || !allowCrawlerResult) {
|
|
25
|
-
return next();
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
const cacheData = await useCache.get(fullUrl);
|
|
29
|
-
|
|
30
|
-
// add cached html to req
|
|
31
|
-
req.cachedHtml = cacheData?.content || cacheData || null;
|
|
32
|
-
// add cached lastModified to req, ISO string to GMT string
|
|
33
|
-
req.cachedLastmod = cacheData?.lastModified ? new Date(cacheData?.lastModified).toUTCString() : null;
|
|
34
|
-
|
|
35
|
-
if (req.cachedLastmod) {
|
|
36
|
-
res.setHeader('Last-Modified', req.cachedLastmod);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
if (autoReturnHtml && req.cachedHtml) {
|
|
40
|
-
res.send(req.cachedHtml);
|
|
41
|
-
return;
|
|
42
|
-
}
|
|
43
|
-
// missing cache
|
|
44
|
-
next();
|
|
45
|
-
};
|
|
46
|
-
}
|