@arcblock/crawler 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/blocklet.d.ts +6 -0
  2. package/dist/blocklet.js +199 -0
  3. package/dist/cache.d.ts +10 -0
  4. package/dist/cache.js +119 -0
  5. package/dist/config.d.ts +10 -0
  6. package/dist/config.js +17 -0
  7. package/dist/crawler.d.ts +28 -0
  8. package/dist/crawler.js +314 -0
  9. package/dist/db/index.d.ts +1 -0
  10. package/dist/db/index.js +41 -0
  11. package/dist/db/job.d.ts +33 -0
  12. package/dist/db/job.js +54 -0
  13. package/dist/db/snapshot.d.ts +31 -0
  14. package/dist/db/snapshot.js +52 -0
  15. package/dist/index.d.ts +6 -0
  16. package/dist/index.js +45 -0
  17. package/dist/middleware.d.ts +4 -0
  18. package/dist/middleware.js +44 -0
  19. package/dist/puppeteer.d.ts +16 -0
  20. package/dist/puppeteer.js +318 -0
  21. package/dist/utils.d.ts +15 -0
  22. package/dist/utils.js +239 -0
  23. package/esm/blocklet.d.ts +6 -0
  24. package/esm/blocklet.js +190 -0
  25. package/esm/cache.d.ts +10 -0
  26. package/esm/cache.js +114 -0
  27. package/esm/config.d.ts +10 -0
  28. package/esm/config.js +11 -0
  29. package/esm/crawler.d.ts +28 -0
  30. package/esm/crawler.js +301 -0
  31. package/esm/db/index.d.ts +1 -0
  32. package/esm/db/index.js +35 -0
  33. package/esm/db/job.d.ts +33 -0
  34. package/esm/db/job.js +50 -0
  35. package/esm/db/snapshot.d.ts +31 -0
  36. package/esm/db/snapshot.js +48 -0
  37. package/esm/index.d.ts +6 -0
  38. package/esm/index.js +26 -0
  39. package/esm/middleware.d.ts +4 -0
  40. package/esm/middleware.js +41 -0
  41. package/esm/puppeteer.d.ts +16 -0
  42. package/esm/puppeteer.js +272 -0
  43. package/esm/utils.d.ts +15 -0
  44. package/esm/utils.js +220 -0
  45. package/package.json +10 -3
  46. package/src/blocklet.ts +0 -223
  47. package/src/cache.ts +0 -117
  48. package/src/config.ts +0 -13
  49. package/src/crawler.ts +0 -364
  50. package/src/db/index.ts +0 -27
  51. package/src/db/job.ts +0 -93
  52. package/src/db/snapshot.ts +0 -89
  53. package/src/index.ts +0 -19
  54. package/src/middleware.ts +0 -46
  55. package/src/puppeteer.ts +0 -296
  56. package/src/utils.ts +0 -240
  57. package/third.d.ts +0 -1
  58. package/tsconfig.json +0 -9
package/src/crawler.ts DELETED
@@ -1,364 +0,0 @@
1
- import createQueue from '@abtnode/queue';
2
- import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
3
- import sequelize from '@sequelize/core';
4
- import { randomUUID } from 'crypto';
5
- import fs from 'fs-extra';
6
- import pick from 'lodash/pick';
7
- import path from 'path';
8
- import { joinURL } from 'ufo';
9
-
10
- import { config, logger } from './config';
11
- import { Job, JobState } from './db/job';
12
- import { Snapshot, SnapshotModel } from './db/snapshot';
13
- import { initPage } from './puppeteer';
14
- import { formatUrl, isAcceptCrawler, md5 } from './utils';
15
-
16
- const { BaseState } = require('@abtnode/models');
17
-
18
- let crawlQueue;
19
-
20
- export function createCrawlQueue() {
21
- const db = new BaseState(Job);
22
-
23
- crawlQueue = createQueue({
24
- store: new SequelizeStore(db, 'crawler'),
25
- concurrency: 1,
26
- onJob: async (job: JobState) => {
27
- logger.debug('job start:', job);
28
-
29
- const canCrawl = await isAcceptCrawler(job.url);
30
- if (!canCrawl) {
31
- logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
32
- const snapshot = convertJobToSnapshot({
33
- job,
34
- snapshot: {
35
- status: 'failed',
36
- error: 'Denied by robots.txt',
37
- },
38
- });
39
- await Snapshot.upsert(snapshot);
40
- return snapshot;
41
- }
42
-
43
- // if index reach autoCloseBrowserCount, close browser
44
- // try {
45
- // if (index >= autoCloseBrowserCount) {
46
- // await closeBrowser({ trimCache: false });
47
- // }
48
- // } catch (error) {
49
- // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
- // }
51
-
52
- try {
53
- // get page content later
54
- const result = await getPageContent(job);
55
-
56
- if (!result || (!result.html && !result.screenshot)) {
57
- logger.error(`failed to crawl ${job.url}, empty content`, job);
58
-
59
- const snapshot = convertJobToSnapshot({
60
- job,
61
- snapshot: {
62
- status: 'failed',
63
- error: 'Failed to crawl content',
64
- },
65
- });
66
- await Snapshot.upsert(snapshot);
67
- return snapshot;
68
- }
69
-
70
- // save html and screenshot to data dir
71
- const { screenshotPath, htmlPath } = await saveSnapshotToLocal({
72
- screenshot: result.screenshot,
73
- html: result.html,
74
- });
75
- // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
76
-
77
- const snapshot = convertJobToSnapshot({
78
- job,
79
- snapshot: {
80
- status: 'success',
81
- screenshot: screenshotPath?.replace(config.dataDir, ''),
82
- html: htmlPath?.replace(config.dataDir, ''),
83
- },
84
- });
85
- await Snapshot.upsert(snapshot);
86
- return snapshot;
87
-
88
- // save to redis
89
- // if (saveToRedis) {
90
- // useCache.set(url, {
91
- // html: result.html || '',
92
- // lastModified,
93
- // });
94
-
95
- // logger.info(`success to crawl ${url}`, job);
96
- // return result;
97
- // }
98
- } catch (error) {
99
- logger.error(`Failed to crawl ${job.url}`, { error, job });
100
- console.error(error.stack);
101
-
102
- const snapshot = convertJobToSnapshot({
103
- job,
104
- snapshot: {
105
- status: 'failed',
106
- error: 'Internal error',
107
- },
108
- });
109
- await Snapshot.upsert(snapshot);
110
- return snapshot;
111
- }
112
- },
113
- });
114
- }
115
-
116
- export async function getDataDir() {
117
- const htmlDir = path.join(config.dataDir, 'data', 'html');
118
- const screenshotDir = path.join(config.dataDir, 'data', 'screenshot');
119
-
120
- await fs.ensureDir(htmlDir);
121
- await fs.ensureDir(screenshotDir);
122
-
123
- return { htmlDir, screenshotDir };
124
- }
125
-
126
- async function saveSnapshotToLocal({ screenshot, html }: { screenshot?: Uint8Array | null; html?: string | null }) {
127
- const { htmlDir, screenshotDir } = await getDataDir();
128
-
129
- let screenshotPath: string | null = null;
130
- let htmlPath: string | null = null;
131
-
132
- if (screenshot) {
133
- const hash = md5(screenshot);
134
- screenshotPath = path.join(screenshotDir, `${hash}.webp`);
135
-
136
- logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
137
-
138
- await fs.writeFile(screenshotPath, screenshot);
139
- }
140
- if (html) {
141
- const hash = md5(html);
142
- htmlPath = path.join(htmlDir, `${hash}.html`);
143
-
144
- logger.debug('saveSnapshotToLocal.html', { htmlPath });
145
-
146
- await fs.writeFile(htmlPath, html);
147
- }
148
-
149
- return {
150
- screenshotPath,
151
- htmlPath,
152
- };
153
- }
154
-
155
- function formatHtml(htmlString: string) {
156
- if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
157
- return '';
158
- }
159
- return htmlString;
160
- }
161
-
162
- export const getPageContent = async ({
163
- url,
164
- formatPageContent,
165
- includeScreenshot = true,
166
- includeHtml = true,
167
- width = 1440,
168
- height = 900,
169
- quality = 80,
170
- timeout = 60 * 1000,
171
- fullPage = false,
172
- }: {
173
- url: string;
174
- formatPageContent?: Function;
175
- includeScreenshot?: boolean;
176
- includeHtml?: boolean;
177
- width?: number;
178
- height?: number;
179
- quality?: number;
180
- timeout?: number;
181
- fullPage?: boolean;
182
- }) => {
183
- logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
184
-
185
- const page = await initPage();
186
-
187
- if (width && height) {
188
- await page.setViewport({ width, height });
189
- }
190
-
191
- let html: string | null = null;
192
- let screenshot: Uint8Array | null = null;
193
-
194
- try {
195
- const response = await page.goto(url, { timeout });
196
-
197
- if (!response) {
198
- throw new Error(`Failed to load page: response is null for ${url}`);
199
- }
200
-
201
- const statusCode = response.status();
202
-
203
- logger.debug('getPageContent.response', { response, statusCode });
204
-
205
- if (![200, 304].includes(statusCode)) {
206
- throw new Error(`Request failed with status ${statusCode}, in ${url}`);
207
- }
208
-
209
- // await for networkidle0
210
- // https://pptr.dev/api/puppeteer.page.goforward/#remarks
211
- await page.waitForNetworkIdle({
212
- idleTime: 2 * 1000,
213
- });
214
-
215
- // get screenshot
216
- if (includeScreenshot) {
217
- try {
218
- screenshot = await page.screenshot({ fullPage, quality, type: 'webp' });
219
- } catch (err) {
220
- logger.error('Failed to get screenshot:', err);
221
- }
222
- }
223
-
224
- // get html
225
- if (includeHtml) {
226
- if (formatPageContent) {
227
- html = await formatPageContent({ page, url });
228
- } else {
229
- html = await page.content();
230
- }
231
- }
232
- } catch (error) {
233
- logger.error('Failed to get page content:', error);
234
- throw error;
235
- } finally {
236
- await page.close();
237
- }
238
-
239
- html = formatHtml(html || '');
240
-
241
- return {
242
- html,
243
- screenshot,
244
- };
245
- };
246
-
247
- export async function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void) {
248
- params = {
249
- ...params,
250
- url: formatUrl(params.url),
251
- };
252
-
253
- // skip duplicate job
254
- const existsJob = await getJob({
255
- url: params.url,
256
- includeScreenshot: params.includeScreenshot,
257
- includeHtml: params.includeHtml,
258
- quality: params.quality,
259
- width: params.width,
260
- height: params.height,
261
- fullPage: params.fullPage,
262
- });
263
-
264
- logger.info('create crawl job', params);
265
-
266
- if (existsJob) {
267
- logger.warn(`Crawl job already exists for ${params.url}, skip`);
268
- return existsJob.id;
269
- }
270
-
271
- const jobId = randomUUID();
272
- const job = crawlQueue.push({ ...params, id: jobId });
273
-
274
- job.on('finished', ({ result }) => {
275
- logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
276
- callback?.(result);
277
- });
278
-
279
- job.on('failed', ({ error }) => {
280
- logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
281
- callback?.(null);
282
- });
283
-
284
- return jobId;
285
- }
286
-
287
- // @ts-ignore
288
- export async function getJob(condition: Partial<JobState>) {
289
- const where = Object.keys(condition)
290
- .filter((key) => condition[key] !== undefined)
291
- .map((key) => {
292
- return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
293
- });
294
-
295
- const job = await crawlQueue.store.db.findOne({
296
- where: {
297
- [sequelize.Op.and]: where,
298
- },
299
- });
300
-
301
- if (job) {
302
- return job.job;
303
- }
304
-
305
- return null;
306
- }
307
-
308
- function convertJobToSnapshot({ job, snapshot }: { job: JobState; snapshot?: Partial<SnapshotModel> }) {
309
- return {
310
- // @ts-ignore
311
- jobId: job.jobId || job.id,
312
- url: job.url,
313
- options: {
314
- width: job.width,
315
- height: job.height,
316
- includeScreenshot: job.includeScreenshot,
317
- includeHtml: job.includeHtml,
318
- quality: job.quality,
319
- fullPage: job.fullPage,
320
- },
321
- ...snapshot,
322
- } as SnapshotModel;
323
- }
324
-
325
- export async function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>) {
326
- let data = Object.assign({}, snapshot);
327
-
328
- // format screenshot path to full url
329
- if (data.screenshot) {
330
- data.screenshot = joinURL(config.appUrl, data.screenshot);
331
- }
332
- // format html path to string
333
- if (data.html) {
334
- const html = await fs.readFile(path.join(config.dataDir, data.html));
335
- data.html = html.toString();
336
- }
337
-
338
- if (columns?.length) {
339
- data = pick(data, columns);
340
- }
341
-
342
- return data;
343
- }
344
-
345
- /**
346
- * get snapshot from db or crawl queue
347
- */
348
- export async function getSnapshot(jobId: string) {
349
- const snapshotModel = await Snapshot.findByPk(jobId);
350
-
351
- if (snapshotModel) {
352
- return snapshotModel.toJSON();
353
- }
354
-
355
- const job = await getJob({ id: jobId });
356
- if (job) {
357
- return {
358
- jobId,
359
- status: 'pending',
360
- } as SnapshotModel;
361
- }
362
-
363
- return null;
364
- }
package/src/db/index.ts DELETED
@@ -1,27 +0,0 @@
1
- import { Sequelize } from '@sequelize/core';
2
- import { SqliteDialect } from '@sequelize/sqlite3';
3
- import path from 'path';
4
-
5
- import { config, logger } from '../config';
6
- import { initJobModel } from './job';
7
- import { initSnapshotModel } from './snapshot';
8
-
9
- export async function ensureDatabase() {
10
- const sequelize = new Sequelize({
11
- dialect: SqliteDialect,
12
- storage: path.join(config.dataDir, 'snap-kit.db'),
13
- logging: (msg) => logger.debug(msg),
14
- });
15
-
16
- await initSnapshotModel(sequelize);
17
- await initJobModel(sequelize);
18
-
19
- try {
20
- await sequelize.authenticate();
21
- await sequelize.sync();
22
- logger.info('Successfully connected to database');
23
- } catch (error) {
24
- logger.error('Failed to connect to database:', error);
25
- throw error;
26
- }
27
- }
package/src/db/job.ts DELETED
@@ -1,93 +0,0 @@
1
- import { DataTypes, Model, Sequelize } from '@sequelize/core';
2
-
3
- export interface JobState {
4
- id?: string;
5
- jobId: string;
6
- url: string;
7
- includeScreenshot?: boolean;
8
- includeHtml?: boolean;
9
- width?: number;
10
- height?: number;
11
- quality?: number;
12
- timeout?: number;
13
- fullPage?: boolean;
14
- }
15
-
16
- export interface JobModel {
17
- id: string;
18
- queue: string;
19
- job: JobState;
20
- retryCount: number;
21
- willRunAt: number;
22
- delay: number;
23
- cancelled: boolean;
24
- }
25
-
26
- class Job extends Model<JobModel> implements JobModel {
27
- public id!: JobModel['id'];
28
-
29
- public queue!: JobModel['queue'];
30
-
31
- public job!: JobModel['job'];
32
-
33
- public retryCount!: JobModel['retryCount'];
34
-
35
- public willRunAt!: JobModel['willRunAt'];
36
-
37
- public delay!: JobModel['delay'];
38
-
39
- public cancelled!: JobModel['cancelled'];
40
- }
41
-
42
- export { Job };
43
-
44
- export function initJobModel(sequelize: Sequelize) {
45
- Job.init(
46
- {
47
- id: {
48
- type: DataTypes.STRING(40),
49
- primaryKey: true,
50
- },
51
- queue: {
52
- type: DataTypes.STRING(32),
53
- allowNull: false,
54
- },
55
- job: {
56
- type: DataTypes.JSON,
57
- allowNull: false,
58
- },
59
- retryCount: {
60
- type: DataTypes.INTEGER,
61
- },
62
- delay: {
63
- type: DataTypes.INTEGER,
64
- },
65
- willRunAt: {
66
- type: DataTypes.INTEGER,
67
- },
68
- cancelled: {
69
- type: DataTypes.BOOLEAN,
70
- defaultValue: false,
71
- },
72
- createdAt: {
73
- type: DataTypes.DATE,
74
- defaultValue: DataTypes.NOW,
75
- index: true,
76
- },
77
- updatedAt: {
78
- type: DataTypes.DATE,
79
- defaultValue: DataTypes.NOW,
80
- index: true,
81
- },
82
- },
83
- {
84
- sequelize,
85
- indexes: [{ fields: ['queue'] }],
86
- modelName: 'job',
87
- tableName: 'jobs',
88
- timestamps: true,
89
- },
90
- );
91
-
92
- return Job;
93
- }
@@ -1,89 +0,0 @@
1
- import { DataTypes, Model, Sequelize } from '@sequelize/core';
2
-
3
- interface SnapshotModel {
4
- jobId: string;
5
- url: string;
6
- status: 'success' | 'failed' | 'pending';
7
- html?: string | null;
8
- screenshot?: string | null;
9
- error?: string;
10
- lastModified?: string;
11
- options?: {
12
- width?: number;
13
- height?: number;
14
- includeScreenshot?: boolean;
15
- includeHtml?: boolean;
16
- quality?: number;
17
- fullPage?: boolean;
18
- };
19
- }
20
-
21
- class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
22
- public jobId!: SnapshotModel['jobId'];
23
-
24
- public url!: SnapshotModel['url'];
25
-
26
- public status!: SnapshotModel['status'];
27
-
28
- public html?: SnapshotModel['html'];
29
-
30
- public screenshot?: SnapshotModel['screenshot'];
31
-
32
- public error?: SnapshotModel['error'];
33
-
34
- public lastModified?: SnapshotModel['lastModified'];
35
-
36
- public options!: SnapshotModel['options'];
37
- }
38
-
39
- export { Snapshot };
40
- export type { SnapshotModel };
41
-
42
- export function initSnapshotModel(sequelize: Sequelize) {
43
- Snapshot.init(
44
- {
45
- jobId: {
46
- type: DataTypes.STRING,
47
- primaryKey: true,
48
- allowNull: false,
49
- },
50
- url: {
51
- type: DataTypes.STRING,
52
- allowNull: false,
53
- index: true,
54
- },
55
- status: {
56
- type: DataTypes.ENUM('success', 'failed'),
57
- allowNull: false,
58
- },
59
- html: {
60
- type: DataTypes.TEXT,
61
- allowNull: true,
62
- },
63
- screenshot: {
64
- type: DataTypes.STRING,
65
- allowNull: true,
66
- },
67
- error: {
68
- type: DataTypes.STRING,
69
- allowNull: true,
70
- },
71
- lastModified: {
72
- type: DataTypes.STRING,
73
- allowNull: true,
74
- },
75
- options: {
76
- type: DataTypes.JSON,
77
- allowNull: true,
78
- },
79
- },
80
- {
81
- sequelize,
82
- modelName: 'snapshot',
83
- tableName: 'snap',
84
- timestamps: true,
85
- },
86
- );
87
-
88
- return Snapshot;
89
- }
package/src/index.ts DELETED
@@ -1,19 +0,0 @@
1
- import { config, logger } from './config';
2
- import { createCrawlQueue } from './crawler';
3
- import { ensureDatabase } from './db';
4
- import { ensureBrowser } from './puppeteer';
5
-
6
- export * from './blocklet';
7
- export * from './crawler';
8
- export * from './middleware';
9
- export { Snapshot } from './db/snapshot';
10
-
11
- export async function initCrawler(_config: Partial<typeof config>) {
12
- Object.assign(config, _config);
13
-
14
- logger.debug('init crawler', config);
15
-
16
- await ensureDatabase();
17
- await createCrawlQueue();
18
- await ensureBrowser();
19
- }
package/src/middleware.ts DELETED
@@ -1,46 +0,0 @@
1
- import { useCache } from './cache';
2
- import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
3
-
4
- export function initSEOMiddleware({
5
- autoReturnHtml = true,
6
- allowCrawler = true,
7
- }: {
8
- autoReturnHtml?: Boolean;
9
- allowCrawler?: Boolean | Function;
10
- }) {
11
- return async (req: any, res: any, next: Function) => {
12
- const isBot = isBotUserAgent(req);
13
- const isSelf = isSelfCrawler(req);
14
-
15
- if (!isBot || isSelf) {
16
- return next();
17
- }
18
-
19
- const fullUrl = getFullUrl(req);
20
- const canCrawl = await isAcceptCrawler(fullUrl);
21
- const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
22
-
23
- // can not crawl, skip
24
- if (!canCrawl || !allowCrawlerResult) {
25
- return next();
26
- }
27
-
28
- const cacheData = await useCache.get(fullUrl);
29
-
30
- // add cached html to req
31
- req.cachedHtml = cacheData?.content || cacheData || null;
32
- // add cached lastModified to req, ISO string to GMT string
33
- req.cachedLastmod = cacheData?.lastModified ? new Date(cacheData?.lastModified).toUTCString() : null;
34
-
35
- if (req.cachedLastmod) {
36
- res.setHeader('Last-Modified', req.cachedLastmod);
37
- }
38
-
39
- if (autoReturnHtml && req.cachedHtml) {
40
- res.send(req.cachedHtml);
41
- return;
42
- }
43
- // missing cache
44
- next();
45
- };
46
- }