@arcblock/crawler 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +1 -2
  2. package/lib/cjs/config.d.ts +9 -3
  3. package/lib/cjs/config.js +2 -10
  4. package/lib/cjs/crawler.d.ts +3 -4
  5. package/lib/cjs/crawler.js +74 -48
  6. package/lib/cjs/cron.js +5 -0
  7. package/lib/cjs/index.d.ts +2 -4
  8. package/lib/cjs/index.js +6 -6
  9. package/lib/cjs/services/snapshot.d.ts +5 -2
  10. package/lib/cjs/services/snapshot.js +44 -7
  11. package/lib/cjs/site.d.ts +1 -1
  12. package/lib/cjs/site.js +11 -4
  13. package/lib/cjs/store/index.d.ts +4 -1
  14. package/lib/cjs/store/index.js +37 -45
  15. package/lib/cjs/store/job.d.ts +6 -1
  16. package/lib/cjs/store/migrate.d.ts +4 -0
  17. package/lib/cjs/store/migrate.js +63 -0
  18. package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
  19. package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
  20. package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
  21. package/lib/cjs/store/migrations/20250616-replace.js +40 -0
  22. package/lib/cjs/store/snapshot.d.ts +8 -0
  23. package/lib/cjs/store/snapshot.js +7 -0
  24. package/lib/esm/config.d.ts +9 -3
  25. package/lib/esm/config.js +2 -10
  26. package/lib/esm/crawler.d.ts +3 -4
  27. package/lib/esm/crawler.js +71 -45
  28. package/lib/esm/cron.js +5 -0
  29. package/lib/esm/index.d.ts +2 -4
  30. package/lib/esm/index.js +4 -5
  31. package/lib/esm/services/snapshot.d.ts +5 -2
  32. package/lib/esm/services/snapshot.js +41 -5
  33. package/lib/esm/site.d.ts +1 -1
  34. package/lib/esm/site.js +11 -4
  35. package/lib/esm/store/index.d.ts +4 -1
  36. package/lib/esm/store/index.js +23 -45
  37. package/lib/esm/store/job.d.ts +6 -1
  38. package/lib/esm/store/migrate.d.ts +4 -0
  39. package/lib/esm/store/migrate.js +26 -0
  40. package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
  41. package/lib/esm/store/migrations/20250615-genesis.js +110 -0
  42. package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
  43. package/lib/esm/store/migrations/20250616-replace.js +36 -0
  44. package/lib/esm/store/snapshot.d.ts +8 -0
  45. package/lib/esm/store/snapshot.js +7 -0
  46. package/package.json +3 -2
@@ -7,16 +7,16 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ import cloneDeep from 'lodash/cloneDeep';
10
11
  import pick from 'lodash/pick';
11
12
  import fs from 'node:fs/promises';
12
13
  import path from 'node:path';
13
14
  import { joinURL } from 'ufo';
14
- import { config } from '../config';
15
- import { Job } from '../store/job';
16
- import { Snapshot } from '../store/snapshot';
15
+ import { config, logger } from '../config';
16
+ import { Job, Snapshot } from '../store';
17
17
  import { formatUrl } from '../utils';
18
18
  export function convertJobToSnapshot({ job, snapshot }) {
19
- return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
19
+ return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
20
20
  width: job.width,
21
21
  height: job.height,
22
22
  includeScreenshot: job.includeScreenshot,
@@ -27,7 +27,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
27
27
  }
28
28
  export function formatSnapshot(snapshot, columns) {
29
29
  return __awaiter(this, void 0, void 0, function* () {
30
- let data = Object.assign({}, snapshot);
30
+ let data = cloneDeep(snapshot);
31
31
  // format screenshot path to full url
32
32
  if (data.screenshot) {
33
33
  data.screenshot = joinURL(config.appUrl, data.screenshot);
@@ -37,6 +37,12 @@ export function formatSnapshot(snapshot, columns) {
37
37
  const html = yield fs.readFile(path.join(config.dataDir, data.html));
38
38
  data.html = html.toString();
39
39
  }
40
+ // remove sensitive options that should not be returned
41
+ if (data.options) {
42
+ delete data.options.cookies;
43
+ delete data.options.localStorage;
44
+ delete data.options.headers;
45
+ }
40
46
  if (columns === null || columns === void 0 ? void 0 : columns.length) {
41
47
  data = pick(data, columns);
42
48
  }
@@ -69,7 +75,37 @@ export function getLatestSnapshot(url) {
69
75
  url: formatUrl(url),
70
76
  status: 'success',
71
77
  },
78
+ order: [
79
+ ['lastModified', 'DESC'],
80
+ ['updatedAt', 'DESC'],
81
+ ],
72
82
  });
73
83
  return snapshot ? formatSnapshot(snapshot) : null;
74
84
  });
75
85
  }
86
+ export function deleteSnapshots(where_1) {
87
+ return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
88
+ const snapshots = yield Snapshot.findAll({
89
+ where,
90
+ order: [
91
+ ['lastModified', 'DESC'],
92
+ ['updatedAt', 'DESC'],
93
+ ],
94
+ });
95
+ const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
96
+ try {
97
+ yield Promise.all([
98
+ snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
99
+ snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
100
+ ]);
101
+ yield snapshot.destroy({ transaction: txn });
102
+ return snapshot.jobId;
103
+ }
104
+ catch (error) {
105
+ logger.error('Failed to delete snapshot', { error, snapshot });
106
+ throw error;
107
+ }
108
+ })));
109
+ return jobIds.filter(Boolean);
110
+ });
111
+ }
package/lib/esm/site.d.ts CHANGED
@@ -1,2 +1,2 @@
1
1
  import { Site } from './config';
2
- export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
2
+ export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
package/lib/esm/site.js CHANGED
@@ -8,12 +8,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
8
8
  });
9
9
  };
10
10
  import uniq from 'lodash/uniq';
11
+ import { randomUUID } from 'node:crypto';
11
12
  import pMap from 'p-map';
12
13
  import { config, logger } from './config';
13
- import { crawlUrl } from './crawler';
14
- import { Snapshot } from './store/snapshot';
14
+ import { createCrawlQueue } from './crawler';
15
+ import { Snapshot } from './store';
15
16
  import { formatUrl, getSitemapList } from './utils';
16
17
  const crawlBlockletRunningMap = new Map();
18
+ const crawlQueue = createCrawlQueue('cronJobs');
17
19
  function parseSitemapUrl(sitemapItem) {
18
20
  var _a;
19
21
  const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
@@ -21,6 +23,7 @@ function parseSitemapUrl(sitemapItem) {
21
23
  return urls.map((url) => ({ url, sitemapItem }));
22
24
  }
23
25
  export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
26
+ var _b;
24
27
  logger.info(`Start crawl from sitemap ${url}`, { pathname });
25
28
  const key = `${url}-${pathname}`;
26
29
  if (crawlBlockletRunningMap.has(key)) {
@@ -60,13 +63,17 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
60
63
  url,
61
64
  });
62
65
  crawlCount++;
63
- return crawlUrl({
66
+ const jobId = randomUUID();
67
+ crawlQueue.push({
68
+ id: jobId,
64
69
  url,
65
70
  lastModified: sitemapItem.lastmod,
66
71
  includeScreenshot: false,
67
72
  includeHtml: true,
73
+ replace: true,
68
74
  });
69
- }), { concurrency: config.siteCron.sitemapConcurrency });
75
+ return jobId;
76
+ }), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
70
77
  logger.info('Enqueued jobs from sitemap finished', {
71
78
  url,
72
79
  pathname,
@@ -1,3 +1,6 @@
1
1
  import { Sequelize } from '@sequelize/core';
2
2
  import { SqliteDialect } from '@sequelize/sqlite3';
3
- export declare function initDatabase(): Promise<Sequelize<SqliteDialect>>;
3
+ declare const sequelize: Sequelize<SqliteDialect>;
4
+ export { sequelize };
5
+ export * from './job';
6
+ export * from './snapshot';
@@ -1,51 +1,29 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
1
  import { Sequelize } from '@sequelize/core';
11
2
  import { SqliteDialect } from '@sequelize/sqlite3';
12
3
  import path from 'path';
13
4
  import { config, logger } from '../config';
14
5
  import { Job } from './job';
15
6
  import { Snapshot } from './snapshot';
16
- export function initDatabase() {
17
- return __awaiter(this, void 0, void 0, function* () {
18
- const sequelize = new Sequelize({
19
- dialect: SqliteDialect,
20
- storage: path.join(config.dataDir, 'snap-kit.db'),
21
- logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
22
- pool: {
23
- min: 0,
24
- max: 10,
25
- idle: 10000,
26
- },
27
- retry: {
28
- match: [/SQLITE_BUSY/],
29
- name: 'query',
30
- max: 10,
31
- },
32
- });
33
- Job.initModel(sequelize);
34
- Snapshot.initModel(sequelize);
35
- try {
36
- yield Promise.all([
37
- sequelize.query('pragma journal_mode = WAL;'),
38
- sequelize.query('pragma synchronous = normal;'),
39
- sequelize.query('pragma journal_size_limit = 67108864;'),
40
- ]);
41
- yield sequelize.authenticate();
42
- yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
43
- logger.info('Successfully connected to database');
44
- }
45
- catch (error) {
46
- logger.error('Failed to connect to database:', error);
47
- throw error;
48
- }
49
- return sequelize;
50
- });
51
- }
7
+ const sequelize = new Sequelize({
8
+ dialect: SqliteDialect,
9
+ storage: path.join(config.dataDir, 'snap-kit.db'),
10
+ logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
11
+ pool: {
12
+ min: 0,
13
+ max: 10,
14
+ idle: 10000,
15
+ },
16
+ retry: {
17
+ match: [/SQLITE_BUSY/],
18
+ name: 'query',
19
+ max: 10,
20
+ },
21
+ });
22
+ sequelize.query('pragma journal_mode = WAL;');
23
+ sequelize.query('pragma synchronous = normal;');
24
+ sequelize.query('pragma journal_size_limit = 67108864;');
25
+ Job.initModel(sequelize);
26
+ Snapshot.initModel(sequelize);
27
+ export { sequelize };
28
+ export * from './job';
29
+ export * from './snapshot';
@@ -12,9 +12,14 @@ export interface JobState {
12
12
  timeout?: number;
13
13
  fullPage?: boolean;
14
14
  lastModified?: string;
15
+ waitTime?: number;
16
+ replace?: boolean;
15
17
  headers?: Record<string, string>;
16
18
  cookies?: CookieParam[];
17
- localStorage?: Record<string, string>;
19
+ localStorage?: {
20
+ key: string;
21
+ value: string;
22
+ }[];
18
23
  }
19
24
  export interface JobModel {
20
25
  id: string;
@@ -0,0 +1,4 @@
1
+ import { Umzug } from 'umzug';
2
+ declare const umzug: Umzug<import("@sequelize/sqlite3").SqliteQueryInterface<import("@sequelize/sqlite3").SqliteDialect>>;
3
+ export declare function migrate(): Promise<import("umzug").MigrationMeta[]>;
4
+ export { umzug };
@@ -0,0 +1,26 @@
1
+ /* eslint-disable global-require */
2
+ import { SequelizeStorage, Umzug } from 'umzug';
3
+ import { sequelize } from './index';
4
+ import * as migration20250615 from './migrations/20250615-genesis';
5
+ import * as migration20250616Replace from './migrations/20250616-replace';
6
+ const umzug = new Umzug({
7
+ migrations: [
8
+ {
9
+ name: '20250615-genesis',
10
+ up: ({ context }) => migration20250615.up({ context }),
11
+ down: ({ context }) => migration20250615.down({ context }),
12
+ },
13
+ {
14
+ name: '20250616-replace',
15
+ up: ({ context }) => migration20250616Replace.up({ context }),
16
+ down: ({ context }) => migration20250616Replace.down({ context }),
17
+ },
18
+ ],
19
+ context: sequelize.getQueryInterface(),
20
+ storage: new SequelizeStorage({ sequelize }),
21
+ logger: console,
22
+ });
23
+ export function migrate() {
24
+ return umzug.up();
25
+ }
26
+ export { umzug };
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,110 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ /* eslint-disable no-console */
11
+ import { DataTypes } from '@sequelize/core';
12
+ export function up(_a) {
13
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
14
+ console.log('[20250615-genesis:up] Migrating...');
15
+ yield context.createTable('snap', {
16
+ jobId: {
17
+ type: DataTypes.STRING,
18
+ primaryKey: true,
19
+ allowNull: false,
20
+ },
21
+ url: {
22
+ type: DataTypes.STRING,
23
+ allowNull: false,
24
+ index: true,
25
+ },
26
+ status: {
27
+ type: DataTypes.ENUM('success', 'failed', 'pending'),
28
+ allowNull: false,
29
+ },
30
+ html: {
31
+ type: DataTypes.TEXT,
32
+ allowNull: true,
33
+ },
34
+ screenshot: {
35
+ type: DataTypes.STRING,
36
+ allowNull: true,
37
+ },
38
+ error: {
39
+ type: DataTypes.STRING,
40
+ allowNull: true,
41
+ },
42
+ lastModified: {
43
+ type: DataTypes.STRING,
44
+ allowNull: true,
45
+ },
46
+ meta: {
47
+ type: DataTypes.JSON,
48
+ allowNull: true,
49
+ },
50
+ options: {
51
+ type: DataTypes.JSON,
52
+ allowNull: true,
53
+ },
54
+ createdAt: {
55
+ type: DataTypes.DATE,
56
+ defaultValue: DataTypes.NOW,
57
+ },
58
+ updatedAt: {
59
+ type: DataTypes.DATE,
60
+ defaultValue: DataTypes.NOW,
61
+ },
62
+ });
63
+ yield context.createTable('jobs', {
64
+ id: {
65
+ type: DataTypes.STRING(40),
66
+ primaryKey: true,
67
+ },
68
+ queue: {
69
+ type: DataTypes.STRING(32),
70
+ allowNull: false,
71
+ },
72
+ job: {
73
+ type: DataTypes.JSON,
74
+ allowNull: false,
75
+ },
76
+ retryCount: {
77
+ type: DataTypes.INTEGER,
78
+ },
79
+ delay: {
80
+ type: DataTypes.INTEGER,
81
+ },
82
+ willRunAt: {
83
+ type: DataTypes.INTEGER,
84
+ },
85
+ cancelled: {
86
+ type: DataTypes.BOOLEAN,
87
+ defaultValue: false,
88
+ },
89
+ createdAt: {
90
+ type: DataTypes.DATE,
91
+ defaultValue: DataTypes.NOW,
92
+ index: true,
93
+ },
94
+ updatedAt: {
95
+ type: DataTypes.DATE,
96
+ defaultValue: DataTypes.NOW,
97
+ index: true,
98
+ },
99
+ });
100
+ console.log('[20250615-genesis:up] Migrated successfully!');
101
+ });
102
+ }
103
+ export function down(_a) {
104
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
105
+ console.log('[20250615-genesis:down] Migrating...');
106
+ yield context.dropTable('snap');
107
+ yield context.dropTable('jobs');
108
+ console.log('[20250615-genesis:down] Migrated successfully!');
109
+ });
110
+ }
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,36 @@
1
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
+ return new (P || (P = Promise))(function (resolve, reject) {
4
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
8
+ });
9
+ };
10
+ /* eslint-disable no-console */
11
+ import { DataTypes } from '@sequelize/core';
12
+ export function up(_a) {
13
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
14
+ console.log('[20250616-replace:up] Migrating...');
15
+ yield context.addColumn('snap', 'replace', {
16
+ type: DataTypes.BOOLEAN,
17
+ allowNull: false,
18
+ defaultValue: false,
19
+ index: true,
20
+ });
21
+ yield context.addIndex('snap', ['createdAt']);
22
+ yield context.addIndex('snap', ['updatedAt']);
23
+ yield context.addIndex('snap', ['status']);
24
+ console.log('[20250616-replace:up] Migrated successfully!');
25
+ });
26
+ }
27
+ export function down(_a) {
28
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
29
+ console.log('[20250616-replace:down] Migrating...');
30
+ yield context.removeColumn('snap', 'replace');
31
+ yield context.removeIndex('snap', ['createdAt']);
32
+ yield context.removeIndex('snap', ['updatedAt']);
33
+ yield context.removeIndex('snap', ['status']);
34
+ console.log('[20250616-replace:down] Migrated successfully!');
35
+ });
36
+ }
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -7,6 +8,7 @@ export interface SnapshotModel {
7
8
  screenshot?: string | null;
8
9
  error?: string;
9
10
  lastModified?: string;
11
+ replace?: boolean;
10
12
  meta?: {
11
13
  title?: string;
12
14
  description?: string;
@@ -19,6 +21,11 @@ export interface SnapshotModel {
19
21
  quality?: number;
20
22
  fullPage?: boolean;
21
23
  headers?: Record<string, string>;
24
+ cookies?: CookieParam[];
25
+ localStorage?: {
26
+ key: string;
27
+ value: string;
28
+ }[];
22
29
  };
23
30
  }
24
31
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
@@ -29,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
29
36
  screenshot?: SnapshotModel['screenshot'];
30
37
  error?: SnapshotModel['error'];
31
38
  lastModified?: SnapshotModel['lastModified'];
39
+ replace?: SnapshotModel['replace'];
32
40
  meta?: SnapshotModel['meta'];
33
41
  options: SnapshotModel['options'];
34
42
  static initModel(sequelize: Sequelize): typeof Snapshot;
@@ -24,6 +24,7 @@ export class Snapshot extends Model {
24
24
  status: {
25
25
  type: DataTypes.ENUM('success', 'failed', 'pending'),
26
26
  allowNull: false,
27
+ index: true,
27
28
  },
28
29
  html: {
29
30
  type: DataTypes.TEXT,
@@ -41,6 +42,12 @@ export class Snapshot extends Model {
41
42
  type: DataTypes.STRING,
42
43
  allowNull: true,
43
44
  },
45
+ replace: {
46
+ type: DataTypes.BOOLEAN,
47
+ allowNull: false,
48
+ defaultValue: false,
49
+ index: true,
50
+ },
44
51
  meta: {
45
52
  type: DataTypes.JSON,
46
53
  allowNull: true,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@arcblock/crawler",
3
- "version": "1.1.5",
3
+ "version": "1.2.0",
4
4
  "main": "lib/cjs/index.js",
5
5
  "module": "lib/esm/index.js",
6
6
  "types": "lib/cjs/index.d.ts",
@@ -61,7 +61,8 @@
61
61
  "robots-parser": "^3.0.1",
62
62
  "sitemap": "^7.1.2",
63
63
  "sqlite3": "^5.1.7",
64
- "ufo": "^1.5.4"
64
+ "ufo": "^1.5.4",
65
+ "umzug": "^3.8.2"
65
66
  },
66
67
  "devDependencies": {
67
68
  "@types/dotenv-flow": "^3.3.3",