@arcblock/crawler 1.1.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/lib/cjs/crawler.d.ts +11 -4
  2. package/lib/cjs/crawler.js +96 -59
  3. package/lib/cjs/index.d.ts +1 -0
  4. package/lib/cjs/index.js +3 -5
  5. package/lib/cjs/services/carbon.d.ts +3 -0
  6. package/lib/cjs/services/carbon.js +87 -0
  7. package/lib/cjs/services/snapshot.d.ts +5 -2
  8. package/lib/cjs/services/snapshot.js +36 -6
  9. package/lib/cjs/site.d.ts +1 -1
  10. package/lib/cjs/site.js +9 -3
  11. package/lib/cjs/store/index.d.ts +4 -1
  12. package/lib/cjs/store/index.js +37 -45
  13. package/lib/cjs/store/job.d.ts +5 -0
  14. package/lib/cjs/store/migrate.d.ts +4 -0
  15. package/lib/cjs/store/migrate.js +63 -0
  16. package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
  17. package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
  18. package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
  19. package/lib/cjs/store/migrations/20250616-replace.js +40 -0
  20. package/lib/cjs/store/snapshot.d.ts +2 -0
  21. package/lib/cjs/store/snapshot.js +7 -0
  22. package/lib/esm/crawler.d.ts +11 -4
  23. package/lib/esm/crawler.js +92 -57
  24. package/lib/esm/index.d.ts +1 -0
  25. package/lib/esm/index.js +1 -4
  26. package/lib/esm/services/carbon.d.ts +3 -0
  27. package/lib/esm/services/carbon.js +84 -0
  28. package/lib/esm/services/snapshot.d.ts +5 -2
  29. package/lib/esm/services/snapshot.js +33 -4
  30. package/lib/esm/site.d.ts +1 -1
  31. package/lib/esm/site.js +9 -3
  32. package/lib/esm/store/index.d.ts +4 -1
  33. package/lib/esm/store/index.js +23 -45
  34. package/lib/esm/store/job.d.ts +5 -0
  35. package/lib/esm/store/migrate.d.ts +4 -0
  36. package/lib/esm/store/migrate.js +26 -0
  37. package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
  38. package/lib/esm/store/migrations/20250615-genesis.js +110 -0
  39. package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
  40. package/lib/esm/store/migrations/20250616-replace.js +36 -0
  41. package/lib/esm/store/snapshot.d.ts +2 -0
  42. package/lib/esm/store/snapshot.js +7 -0
  43. package/package.json +3 -2
@@ -1,57 +1,49 @@
1
1
  "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
10
15
  };
11
16
  var __importDefault = (this && this.__importDefault) || function (mod) {
12
17
  return (mod && mod.__esModule) ? mod : { "default": mod };
13
18
  };
14
19
  Object.defineProperty(exports, "__esModule", { value: true });
15
- exports.initDatabase = initDatabase;
20
+ exports.sequelize = void 0;
16
21
  const core_1 = require("@sequelize/core");
17
22
  const sqlite3_1 = require("@sequelize/sqlite3");
18
23
  const path_1 = __importDefault(require("path"));
19
24
  const config_1 = require("../config");
20
25
  const job_1 = require("./job");
21
26
  const snapshot_1 = require("./snapshot");
22
- function initDatabase() {
23
- return __awaiter(this, void 0, void 0, function* () {
24
- const sequelize = new core_1.Sequelize({
25
- dialect: sqlite3_1.SqliteDialect,
26
- storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
27
- logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
28
- pool: {
29
- min: 0,
30
- max: 10,
31
- idle: 10000,
32
- },
33
- retry: {
34
- match: [/SQLITE_BUSY/],
35
- name: 'query',
36
- max: 10,
37
- },
38
- });
39
- job_1.Job.initModel(sequelize);
40
- snapshot_1.Snapshot.initModel(sequelize);
41
- try {
42
- yield Promise.all([
43
- sequelize.query('pragma journal_mode = WAL;'),
44
- sequelize.query('pragma synchronous = normal;'),
45
- sequelize.query('pragma journal_size_limit = 67108864;'),
46
- ]);
47
- yield sequelize.authenticate();
48
- yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
49
- config_1.logger.info('Successfully connected to database');
50
- }
51
- catch (error) {
52
- config_1.logger.error('Failed to connect to database:', error);
53
- throw error;
54
- }
55
- return sequelize;
56
- });
57
- }
27
+ const sequelize = new core_1.Sequelize({
28
+ dialect: sqlite3_1.SqliteDialect,
29
+ storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
30
+ logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
31
+ pool: {
32
+ min: 0,
33
+ max: 10,
34
+ idle: 10000,
35
+ },
36
+ retry: {
37
+ match: [/SQLITE_BUSY/],
38
+ name: 'query',
39
+ max: 10,
40
+ },
41
+ });
42
+ exports.sequelize = sequelize;
43
+ sequelize.query('pragma journal_mode = WAL;');
44
+ sequelize.query('pragma synchronous = normal;');
45
+ sequelize.query('pragma journal_size_limit = 67108864;');
46
+ job_1.Job.initModel(sequelize);
47
+ snapshot_1.Snapshot.initModel(sequelize);
48
+ __exportStar(require("./job"), exports);
49
+ __exportStar(require("./snapshot"), exports);
@@ -9,9 +9,14 @@ export interface JobState {
9
9
  width?: number;
10
10
  height?: number;
11
11
  quality?: number;
12
+ format?: 'png' | 'jpeg' | 'webp';
12
13
  timeout?: number;
13
14
  fullPage?: boolean;
14
15
  lastModified?: string;
16
+ waitTime?: number;
17
+ replace?: boolean;
18
+ sync?: boolean;
19
+ ignoreRobots?: boolean;
15
20
  headers?: Record<string, string>;
16
21
  cookies?: CookieParam[];
17
22
  localStorage?: {
@@ -0,0 +1,4 @@
1
+ import { Umzug } from 'umzug';
2
+ declare const umzug: Umzug<import("@sequelize/sqlite3").SqliteQueryInterface<import("@sequelize/sqlite3").SqliteDialect>>;
3
+ export declare function migrate(): Promise<import("umzug").MigrationMeta[]>;
4
+ export { umzug };
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.umzug = void 0;
37
+ exports.migrate = migrate;
38
+ /* eslint-disable global-require */
39
+ const umzug_1 = require("umzug");
40
+ const index_1 = require("./index");
41
+ const migration20250615 = __importStar(require("./migrations/20250615-genesis"));
42
+ const migration20250616Replace = __importStar(require("./migrations/20250616-replace"));
43
+ const umzug = new umzug_1.Umzug({
44
+ migrations: [
45
+ {
46
+ name: '20250615-genesis',
47
+ up: ({ context }) => migration20250615.up({ context }),
48
+ down: ({ context }) => migration20250615.down({ context }),
49
+ },
50
+ {
51
+ name: '20250616-replace',
52
+ up: ({ context }) => migration20250616Replace.up({ context }),
53
+ down: ({ context }) => migration20250616Replace.down({ context }),
54
+ },
55
+ ],
56
+ context: index_1.sequelize.getQueryInterface(),
57
+ storage: new umzug_1.SequelizeStorage({ sequelize: index_1.sequelize }),
58
+ logger: console,
59
+ });
60
+ exports.umzug = umzug;
61
+ function migrate() {
62
+ return umzug.up();
63
+ }
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,114 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.up = up;
13
+ exports.down = down;
14
+ /* eslint-disable no-console */
15
+ const core_1 = require("@sequelize/core");
16
+ function up(_a) {
17
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
18
+ console.log('[20250615-genesis:up] Migrating...');
19
+ yield context.createTable('snap', {
20
+ jobId: {
21
+ type: core_1.DataTypes.STRING,
22
+ primaryKey: true,
23
+ allowNull: false,
24
+ },
25
+ url: {
26
+ type: core_1.DataTypes.STRING,
27
+ allowNull: false,
28
+ index: true,
29
+ },
30
+ status: {
31
+ type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
32
+ allowNull: false,
33
+ },
34
+ html: {
35
+ type: core_1.DataTypes.TEXT,
36
+ allowNull: true,
37
+ },
38
+ screenshot: {
39
+ type: core_1.DataTypes.STRING,
40
+ allowNull: true,
41
+ },
42
+ error: {
43
+ type: core_1.DataTypes.STRING,
44
+ allowNull: true,
45
+ },
46
+ lastModified: {
47
+ type: core_1.DataTypes.STRING,
48
+ allowNull: true,
49
+ },
50
+ meta: {
51
+ type: core_1.DataTypes.JSON,
52
+ allowNull: true,
53
+ },
54
+ options: {
55
+ type: core_1.DataTypes.JSON,
56
+ allowNull: true,
57
+ },
58
+ createdAt: {
59
+ type: core_1.DataTypes.DATE,
60
+ defaultValue: core_1.DataTypes.NOW,
61
+ },
62
+ updatedAt: {
63
+ type: core_1.DataTypes.DATE,
64
+ defaultValue: core_1.DataTypes.NOW,
65
+ },
66
+ });
67
+ yield context.createTable('jobs', {
68
+ id: {
69
+ type: core_1.DataTypes.STRING(40),
70
+ primaryKey: true,
71
+ },
72
+ queue: {
73
+ type: core_1.DataTypes.STRING(32),
74
+ allowNull: false,
75
+ },
76
+ job: {
77
+ type: core_1.DataTypes.JSON,
78
+ allowNull: false,
79
+ },
80
+ retryCount: {
81
+ type: core_1.DataTypes.INTEGER,
82
+ },
83
+ delay: {
84
+ type: core_1.DataTypes.INTEGER,
85
+ },
86
+ willRunAt: {
87
+ type: core_1.DataTypes.INTEGER,
88
+ },
89
+ cancelled: {
90
+ type: core_1.DataTypes.BOOLEAN,
91
+ defaultValue: false,
92
+ },
93
+ createdAt: {
94
+ type: core_1.DataTypes.DATE,
95
+ defaultValue: core_1.DataTypes.NOW,
96
+ index: true,
97
+ },
98
+ updatedAt: {
99
+ type: core_1.DataTypes.DATE,
100
+ defaultValue: core_1.DataTypes.NOW,
101
+ index: true,
102
+ },
103
+ });
104
+ console.log('[20250615-genesis:up] Migrated successfully!');
105
+ });
106
+ }
107
+ function down(_a) {
108
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
109
+ console.log('[20250615-genesis:down] Migrating...');
110
+ yield context.dropTable('snap');
111
+ yield context.dropTable('jobs');
112
+ console.log('[20250615-genesis:down] Migrated successfully!');
113
+ });
114
+ }
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,40 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.up = up;
13
+ exports.down = down;
14
+ /* eslint-disable no-console */
15
+ const core_1 = require("@sequelize/core");
16
+ function up(_a) {
17
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
18
+ console.log('[20250616-replace:up] Migrating...');
19
+ yield context.addColumn('snap', 'replace', {
20
+ type: core_1.DataTypes.BOOLEAN,
21
+ allowNull: false,
22
+ defaultValue: false,
23
+ index: true,
24
+ });
25
+ yield context.addIndex('snap', ['createdAt']);
26
+ yield context.addIndex('snap', ['updatedAt']);
27
+ yield context.addIndex('snap', ['status']);
28
+ console.log('[20250616-replace:up] Migrated successfully!');
29
+ });
30
+ }
31
+ function down(_a) {
32
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
33
+ console.log('[20250616-replace:down] Migrating...');
34
+ yield context.removeColumn('snap', 'replace');
35
+ yield context.removeIndex('snap', ['createdAt']);
36
+ yield context.removeIndex('snap', ['updatedAt']);
37
+ yield context.removeIndex('snap', ['status']);
38
+ console.log('[20250616-replace:down] Migrated successfully!');
39
+ });
40
+ }
@@ -8,6 +8,7 @@ export interface SnapshotModel {
8
8
  screenshot?: string | null;
9
9
  error?: string;
10
10
  lastModified?: string;
11
+ replace?: boolean;
11
12
  meta?: {
12
13
  title?: string;
13
14
  description?: string;
@@ -35,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
35
36
  screenshot?: SnapshotModel['screenshot'];
36
37
  error?: SnapshotModel['error'];
37
38
  lastModified?: SnapshotModel['lastModified'];
39
+ replace?: SnapshotModel['replace'];
38
40
  meta?: SnapshotModel['meta'];
39
41
  options: SnapshotModel['options'];
40
42
  static initModel(sequelize: Sequelize): typeof Snapshot;
@@ -27,6 +27,7 @@ class Snapshot extends core_1.Model {
27
27
  status: {
28
28
  type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
29
29
  allowNull: false,
30
+ index: true,
30
31
  },
31
32
  html: {
32
33
  type: core_1.DataTypes.TEXT,
@@ -44,6 +45,12 @@ class Snapshot extends core_1.Model {
44
45
  type: core_1.DataTypes.STRING,
45
46
  allowNull: true,
46
47
  },
48
+ replace: {
49
+ type: core_1.DataTypes.BOOLEAN,
50
+ allowNull: false,
51
+ defaultValue: false,
52
+ index: true,
53
+ },
47
54
  meta: {
48
55
  type: core_1.DataTypes.JSON,
49
56
  allowNull: true,
@@ -1,11 +1,15 @@
1
- import { JobState } from './store/job';
2
- import { SnapshotModel } from './store/snapshot';
3
- export declare function createCrawlQueue(): void;
1
+ import { Page } from '@blocklet/puppeteer';
2
+ import { JobState, SnapshotModel } from './store';
3
+ type PageHandler = {
4
+ handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
5
+ handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
6
+ };
7
+ export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
4
8
  export declare function getDataDir(): Promise<{
5
9
  htmlDir: string;
6
10
  screenshotDir: string;
7
11
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
12
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
9
13
  html: string | null;
10
14
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
15
  meta: {
@@ -18,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
18
22
  * @param params
19
23
  * @param callback callback when job finished
20
24
  */
25
+ export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
21
26
  export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
27
+ export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
28
+ export {};
@@ -14,44 +14,44 @@ import fs from 'fs-extra';
14
14
  import path from 'path';
15
15
  import { config, logger } from './config';
16
16
  import { initPage } from './puppeteer';
17
- import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
18
- import { Job } from './store/job';
19
- import { Snapshot } from './store/snapshot';
20
- import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
17
+ import { createCarbonImage } from './services/carbon';
18
+ import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
19
+ import { Job, Snapshot, sequelize } from './store';
20
+ import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
21
21
  const { BaseState } = require('@abtnode/models');
22
- let crawlQueue;
23
- export function createCrawlQueue() {
22
+ // eslint-disable-next-line import/no-mutable-exports
23
+ const crawlQueue = createCrawlQueue('urlCrawler');
24
+ const syncQueue = createCrawlQueue('syncCrawler');
25
+ const codeQueue = createCrawlQueue('codeCrawler', {
26
+ handleScreenshot: createCarbonImage,
27
+ });
28
+ export function createCrawlQueue(queue, handler) {
24
29
  const db = new BaseState(Job);
25
- crawlQueue = createQueue({
26
- store: new SequelizeStore(db, 'crawler'),
30
+ return createQueue({
31
+ store: new SequelizeStore(db, queue),
27
32
  concurrency: config.concurrency,
28
33
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
34
  logger.info('Starting to execute crawl job', job);
30
- const canCrawl = yield isAcceptCrawler(job.url);
31
- if (!canCrawl) {
32
- logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
33
- const snapshot = convertJobToSnapshot({
34
- job,
35
- snapshot: {
36
- status: 'failed',
37
- error: 'Denied by robots.txt',
38
- },
39
- });
40
- yield Snapshot.upsert(snapshot);
41
- return snapshot;
35
+ // check robots.txt
36
+ if (!job.ignoreRobots) {
37
+ const canCrawl = yield isAcceptCrawler(job.url);
38
+ if (!canCrawl) {
39
+ logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
40
+ const snapshot = convertJobToSnapshot({
41
+ job,
42
+ snapshot: {
43
+ status: 'failed',
44
+ error: 'Denied by robots.txt',
45
+ },
46
+ });
47
+ yield Snapshot.upsert(snapshot);
48
+ return snapshot;
49
+ }
42
50
  }
43
- // if index reach autoCloseBrowserCount, close browser
44
- // try {
45
- // if (index >= autoCloseBrowserCount) {
46
- // await closeBrowser({ trimCache: false });
47
- // }
48
- // } catch (error) {
49
- // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
- // }
51
51
  const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
52
52
  try {
53
53
  // get page content later
54
- const result = yield getPageContent(formattedJob);
54
+ const result = yield getPageContent(formattedJob, handler);
55
55
  if (!result || (!result.html && !result.screenshot)) {
56
56
  logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
57
57
  const snapshot = convertJobToSnapshot({
@@ -64,22 +64,40 @@ export function createCrawlQueue() {
64
64
  yield Snapshot.upsert(snapshot);
65
65
  return snapshot;
66
66
  }
67
- // save html and screenshot to data dir
68
- const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
69
- screenshot: result.screenshot,
70
- html: result.html,
71
- });
72
- // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
73
- const snapshot = convertJobToSnapshot({
74
- job: formattedJob,
75
- snapshot: {
76
- status: 'success',
77
- screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
78
- html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
79
- meta: result.meta,
80
- },
81
- });
82
- yield Snapshot.upsert(snapshot);
67
+ const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
68
+ // delete old snapshot
69
+ if (formattedJob.replace) {
70
+ try {
71
+ const deletedJobIds = yield deleteSnapshots({
72
+ url: formattedJob.url,
73
+ replace: true,
74
+ }, { txn });
75
+ if (deletedJobIds) {
76
+ logger.info('Deleted old snapshot', { deletedJobIds });
77
+ }
78
+ }
79
+ catch (error) {
80
+ logger.error('Failed to delete old snapshot', { error, formattedJob });
81
+ }
82
+ }
83
+ // save html and screenshot to data dir
84
+ const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
85
+ screenshot: result.screenshot,
86
+ html: result.html,
87
+ format: formattedJob.format,
88
+ });
89
+ const snapshot = convertJobToSnapshot({
90
+ job: formattedJob,
91
+ snapshot: {
92
+ status: 'success',
93
+ screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
94
+ html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
95
+ meta: result.meta,
96
+ },
97
+ });
98
+ yield Snapshot.upsert(snapshot, { transaction: txn });
99
+ return snapshot;
100
+ }));
83
101
  return snapshot;
84
102
  }
85
103
  catch (error) {
@@ -107,13 +125,13 @@ export function getDataDir() {
107
125
  });
108
126
  }
109
127
  function saveSnapshotToLocal(_a) {
110
- return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
128
+ return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
111
129
  const { htmlDir, screenshotDir } = yield getDataDir();
112
130
  let screenshotPath = null;
113
131
  let htmlPath = null;
114
132
  if (screenshot) {
115
133
  const hash = md5(screenshot);
116
- screenshotPath = path.join(screenshotDir, `${hash}.webp`);
134
+ screenshotPath = path.join(screenshotDir, `${hash}.${format}`);
117
135
  logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
118
136
  yield fs.writeFile(screenshotPath, screenshot);
119
137
  }
@@ -129,7 +147,7 @@ function saveSnapshotToLocal(_a) {
129
147
  };
130
148
  });
131
149
  }
132
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
150
+ export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
133
151
  const page = yield initPage();
134
152
  if (width && height) {
135
153
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -166,9 +184,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
166
184
  }
167
185
  // await for networkidle0
168
186
  // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
169
- yield page.waitForNetworkIdle({
170
- idleTime: 1.5 * 1000,
171
- });
187
+ try {
188
+ yield Promise.all([
189
+ page.waitForNetworkIdle({
190
+ idleTime: 1.5 * 1000,
191
+ timeout,
192
+ }),
193
+ sleep(waitTime),
194
+ ]);
195
+ }
196
+ catch (err) {
197
+ logger.warn(`Failed to wait for network idle in ${url}:`, err);
198
+ }
172
199
  // get screenshot
173
200
  if (includeScreenshot) {
174
201
  // Try to find the tallest element and set the browser to the same height
@@ -184,7 +211,9 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
184
211
  }
185
212
  }
186
213
  try {
187
- screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
214
+ screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
215
+ ? yield handler.handleScreenshot(page)
216
+ : yield page.screenshot({ fullPage, quality, type: format });
188
217
  }
189
218
  catch (err) {
190
219
  logger.error('Failed to get screenshot:', err);
@@ -212,12 +241,12 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
212
241
  // check if the page is an error page
213
242
  const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
214
243
  if (isErrorPage) {
215
- throw new Error('Page is an error page');
244
+ throw new Error(`${url} is an error page`);
216
245
  }
217
246
  meta.title = data.title;
218
247
  meta.description = data.description;
219
248
  if (includeHtml) {
220
- html = data.html;
249
+ html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
221
250
  }
222
251
  }
223
252
  catch (err) {
@@ -244,17 +273,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
244
273
  * @param callback callback when job finished
245
274
  */
246
275
  // eslint-disable-next-line require-await
247
- export function crawlUrl(params, callback) {
276
+ export function enqueue(queue, params, callback) {
248
277
  return __awaiter(this, void 0, void 0, function* () {
249
278
  // skip duplicate job
250
279
  const existsJob = yield Job.isExists(params);
251
- if (existsJob) {
280
+ if (existsJob && !params.sync) {
252
281
  logger.info(`Crawl job already exists for ${params.url}, skip`);
253
282
  return existsJob.id;
254
283
  }
255
284
  logger.info('enqueue crawl job', params);
256
285
  const jobId = randomUUID();
257
- const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
286
+ const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
258
287
  job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
259
288
  logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
260
289
  callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
@@ -266,3 +295,9 @@ export function crawlUrl(params, callback) {
266
295
  return jobId;
267
296
  });
268
297
  }
298
+ export function crawlUrl(params, callback) {
299
+ return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
300
+ }
301
+ export function crawlCode(params, callback) {
302
+ return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
303
+ }