@arcblock/crawler 1.1.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +1 -2
  2. package/lib/cjs/config.d.ts +9 -3
  3. package/lib/cjs/config.js +2 -10
  4. package/lib/cjs/crawler.d.ts +3 -4
  5. package/lib/cjs/crawler.js +74 -48
  6. package/lib/cjs/cron.js +5 -0
  7. package/lib/cjs/index.d.ts +2 -4
  8. package/lib/cjs/index.js +6 -6
  9. package/lib/cjs/services/snapshot.d.ts +5 -2
  10. package/lib/cjs/services/snapshot.js +44 -7
  11. package/lib/cjs/site.d.ts +1 -1
  12. package/lib/cjs/site.js +11 -4
  13. package/lib/cjs/store/index.d.ts +4 -1
  14. package/lib/cjs/store/index.js +37 -45
  15. package/lib/cjs/store/job.d.ts +6 -1
  16. package/lib/cjs/store/migrate.d.ts +4 -0
  17. package/lib/cjs/store/migrate.js +63 -0
  18. package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
  19. package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
  20. package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
  21. package/lib/cjs/store/migrations/20250616-replace.js +40 -0
  22. package/lib/cjs/store/snapshot.d.ts +8 -0
  23. package/lib/cjs/store/snapshot.js +7 -0
  24. package/lib/esm/config.d.ts +9 -3
  25. package/lib/esm/config.js +2 -10
  26. package/lib/esm/crawler.d.ts +3 -4
  27. package/lib/esm/crawler.js +71 -45
  28. package/lib/esm/cron.js +5 -0
  29. package/lib/esm/index.d.ts +2 -4
  30. package/lib/esm/index.js +4 -5
  31. package/lib/esm/services/snapshot.d.ts +5 -2
  32. package/lib/esm/services/snapshot.js +41 -5
  33. package/lib/esm/site.d.ts +1 -1
  34. package/lib/esm/site.js +11 -4
  35. package/lib/esm/store/index.d.ts +4 -1
  36. package/lib/esm/store/index.js +23 -45
  37. package/lib/esm/store/job.d.ts +6 -1
  38. package/lib/esm/store/migrate.d.ts +4 -0
  39. package/lib/esm/store/migrate.js +26 -0
  40. package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
  41. package/lib/esm/store/migrations/20250615-genesis.js +110 -0
  42. package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
  43. package/lib/esm/store/migrations/20250616-replace.js +36 -0
  44. package/lib/esm/store/snapshot.d.ts +8 -0
  45. package/lib/esm/store/snapshot.js +7 -0
  46. package/package.json +3 -2
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.umzug = void 0;
37
+ exports.migrate = migrate;
38
+ /* eslint-disable global-require */
39
+ const umzug_1 = require("umzug");
40
+ const index_1 = require("./index");
41
+ const migration20250615 = __importStar(require("./migrations/20250615-genesis"));
42
+ const migration20250616Replace = __importStar(require("./migrations/20250616-replace"));
43
+ const umzug = new umzug_1.Umzug({
44
+ migrations: [
45
+ {
46
+ name: '20250615-genesis',
47
+ up: ({ context }) => migration20250615.up({ context }),
48
+ down: ({ context }) => migration20250615.down({ context }),
49
+ },
50
+ {
51
+ name: '20250616-replace',
52
+ up: ({ context }) => migration20250616Replace.up({ context }),
53
+ down: ({ context }) => migration20250616Replace.down({ context }),
54
+ },
55
+ ],
56
+ context: index_1.sequelize.getQueryInterface(),
57
+ storage: new umzug_1.SequelizeStorage({ sequelize: index_1.sequelize }),
58
+ logger: console,
59
+ });
60
+ exports.umzug = umzug;
61
+ function migrate() {
62
+ return umzug.up();
63
+ }
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,114 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.up = up;
13
+ exports.down = down;
14
+ /* eslint-disable no-console */
15
+ const core_1 = require("@sequelize/core");
16
+ function up(_a) {
17
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
18
+ console.log('[20250615-genesis:up] Migrating...');
19
+ yield context.createTable('snap', {
20
+ jobId: {
21
+ type: core_1.DataTypes.STRING,
22
+ primaryKey: true,
23
+ allowNull: false,
24
+ },
25
+ url: {
26
+ type: core_1.DataTypes.STRING,
27
+ allowNull: false,
28
+ index: true,
29
+ },
30
+ status: {
31
+ type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
32
+ allowNull: false,
33
+ },
34
+ html: {
35
+ type: core_1.DataTypes.TEXT,
36
+ allowNull: true,
37
+ },
38
+ screenshot: {
39
+ type: core_1.DataTypes.STRING,
40
+ allowNull: true,
41
+ },
42
+ error: {
43
+ type: core_1.DataTypes.STRING,
44
+ allowNull: true,
45
+ },
46
+ lastModified: {
47
+ type: core_1.DataTypes.STRING,
48
+ allowNull: true,
49
+ },
50
+ meta: {
51
+ type: core_1.DataTypes.JSON,
52
+ allowNull: true,
53
+ },
54
+ options: {
55
+ type: core_1.DataTypes.JSON,
56
+ allowNull: true,
57
+ },
58
+ createdAt: {
59
+ type: core_1.DataTypes.DATE,
60
+ defaultValue: core_1.DataTypes.NOW,
61
+ },
62
+ updatedAt: {
63
+ type: core_1.DataTypes.DATE,
64
+ defaultValue: core_1.DataTypes.NOW,
65
+ },
66
+ });
67
+ yield context.createTable('jobs', {
68
+ id: {
69
+ type: core_1.DataTypes.STRING(40),
70
+ primaryKey: true,
71
+ },
72
+ queue: {
73
+ type: core_1.DataTypes.STRING(32),
74
+ allowNull: false,
75
+ },
76
+ job: {
77
+ type: core_1.DataTypes.JSON,
78
+ allowNull: false,
79
+ },
80
+ retryCount: {
81
+ type: core_1.DataTypes.INTEGER,
82
+ },
83
+ delay: {
84
+ type: core_1.DataTypes.INTEGER,
85
+ },
86
+ willRunAt: {
87
+ type: core_1.DataTypes.INTEGER,
88
+ },
89
+ cancelled: {
90
+ type: core_1.DataTypes.BOOLEAN,
91
+ defaultValue: false,
92
+ },
93
+ createdAt: {
94
+ type: core_1.DataTypes.DATE,
95
+ defaultValue: core_1.DataTypes.NOW,
96
+ index: true,
97
+ },
98
+ updatedAt: {
99
+ type: core_1.DataTypes.DATE,
100
+ defaultValue: core_1.DataTypes.NOW,
101
+ index: true,
102
+ },
103
+ });
104
+ console.log('[20250615-genesis:up] Migrated successfully!');
105
+ });
106
+ }
107
+ function down(_a) {
108
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
109
+ console.log('[20250615-genesis:down] Migrating...');
110
+ yield context.dropTable('snap');
111
+ yield context.dropTable('jobs');
112
+ console.log('[20250615-genesis:down] Migrated successfully!');
113
+ });
114
+ }
@@ -0,0 +1,6 @@
1
+ export declare function up({ context }: {
2
+ context: any;
3
+ }): Promise<void>;
4
+ export declare function down({ context }: {
5
+ context: any;
6
+ }): Promise<void>;
@@ -0,0 +1,40 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.up = up;
13
+ exports.down = down;
14
+ /* eslint-disable no-console */
15
+ const core_1 = require("@sequelize/core");
16
+ function up(_a) {
17
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
18
+ console.log('[20250616-replace:up] Migrating...');
19
+ yield context.addColumn('snap', 'replace', {
20
+ type: core_1.DataTypes.BOOLEAN,
21
+ allowNull: false,
22
+ defaultValue: false,
23
+ index: true,
24
+ });
25
+ yield context.addIndex('snap', ['createdAt']);
26
+ yield context.addIndex('snap', ['updatedAt']);
27
+ yield context.addIndex('snap', ['status']);
28
+ console.log('[20250616-replace:up] Migrated successfully!');
29
+ });
30
+ }
31
+ function down(_a) {
32
+ return __awaiter(this, arguments, void 0, function* ({ context }) {
33
+ console.log('[20250616-replace:down] Migrating...');
34
+ yield context.removeColumn('snap', 'replace');
35
+ yield context.removeIndex('snap', ['createdAt']);
36
+ yield context.removeIndex('snap', ['updatedAt']);
37
+ yield context.removeIndex('snap', ['status']);
38
+ console.log('[20250616-replace:down] Migrated successfully!');
39
+ });
40
+ }
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  import { FindOptions, Model, Sequelize } from '@sequelize/core';
2
3
  export interface SnapshotModel {
3
4
  jobId: string;
@@ -7,6 +8,7 @@ export interface SnapshotModel {
7
8
  screenshot?: string | null;
8
9
  error?: string;
9
10
  lastModified?: string;
11
+ replace?: boolean;
10
12
  meta?: {
11
13
  title?: string;
12
14
  description?: string;
@@ -19,6 +21,11 @@ export interface SnapshotModel {
19
21
  quality?: number;
20
22
  fullPage?: boolean;
21
23
  headers?: Record<string, string>;
24
+ cookies?: CookieParam[];
25
+ localStorage?: {
26
+ key: string;
27
+ value: string;
28
+ }[];
22
29
  };
23
30
  }
24
31
  export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
@@ -29,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
29
36
  screenshot?: SnapshotModel['screenshot'];
30
37
  error?: SnapshotModel['error'];
31
38
  lastModified?: SnapshotModel['lastModified'];
39
+ replace?: SnapshotModel['replace'];
32
40
  meta?: SnapshotModel['meta'];
33
41
  options: SnapshotModel['options'];
34
42
  static initModel(sequelize: Sequelize): typeof Snapshot;
@@ -27,6 +27,7 @@ class Snapshot extends core_1.Model {
27
27
  status: {
28
28
  type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
29
29
  allowNull: false,
30
+ index: true,
30
31
  },
31
32
  html: {
32
33
  type: core_1.DataTypes.TEXT,
@@ -44,6 +45,12 @@ class Snapshot extends core_1.Model {
44
45
  type: core_1.DataTypes.STRING,
45
46
  allowNull: true,
46
47
  },
48
+ replace: {
49
+ type: core_1.DataTypes.BOOLEAN,
50
+ allowNull: false,
51
+ defaultValue: false,
52
+ index: true,
53
+ },
47
54
  meta: {
48
55
  type: core_1.DataTypes.JSON,
49
56
  allowNull: true,
@@ -1,3 +1,4 @@
1
+ import { CookieParam } from '@blocklet/puppeteer';
1
2
  export type Site = {
2
3
  url: string;
3
4
  pathname: string;
@@ -11,14 +12,19 @@ export type Config = {
11
12
  appUrl: string;
12
13
  cacheDir: string;
13
14
  puppeteerPath?: string;
14
- siteCron: {
15
+ concurrency: number;
16
+ siteCron?: {
15
17
  sites: Site[];
16
18
  time: string;
17
19
  enabled: boolean;
18
20
  immediate: boolean;
19
- crawlConcurrency: number;
20
- sitemapConcurrency: number;
21
+ concurrency: number;
21
22
  };
23
+ cookies?: CookieParam[];
24
+ localStorage?: {
25
+ key: string;
26
+ value: string;
27
+ }[];
22
28
  };
23
29
  export declare const logger: any;
24
30
  export declare const config: Config;
package/lib/esm/config.js CHANGED
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
3
3
  export const config = {
4
4
  isProd: process.env.NODE_ENV === 'production',
5
5
  dataDir: process.env.BLOCKLET_DATA_DIR,
6
- appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
7
6
  cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
7
+ appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
8
8
  appUrl: process.env.BLOCKLET_APP_URL || '/',
9
9
  puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
10
- // cron
11
- siteCron: {
12
- sites: [],
13
- enabled: true,
14
- time: '0 0 0 * * *',
15
- immediate: false,
16
- crawlConcurrency: 2,
17
- sitemapConcurrency: 30,
18
- },
10
+ concurrency: 2,
19
11
  };
@@ -1,11 +1,10 @@
1
- import { JobState } from './store/job';
2
- import { SnapshotModel } from './store/snapshot';
3
- export declare function createCrawlQueue(): void;
1
+ import { JobState, SnapshotModel } from './store';
2
+ export declare function createCrawlQueue(queue: string): any;
4
3
  export declare function getDataDir(): Promise<{
5
4
  htmlDir: string;
6
5
  screenshotDir: string;
7
6
  }>;
8
- export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
7
+ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
9
8
  html: string | null;
10
9
  screenshot: Uint8Array<ArrayBufferLike> | null;
11
10
  meta: {
@@ -14,17 +14,17 @@ import fs from 'fs-extra';
14
14
  import path from 'path';
15
15
  import { config, logger } from './config';
16
16
  import { initPage } from './puppeteer';
17
- import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
18
- import { Job } from './store/job';
19
- import { Snapshot } from './store/snapshot';
20
- import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
17
+ import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
18
+ import { Job, Snapshot, sequelize } from './store';
19
+ import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
21
20
  const { BaseState } = require('@abtnode/models');
22
- let crawlQueue;
23
- export function createCrawlQueue() {
21
+ // eslint-disable-next-line import/no-mutable-exports
22
+ const crawlQueue = createCrawlQueue('urlCrawler');
23
+ export function createCrawlQueue(queue) {
24
24
  const db = new BaseState(Job);
25
- crawlQueue = createQueue({
26
- store: new SequelizeStore(db, 'crawler'),
27
- concurrency: config.siteCron.crawlConcurrency,
25
+ return createQueue({
26
+ store: new SequelizeStore(db, queue),
27
+ concurrency: config.concurrency,
28
28
  onJob: (job) => __awaiter(this, void 0, void 0, function* () {
29
29
  logger.info('Starting to execute crawl job', job);
30
30
  const canCrawl = yield isAcceptCrawler(job.url);
@@ -48,18 +48,14 @@ export function createCrawlQueue() {
48
48
  // } catch (error) {
49
49
  // logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
50
50
  // }
51
+ const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
51
52
  try {
52
53
  // get page content later
53
- const result = yield getPageContent(Object.assign({ localStorage: {
54
- // for blocklet theme
55
- blocklet_theme_prefer: 'light',
56
- // for blocklet domain warning
57
- 'domain-warning-skip': Date.now().toString(),
58
- } }, job));
54
+ const result = yield getPageContent(formattedJob);
59
55
  if (!result || (!result.html && !result.screenshot)) {
60
- logger.error(`failed to crawl ${job.url}, empty content`, job);
56
+ logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
61
57
  const snapshot = convertJobToSnapshot({
62
- job,
58
+ job: formattedJob,
63
59
  snapshot: {
64
60
  status: 'failed',
65
61
  error: 'Failed to crawl content',
@@ -68,28 +64,45 @@ export function createCrawlQueue() {
68
64
  yield Snapshot.upsert(snapshot);
69
65
  return snapshot;
70
66
  }
71
- // save html and screenshot to data dir
72
- const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
73
- screenshot: result.screenshot,
74
- html: result.html,
75
- });
76
- // const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
77
- const snapshot = convertJobToSnapshot({
78
- job,
79
- snapshot: {
80
- status: 'success',
81
- screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
82
- html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
83
- meta: result.meta,
84
- },
85
- });
86
- yield Snapshot.upsert(snapshot);
67
+ const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
68
+ // delete old snapshot
69
+ if (formattedJob.replace) {
70
+ try {
71
+ const deletedJobIds = yield deleteSnapshots({
72
+ url: formattedJob.url,
73
+ replace: true,
74
+ }, { txn });
75
+ if (deletedJobIds) {
76
+ logger.info('Deleted old snapshot', { deletedJobIds });
77
+ }
78
+ }
79
+ catch (error) {
80
+ logger.error('Failed to delete old snapshot', { error, formattedJob });
81
+ }
82
+ }
83
+ // save html and screenshot to data dir
84
+ const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
85
+ screenshot: result.screenshot,
86
+ html: result.html,
87
+ });
88
+ const snapshot = convertJobToSnapshot({
89
+ job: formattedJob,
90
+ snapshot: {
91
+ status: 'success',
92
+ screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
93
+ html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
94
+ meta: result.meta,
95
+ },
96
+ });
97
+ yield Snapshot.upsert(snapshot, { transaction: txn });
98
+ return snapshot;
99
+ }));
87
100
  return snapshot;
88
101
  }
89
102
  catch (error) {
90
- logger.error(`Failed to crawl ${job.url}`, { error, job });
103
+ logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
91
104
  const snapshot = convertJobToSnapshot({
92
- job,
105
+ job: formattedJob,
93
106
  snapshot: {
94
107
  status: 'failed',
95
108
  error: 'Internal error',
@@ -133,7 +146,7 @@ function saveSnapshotToLocal(_a) {
133
146
  };
134
147
  });
135
148
  }
136
- export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies = [], localStorage, }) {
149
+ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
137
150
  const page = yield initPage();
138
151
  if (width && height) {
139
152
  yield page.setViewport({ width, height, deviceScaleFactor: 2 });
@@ -141,13 +154,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
141
154
  if (headers) {
142
155
  yield page.setExtraHTTPHeaders(headers);
143
156
  }
144
- if (cookies === null || cookies === void 0 ? void 0 : cookies.length) {
145
- yield page.setCookie(...cookies);
157
+ // handle cookies
158
+ if (cookies) {
159
+ const { hostname } = new URL(url);
160
+ const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
161
+ yield page.setCookie(...cookieParams);
146
162
  }
163
+ // handle localStorage
147
164
  if (localStorage) {
148
165
  yield page.evaluateOnNewDocument((items) => {
149
- Object.entries(items).forEach(([key, value]) => {
150
- window.localStorage.setItem(key, value);
166
+ items.forEach((item) => {
167
+ const value = item.value === 'now()' ? new Date().toISOString() : item.value;
168
+ window.localStorage.setItem(item.key, value);
151
169
  });
152
170
  }, localStorage);
153
171
  }
@@ -165,9 +183,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
165
183
  }
166
184
  // await for networkidle0
167
185
  // https://pptr.dev/api/puppeteer.page.waitfornetworkidle
168
- yield page.waitForNetworkIdle({
169
- idleTime: 1.5 * 1000,
170
- });
186
+ try {
187
+ yield Promise.all([
188
+ page.waitForNetworkIdle({
189
+ idleTime: 1.5 * 1000,
190
+ timeout,
191
+ }),
192
+ sleep(waitTime),
193
+ ]);
194
+ }
195
+ catch (err) {
196
+ logger.warn(`Failed to wait for network idle in ${url}:`, err);
197
+ }
171
198
  // get screenshot
172
199
  if (includeScreenshot) {
173
200
  // Try to find the tallest element and set the browser to the same height
@@ -211,7 +238,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
211
238
  // check if the page is an error page
212
239
  const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
213
240
  if (isErrorPage) {
214
- throw new Error('Page is an error page');
241
+ throw new Error(`${url} is an error page`);
215
242
  }
216
243
  meta.title = data.title;
217
244
  meta.description = data.description;
@@ -245,7 +272,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
245
272
  // eslint-disable-next-line require-await
246
273
  export function crawlUrl(params, callback) {
247
274
  return __awaiter(this, void 0, void 0, function* () {
248
- params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
249
275
  // skip duplicate job
250
276
  const existsJob = yield Job.isExists(params);
251
277
  if (existsJob) {
package/lib/esm/cron.js CHANGED
@@ -14,6 +14,8 @@ let cron = null;
14
14
  export function initCron() {
15
15
  if (cron)
16
16
  return;
17
+ if (!config.siteCron)
18
+ return;
17
19
  logger.info('Init cron', { config: config.siteCron });
18
20
  cron = Cron.init({
19
21
  context: {},
@@ -23,6 +25,9 @@ export function initCron() {
23
25
  time: config.siteCron.time,
24
26
  options: { runOnInit: config.siteCron.immediate },
25
27
  fn: () => __awaiter(this, void 0, void 0, function* () {
28
+ var _a;
29
+ if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
30
+ return;
26
31
  logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
27
32
  for (const site of config.siteCron.sites) {
28
33
  try {
@@ -3,7 +3,5 @@ export * from './crawler';
3
3
  export * from './site';
4
4
  export * from './services/snapshot';
5
5
  export * as utils from './utils';
6
- type DeepPartial<T> = T extends object ? {
7
- [P in keyof T]?: DeepPartial<T[P]>;
8
- } : T;
9
- export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
6
+ export { migrate } from './store/migrate';
7
+ export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
package/lib/esm/index.js CHANGED
@@ -7,25 +7,24 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
7
7
  step((generator = generator.apply(thisArg, _arguments || [])).next());
8
8
  });
9
9
  };
10
+ /* eslint-disable @typescript-eslint/indent */
10
11
  import merge from 'lodash/merge';
11
12
  import { config, logger } from './config';
12
- import { createCrawlQueue } from './crawler';
13
13
  import { initCron } from './cron';
14
14
  import { ensureBrowser } from './puppeteer';
15
- import { initDatabase } from './store';
16
15
  export * from './crawler';
17
16
  export * from './site';
18
17
  export * from './services/snapshot';
19
18
  export * as utils from './utils';
19
+ export { migrate } from './store/migrate';
20
20
  export function initCrawler(params) {
21
21
  return __awaiter(this, void 0, void 0, function* () {
22
+ var _a;
22
23
  merge(config, params);
23
24
  logger.info('Init crawler', { params, config });
24
25
  try {
25
- yield initDatabase();
26
26
  yield ensureBrowser();
27
- yield createCrawlQueue();
28
- if (config.siteCron.enabled) {
27
+ if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
29
28
  yield initCron();
30
29
  }
31
30
  }
@@ -1,5 +1,5 @@
1
- import { JobState } from '../store/job';
2
- import { SnapshotModel } from '../store/snapshot';
1
+ import { Transaction, WhereOptions } from '@sequelize/core';
2
+ import { JobState, SnapshotModel } from '../store';
3
3
  export declare function convertJobToSnapshot({ job, snapshot }: {
4
4
  job: JobState;
5
5
  snapshot?: Partial<SnapshotModel>;
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
10
10
  */
11
11
  export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
12
12
  export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
13
+ export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
14
+ txn?: Transaction;
15
+ }): Promise<string[]>;