@arcblock/crawler 1.1.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/lib/cjs/config.d.ts +9 -3
- package/lib/cjs/config.js +2 -10
- package/lib/cjs/crawler.d.ts +3 -4
- package/lib/cjs/crawler.js +74 -48
- package/lib/cjs/cron.js +5 -0
- package/lib/cjs/index.d.ts +2 -4
- package/lib/cjs/index.js +6 -6
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +44 -7
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +11 -4
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +6 -1
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +8 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/config.d.ts +9 -3
- package/lib/esm/config.js +2 -10
- package/lib/esm/crawler.d.ts +3 -4
- package/lib/esm/crawler.js +71 -45
- package/lib/esm/cron.js +5 -0
- package/lib/esm/index.d.ts +2 -4
- package/lib/esm/index.js +4 -5
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +41 -5
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +11 -4
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +6 -1
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +8 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.umzug = void 0;
|
|
37
|
+
exports.migrate = migrate;
|
|
38
|
+
/* eslint-disable global-require */
|
|
39
|
+
const umzug_1 = require("umzug");
|
|
40
|
+
const index_1 = require("./index");
|
|
41
|
+
const migration20250615 = __importStar(require("./migrations/20250615-genesis"));
|
|
42
|
+
const migration20250616Replace = __importStar(require("./migrations/20250616-replace"));
|
|
43
|
+
const umzug = new umzug_1.Umzug({
|
|
44
|
+
migrations: [
|
|
45
|
+
{
|
|
46
|
+
name: '20250615-genesis',
|
|
47
|
+
up: ({ context }) => migration20250615.up({ context }),
|
|
48
|
+
down: ({ context }) => migration20250615.down({ context }),
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: '20250616-replace',
|
|
52
|
+
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
53
|
+
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
context: index_1.sequelize.getQueryInterface(),
|
|
57
|
+
storage: new umzug_1.SequelizeStorage({ sequelize: index_1.sequelize }),
|
|
58
|
+
logger: console,
|
|
59
|
+
});
|
|
60
|
+
exports.umzug = umzug;
|
|
61
|
+
function migrate() {
|
|
62
|
+
return umzug.up();
|
|
63
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.up = up;
|
|
13
|
+
exports.down = down;
|
|
14
|
+
/* eslint-disable no-console */
|
|
15
|
+
const core_1 = require("@sequelize/core");
|
|
16
|
+
function up(_a) {
|
|
17
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
18
|
+
console.log('[20250615-genesis:up] Migrating...');
|
|
19
|
+
yield context.createTable('snap', {
|
|
20
|
+
jobId: {
|
|
21
|
+
type: core_1.DataTypes.STRING,
|
|
22
|
+
primaryKey: true,
|
|
23
|
+
allowNull: false,
|
|
24
|
+
},
|
|
25
|
+
url: {
|
|
26
|
+
type: core_1.DataTypes.STRING,
|
|
27
|
+
allowNull: false,
|
|
28
|
+
index: true,
|
|
29
|
+
},
|
|
30
|
+
status: {
|
|
31
|
+
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
32
|
+
allowNull: false,
|
|
33
|
+
},
|
|
34
|
+
html: {
|
|
35
|
+
type: core_1.DataTypes.TEXT,
|
|
36
|
+
allowNull: true,
|
|
37
|
+
},
|
|
38
|
+
screenshot: {
|
|
39
|
+
type: core_1.DataTypes.STRING,
|
|
40
|
+
allowNull: true,
|
|
41
|
+
},
|
|
42
|
+
error: {
|
|
43
|
+
type: core_1.DataTypes.STRING,
|
|
44
|
+
allowNull: true,
|
|
45
|
+
},
|
|
46
|
+
lastModified: {
|
|
47
|
+
type: core_1.DataTypes.STRING,
|
|
48
|
+
allowNull: true,
|
|
49
|
+
},
|
|
50
|
+
meta: {
|
|
51
|
+
type: core_1.DataTypes.JSON,
|
|
52
|
+
allowNull: true,
|
|
53
|
+
},
|
|
54
|
+
options: {
|
|
55
|
+
type: core_1.DataTypes.JSON,
|
|
56
|
+
allowNull: true,
|
|
57
|
+
},
|
|
58
|
+
createdAt: {
|
|
59
|
+
type: core_1.DataTypes.DATE,
|
|
60
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
61
|
+
},
|
|
62
|
+
updatedAt: {
|
|
63
|
+
type: core_1.DataTypes.DATE,
|
|
64
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
yield context.createTable('jobs', {
|
|
68
|
+
id: {
|
|
69
|
+
type: core_1.DataTypes.STRING(40),
|
|
70
|
+
primaryKey: true,
|
|
71
|
+
},
|
|
72
|
+
queue: {
|
|
73
|
+
type: core_1.DataTypes.STRING(32),
|
|
74
|
+
allowNull: false,
|
|
75
|
+
},
|
|
76
|
+
job: {
|
|
77
|
+
type: core_1.DataTypes.JSON,
|
|
78
|
+
allowNull: false,
|
|
79
|
+
},
|
|
80
|
+
retryCount: {
|
|
81
|
+
type: core_1.DataTypes.INTEGER,
|
|
82
|
+
},
|
|
83
|
+
delay: {
|
|
84
|
+
type: core_1.DataTypes.INTEGER,
|
|
85
|
+
},
|
|
86
|
+
willRunAt: {
|
|
87
|
+
type: core_1.DataTypes.INTEGER,
|
|
88
|
+
},
|
|
89
|
+
cancelled: {
|
|
90
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
91
|
+
defaultValue: false,
|
|
92
|
+
},
|
|
93
|
+
createdAt: {
|
|
94
|
+
type: core_1.DataTypes.DATE,
|
|
95
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
96
|
+
index: true,
|
|
97
|
+
},
|
|
98
|
+
updatedAt: {
|
|
99
|
+
type: core_1.DataTypes.DATE,
|
|
100
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
101
|
+
index: true,
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
console.log('[20250615-genesis:up] Migrated successfully!');
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
function down(_a) {
|
|
108
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
109
|
+
console.log('[20250615-genesis:down] Migrating...');
|
|
110
|
+
yield context.dropTable('snap');
|
|
111
|
+
yield context.dropTable('jobs');
|
|
112
|
+
console.log('[20250615-genesis:down] Migrated successfully!');
|
|
113
|
+
});
|
|
114
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.up = up;
|
|
13
|
+
exports.down = down;
|
|
14
|
+
/* eslint-disable no-console */
|
|
15
|
+
const core_1 = require("@sequelize/core");
|
|
16
|
+
function up(_a) {
|
|
17
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
18
|
+
console.log('[20250616-replace:up] Migrating...');
|
|
19
|
+
yield context.addColumn('snap', 'replace', {
|
|
20
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
21
|
+
allowNull: false,
|
|
22
|
+
defaultValue: false,
|
|
23
|
+
index: true,
|
|
24
|
+
});
|
|
25
|
+
yield context.addIndex('snap', ['createdAt']);
|
|
26
|
+
yield context.addIndex('snap', ['updatedAt']);
|
|
27
|
+
yield context.addIndex('snap', ['status']);
|
|
28
|
+
console.log('[20250616-replace:up] Migrated successfully!');
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
function down(_a) {
|
|
32
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
33
|
+
console.log('[20250616-replace:down] Migrating...');
|
|
34
|
+
yield context.removeColumn('snap', 'replace');
|
|
35
|
+
yield context.removeIndex('snap', ['createdAt']);
|
|
36
|
+
yield context.removeIndex('snap', ['updatedAt']);
|
|
37
|
+
yield context.removeIndex('snap', ['status']);
|
|
38
|
+
console.log('[20250616-replace:down] Migrated successfully!');
|
|
39
|
+
});
|
|
40
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -7,6 +8,7 @@ export interface SnapshotModel {
|
|
|
7
8
|
screenshot?: string | null;
|
|
8
9
|
error?: string;
|
|
9
10
|
lastModified?: string;
|
|
11
|
+
replace?: boolean;
|
|
10
12
|
meta?: {
|
|
11
13
|
title?: string;
|
|
12
14
|
description?: string;
|
|
@@ -19,6 +21,11 @@ export interface SnapshotModel {
|
|
|
19
21
|
quality?: number;
|
|
20
22
|
fullPage?: boolean;
|
|
21
23
|
headers?: Record<string, string>;
|
|
24
|
+
cookies?: CookieParam[];
|
|
25
|
+
localStorage?: {
|
|
26
|
+
key: string;
|
|
27
|
+
value: string;
|
|
28
|
+
}[];
|
|
22
29
|
};
|
|
23
30
|
}
|
|
24
31
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
@@ -29,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
29
36
|
screenshot?: SnapshotModel['screenshot'];
|
|
30
37
|
error?: SnapshotModel['error'];
|
|
31
38
|
lastModified?: SnapshotModel['lastModified'];
|
|
39
|
+
replace?: SnapshotModel['replace'];
|
|
32
40
|
meta?: SnapshotModel['meta'];
|
|
33
41
|
options: SnapshotModel['options'];
|
|
34
42
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -27,6 +27,7 @@ class Snapshot extends core_1.Model {
|
|
|
27
27
|
status: {
|
|
28
28
|
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
29
29
|
allowNull: false,
|
|
30
|
+
index: true,
|
|
30
31
|
},
|
|
31
32
|
html: {
|
|
32
33
|
type: core_1.DataTypes.TEXT,
|
|
@@ -44,6 +45,12 @@ class Snapshot extends core_1.Model {
|
|
|
44
45
|
type: core_1.DataTypes.STRING,
|
|
45
46
|
allowNull: true,
|
|
46
47
|
},
|
|
48
|
+
replace: {
|
|
49
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
50
|
+
allowNull: false,
|
|
51
|
+
defaultValue: false,
|
|
52
|
+
index: true,
|
|
53
|
+
},
|
|
47
54
|
meta: {
|
|
48
55
|
type: core_1.DataTypes.JSON,
|
|
49
56
|
allowNull: true,
|
package/lib/esm/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/esm/config.js
CHANGED
|
@@ -3,17 +3,9 @@ export const logger = createLogger('@arcblock/crawler', { level: process.env.LOG
|
|
|
3
3
|
export const config = {
|
|
4
4
|
isProd: process.env.NODE_ENV === 'production',
|
|
5
5
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
6
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
7
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
8
8
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
9
9
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
10
|
-
|
|
11
|
-
siteCron: {
|
|
12
|
-
sites: [],
|
|
13
|
-
enabled: true,
|
|
14
|
-
time: '0 0 0 * * *',
|
|
15
|
-
immediate: false,
|
|
16
|
-
crawlConcurrency: 2,
|
|
17
|
-
sitemapConcurrency: 30,
|
|
18
|
-
},
|
|
10
|
+
concurrency: 2,
|
|
19
11
|
};
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import { JobState } from './store
|
|
2
|
-
|
|
3
|
-
export declare function createCrawlQueue(): void;
|
|
1
|
+
import { JobState, SnapshotModel } from './store';
|
|
2
|
+
export declare function createCrawlQueue(queue: string): any;
|
|
4
3
|
export declare function getDataDir(): Promise<{
|
|
5
4
|
htmlDir: string;
|
|
6
5
|
screenshotDir: string;
|
|
7
6
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
7
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
8
|
html: string | null;
|
|
10
9
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
10
|
meta: {
|
package/lib/esm/crawler.js
CHANGED
|
@@ -14,17 +14,17 @@ import fs from 'fs-extra';
|
|
|
14
14
|
import path from 'path';
|
|
15
15
|
import { config, logger } from './config';
|
|
16
16
|
import { initPage } from './puppeteer';
|
|
17
|
-
import { convertJobToSnapshot, formatSnapshot } from './services/snapshot';
|
|
18
|
-
import { Job } from './store
|
|
19
|
-
import {
|
|
20
|
-
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
17
|
+
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
|
|
18
|
+
import { Job, Snapshot, sequelize } from './store';
|
|
19
|
+
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
|
|
21
20
|
const { BaseState } = require('@abtnode/models');
|
|
22
|
-
|
|
23
|
-
|
|
21
|
+
// eslint-disable-next-line import/no-mutable-exports
|
|
22
|
+
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
23
|
+
export function createCrawlQueue(queue) {
|
|
24
24
|
const db = new BaseState(Job);
|
|
25
|
-
|
|
26
|
-
store: new SequelizeStore(db,
|
|
27
|
-
concurrency: config.
|
|
25
|
+
return createQueue({
|
|
26
|
+
store: new SequelizeStore(db, queue),
|
|
27
|
+
concurrency: config.concurrency,
|
|
28
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
29
|
logger.info('Starting to execute crawl job', job);
|
|
30
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
@@ -48,18 +48,14 @@ export function createCrawlQueue() {
|
|
|
48
48
|
// } catch (error) {
|
|
49
49
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
50
|
// }
|
|
51
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
51
52
|
try {
|
|
52
53
|
// get page content later
|
|
53
|
-
const result = yield getPageContent(
|
|
54
|
-
// for blocklet theme
|
|
55
|
-
blocklet_theme_prefer: 'light',
|
|
56
|
-
// for blocklet domain warning
|
|
57
|
-
'domain-warning-skip': Date.now().toString(),
|
|
58
|
-
} }, job));
|
|
54
|
+
const result = yield getPageContent(formattedJob);
|
|
59
55
|
if (!result || (!result.html && !result.screenshot)) {
|
|
60
|
-
logger.error(`failed to crawl ${
|
|
56
|
+
logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
61
57
|
const snapshot = convertJobToSnapshot({
|
|
62
|
-
job,
|
|
58
|
+
job: formattedJob,
|
|
63
59
|
snapshot: {
|
|
64
60
|
status: 'failed',
|
|
65
61
|
error: 'Failed to crawl content',
|
|
@@ -68,28 +64,45 @@ export function createCrawlQueue() {
|
|
|
68
64
|
yield Snapshot.upsert(snapshot);
|
|
69
65
|
return snapshot;
|
|
70
66
|
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
67
|
+
const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
68
|
+
// delete old snapshot
|
|
69
|
+
if (formattedJob.replace) {
|
|
70
|
+
try {
|
|
71
|
+
const deletedJobIds = yield deleteSnapshots({
|
|
72
|
+
url: formattedJob.url,
|
|
73
|
+
replace: true,
|
|
74
|
+
}, { txn });
|
|
75
|
+
if (deletedJobIds) {
|
|
76
|
+
logger.info('Deleted old snapshot', { deletedJobIds });
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch (error) {
|
|
80
|
+
logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// save html and screenshot to data dir
|
|
84
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
85
|
+
screenshot: result.screenshot,
|
|
86
|
+
html: result.html,
|
|
87
|
+
});
|
|
88
|
+
const snapshot = convertJobToSnapshot({
|
|
89
|
+
job: formattedJob,
|
|
90
|
+
snapshot: {
|
|
91
|
+
status: 'success',
|
|
92
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
93
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
|
|
94
|
+
meta: result.meta,
|
|
95
|
+
},
|
|
96
|
+
});
|
|
97
|
+
yield Snapshot.upsert(snapshot, { transaction: txn });
|
|
98
|
+
return snapshot;
|
|
99
|
+
}));
|
|
87
100
|
return snapshot;
|
|
88
101
|
}
|
|
89
102
|
catch (error) {
|
|
90
|
-
logger.error(`Failed to crawl ${
|
|
103
|
+
logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
91
104
|
const snapshot = convertJobToSnapshot({
|
|
92
|
-
job,
|
|
105
|
+
job: formattedJob,
|
|
93
106
|
snapshot: {
|
|
94
107
|
status: 'failed',
|
|
95
108
|
error: 'Internal error',
|
|
@@ -133,7 +146,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
133
146
|
};
|
|
134
147
|
});
|
|
135
148
|
}
|
|
136
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies
|
|
149
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
|
|
137
150
|
const page = yield initPage();
|
|
138
151
|
if (width && height) {
|
|
139
152
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -141,13 +154,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
141
154
|
if (headers) {
|
|
142
155
|
yield page.setExtraHTTPHeaders(headers);
|
|
143
156
|
}
|
|
144
|
-
|
|
145
|
-
|
|
157
|
+
// handle cookies
|
|
158
|
+
if (cookies) {
|
|
159
|
+
const { hostname } = new URL(url);
|
|
160
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
161
|
+
yield page.setCookie(...cookieParams);
|
|
146
162
|
}
|
|
163
|
+
// handle localStorage
|
|
147
164
|
if (localStorage) {
|
|
148
165
|
yield page.evaluateOnNewDocument((items) => {
|
|
149
|
-
|
|
150
|
-
|
|
166
|
+
items.forEach((item) => {
|
|
167
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
168
|
+
window.localStorage.setItem(item.key, value);
|
|
151
169
|
});
|
|
152
170
|
}, localStorage);
|
|
153
171
|
}
|
|
@@ -165,9 +183,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
165
183
|
}
|
|
166
184
|
// await for networkidle0
|
|
167
185
|
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
186
|
+
try {
|
|
187
|
+
yield Promise.all([
|
|
188
|
+
page.waitForNetworkIdle({
|
|
189
|
+
idleTime: 1.5 * 1000,
|
|
190
|
+
timeout,
|
|
191
|
+
}),
|
|
192
|
+
sleep(waitTime),
|
|
193
|
+
]);
|
|
194
|
+
}
|
|
195
|
+
catch (err) {
|
|
196
|
+
logger.warn(`Failed to wait for network idle in ${url}:`, err);
|
|
197
|
+
}
|
|
171
198
|
// get screenshot
|
|
172
199
|
if (includeScreenshot) {
|
|
173
200
|
// Try to find the tallest element and set the browser to the same height
|
|
@@ -211,7 +238,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
211
238
|
// check if the page is an error page
|
|
212
239
|
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
213
240
|
if (isErrorPage) {
|
|
214
|
-
throw new Error(
|
|
241
|
+
throw new Error(`${url} is an error page`);
|
|
215
242
|
}
|
|
216
243
|
meta.title = data.title;
|
|
217
244
|
meta.description = data.description;
|
|
@@ -245,7 +272,6 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
245
272
|
// eslint-disable-next-line require-await
|
|
246
273
|
export function crawlUrl(params, callback) {
|
|
247
274
|
return __awaiter(this, void 0, void 0, function* () {
|
|
248
|
-
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
249
275
|
// skip duplicate job
|
|
250
276
|
const existsJob = yield Job.isExists(params);
|
|
251
277
|
if (existsJob) {
|
package/lib/esm/cron.js
CHANGED
|
@@ -14,6 +14,8 @@ let cron = null;
|
|
|
14
14
|
export function initCron() {
|
|
15
15
|
if (cron)
|
|
16
16
|
return;
|
|
17
|
+
if (!config.siteCron)
|
|
18
|
+
return;
|
|
17
19
|
logger.info('Init cron', { config: config.siteCron });
|
|
18
20
|
cron = Cron.init({
|
|
19
21
|
context: {},
|
|
@@ -23,6 +25,9 @@ export function initCron() {
|
|
|
23
25
|
time: config.siteCron.time,
|
|
24
26
|
options: { runOnInit: config.siteCron.immediate },
|
|
25
27
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
28
|
+
var _a;
|
|
29
|
+
if (!((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
30
|
+
return;
|
|
26
31
|
logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
|
|
27
32
|
for (const site of config.siteCron.sites) {
|
|
28
33
|
try {
|
package/lib/esm/index.d.ts
CHANGED
|
@@ -3,7 +3,5 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export { migrate } from './store/migrate';
|
|
7
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -7,25 +7,24 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
10
11
|
import merge from 'lodash/merge';
|
|
11
12
|
import { config, logger } from './config';
|
|
12
|
-
import { createCrawlQueue } from './crawler';
|
|
13
13
|
import { initCron } from './cron';
|
|
14
14
|
import { ensureBrowser } from './puppeteer';
|
|
15
|
-
import { initDatabase } from './store';
|
|
16
15
|
export * from './crawler';
|
|
17
16
|
export * from './site';
|
|
18
17
|
export * from './services/snapshot';
|
|
19
18
|
export * as utils from './utils';
|
|
19
|
+
export { migrate } from './store/migrate';
|
|
20
20
|
export function initCrawler(params) {
|
|
21
21
|
return __awaiter(this, void 0, void 0, function* () {
|
|
22
|
+
var _a;
|
|
22
23
|
merge(config, params);
|
|
23
24
|
logger.info('Init crawler', { params, config });
|
|
24
25
|
try {
|
|
25
|
-
yield initDatabase();
|
|
26
26
|
yield ensureBrowser();
|
|
27
|
-
|
|
28
|
-
if (config.siteCron.enabled) {
|
|
27
|
+
if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
29
28
|
yield initCron();
|
|
30
29
|
}
|
|
31
30
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from '../store
|
|
1
|
+
import { Transaction, WhereOptions } from '@sequelize/core';
|
|
2
|
+
import { JobState, SnapshotModel } from '../store';
|
|
3
3
|
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
4
|
job: JobState;
|
|
5
5
|
snapshot?: Partial<SnapshotModel>;
|
|
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
|
|
|
10
10
|
*/
|
|
11
11
|
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
12
|
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
13
|
+
export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
|
|
14
|
+
txn?: Transaction;
|
|
15
|
+
}): Promise<string[]>;
|