@arcblock/crawler 1.1.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +11 -4
- package/lib/cjs/crawler.js +96 -59
- package/lib/cjs/index.d.ts +1 -0
- package/lib/cjs/index.js +3 -5
- package/lib/cjs/services/carbon.d.ts +3 -0
- package/lib/cjs/services/carbon.js +87 -0
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +36 -6
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +9 -3
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +5 -0
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +2 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/crawler.d.ts +11 -4
- package/lib/esm/crawler.js +92 -57
- package/lib/esm/index.d.ts +1 -0
- package/lib/esm/index.js +1 -4
- package/lib/esm/services/carbon.d.ts +3 -0
- package/lib/esm/services/carbon.js +84 -0
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +33 -4
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +9 -3
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +5 -0
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +2 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
package/lib/cjs/store/index.js
CHANGED
|
@@ -1,57 +1,49 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
10
15
|
};
|
|
11
16
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
17
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
18
|
};
|
|
14
19
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.
|
|
20
|
+
exports.sequelize = void 0;
|
|
16
21
|
const core_1 = require("@sequelize/core");
|
|
17
22
|
const sqlite3_1 = require("@sequelize/sqlite3");
|
|
18
23
|
const path_1 = __importDefault(require("path"));
|
|
19
24
|
const config_1 = require("../config");
|
|
20
25
|
const job_1 = require("./job");
|
|
21
26
|
const snapshot_1 = require("./snapshot");
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
46
|
-
]);
|
|
47
|
-
yield sequelize.authenticate();
|
|
48
|
-
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
49
|
-
config_1.logger.info('Successfully connected to database');
|
|
50
|
-
}
|
|
51
|
-
catch (error) {
|
|
52
|
-
config_1.logger.error('Failed to connect to database:', error);
|
|
53
|
-
throw error;
|
|
54
|
-
}
|
|
55
|
-
return sequelize;
|
|
56
|
-
});
|
|
57
|
-
}
|
|
27
|
+
const sequelize = new core_1.Sequelize({
|
|
28
|
+
dialect: sqlite3_1.SqliteDialect,
|
|
29
|
+
storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
|
|
30
|
+
logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
|
|
31
|
+
pool: {
|
|
32
|
+
min: 0,
|
|
33
|
+
max: 10,
|
|
34
|
+
idle: 10000,
|
|
35
|
+
},
|
|
36
|
+
retry: {
|
|
37
|
+
match: [/SQLITE_BUSY/],
|
|
38
|
+
name: 'query',
|
|
39
|
+
max: 10,
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
exports.sequelize = sequelize;
|
|
43
|
+
sequelize.query('pragma journal_mode = WAL;');
|
|
44
|
+
sequelize.query('pragma synchronous = normal;');
|
|
45
|
+
sequelize.query('pragma journal_size_limit = 67108864;');
|
|
46
|
+
job_1.Job.initModel(sequelize);
|
|
47
|
+
snapshot_1.Snapshot.initModel(sequelize);
|
|
48
|
+
__exportStar(require("./job"), exports);
|
|
49
|
+
__exportStar(require("./snapshot"), exports);
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -9,9 +9,14 @@ export interface JobState {
|
|
|
9
9
|
width?: number;
|
|
10
10
|
height?: number;
|
|
11
11
|
quality?: number;
|
|
12
|
+
format?: 'png' | 'jpeg' | 'webp';
|
|
12
13
|
timeout?: number;
|
|
13
14
|
fullPage?: boolean;
|
|
14
15
|
lastModified?: string;
|
|
16
|
+
waitTime?: number;
|
|
17
|
+
replace?: boolean;
|
|
18
|
+
sync?: boolean;
|
|
19
|
+
ignoreRobots?: boolean;
|
|
15
20
|
headers?: Record<string, string>;
|
|
16
21
|
cookies?: CookieParam[];
|
|
17
22
|
localStorage?: {
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.umzug = void 0;
|
|
37
|
+
exports.migrate = migrate;
|
|
38
|
+
/* eslint-disable global-require */
|
|
39
|
+
const umzug_1 = require("umzug");
|
|
40
|
+
const index_1 = require("./index");
|
|
41
|
+
const migration20250615 = __importStar(require("./migrations/20250615-genesis"));
|
|
42
|
+
const migration20250616Replace = __importStar(require("./migrations/20250616-replace"));
|
|
43
|
+
const umzug = new umzug_1.Umzug({
|
|
44
|
+
migrations: [
|
|
45
|
+
{
|
|
46
|
+
name: '20250615-genesis',
|
|
47
|
+
up: ({ context }) => migration20250615.up({ context }),
|
|
48
|
+
down: ({ context }) => migration20250615.down({ context }),
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
name: '20250616-replace',
|
|
52
|
+
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
53
|
+
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
context: index_1.sequelize.getQueryInterface(),
|
|
57
|
+
storage: new umzug_1.SequelizeStorage({ sequelize: index_1.sequelize }),
|
|
58
|
+
logger: console,
|
|
59
|
+
});
|
|
60
|
+
exports.umzug = umzug;
|
|
61
|
+
function migrate() {
|
|
62
|
+
return umzug.up();
|
|
63
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.up = up;
|
|
13
|
+
exports.down = down;
|
|
14
|
+
/* eslint-disable no-console */
|
|
15
|
+
const core_1 = require("@sequelize/core");
|
|
16
|
+
function up(_a) {
|
|
17
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
18
|
+
console.log('[20250615-genesis:up] Migrating...');
|
|
19
|
+
yield context.createTable('snap', {
|
|
20
|
+
jobId: {
|
|
21
|
+
type: core_1.DataTypes.STRING,
|
|
22
|
+
primaryKey: true,
|
|
23
|
+
allowNull: false,
|
|
24
|
+
},
|
|
25
|
+
url: {
|
|
26
|
+
type: core_1.DataTypes.STRING,
|
|
27
|
+
allowNull: false,
|
|
28
|
+
index: true,
|
|
29
|
+
},
|
|
30
|
+
status: {
|
|
31
|
+
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
32
|
+
allowNull: false,
|
|
33
|
+
},
|
|
34
|
+
html: {
|
|
35
|
+
type: core_1.DataTypes.TEXT,
|
|
36
|
+
allowNull: true,
|
|
37
|
+
},
|
|
38
|
+
screenshot: {
|
|
39
|
+
type: core_1.DataTypes.STRING,
|
|
40
|
+
allowNull: true,
|
|
41
|
+
},
|
|
42
|
+
error: {
|
|
43
|
+
type: core_1.DataTypes.STRING,
|
|
44
|
+
allowNull: true,
|
|
45
|
+
},
|
|
46
|
+
lastModified: {
|
|
47
|
+
type: core_1.DataTypes.STRING,
|
|
48
|
+
allowNull: true,
|
|
49
|
+
},
|
|
50
|
+
meta: {
|
|
51
|
+
type: core_1.DataTypes.JSON,
|
|
52
|
+
allowNull: true,
|
|
53
|
+
},
|
|
54
|
+
options: {
|
|
55
|
+
type: core_1.DataTypes.JSON,
|
|
56
|
+
allowNull: true,
|
|
57
|
+
},
|
|
58
|
+
createdAt: {
|
|
59
|
+
type: core_1.DataTypes.DATE,
|
|
60
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
61
|
+
},
|
|
62
|
+
updatedAt: {
|
|
63
|
+
type: core_1.DataTypes.DATE,
|
|
64
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
yield context.createTable('jobs', {
|
|
68
|
+
id: {
|
|
69
|
+
type: core_1.DataTypes.STRING(40),
|
|
70
|
+
primaryKey: true,
|
|
71
|
+
},
|
|
72
|
+
queue: {
|
|
73
|
+
type: core_1.DataTypes.STRING(32),
|
|
74
|
+
allowNull: false,
|
|
75
|
+
},
|
|
76
|
+
job: {
|
|
77
|
+
type: core_1.DataTypes.JSON,
|
|
78
|
+
allowNull: false,
|
|
79
|
+
},
|
|
80
|
+
retryCount: {
|
|
81
|
+
type: core_1.DataTypes.INTEGER,
|
|
82
|
+
},
|
|
83
|
+
delay: {
|
|
84
|
+
type: core_1.DataTypes.INTEGER,
|
|
85
|
+
},
|
|
86
|
+
willRunAt: {
|
|
87
|
+
type: core_1.DataTypes.INTEGER,
|
|
88
|
+
},
|
|
89
|
+
cancelled: {
|
|
90
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
91
|
+
defaultValue: false,
|
|
92
|
+
},
|
|
93
|
+
createdAt: {
|
|
94
|
+
type: core_1.DataTypes.DATE,
|
|
95
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
96
|
+
index: true,
|
|
97
|
+
},
|
|
98
|
+
updatedAt: {
|
|
99
|
+
type: core_1.DataTypes.DATE,
|
|
100
|
+
defaultValue: core_1.DataTypes.NOW,
|
|
101
|
+
index: true,
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
console.log('[20250615-genesis:up] Migrated successfully!');
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
function down(_a) {
|
|
108
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
109
|
+
console.log('[20250615-genesis:down] Migrating...');
|
|
110
|
+
yield context.dropTable('snap');
|
|
111
|
+
yield context.dropTable('jobs');
|
|
112
|
+
console.log('[20250615-genesis:down] Migrated successfully!');
|
|
113
|
+
});
|
|
114
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.up = up;
|
|
13
|
+
exports.down = down;
|
|
14
|
+
/* eslint-disable no-console */
|
|
15
|
+
const core_1 = require("@sequelize/core");
|
|
16
|
+
function up(_a) {
|
|
17
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
18
|
+
console.log('[20250616-replace:up] Migrating...');
|
|
19
|
+
yield context.addColumn('snap', 'replace', {
|
|
20
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
21
|
+
allowNull: false,
|
|
22
|
+
defaultValue: false,
|
|
23
|
+
index: true,
|
|
24
|
+
});
|
|
25
|
+
yield context.addIndex('snap', ['createdAt']);
|
|
26
|
+
yield context.addIndex('snap', ['updatedAt']);
|
|
27
|
+
yield context.addIndex('snap', ['status']);
|
|
28
|
+
console.log('[20250616-replace:up] Migrated successfully!');
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
function down(_a) {
|
|
32
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
33
|
+
console.log('[20250616-replace:down] Migrating...');
|
|
34
|
+
yield context.removeColumn('snap', 'replace');
|
|
35
|
+
yield context.removeIndex('snap', ['createdAt']);
|
|
36
|
+
yield context.removeIndex('snap', ['updatedAt']);
|
|
37
|
+
yield context.removeIndex('snap', ['status']);
|
|
38
|
+
console.log('[20250616-replace:down] Migrated successfully!');
|
|
39
|
+
});
|
|
40
|
+
}
|
|
@@ -8,6 +8,7 @@ export interface SnapshotModel {
|
|
|
8
8
|
screenshot?: string | null;
|
|
9
9
|
error?: string;
|
|
10
10
|
lastModified?: string;
|
|
11
|
+
replace?: boolean;
|
|
11
12
|
meta?: {
|
|
12
13
|
title?: string;
|
|
13
14
|
description?: string;
|
|
@@ -35,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
35
36
|
screenshot?: SnapshotModel['screenshot'];
|
|
36
37
|
error?: SnapshotModel['error'];
|
|
37
38
|
lastModified?: SnapshotModel['lastModified'];
|
|
39
|
+
replace?: SnapshotModel['replace'];
|
|
38
40
|
meta?: SnapshotModel['meta'];
|
|
39
41
|
options: SnapshotModel['options'];
|
|
40
42
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -27,6 +27,7 @@ class Snapshot extends core_1.Model {
|
|
|
27
27
|
status: {
|
|
28
28
|
type: core_1.DataTypes.ENUM('success', 'failed', 'pending'),
|
|
29
29
|
allowNull: false,
|
|
30
|
+
index: true,
|
|
30
31
|
},
|
|
31
32
|
html: {
|
|
32
33
|
type: core_1.DataTypes.TEXT,
|
|
@@ -44,6 +45,12 @@ class Snapshot extends core_1.Model {
|
|
|
44
45
|
type: core_1.DataTypes.STRING,
|
|
45
46
|
allowNull: true,
|
|
46
47
|
},
|
|
48
|
+
replace: {
|
|
49
|
+
type: core_1.DataTypes.BOOLEAN,
|
|
50
|
+
allowNull: false,
|
|
51
|
+
defaultValue: false,
|
|
52
|
+
index: true,
|
|
53
|
+
},
|
|
47
54
|
meta: {
|
|
48
55
|
type: core_1.DataTypes.JSON,
|
|
49
56
|
allowNull: true,
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from './store
|
|
3
|
-
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import { JobState, SnapshotModel } from './store';
|
|
3
|
+
type PageHandler = {
|
|
4
|
+
handleScreenshot?: (page: Page, params?: JobState) => Promise<Buffer | null>;
|
|
5
|
+
handleHtml?: (page: Page, params?: JobState) => Promise<string | null>;
|
|
6
|
+
};
|
|
7
|
+
export declare function createCrawlQueue(queue: string, handler?: PageHandler): any;
|
|
4
8
|
export declare function getDataDir(): Promise<{
|
|
5
9
|
htmlDir: string;
|
|
6
10
|
screenshotDir: string;
|
|
7
11
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
12
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, format, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState, handler?: PageHandler) => Promise<{
|
|
9
13
|
html: string | null;
|
|
10
14
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
15
|
meta: {
|
|
@@ -18,4 +22,7 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
18
22
|
* @param params
|
|
19
23
|
* @param callback callback when job finished
|
|
20
24
|
*/
|
|
25
|
+
export declare function enqueue(queue: any, params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
21
26
|
export declare function crawlUrl(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
27
|
+
export declare function crawlCode(params: Omit<JobState, 'jobId'>, callback?: (snapshot: SnapshotModel | null) => void): Promise<string>;
|
|
28
|
+
export {};
|
package/lib/esm/crawler.js
CHANGED
|
@@ -14,44 +14,44 @@ import fs from 'fs-extra';
|
|
|
14
14
|
import path from 'path';
|
|
15
15
|
import { config, logger } from './config';
|
|
16
16
|
import { initPage } from './puppeteer';
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
19
|
-
import { Snapshot } from './store
|
|
20
|
-
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
17
|
+
import { createCarbonImage } from './services/carbon';
|
|
18
|
+
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
|
|
19
|
+
import { Job, Snapshot, sequelize } from './store';
|
|
20
|
+
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5, sleep } from './utils';
|
|
21
21
|
const { BaseState } = require('@abtnode/models');
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
// eslint-disable-next-line import/no-mutable-exports
|
|
23
|
+
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
24
|
+
const syncQueue = createCrawlQueue('syncCrawler');
|
|
25
|
+
const codeQueue = createCrawlQueue('codeCrawler', {
|
|
26
|
+
handleScreenshot: createCarbonImage,
|
|
27
|
+
});
|
|
28
|
+
export function createCrawlQueue(queue, handler) {
|
|
24
29
|
const db = new BaseState(Job);
|
|
25
|
-
|
|
26
|
-
store: new SequelizeStore(db,
|
|
30
|
+
return createQueue({
|
|
31
|
+
store: new SequelizeStore(db, queue),
|
|
27
32
|
concurrency: config.concurrency,
|
|
28
33
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
34
|
logger.info('Starting to execute crawl job', job);
|
|
30
|
-
|
|
31
|
-
if (!
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
job,
|
|
35
|
-
snapshot
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
35
|
+
// check robots.txt
|
|
36
|
+
if (!job.ignoreRobots) {
|
|
37
|
+
const canCrawl = yield isAcceptCrawler(job.url);
|
|
38
|
+
if (!canCrawl) {
|
|
39
|
+
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
40
|
+
const snapshot = convertJobToSnapshot({
|
|
41
|
+
job,
|
|
42
|
+
snapshot: {
|
|
43
|
+
status: 'failed',
|
|
44
|
+
error: 'Denied by robots.txt',
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
yield Snapshot.upsert(snapshot);
|
|
48
|
+
return snapshot;
|
|
49
|
+
}
|
|
42
50
|
}
|
|
43
|
-
// if index reach autoCloseBrowserCount, close browser
|
|
44
|
-
// try {
|
|
45
|
-
// if (index >= autoCloseBrowserCount) {
|
|
46
|
-
// await closeBrowser({ trimCache: false });
|
|
47
|
-
// }
|
|
48
|
-
// } catch (error) {
|
|
49
|
-
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
50
|
-
// }
|
|
51
51
|
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config.cookies || []).concat(job.cookies || []), localStorage: (config.localStorage || []).concat(job.localStorage || []), url: formatUrl(job.url) });
|
|
52
52
|
try {
|
|
53
53
|
// get page content later
|
|
54
|
-
const result = yield getPageContent(formattedJob);
|
|
54
|
+
const result = yield getPageContent(formattedJob, handler);
|
|
55
55
|
if (!result || (!result.html && !result.screenshot)) {
|
|
56
56
|
logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
57
57
|
const snapshot = convertJobToSnapshot({
|
|
@@ -64,22 +64,40 @@ export function createCrawlQueue() {
|
|
|
64
64
|
yield Snapshot.upsert(snapshot);
|
|
65
65
|
return snapshot;
|
|
66
66
|
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
67
|
+
const snapshot = yield sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
68
|
+
// delete old snapshot
|
|
69
|
+
if (formattedJob.replace) {
|
|
70
|
+
try {
|
|
71
|
+
const deletedJobIds = yield deleteSnapshots({
|
|
72
|
+
url: formattedJob.url,
|
|
73
|
+
replace: true,
|
|
74
|
+
}, { txn });
|
|
75
|
+
if (deletedJobIds) {
|
|
76
|
+
logger.info('Deleted old snapshot', { deletedJobIds });
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch (error) {
|
|
80
|
+
logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// save html and screenshot to data dir
|
|
84
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
85
|
+
screenshot: result.screenshot,
|
|
86
|
+
html: result.html,
|
|
87
|
+
format: formattedJob.format,
|
|
88
|
+
});
|
|
89
|
+
const snapshot = convertJobToSnapshot({
|
|
90
|
+
job: formattedJob,
|
|
91
|
+
snapshot: {
|
|
92
|
+
status: 'success',
|
|
93
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
94
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
|
|
95
|
+
meta: result.meta,
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
yield Snapshot.upsert(snapshot, { transaction: txn });
|
|
99
|
+
return snapshot;
|
|
100
|
+
}));
|
|
83
101
|
return snapshot;
|
|
84
102
|
}
|
|
85
103
|
catch (error) {
|
|
@@ -107,13 +125,13 @@ export function getDataDir() {
|
|
|
107
125
|
});
|
|
108
126
|
}
|
|
109
127
|
function saveSnapshotToLocal(_a) {
|
|
110
|
-
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
128
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html, format = 'webp', }) {
|
|
111
129
|
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
112
130
|
let screenshotPath = null;
|
|
113
131
|
let htmlPath = null;
|
|
114
132
|
if (screenshot) {
|
|
115
133
|
const hash = md5(screenshot);
|
|
116
|
-
screenshotPath = path.join(screenshotDir, `${hash}
|
|
134
|
+
screenshotPath = path.join(screenshotDir, `${hash}.${format}`);
|
|
117
135
|
logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
118
136
|
yield fs.writeFile(screenshotPath, screenshot);
|
|
119
137
|
}
|
|
@@ -129,7 +147,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
129
147
|
};
|
|
130
148
|
});
|
|
131
149
|
}
|
|
132
|
-
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies, localStorage, }) {
|
|
150
|
+
export const getPageContent = (_a, handler_1) => __awaiter(void 0, [_a, handler_1], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, format = 'webp', timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }, handler) {
|
|
133
151
|
const page = yield initPage();
|
|
134
152
|
if (width && height) {
|
|
135
153
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -166,9 +184,18 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
166
184
|
}
|
|
167
185
|
// await for networkidle0
|
|
168
186
|
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
187
|
+
try {
|
|
188
|
+
yield Promise.all([
|
|
189
|
+
page.waitForNetworkIdle({
|
|
190
|
+
idleTime: 1.5 * 1000,
|
|
191
|
+
timeout,
|
|
192
|
+
}),
|
|
193
|
+
sleep(waitTime),
|
|
194
|
+
]);
|
|
195
|
+
}
|
|
196
|
+
catch (err) {
|
|
197
|
+
logger.warn(`Failed to wait for network idle in ${url}:`, err);
|
|
198
|
+
}
|
|
172
199
|
// get screenshot
|
|
173
200
|
if (includeScreenshot) {
|
|
174
201
|
// Try to find the tallest element and set the browser to the same height
|
|
@@ -184,7 +211,9 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
184
211
|
}
|
|
185
212
|
}
|
|
186
213
|
try {
|
|
187
|
-
screenshot =
|
|
214
|
+
screenshot = (handler === null || handler === void 0 ? void 0 : handler.handleScreenshot)
|
|
215
|
+
? yield handler.handleScreenshot(page)
|
|
216
|
+
: yield page.screenshot({ fullPage, quality, type: format });
|
|
188
217
|
}
|
|
189
218
|
catch (err) {
|
|
190
219
|
logger.error('Failed to get screenshot:', err);
|
|
@@ -212,12 +241,12 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
212
241
|
// check if the page is an error page
|
|
213
242
|
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
214
243
|
if (isErrorPage) {
|
|
215
|
-
throw new Error(
|
|
244
|
+
throw new Error(`${url} is an error page`);
|
|
216
245
|
}
|
|
217
246
|
meta.title = data.title;
|
|
218
247
|
meta.description = data.description;
|
|
219
248
|
if (includeHtml) {
|
|
220
|
-
html = data.html;
|
|
249
|
+
html = (handler === null || handler === void 0 ? void 0 : handler.handleHtml) ? yield handler.handleHtml(page) : data.html;
|
|
221
250
|
}
|
|
222
251
|
}
|
|
223
252
|
catch (err) {
|
|
@@ -244,17 +273,17 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
244
273
|
* @param callback callback when job finished
|
|
245
274
|
*/
|
|
246
275
|
// eslint-disable-next-line require-await
|
|
247
|
-
export function
|
|
276
|
+
export function enqueue(queue, params, callback) {
|
|
248
277
|
return __awaiter(this, void 0, void 0, function* () {
|
|
249
278
|
// skip duplicate job
|
|
250
279
|
const existsJob = yield Job.isExists(params);
|
|
251
|
-
if (existsJob) {
|
|
280
|
+
if (existsJob && !params.sync) {
|
|
252
281
|
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
253
282
|
return existsJob.id;
|
|
254
283
|
}
|
|
255
284
|
logger.info('enqueue crawl job', params);
|
|
256
285
|
const jobId = randomUUID();
|
|
257
|
-
const job =
|
|
286
|
+
const job = queue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
258
287
|
job.on('finished', (_a) => __awaiter(this, [_a], void 0, function* ({ result }) {
|
|
259
288
|
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
260
289
|
callback === null || callback === void 0 ? void 0 : callback(result ? yield formatSnapshot(result) : null);
|
|
@@ -266,3 +295,9 @@ export function crawlUrl(params, callback) {
|
|
|
266
295
|
return jobId;
|
|
267
296
|
});
|
|
268
297
|
}
|
|
298
|
+
export function crawlUrl(params, callback) {
|
|
299
|
+
return enqueue(params.sync ? syncQueue : crawlQueue, params, callback);
|
|
300
|
+
}
|
|
301
|
+
export function crawlCode(params, callback) {
|
|
302
|
+
return enqueue(codeQueue, Object.assign({ ignoreRobots: true, includeHtml: false, includeScreenshot: true }, params), callback);
|
|
303
|
+
}
|