@arcblock/crawler 1.1.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/lib/cjs/config.d.ts +9 -3
- package/lib/cjs/config.js +2 -10
- package/lib/cjs/crawler.d.ts +3 -4
- package/lib/cjs/crawler.js +74 -48
- package/lib/cjs/cron.js +5 -0
- package/lib/cjs/index.d.ts +2 -4
- package/lib/cjs/index.js +6 -6
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +44 -7
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +11 -4
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +6 -1
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +8 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/config.d.ts +9 -3
- package/lib/esm/config.js +2 -10
- package/lib/esm/crawler.d.ts +3 -4
- package/lib/esm/crawler.js +71 -45
- package/lib/esm/cron.js +5 -0
- package/lib/esm/index.d.ts +2 -4
- package/lib/esm/index.js +4 -5
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +41 -5
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +11 -4
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +6 -1
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +8 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
|
@@ -7,16 +7,16 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
import cloneDeep from 'lodash/cloneDeep';
|
|
10
11
|
import pick from 'lodash/pick';
|
|
11
12
|
import fs from 'node:fs/promises';
|
|
12
13
|
import path from 'node:path';
|
|
13
14
|
import { joinURL } from 'ufo';
|
|
14
|
-
import { config } from '../config';
|
|
15
|
-
import { Job } from '../store
|
|
16
|
-
import { Snapshot } from '../store/snapshot';
|
|
15
|
+
import { config, logger } from '../config';
|
|
16
|
+
import { Job, Snapshot } from '../store';
|
|
17
17
|
import { formatUrl } from '../utils';
|
|
18
18
|
export function convertJobToSnapshot({ job, snapshot }) {
|
|
19
|
-
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
19
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
|
|
20
20
|
width: job.width,
|
|
21
21
|
height: job.height,
|
|
22
22
|
includeScreenshot: job.includeScreenshot,
|
|
@@ -27,7 +27,7 @@ export function convertJobToSnapshot({ job, snapshot }) {
|
|
|
27
27
|
}
|
|
28
28
|
export function formatSnapshot(snapshot, columns) {
|
|
29
29
|
return __awaiter(this, void 0, void 0, function* () {
|
|
30
|
-
let data =
|
|
30
|
+
let data = cloneDeep(snapshot);
|
|
31
31
|
// format screenshot path to full url
|
|
32
32
|
if (data.screenshot) {
|
|
33
33
|
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
@@ -37,6 +37,12 @@ export function formatSnapshot(snapshot, columns) {
|
|
|
37
37
|
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
38
38
|
data.html = html.toString();
|
|
39
39
|
}
|
|
40
|
+
// remove sensitive options that should not be returned
|
|
41
|
+
if (data.options) {
|
|
42
|
+
delete data.options.cookies;
|
|
43
|
+
delete data.options.localStorage;
|
|
44
|
+
delete data.options.headers;
|
|
45
|
+
}
|
|
40
46
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
41
47
|
data = pick(data, columns);
|
|
42
48
|
}
|
|
@@ -69,7 +75,37 @@ export function getLatestSnapshot(url) {
|
|
|
69
75
|
url: formatUrl(url),
|
|
70
76
|
status: 'success',
|
|
71
77
|
},
|
|
78
|
+
order: [
|
|
79
|
+
['lastModified', 'DESC'],
|
|
80
|
+
['updatedAt', 'DESC'],
|
|
81
|
+
],
|
|
72
82
|
});
|
|
73
83
|
return snapshot ? formatSnapshot(snapshot) : null;
|
|
74
84
|
});
|
|
75
85
|
}
|
|
86
|
+
export function deleteSnapshots(where_1) {
|
|
87
|
+
return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
|
|
88
|
+
const snapshots = yield Snapshot.findAll({
|
|
89
|
+
where,
|
|
90
|
+
order: [
|
|
91
|
+
['lastModified', 'DESC'],
|
|
92
|
+
['updatedAt', 'DESC'],
|
|
93
|
+
],
|
|
94
|
+
});
|
|
95
|
+
const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
|
|
96
|
+
try {
|
|
97
|
+
yield Promise.all([
|
|
98
|
+
snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
|
|
99
|
+
snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
|
|
100
|
+
]);
|
|
101
|
+
yield snapshot.destroy({ transaction: txn });
|
|
102
|
+
return snapshot.jobId;
|
|
103
|
+
}
|
|
104
|
+
catch (error) {
|
|
105
|
+
logger.error('Failed to delete snapshot', { error, snapshot });
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
108
|
+
})));
|
|
109
|
+
return jobIds.filter(Boolean);
|
|
110
|
+
});
|
|
111
|
+
}
|
package/lib/esm/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
|
package/lib/esm/site.js
CHANGED
|
@@ -8,12 +8,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
10
|
import uniq from 'lodash/uniq';
|
|
11
|
+
import { randomUUID } from 'node:crypto';
|
|
11
12
|
import pMap from 'p-map';
|
|
12
13
|
import { config, logger } from './config';
|
|
13
|
-
import {
|
|
14
|
-
import { Snapshot } from './store
|
|
14
|
+
import { createCrawlQueue } from './crawler';
|
|
15
|
+
import { Snapshot } from './store';
|
|
15
16
|
import { formatUrl, getSitemapList } from './utils';
|
|
16
17
|
const crawlBlockletRunningMap = new Map();
|
|
18
|
+
const crawlQueue = createCrawlQueue('cronJobs');
|
|
17
19
|
function parseSitemapUrl(sitemapItem) {
|
|
18
20
|
var _a;
|
|
19
21
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -21,6 +23,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
21
23
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
22
24
|
}
|
|
23
25
|
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
26
|
+
var _b;
|
|
24
27
|
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
25
28
|
const key = `${url}-${pathname}`;
|
|
26
29
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -60,13 +63,17 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
60
63
|
url,
|
|
61
64
|
});
|
|
62
65
|
crawlCount++;
|
|
63
|
-
|
|
66
|
+
const jobId = randomUUID();
|
|
67
|
+
crawlQueue.push({
|
|
68
|
+
id: jobId,
|
|
64
69
|
url,
|
|
65
70
|
lastModified: sitemapItem.lastmod,
|
|
66
71
|
includeScreenshot: false,
|
|
67
72
|
includeHtml: true,
|
|
73
|
+
replace: true,
|
|
68
74
|
});
|
|
69
|
-
|
|
75
|
+
return jobId;
|
|
76
|
+
}), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
70
77
|
logger.info('Enqueued jobs from sitemap finished', {
|
|
71
78
|
url,
|
|
72
79
|
pathname,
|
package/lib/esm/store/index.d.ts
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
1
|
import { Sequelize } from '@sequelize/core';
|
|
2
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
3
|
-
|
|
3
|
+
declare const sequelize: Sequelize<SqliteDialect>;
|
|
4
|
+
export { sequelize };
|
|
5
|
+
export * from './job';
|
|
6
|
+
export * from './snapshot';
|
package/lib/esm/store/index.js
CHANGED
|
@@ -1,51 +1,29 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
1
|
import { Sequelize } from '@sequelize/core';
|
|
11
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
12
3
|
import path from 'path';
|
|
13
4
|
import { config, logger } from '../config';
|
|
14
5
|
import { Job } from './job';
|
|
15
6
|
import { Snapshot } from './snapshot';
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
40
|
-
]);
|
|
41
|
-
yield sequelize.authenticate();
|
|
42
|
-
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
43
|
-
logger.info('Successfully connected to database');
|
|
44
|
-
}
|
|
45
|
-
catch (error) {
|
|
46
|
-
logger.error('Failed to connect to database:', error);
|
|
47
|
-
throw error;
|
|
48
|
-
}
|
|
49
|
-
return sequelize;
|
|
50
|
-
});
|
|
51
|
-
}
|
|
7
|
+
const sequelize = new Sequelize({
|
|
8
|
+
dialect: SqliteDialect,
|
|
9
|
+
storage: path.join(config.dataDir, 'snap-kit.db'),
|
|
10
|
+
logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
|
|
11
|
+
pool: {
|
|
12
|
+
min: 0,
|
|
13
|
+
max: 10,
|
|
14
|
+
idle: 10000,
|
|
15
|
+
},
|
|
16
|
+
retry: {
|
|
17
|
+
match: [/SQLITE_BUSY/],
|
|
18
|
+
name: 'query',
|
|
19
|
+
max: 10,
|
|
20
|
+
},
|
|
21
|
+
});
|
|
22
|
+
sequelize.query('pragma journal_mode = WAL;');
|
|
23
|
+
sequelize.query('pragma synchronous = normal;');
|
|
24
|
+
sequelize.query('pragma journal_size_limit = 67108864;');
|
|
25
|
+
Job.initModel(sequelize);
|
|
26
|
+
Snapshot.initModel(sequelize);
|
|
27
|
+
export { sequelize };
|
|
28
|
+
export * from './job';
|
|
29
|
+
export * from './snapshot';
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -12,9 +12,14 @@ export interface JobState {
|
|
|
12
12
|
timeout?: number;
|
|
13
13
|
fullPage?: boolean;
|
|
14
14
|
lastModified?: string;
|
|
15
|
+
waitTime?: number;
|
|
16
|
+
replace?: boolean;
|
|
15
17
|
headers?: Record<string, string>;
|
|
16
18
|
cookies?: CookieParam[];
|
|
17
|
-
localStorage?:
|
|
19
|
+
localStorage?: {
|
|
20
|
+
key: string;
|
|
21
|
+
value: string;
|
|
22
|
+
}[];
|
|
18
23
|
}
|
|
19
24
|
export interface JobModel {
|
|
20
25
|
id: string;
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/* eslint-disable global-require */
|
|
2
|
+
import { SequelizeStorage, Umzug } from 'umzug';
|
|
3
|
+
import { sequelize } from './index';
|
|
4
|
+
import * as migration20250615 from './migrations/20250615-genesis';
|
|
5
|
+
import * as migration20250616Replace from './migrations/20250616-replace';
|
|
6
|
+
const umzug = new Umzug({
|
|
7
|
+
migrations: [
|
|
8
|
+
{
|
|
9
|
+
name: '20250615-genesis',
|
|
10
|
+
up: ({ context }) => migration20250615.up({ context }),
|
|
11
|
+
down: ({ context }) => migration20250615.down({ context }),
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
name: '20250616-replace',
|
|
15
|
+
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
16
|
+
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
17
|
+
},
|
|
18
|
+
],
|
|
19
|
+
context: sequelize.getQueryInterface(),
|
|
20
|
+
storage: new SequelizeStorage({ sequelize }),
|
|
21
|
+
logger: console,
|
|
22
|
+
});
|
|
23
|
+
export function migrate() {
|
|
24
|
+
return umzug.up();
|
|
25
|
+
}
|
|
26
|
+
export { umzug };
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
/* eslint-disable no-console */
|
|
11
|
+
import { DataTypes } from '@sequelize/core';
|
|
12
|
+
export function up(_a) {
|
|
13
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
14
|
+
console.log('[20250615-genesis:up] Migrating...');
|
|
15
|
+
yield context.createTable('snap', {
|
|
16
|
+
jobId: {
|
|
17
|
+
type: DataTypes.STRING,
|
|
18
|
+
primaryKey: true,
|
|
19
|
+
allowNull: false,
|
|
20
|
+
},
|
|
21
|
+
url: {
|
|
22
|
+
type: DataTypes.STRING,
|
|
23
|
+
allowNull: false,
|
|
24
|
+
index: true,
|
|
25
|
+
},
|
|
26
|
+
status: {
|
|
27
|
+
type: DataTypes.ENUM('success', 'failed', 'pending'),
|
|
28
|
+
allowNull: false,
|
|
29
|
+
},
|
|
30
|
+
html: {
|
|
31
|
+
type: DataTypes.TEXT,
|
|
32
|
+
allowNull: true,
|
|
33
|
+
},
|
|
34
|
+
screenshot: {
|
|
35
|
+
type: DataTypes.STRING,
|
|
36
|
+
allowNull: true,
|
|
37
|
+
},
|
|
38
|
+
error: {
|
|
39
|
+
type: DataTypes.STRING,
|
|
40
|
+
allowNull: true,
|
|
41
|
+
},
|
|
42
|
+
lastModified: {
|
|
43
|
+
type: DataTypes.STRING,
|
|
44
|
+
allowNull: true,
|
|
45
|
+
},
|
|
46
|
+
meta: {
|
|
47
|
+
type: DataTypes.JSON,
|
|
48
|
+
allowNull: true,
|
|
49
|
+
},
|
|
50
|
+
options: {
|
|
51
|
+
type: DataTypes.JSON,
|
|
52
|
+
allowNull: true,
|
|
53
|
+
},
|
|
54
|
+
createdAt: {
|
|
55
|
+
type: DataTypes.DATE,
|
|
56
|
+
defaultValue: DataTypes.NOW,
|
|
57
|
+
},
|
|
58
|
+
updatedAt: {
|
|
59
|
+
type: DataTypes.DATE,
|
|
60
|
+
defaultValue: DataTypes.NOW,
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
yield context.createTable('jobs', {
|
|
64
|
+
id: {
|
|
65
|
+
type: DataTypes.STRING(40),
|
|
66
|
+
primaryKey: true,
|
|
67
|
+
},
|
|
68
|
+
queue: {
|
|
69
|
+
type: DataTypes.STRING(32),
|
|
70
|
+
allowNull: false,
|
|
71
|
+
},
|
|
72
|
+
job: {
|
|
73
|
+
type: DataTypes.JSON,
|
|
74
|
+
allowNull: false,
|
|
75
|
+
},
|
|
76
|
+
retryCount: {
|
|
77
|
+
type: DataTypes.INTEGER,
|
|
78
|
+
},
|
|
79
|
+
delay: {
|
|
80
|
+
type: DataTypes.INTEGER,
|
|
81
|
+
},
|
|
82
|
+
willRunAt: {
|
|
83
|
+
type: DataTypes.INTEGER,
|
|
84
|
+
},
|
|
85
|
+
cancelled: {
|
|
86
|
+
type: DataTypes.BOOLEAN,
|
|
87
|
+
defaultValue: false,
|
|
88
|
+
},
|
|
89
|
+
createdAt: {
|
|
90
|
+
type: DataTypes.DATE,
|
|
91
|
+
defaultValue: DataTypes.NOW,
|
|
92
|
+
index: true,
|
|
93
|
+
},
|
|
94
|
+
updatedAt: {
|
|
95
|
+
type: DataTypes.DATE,
|
|
96
|
+
defaultValue: DataTypes.NOW,
|
|
97
|
+
index: true,
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
console.log('[20250615-genesis:up] Migrated successfully!');
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
export function down(_a) {
|
|
104
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
105
|
+
console.log('[20250615-genesis:down] Migrating...');
|
|
106
|
+
yield context.dropTable('snap');
|
|
107
|
+
yield context.dropTable('jobs');
|
|
108
|
+
console.log('[20250615-genesis:down] Migrated successfully!');
|
|
109
|
+
});
|
|
110
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
/* eslint-disable no-console */
|
|
11
|
+
import { DataTypes } from '@sequelize/core';
|
|
12
|
+
export function up(_a) {
|
|
13
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
14
|
+
console.log('[20250616-replace:up] Migrating...');
|
|
15
|
+
yield context.addColumn('snap', 'replace', {
|
|
16
|
+
type: DataTypes.BOOLEAN,
|
|
17
|
+
allowNull: false,
|
|
18
|
+
defaultValue: false,
|
|
19
|
+
index: true,
|
|
20
|
+
});
|
|
21
|
+
yield context.addIndex('snap', ['createdAt']);
|
|
22
|
+
yield context.addIndex('snap', ['updatedAt']);
|
|
23
|
+
yield context.addIndex('snap', ['status']);
|
|
24
|
+
console.log('[20250616-replace:up] Migrated successfully!');
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
export function down(_a) {
|
|
28
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
29
|
+
console.log('[20250616-replace:down] Migrating...');
|
|
30
|
+
yield context.removeColumn('snap', 'replace');
|
|
31
|
+
yield context.removeIndex('snap', ['createdAt']);
|
|
32
|
+
yield context.removeIndex('snap', ['updatedAt']);
|
|
33
|
+
yield context.removeIndex('snap', ['status']);
|
|
34
|
+
console.log('[20250616-replace:down] Migrated successfully!');
|
|
35
|
+
});
|
|
36
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
3
|
export interface SnapshotModel {
|
|
3
4
|
jobId: string;
|
|
@@ -7,6 +8,7 @@ export interface SnapshotModel {
|
|
|
7
8
|
screenshot?: string | null;
|
|
8
9
|
error?: string;
|
|
9
10
|
lastModified?: string;
|
|
11
|
+
replace?: boolean;
|
|
10
12
|
meta?: {
|
|
11
13
|
title?: string;
|
|
12
14
|
description?: string;
|
|
@@ -19,6 +21,11 @@ export interface SnapshotModel {
|
|
|
19
21
|
quality?: number;
|
|
20
22
|
fullPage?: boolean;
|
|
21
23
|
headers?: Record<string, string>;
|
|
24
|
+
cookies?: CookieParam[];
|
|
25
|
+
localStorage?: {
|
|
26
|
+
key: string;
|
|
27
|
+
value: string;
|
|
28
|
+
}[];
|
|
22
29
|
};
|
|
23
30
|
}
|
|
24
31
|
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
@@ -29,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
29
36
|
screenshot?: SnapshotModel['screenshot'];
|
|
30
37
|
error?: SnapshotModel['error'];
|
|
31
38
|
lastModified?: SnapshotModel['lastModified'];
|
|
39
|
+
replace?: SnapshotModel['replace'];
|
|
32
40
|
meta?: SnapshotModel['meta'];
|
|
33
41
|
options: SnapshotModel['options'];
|
|
34
42
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -24,6 +24,7 @@ export class Snapshot extends Model {
|
|
|
24
24
|
status: {
|
|
25
25
|
type: DataTypes.ENUM('success', 'failed', 'pending'),
|
|
26
26
|
allowNull: false,
|
|
27
|
+
index: true,
|
|
27
28
|
},
|
|
28
29
|
html: {
|
|
29
30
|
type: DataTypes.TEXT,
|
|
@@ -41,6 +42,12 @@ export class Snapshot extends Model {
|
|
|
41
42
|
type: DataTypes.STRING,
|
|
42
43
|
allowNull: true,
|
|
43
44
|
},
|
|
45
|
+
replace: {
|
|
46
|
+
type: DataTypes.BOOLEAN,
|
|
47
|
+
allowNull: false,
|
|
48
|
+
defaultValue: false,
|
|
49
|
+
index: true,
|
|
50
|
+
},
|
|
44
51
|
meta: {
|
|
45
52
|
type: DataTypes.JSON,
|
|
46
53
|
allowNull: true,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -61,7 +61,8 @@
|
|
|
61
61
|
"robots-parser": "^3.0.1",
|
|
62
62
|
"sitemap": "^7.1.2",
|
|
63
63
|
"sqlite3": "^5.1.7",
|
|
64
|
-
"ufo": "^1.5.4"
|
|
64
|
+
"ufo": "^1.5.4",
|
|
65
|
+
"umzug": "^3.8.2"
|
|
65
66
|
},
|
|
66
67
|
"devDependencies": {
|
|
67
68
|
"@types/dotenv-flow": "^3.3.3",
|