@arcblock/crawler 1.1.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/crawler.d.ts +11 -4
- package/lib/cjs/crawler.js +96 -59
- package/lib/cjs/index.d.ts +1 -0
- package/lib/cjs/index.js +3 -5
- package/lib/cjs/services/carbon.d.ts +3 -0
- package/lib/cjs/services/carbon.js +87 -0
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +36 -6
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +9 -3
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +5 -0
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +2 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/crawler.d.ts +11 -4
- package/lib/esm/crawler.js +92 -57
- package/lib/esm/index.d.ts +1 -0
- package/lib/esm/index.js +1 -4
- package/lib/esm/services/carbon.d.ts +3 -0
- package/lib/esm/services/carbon.js +84 -0
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +33 -4
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +9 -3
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +5 -0
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +2 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
package/lib/esm/index.d.ts
CHANGED
|
@@ -3,4 +3,5 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
+
export { migrate } from './store/migrate';
|
|
6
7
|
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/esm/index.js
CHANGED
|
@@ -10,23 +10,20 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
10
10
|
/* eslint-disable @typescript-eslint/indent */
|
|
11
11
|
import merge from 'lodash/merge';
|
|
12
12
|
import { config, logger } from './config';
|
|
13
|
-
import { createCrawlQueue } from './crawler';
|
|
14
13
|
import { initCron } from './cron';
|
|
15
14
|
import { ensureBrowser } from './puppeteer';
|
|
16
|
-
import { initDatabase } from './store';
|
|
17
15
|
export * from './crawler';
|
|
18
16
|
export * from './site';
|
|
19
17
|
export * from './services/snapshot';
|
|
20
18
|
export * as utils from './utils';
|
|
19
|
+
export { migrate } from './store/migrate';
|
|
21
20
|
export function initCrawler(params) {
|
|
22
21
|
return __awaiter(this, void 0, void 0, function* () {
|
|
23
22
|
var _a;
|
|
24
23
|
merge(config, params);
|
|
25
24
|
logger.info('Init crawler', { params, config });
|
|
26
25
|
try {
|
|
27
|
-
yield initDatabase();
|
|
28
26
|
yield ensureBrowser();
|
|
29
|
-
yield createCrawlQueue();
|
|
30
27
|
if ((_a = config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
31
28
|
yield initCron();
|
|
32
29
|
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { logger } from '../config';
|
|
11
|
+
// TODO expose local version of dom-to-image
|
|
12
|
+
const DOM_TO_IMAGE_URL = 'https://unpkg.com/dom-to-image@2.6.0/dist/dom-to-image.min.js';
|
|
13
|
+
export function createCarbonImage(page, params) {
|
|
14
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
15
|
+
try {
|
|
16
|
+
yield page.addScriptTag({ url: DOM_TO_IMAGE_URL });
|
|
17
|
+
yield page.waitForSelector('.export-container', { visible: true, timeout: (params === null || params === void 0 ? void 0 : params.timeout) || 120 });
|
|
18
|
+
const targetElement = yield page.$('.export-container');
|
|
19
|
+
const format = (params === null || params === void 0 ? void 0 : params.format) || 'png';
|
|
20
|
+
const dataUrl = yield page.evaluate((target = document, imageFormat = 'png') => {
|
|
21
|
+
const query = new URLSearchParams(document.location.search);
|
|
22
|
+
const EXPORT_SIZES_HASH = {
|
|
23
|
+
'1x': '1',
|
|
24
|
+
'2x': '2',
|
|
25
|
+
'4x': '4',
|
|
26
|
+
};
|
|
27
|
+
const exportSize = EXPORT_SIZES_HASH[query.get('es')] || '2';
|
|
28
|
+
if (!target) {
|
|
29
|
+
throw new Error('Target element not found');
|
|
30
|
+
}
|
|
31
|
+
target.querySelectorAll('span[role="presentation"]').forEach((node) => {
|
|
32
|
+
var _a;
|
|
33
|
+
const el = node;
|
|
34
|
+
if (el && el.innerText && el.innerText.match(/%[A-Fa-f0-9]{2}/)) {
|
|
35
|
+
(_a = el.innerText.match(/%[A-Fa-f0-9]{2}/g)) === null || _a === void 0 ? void 0 : _a.forEach((t) => {
|
|
36
|
+
el.innerHTML = el.innerHTML.replace(t, encodeURIComponent(t));
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
});
|
|
40
|
+
const width = target.offsetWidth * exportSize;
|
|
41
|
+
const height = query.get('si') === 'true'
|
|
42
|
+
? target.offsetWidth * exportSize
|
|
43
|
+
: target.offsetHeight * exportSize;
|
|
44
|
+
const config = {
|
|
45
|
+
style: {
|
|
46
|
+
transform: `scale(${exportSize})`,
|
|
47
|
+
'transform-origin': 'center',
|
|
48
|
+
background: query.get('si') ? query.get('bg') : 'none',
|
|
49
|
+
},
|
|
50
|
+
filter: (n) => {
|
|
51
|
+
if (n.className) {
|
|
52
|
+
return String(n.className).indexOf('eliminateOnRender') < 0;
|
|
53
|
+
}
|
|
54
|
+
return true;
|
|
55
|
+
},
|
|
56
|
+
width,
|
|
57
|
+
height,
|
|
58
|
+
};
|
|
59
|
+
switch (imageFormat) {
|
|
60
|
+
case 'jpeg':
|
|
61
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
62
|
+
return domtoimage.toJpeg(target, config);
|
|
63
|
+
case 'webp':
|
|
64
|
+
// dom-to-image doesn't support webp directly, fall back to png
|
|
65
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
66
|
+
return domtoimage.toPng(target, config);
|
|
67
|
+
case 'png':
|
|
68
|
+
default:
|
|
69
|
+
// @ts-ignore: domtoimage is injected by addScriptTag
|
|
70
|
+
return domtoimage.toPng(target, config);
|
|
71
|
+
}
|
|
72
|
+
}, targetElement, format);
|
|
73
|
+
const base64Data = dataUrl.split(',')[1];
|
|
74
|
+
if (!base64Data) {
|
|
75
|
+
throw new Error('Failed to extract base64 data from image');
|
|
76
|
+
}
|
|
77
|
+
return Buffer.from(base64Data, 'base64');
|
|
78
|
+
}
|
|
79
|
+
catch (e) {
|
|
80
|
+
logger.error('failed to crawl from carbon', { error: e });
|
|
81
|
+
throw e;
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from '../store
|
|
1
|
+
import { Transaction, WhereOptions } from '@sequelize/core';
|
|
2
|
+
import { JobState, SnapshotModel } from '../store';
|
|
3
3
|
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
4
|
job: JobState;
|
|
5
5
|
snapshot?: Partial<SnapshotModel>;
|
|
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
|
|
|
10
10
|
*/
|
|
11
11
|
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
12
|
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
13
|
+
export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
|
|
14
|
+
txn?: Transaction;
|
|
15
|
+
}): Promise<string[]>;
|
|
@@ -12,12 +12,11 @@ import pick from 'lodash/pick';
|
|
|
12
12
|
import fs from 'node:fs/promises';
|
|
13
13
|
import path from 'node:path';
|
|
14
14
|
import { joinURL } from 'ufo';
|
|
15
|
-
import { config } from '../config';
|
|
16
|
-
import { Job } from '../store
|
|
17
|
-
import { Snapshot } from '../store/snapshot';
|
|
15
|
+
import { config, logger } from '../config';
|
|
16
|
+
import { Job, Snapshot } from '../store';
|
|
18
17
|
import { formatUrl } from '../utils';
|
|
19
18
|
export function convertJobToSnapshot({ job, snapshot }) {
|
|
20
|
-
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
19
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
|
|
21
20
|
width: job.width,
|
|
22
21
|
height: job.height,
|
|
23
22
|
includeScreenshot: job.includeScreenshot,
|
|
@@ -76,7 +75,37 @@ export function getLatestSnapshot(url) {
|
|
|
76
75
|
url: formatUrl(url),
|
|
77
76
|
status: 'success',
|
|
78
77
|
},
|
|
78
|
+
order: [
|
|
79
|
+
['lastModified', 'DESC'],
|
|
80
|
+
['updatedAt', 'DESC'],
|
|
81
|
+
],
|
|
79
82
|
});
|
|
80
83
|
return snapshot ? formatSnapshot(snapshot) : null;
|
|
81
84
|
});
|
|
82
85
|
}
|
|
86
|
+
export function deleteSnapshots(where_1) {
|
|
87
|
+
return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
|
|
88
|
+
const snapshots = yield Snapshot.findAll({
|
|
89
|
+
where,
|
|
90
|
+
order: [
|
|
91
|
+
['lastModified', 'DESC'],
|
|
92
|
+
['updatedAt', 'DESC'],
|
|
93
|
+
],
|
|
94
|
+
});
|
|
95
|
+
const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
|
|
96
|
+
try {
|
|
97
|
+
yield Promise.all([
|
|
98
|
+
snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)),
|
|
99
|
+
snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)),
|
|
100
|
+
]);
|
|
101
|
+
yield snapshot.destroy({ transaction: txn });
|
|
102
|
+
return snapshot.jobId;
|
|
103
|
+
}
|
|
104
|
+
catch (error) {
|
|
105
|
+
logger.error('Failed to delete snapshot', { error, snapshot });
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
108
|
+
})));
|
|
109
|
+
return jobIds.filter(Boolean);
|
|
110
|
+
});
|
|
111
|
+
}
|
package/lib/esm/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
|
package/lib/esm/site.js
CHANGED
|
@@ -8,12 +8,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
10
|
import uniq from 'lodash/uniq';
|
|
11
|
+
import { randomUUID } from 'node:crypto';
|
|
11
12
|
import pMap from 'p-map';
|
|
12
13
|
import { config, logger } from './config';
|
|
13
|
-
import {
|
|
14
|
-
import { Snapshot } from './store
|
|
14
|
+
import { createCrawlQueue } from './crawler';
|
|
15
|
+
import { Snapshot } from './store';
|
|
15
16
|
import { formatUrl, getSitemapList } from './utils';
|
|
16
17
|
const crawlBlockletRunningMap = new Map();
|
|
18
|
+
const crawlQueue = createCrawlQueue('cronJobs');
|
|
17
19
|
function parseSitemapUrl(sitemapItem) {
|
|
18
20
|
var _a;
|
|
19
21
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -61,12 +63,16 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
61
63
|
url,
|
|
62
64
|
});
|
|
63
65
|
crawlCount++;
|
|
64
|
-
|
|
66
|
+
const jobId = randomUUID();
|
|
67
|
+
crawlQueue.push({
|
|
68
|
+
id: jobId,
|
|
65
69
|
url,
|
|
66
70
|
lastModified: sitemapItem.lastmod,
|
|
67
71
|
includeScreenshot: false,
|
|
68
72
|
includeHtml: true,
|
|
73
|
+
replace: true,
|
|
69
74
|
});
|
|
75
|
+
return jobId;
|
|
70
76
|
}), { concurrency: ((_b = config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
71
77
|
logger.info('Enqueued jobs from sitemap finished', {
|
|
72
78
|
url,
|
package/lib/esm/store/index.d.ts
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
1
|
import { Sequelize } from '@sequelize/core';
|
|
2
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
3
|
-
|
|
3
|
+
declare const sequelize: Sequelize<SqliteDialect>;
|
|
4
|
+
export { sequelize };
|
|
5
|
+
export * from './job';
|
|
6
|
+
export * from './snapshot';
|
package/lib/esm/store/index.js
CHANGED
|
@@ -1,51 +1,29 @@
|
|
|
1
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
-
});
|
|
9
|
-
};
|
|
10
1
|
import { Sequelize } from '@sequelize/core';
|
|
11
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
12
3
|
import path from 'path';
|
|
13
4
|
import { config, logger } from '../config';
|
|
14
5
|
import { Job } from './job';
|
|
15
6
|
import { Snapshot } from './snapshot';
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
40
|
-
]);
|
|
41
|
-
yield sequelize.authenticate();
|
|
42
|
-
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
43
|
-
logger.info('Successfully connected to database');
|
|
44
|
-
}
|
|
45
|
-
catch (error) {
|
|
46
|
-
logger.error('Failed to connect to database:', error);
|
|
47
|
-
throw error;
|
|
48
|
-
}
|
|
49
|
-
return sequelize;
|
|
50
|
-
});
|
|
51
|
-
}
|
|
7
|
+
const sequelize = new Sequelize({
|
|
8
|
+
dialect: SqliteDialect,
|
|
9
|
+
storage: path.join(config.dataDir, 'snap-kit.db'),
|
|
10
|
+
logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
|
|
11
|
+
pool: {
|
|
12
|
+
min: 0,
|
|
13
|
+
max: 10,
|
|
14
|
+
idle: 10000,
|
|
15
|
+
},
|
|
16
|
+
retry: {
|
|
17
|
+
match: [/SQLITE_BUSY/],
|
|
18
|
+
name: 'query',
|
|
19
|
+
max: 10,
|
|
20
|
+
},
|
|
21
|
+
});
|
|
22
|
+
sequelize.query('pragma journal_mode = WAL;');
|
|
23
|
+
sequelize.query('pragma synchronous = normal;');
|
|
24
|
+
sequelize.query('pragma journal_size_limit = 67108864;');
|
|
25
|
+
Job.initModel(sequelize);
|
|
26
|
+
Snapshot.initModel(sequelize);
|
|
27
|
+
export { sequelize };
|
|
28
|
+
export * from './job';
|
|
29
|
+
export * from './snapshot';
|
package/lib/esm/store/job.d.ts
CHANGED
|
@@ -9,9 +9,14 @@ export interface JobState {
|
|
|
9
9
|
width?: number;
|
|
10
10
|
height?: number;
|
|
11
11
|
quality?: number;
|
|
12
|
+
format?: 'png' | 'jpeg' | 'webp';
|
|
12
13
|
timeout?: number;
|
|
13
14
|
fullPage?: boolean;
|
|
14
15
|
lastModified?: string;
|
|
16
|
+
waitTime?: number;
|
|
17
|
+
replace?: boolean;
|
|
18
|
+
sync?: boolean;
|
|
19
|
+
ignoreRobots?: boolean;
|
|
15
20
|
headers?: Record<string, string>;
|
|
16
21
|
cookies?: CookieParam[];
|
|
17
22
|
localStorage?: {
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/* eslint-disable global-require */
|
|
2
|
+
import { SequelizeStorage, Umzug } from 'umzug';
|
|
3
|
+
import { sequelize } from './index';
|
|
4
|
+
import * as migration20250615 from './migrations/20250615-genesis';
|
|
5
|
+
import * as migration20250616Replace from './migrations/20250616-replace';
|
|
6
|
+
const umzug = new Umzug({
|
|
7
|
+
migrations: [
|
|
8
|
+
{
|
|
9
|
+
name: '20250615-genesis',
|
|
10
|
+
up: ({ context }) => migration20250615.up({ context }),
|
|
11
|
+
down: ({ context }) => migration20250615.down({ context }),
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
name: '20250616-replace',
|
|
15
|
+
up: ({ context }) => migration20250616Replace.up({ context }),
|
|
16
|
+
down: ({ context }) => migration20250616Replace.down({ context }),
|
|
17
|
+
},
|
|
18
|
+
],
|
|
19
|
+
context: sequelize.getQueryInterface(),
|
|
20
|
+
storage: new SequelizeStorage({ sequelize }),
|
|
21
|
+
logger: console,
|
|
22
|
+
});
|
|
23
|
+
export function migrate() {
|
|
24
|
+
return umzug.up();
|
|
25
|
+
}
|
|
26
|
+
export { umzug };
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
/* eslint-disable no-console */
|
|
11
|
+
import { DataTypes } from '@sequelize/core';
|
|
12
|
+
export function up(_a) {
|
|
13
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
14
|
+
console.log('[20250615-genesis:up] Migrating...');
|
|
15
|
+
yield context.createTable('snap', {
|
|
16
|
+
jobId: {
|
|
17
|
+
type: DataTypes.STRING,
|
|
18
|
+
primaryKey: true,
|
|
19
|
+
allowNull: false,
|
|
20
|
+
},
|
|
21
|
+
url: {
|
|
22
|
+
type: DataTypes.STRING,
|
|
23
|
+
allowNull: false,
|
|
24
|
+
index: true,
|
|
25
|
+
},
|
|
26
|
+
status: {
|
|
27
|
+
type: DataTypes.ENUM('success', 'failed', 'pending'),
|
|
28
|
+
allowNull: false,
|
|
29
|
+
},
|
|
30
|
+
html: {
|
|
31
|
+
type: DataTypes.TEXT,
|
|
32
|
+
allowNull: true,
|
|
33
|
+
},
|
|
34
|
+
screenshot: {
|
|
35
|
+
type: DataTypes.STRING,
|
|
36
|
+
allowNull: true,
|
|
37
|
+
},
|
|
38
|
+
error: {
|
|
39
|
+
type: DataTypes.STRING,
|
|
40
|
+
allowNull: true,
|
|
41
|
+
},
|
|
42
|
+
lastModified: {
|
|
43
|
+
type: DataTypes.STRING,
|
|
44
|
+
allowNull: true,
|
|
45
|
+
},
|
|
46
|
+
meta: {
|
|
47
|
+
type: DataTypes.JSON,
|
|
48
|
+
allowNull: true,
|
|
49
|
+
},
|
|
50
|
+
options: {
|
|
51
|
+
type: DataTypes.JSON,
|
|
52
|
+
allowNull: true,
|
|
53
|
+
},
|
|
54
|
+
createdAt: {
|
|
55
|
+
type: DataTypes.DATE,
|
|
56
|
+
defaultValue: DataTypes.NOW,
|
|
57
|
+
},
|
|
58
|
+
updatedAt: {
|
|
59
|
+
type: DataTypes.DATE,
|
|
60
|
+
defaultValue: DataTypes.NOW,
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
yield context.createTable('jobs', {
|
|
64
|
+
id: {
|
|
65
|
+
type: DataTypes.STRING(40),
|
|
66
|
+
primaryKey: true,
|
|
67
|
+
},
|
|
68
|
+
queue: {
|
|
69
|
+
type: DataTypes.STRING(32),
|
|
70
|
+
allowNull: false,
|
|
71
|
+
},
|
|
72
|
+
job: {
|
|
73
|
+
type: DataTypes.JSON,
|
|
74
|
+
allowNull: false,
|
|
75
|
+
},
|
|
76
|
+
retryCount: {
|
|
77
|
+
type: DataTypes.INTEGER,
|
|
78
|
+
},
|
|
79
|
+
delay: {
|
|
80
|
+
type: DataTypes.INTEGER,
|
|
81
|
+
},
|
|
82
|
+
willRunAt: {
|
|
83
|
+
type: DataTypes.INTEGER,
|
|
84
|
+
},
|
|
85
|
+
cancelled: {
|
|
86
|
+
type: DataTypes.BOOLEAN,
|
|
87
|
+
defaultValue: false,
|
|
88
|
+
},
|
|
89
|
+
createdAt: {
|
|
90
|
+
type: DataTypes.DATE,
|
|
91
|
+
defaultValue: DataTypes.NOW,
|
|
92
|
+
index: true,
|
|
93
|
+
},
|
|
94
|
+
updatedAt: {
|
|
95
|
+
type: DataTypes.DATE,
|
|
96
|
+
defaultValue: DataTypes.NOW,
|
|
97
|
+
index: true,
|
|
98
|
+
},
|
|
99
|
+
});
|
|
100
|
+
console.log('[20250615-genesis:up] Migrated successfully!');
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
export function down(_a) {
|
|
104
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
105
|
+
console.log('[20250615-genesis:down] Migrating...');
|
|
106
|
+
yield context.dropTable('snap');
|
|
107
|
+
yield context.dropTable('jobs');
|
|
108
|
+
console.log('[20250615-genesis:down] Migrated successfully!');
|
|
109
|
+
});
|
|
110
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
/* eslint-disable no-console */
|
|
11
|
+
import { DataTypes } from '@sequelize/core';
|
|
12
|
+
export function up(_a) {
|
|
13
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
14
|
+
console.log('[20250616-replace:up] Migrating...');
|
|
15
|
+
yield context.addColumn('snap', 'replace', {
|
|
16
|
+
type: DataTypes.BOOLEAN,
|
|
17
|
+
allowNull: false,
|
|
18
|
+
defaultValue: false,
|
|
19
|
+
index: true,
|
|
20
|
+
});
|
|
21
|
+
yield context.addIndex('snap', ['createdAt']);
|
|
22
|
+
yield context.addIndex('snap', ['updatedAt']);
|
|
23
|
+
yield context.addIndex('snap', ['status']);
|
|
24
|
+
console.log('[20250616-replace:up] Migrated successfully!');
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
export function down(_a) {
|
|
28
|
+
return __awaiter(this, arguments, void 0, function* ({ context }) {
|
|
29
|
+
console.log('[20250616-replace:down] Migrating...');
|
|
30
|
+
yield context.removeColumn('snap', 'replace');
|
|
31
|
+
yield context.removeIndex('snap', ['createdAt']);
|
|
32
|
+
yield context.removeIndex('snap', ['updatedAt']);
|
|
33
|
+
yield context.removeIndex('snap', ['status']);
|
|
34
|
+
console.log('[20250616-replace:down] Migrated successfully!');
|
|
35
|
+
});
|
|
36
|
+
}
|
|
@@ -8,6 +8,7 @@ export interface SnapshotModel {
|
|
|
8
8
|
screenshot?: string | null;
|
|
9
9
|
error?: string;
|
|
10
10
|
lastModified?: string;
|
|
11
|
+
replace?: boolean;
|
|
11
12
|
meta?: {
|
|
12
13
|
title?: string;
|
|
13
14
|
description?: string;
|
|
@@ -35,6 +36,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
35
36
|
screenshot?: SnapshotModel['screenshot'];
|
|
36
37
|
error?: SnapshotModel['error'];
|
|
37
38
|
lastModified?: SnapshotModel['lastModified'];
|
|
39
|
+
replace?: SnapshotModel['replace'];
|
|
38
40
|
meta?: SnapshotModel['meta'];
|
|
39
41
|
options: SnapshotModel['options'];
|
|
40
42
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
@@ -24,6 +24,7 @@ export class Snapshot extends Model {
|
|
|
24
24
|
status: {
|
|
25
25
|
type: DataTypes.ENUM('success', 'failed', 'pending'),
|
|
26
26
|
allowNull: false,
|
|
27
|
+
index: true,
|
|
27
28
|
},
|
|
28
29
|
html: {
|
|
29
30
|
type: DataTypes.TEXT,
|
|
@@ -41,6 +42,12 @@ export class Snapshot extends Model {
|
|
|
41
42
|
type: DataTypes.STRING,
|
|
42
43
|
allowNull: true,
|
|
43
44
|
},
|
|
45
|
+
replace: {
|
|
46
|
+
type: DataTypes.BOOLEAN,
|
|
47
|
+
allowNull: false,
|
|
48
|
+
defaultValue: false,
|
|
49
|
+
index: true,
|
|
50
|
+
},
|
|
44
51
|
meta: {
|
|
45
52
|
type: DataTypes.JSON,
|
|
46
53
|
allowNull: true,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"main": "lib/cjs/index.js",
|
|
5
5
|
"module": "lib/esm/index.js",
|
|
6
6
|
"types": "lib/cjs/index.d.ts",
|
|
@@ -61,7 +61,8 @@
|
|
|
61
61
|
"robots-parser": "^3.0.1",
|
|
62
62
|
"sitemap": "^7.1.2",
|
|
63
63
|
"sqlite3": "^5.1.7",
|
|
64
|
-
"ufo": "^1.5.4"
|
|
64
|
+
"ufo": "^1.5.4",
|
|
65
|
+
"umzug": "^3.8.2"
|
|
65
66
|
},
|
|
66
67
|
"devDependencies": {
|
|
67
68
|
"@types/dotenv-flow": "^3.3.3",
|