@arcblock/crawler 1.1.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/lib/cjs/config.d.ts +9 -3
- package/lib/cjs/config.js +2 -10
- package/lib/cjs/crawler.d.ts +3 -4
- package/lib/cjs/crawler.js +74 -48
- package/lib/cjs/cron.js +5 -0
- package/lib/cjs/index.d.ts +2 -4
- package/lib/cjs/index.js +6 -6
- package/lib/cjs/services/snapshot.d.ts +5 -2
- package/lib/cjs/services/snapshot.js +44 -7
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +11 -4
- package/lib/cjs/store/index.d.ts +4 -1
- package/lib/cjs/store/index.js +37 -45
- package/lib/cjs/store/job.d.ts +6 -1
- package/lib/cjs/store/migrate.d.ts +4 -0
- package/lib/cjs/store/migrate.js +63 -0
- package/lib/cjs/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/cjs/store/migrations/20250615-genesis.js +114 -0
- package/lib/cjs/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/cjs/store/migrations/20250616-replace.js +40 -0
- package/lib/cjs/store/snapshot.d.ts +8 -0
- package/lib/cjs/store/snapshot.js +7 -0
- package/lib/esm/config.d.ts +9 -3
- package/lib/esm/config.js +2 -10
- package/lib/esm/crawler.d.ts +3 -4
- package/lib/esm/crawler.js +71 -45
- package/lib/esm/cron.js +5 -0
- package/lib/esm/index.d.ts +2 -4
- package/lib/esm/index.js +4 -5
- package/lib/esm/services/snapshot.d.ts +5 -2
- package/lib/esm/services/snapshot.js +41 -5
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +11 -4
- package/lib/esm/store/index.d.ts +4 -1
- package/lib/esm/store/index.js +23 -45
- package/lib/esm/store/job.d.ts +6 -1
- package/lib/esm/store/migrate.d.ts +4 -0
- package/lib/esm/store/migrate.js +26 -0
- package/lib/esm/store/migrations/20250615-genesis.d.ts +6 -0
- package/lib/esm/store/migrations/20250615-genesis.js +110 -0
- package/lib/esm/store/migrations/20250616-replace.d.ts +6 -0
- package/lib/esm/store/migrations/20250616-replace.js +36 -0
- package/lib/esm/store/snapshot.d.ts +8 -0
- package/lib/esm/store/snapshot.js +7 -0
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -43,8 +43,7 @@ await initCrawler({
|
|
|
43
43
|
immediate: !!env.preferences.cronImmediate,
|
|
44
44
|
sites: env.preferences.cronSites,
|
|
45
45
|
time: env.preferences.cronTime,
|
|
46
|
-
|
|
47
|
-
sitemapConcurrency: env.preferences.sitemapConcurrency,
|
|
46
|
+
concurrency: env.preferences.concurrency,
|
|
48
47
|
},
|
|
49
48
|
});
|
|
50
49
|
```
|
package/lib/cjs/config.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { CookieParam } from '@blocklet/puppeteer';
|
|
1
2
|
export type Site = {
|
|
2
3
|
url: string;
|
|
3
4
|
pathname: string;
|
|
@@ -11,14 +12,19 @@ export type Config = {
|
|
|
11
12
|
appUrl: string;
|
|
12
13
|
cacheDir: string;
|
|
13
14
|
puppeteerPath?: string;
|
|
14
|
-
|
|
15
|
+
concurrency: number;
|
|
16
|
+
siteCron?: {
|
|
15
17
|
sites: Site[];
|
|
16
18
|
time: string;
|
|
17
19
|
enabled: boolean;
|
|
18
20
|
immediate: boolean;
|
|
19
|
-
|
|
20
|
-
sitemapConcurrency: number;
|
|
21
|
+
concurrency: number;
|
|
21
22
|
};
|
|
23
|
+
cookies?: CookieParam[];
|
|
24
|
+
localStorage?: {
|
|
25
|
+
key: string;
|
|
26
|
+
value: string;
|
|
27
|
+
}[];
|
|
22
28
|
};
|
|
23
29
|
export declare const logger: any;
|
|
24
30
|
export declare const config: Config;
|
package/lib/cjs/config.js
CHANGED
|
@@ -9,17 +9,9 @@ exports.logger = (0, logger_1.default)('@arcblock/crawler', { level: process.env
|
|
|
9
9
|
exports.config = {
|
|
10
10
|
isProd: process.env.NODE_ENV === 'production',
|
|
11
11
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
12
|
-
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
13
12
|
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
13
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
14
14
|
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
15
15
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
16
|
-
|
|
17
|
-
siteCron: {
|
|
18
|
-
sites: [],
|
|
19
|
-
enabled: true,
|
|
20
|
-
time: '0 0 0 * * *',
|
|
21
|
-
immediate: false,
|
|
22
|
-
crawlConcurrency: 2,
|
|
23
|
-
sitemapConcurrency: 30,
|
|
24
|
-
},
|
|
16
|
+
concurrency: 2,
|
|
25
17
|
};
|
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import { JobState } from './store
|
|
2
|
-
|
|
3
|
-
export declare function createCrawlQueue(): void;
|
|
1
|
+
import { JobState, SnapshotModel } from './store';
|
|
2
|
+
export declare function createCrawlQueue(queue: string): any;
|
|
4
3
|
export declare function getDataDir(): Promise<{
|
|
5
4
|
htmlDir: string;
|
|
6
5
|
screenshotDir: string;
|
|
7
6
|
}>;
|
|
8
|
-
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
7
|
+
export declare const getPageContent: ({ url, includeScreenshot, includeHtml, width, height, quality, timeout, waitTime, fullPage, headers, cookies, localStorage, }: JobState) => Promise<{
|
|
9
8
|
html: string | null;
|
|
10
9
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
11
10
|
meta: {
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -24,16 +24,16 @@ const path_1 = __importDefault(require("path"));
|
|
|
24
24
|
const config_1 = require("./config");
|
|
25
25
|
const puppeteer_1 = require("./puppeteer");
|
|
26
26
|
const snapshot_1 = require("./services/snapshot");
|
|
27
|
-
const
|
|
28
|
-
const snapshot_2 = require("./store/snapshot");
|
|
27
|
+
const store_1 = require("./store");
|
|
29
28
|
const utils_1 = require("./utils");
|
|
30
29
|
const { BaseState } = require('@abtnode/models');
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
// eslint-disable-next-line import/no-mutable-exports
|
|
31
|
+
const crawlQueue = createCrawlQueue('urlCrawler');
|
|
32
|
+
function createCrawlQueue(queue) {
|
|
33
|
+
const db = new BaseState(store_1.Job);
|
|
34
|
+
return (0, queue_1.default)({
|
|
35
|
+
store: new sequelize_1.default(db, queue),
|
|
36
|
+
concurrency: config_1.config.concurrency,
|
|
37
37
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
38
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
39
|
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
@@ -46,7 +46,7 @@ function createCrawlQueue() {
|
|
|
46
46
|
error: 'Denied by robots.txt',
|
|
47
47
|
},
|
|
48
48
|
});
|
|
49
|
-
yield
|
|
49
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
50
50
|
return snapshot;
|
|
51
51
|
}
|
|
52
52
|
// if index reach autoCloseBrowserCount, close browser
|
|
@@ -57,54 +57,67 @@ function createCrawlQueue() {
|
|
|
57
57
|
// } catch (error) {
|
|
58
58
|
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
59
59
|
// }
|
|
60
|
+
const formattedJob = Object.assign(Object.assign({}, job), { cookies: (config_1.config.cookies || []).concat(job.cookies || []), localStorage: (config_1.config.localStorage || []).concat(job.localStorage || []), url: (0, utils_1.formatUrl)(job.url) });
|
|
60
61
|
try {
|
|
61
62
|
// get page content later
|
|
62
|
-
const result = yield (0, exports.getPageContent)(
|
|
63
|
-
// for blocklet theme
|
|
64
|
-
blocklet_theme_prefer: 'light',
|
|
65
|
-
// for blocklet domain warning
|
|
66
|
-
'domain-warning-skip': Date.now().toString(),
|
|
67
|
-
} }, job));
|
|
63
|
+
const result = yield (0, exports.getPageContent)(formattedJob);
|
|
68
64
|
if (!result || (!result.html && !result.screenshot)) {
|
|
69
|
-
config_1.logger.error(`failed to crawl ${
|
|
65
|
+
config_1.logger.error(`failed to crawl ${formattedJob.url}, empty content`, formattedJob);
|
|
70
66
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
71
|
-
job,
|
|
67
|
+
job: formattedJob,
|
|
72
68
|
snapshot: {
|
|
73
69
|
status: 'failed',
|
|
74
70
|
error: 'Failed to crawl content',
|
|
75
71
|
},
|
|
76
72
|
});
|
|
77
|
-
yield
|
|
73
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
78
74
|
return snapshot;
|
|
79
75
|
}
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
76
|
+
const snapshot = yield store_1.sequelize.transaction((txn) => __awaiter(this, void 0, void 0, function* () {
|
|
77
|
+
// delete old snapshot
|
|
78
|
+
if (formattedJob.replace) {
|
|
79
|
+
try {
|
|
80
|
+
const deletedJobIds = yield (0, snapshot_1.deleteSnapshots)({
|
|
81
|
+
url: formattedJob.url,
|
|
82
|
+
replace: true,
|
|
83
|
+
}, { txn });
|
|
84
|
+
if (deletedJobIds) {
|
|
85
|
+
config_1.logger.info('Deleted old snapshot', { deletedJobIds });
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
config_1.logger.error('Failed to delete old snapshot', { error, formattedJob });
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
// save html and screenshot to data dir
|
|
93
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
94
|
+
screenshot: result.screenshot,
|
|
95
|
+
html: result.html,
|
|
96
|
+
});
|
|
97
|
+
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
98
|
+
job: formattedJob,
|
|
99
|
+
snapshot: {
|
|
100
|
+
status: 'success',
|
|
101
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
102
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
|
|
103
|
+
meta: result.meta,
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
yield store_1.Snapshot.upsert(snapshot, { transaction: txn });
|
|
107
|
+
return snapshot;
|
|
108
|
+
}));
|
|
96
109
|
return snapshot;
|
|
97
110
|
}
|
|
98
111
|
catch (error) {
|
|
99
|
-
config_1.logger.error(`Failed to crawl ${
|
|
112
|
+
config_1.logger.error(`Failed to crawl ${formattedJob.url}`, { error, formattedJob });
|
|
100
113
|
const snapshot = (0, snapshot_1.convertJobToSnapshot)({
|
|
101
|
-
job,
|
|
114
|
+
job: formattedJob,
|
|
102
115
|
snapshot: {
|
|
103
116
|
status: 'failed',
|
|
104
117
|
error: 'Internal error',
|
|
105
118
|
},
|
|
106
119
|
});
|
|
107
|
-
yield
|
|
120
|
+
yield store_1.Snapshot.upsert(snapshot);
|
|
108
121
|
return snapshot;
|
|
109
122
|
}
|
|
110
123
|
}),
|
|
@@ -142,7 +155,7 @@ function saveSnapshotToLocal(_a) {
|
|
|
142
155
|
};
|
|
143
156
|
});
|
|
144
157
|
}
|
|
145
|
-
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, fullPage = false, headers, cookies
|
|
158
|
+
const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 90 * 1000, waitTime = 0, fullPage = false, headers, cookies, localStorage, }) {
|
|
146
159
|
const page = yield (0, puppeteer_1.initPage)();
|
|
147
160
|
if (width && height) {
|
|
148
161
|
yield page.setViewport({ width, height, deviceScaleFactor: 2 });
|
|
@@ -150,13 +163,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
150
163
|
if (headers) {
|
|
151
164
|
yield page.setExtraHTTPHeaders(headers);
|
|
152
165
|
}
|
|
153
|
-
|
|
154
|
-
|
|
166
|
+
// handle cookies
|
|
167
|
+
if (cookies) {
|
|
168
|
+
const { hostname } = new URL(url);
|
|
169
|
+
const cookieParams = cookies.map((item) => (Object.assign(Object.assign({}, item), { expires: item.expires ? new Date(item.expires).getTime() : undefined, domain: item.domain || hostname, path: item.path || '/' })));
|
|
170
|
+
yield page.setCookie(...cookieParams);
|
|
155
171
|
}
|
|
172
|
+
// handle localStorage
|
|
156
173
|
if (localStorage) {
|
|
157
174
|
yield page.evaluateOnNewDocument((items) => {
|
|
158
|
-
|
|
159
|
-
|
|
175
|
+
items.forEach((item) => {
|
|
176
|
+
const value = item.value === 'now()' ? new Date().toISOString() : item.value;
|
|
177
|
+
window.localStorage.setItem(item.key, value);
|
|
160
178
|
});
|
|
161
179
|
}, localStorage);
|
|
162
180
|
}
|
|
@@ -174,9 +192,18 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
174
192
|
}
|
|
175
193
|
// await for networkidle0
|
|
176
194
|
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
195
|
+
try {
|
|
196
|
+
yield Promise.all([
|
|
197
|
+
page.waitForNetworkIdle({
|
|
198
|
+
idleTime: 1.5 * 1000,
|
|
199
|
+
timeout,
|
|
200
|
+
}),
|
|
201
|
+
(0, utils_1.sleep)(waitTime),
|
|
202
|
+
]);
|
|
203
|
+
}
|
|
204
|
+
catch (err) {
|
|
205
|
+
config_1.logger.warn(`Failed to wait for network idle in ${url}:`, err);
|
|
206
|
+
}
|
|
180
207
|
// get screenshot
|
|
181
208
|
if (includeScreenshot) {
|
|
182
209
|
// Try to find the tallest element and set the browser to the same height
|
|
@@ -220,7 +247,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
220
247
|
// check if the page is an error page
|
|
221
248
|
const isErrorPage = ['<h2>Unexpected Application Error!</h2>', 'Current route occurred an error'].some((errorHtml) => data.html.includes(errorHtml));
|
|
222
249
|
if (isErrorPage) {
|
|
223
|
-
throw new Error(
|
|
250
|
+
throw new Error(`${url} is an error page`);
|
|
224
251
|
}
|
|
225
252
|
meta.title = data.title;
|
|
226
253
|
meta.description = data.description;
|
|
@@ -255,9 +282,8 @@ exports.getPageContent = getPageContent;
|
|
|
255
282
|
// eslint-disable-next-line require-await
|
|
256
283
|
function crawlUrl(params, callback) {
|
|
257
284
|
return __awaiter(this, void 0, void 0, function* () {
|
|
258
|
-
params = Object.assign(Object.assign({}, params), { url: (0, utils_1.formatUrl)(params.url) });
|
|
259
285
|
// skip duplicate job
|
|
260
|
-
const existsJob = yield
|
|
286
|
+
const existsJob = yield store_1.Job.isExists(params);
|
|
261
287
|
if (existsJob) {
|
|
262
288
|
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
263
289
|
return existsJob.id;
|
package/lib/cjs/cron.js
CHANGED
|
@@ -20,6 +20,8 @@ let cron = null;
|
|
|
20
20
|
function initCron() {
|
|
21
21
|
if (cron)
|
|
22
22
|
return;
|
|
23
|
+
if (!config_1.config.siteCron)
|
|
24
|
+
return;
|
|
23
25
|
config_1.logger.info('Init cron', { config: config_1.config.siteCron });
|
|
24
26
|
cron = cron_1.default.init({
|
|
25
27
|
context: {},
|
|
@@ -29,6 +31,9 @@ function initCron() {
|
|
|
29
31
|
time: config_1.config.siteCron.time,
|
|
30
32
|
options: { runOnInit: config_1.config.siteCron.immediate },
|
|
31
33
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
34
|
+
var _a;
|
|
35
|
+
if (!((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled))
|
|
36
|
+
return;
|
|
32
37
|
config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
|
|
33
38
|
for (const site of config_1.config.siteCron.sites) {
|
|
34
39
|
try {
|
package/lib/cjs/index.d.ts
CHANGED
|
@@ -3,7 +3,5 @@ export * from './crawler';
|
|
|
3
3
|
export * from './site';
|
|
4
4
|
export * from './services/snapshot';
|
|
5
5
|
export * as utils from './utils';
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
} : T;
|
|
9
|
-
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
|
6
|
+
export { migrate } from './store/migrate';
|
|
7
|
+
export declare function initCrawler(params: Pick<Config, 'puppeteerPath' | 'siteCron' | 'cookies' | 'localStorage' | 'concurrency'>): Promise<void>;
|
package/lib/cjs/index.js
CHANGED
|
@@ -48,27 +48,27 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
48
48
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
49
49
|
};
|
|
50
50
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
51
|
-
exports.utils = void 0;
|
|
51
|
+
exports.migrate = exports.utils = void 0;
|
|
52
52
|
exports.initCrawler = initCrawler;
|
|
53
|
+
/* eslint-disable @typescript-eslint/indent */
|
|
53
54
|
const merge_1 = __importDefault(require("lodash/merge"));
|
|
54
55
|
const config_1 = require("./config");
|
|
55
|
-
const crawler_1 = require("./crawler");
|
|
56
56
|
const cron_1 = require("./cron");
|
|
57
57
|
const puppeteer_1 = require("./puppeteer");
|
|
58
|
-
const store_1 = require("./store");
|
|
59
58
|
__exportStar(require("./crawler"), exports);
|
|
60
59
|
__exportStar(require("./site"), exports);
|
|
61
60
|
__exportStar(require("./services/snapshot"), exports);
|
|
62
61
|
exports.utils = __importStar(require("./utils"));
|
|
62
|
+
var migrate_1 = require("./store/migrate");
|
|
63
|
+
Object.defineProperty(exports, "migrate", { enumerable: true, get: function () { return migrate_1.migrate; } });
|
|
63
64
|
function initCrawler(params) {
|
|
64
65
|
return __awaiter(this, void 0, void 0, function* () {
|
|
66
|
+
var _a;
|
|
65
67
|
(0, merge_1.default)(config_1.config, params);
|
|
66
68
|
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
67
69
|
try {
|
|
68
|
-
yield (0, store_1.initDatabase)();
|
|
69
70
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
70
|
-
|
|
71
|
-
if (config_1.config.siteCron.enabled) {
|
|
71
|
+
if ((_a = config_1.config.siteCron) === null || _a === void 0 ? void 0 : _a.enabled) {
|
|
72
72
|
yield (0, cron_1.initCron)();
|
|
73
73
|
}
|
|
74
74
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { SnapshotModel } from '../store
|
|
1
|
+
import { Transaction, WhereOptions } from '@sequelize/core';
|
|
2
|
+
import { JobState, SnapshotModel } from '../store';
|
|
3
3
|
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
4
|
job: JobState;
|
|
5
5
|
snapshot?: Partial<SnapshotModel>;
|
|
@@ -10,3 +10,6 @@ export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<
|
|
|
10
10
|
*/
|
|
11
11
|
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
12
|
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
13
|
+
export declare function deleteSnapshots(where: WhereOptions<SnapshotModel>, { txn }?: {
|
|
14
|
+
txn?: Transaction;
|
|
15
|
+
}): Promise<string[]>;
|
|
@@ -16,16 +16,17 @@ exports.convertJobToSnapshot = convertJobToSnapshot;
|
|
|
16
16
|
exports.formatSnapshot = formatSnapshot;
|
|
17
17
|
exports.getSnapshot = getSnapshot;
|
|
18
18
|
exports.getLatestSnapshot = getLatestSnapshot;
|
|
19
|
+
exports.deleteSnapshots = deleteSnapshots;
|
|
20
|
+
const cloneDeep_1 = __importDefault(require("lodash/cloneDeep"));
|
|
19
21
|
const pick_1 = __importDefault(require("lodash/pick"));
|
|
20
22
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
21
23
|
const node_path_1 = __importDefault(require("node:path"));
|
|
22
24
|
const ufo_1 = require("ufo");
|
|
23
25
|
const config_1 = require("../config");
|
|
24
|
-
const
|
|
25
|
-
const snapshot_1 = require("../store/snapshot");
|
|
26
|
+
const store_1 = require("../store");
|
|
26
27
|
const utils_1 = require("../utils");
|
|
27
28
|
function convertJobToSnapshot({ job, snapshot }) {
|
|
28
|
-
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
29
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), replace: job.replace, options: {
|
|
29
30
|
width: job.width,
|
|
30
31
|
height: job.height,
|
|
31
32
|
includeScreenshot: job.includeScreenshot,
|
|
@@ -36,7 +37,7 @@ function convertJobToSnapshot({ job, snapshot }) {
|
|
|
36
37
|
}
|
|
37
38
|
function formatSnapshot(snapshot, columns) {
|
|
38
39
|
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
-
let data =
|
|
40
|
+
let data = (0, cloneDeep_1.default)(snapshot);
|
|
40
41
|
// format screenshot path to full url
|
|
41
42
|
if (data.screenshot) {
|
|
42
43
|
data.screenshot = (0, ufo_1.joinURL)(config_1.config.appUrl, data.screenshot);
|
|
@@ -46,6 +47,12 @@ function formatSnapshot(snapshot, columns) {
|
|
|
46
47
|
const html = yield promises_1.default.readFile(node_path_1.default.join(config_1.config.dataDir, data.html));
|
|
47
48
|
data.html = html.toString();
|
|
48
49
|
}
|
|
50
|
+
// remove sensitive options that should not be returned
|
|
51
|
+
if (data.options) {
|
|
52
|
+
delete data.options.cookies;
|
|
53
|
+
delete data.options.localStorage;
|
|
54
|
+
delete data.options.headers;
|
|
55
|
+
}
|
|
49
56
|
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
50
57
|
data = (0, pick_1.default)(data, columns);
|
|
51
58
|
}
|
|
@@ -57,11 +64,11 @@ function formatSnapshot(snapshot, columns) {
|
|
|
57
64
|
*/
|
|
58
65
|
function getSnapshot(jobId) {
|
|
59
66
|
return __awaiter(this, void 0, void 0, function* () {
|
|
60
|
-
const snapshot = yield
|
|
67
|
+
const snapshot = yield store_1.Snapshot.findSnapshot({ where: { jobId } });
|
|
61
68
|
if (snapshot) {
|
|
62
69
|
return formatSnapshot(snapshot);
|
|
63
70
|
}
|
|
64
|
-
const job = yield
|
|
71
|
+
const job = yield store_1.Job.findJob({ id: jobId });
|
|
65
72
|
if (job) {
|
|
66
73
|
return {
|
|
67
74
|
jobId,
|
|
@@ -73,12 +80,42 @@ function getSnapshot(jobId) {
|
|
|
73
80
|
}
|
|
74
81
|
function getLatestSnapshot(url) {
|
|
75
82
|
return __awaiter(this, void 0, void 0, function* () {
|
|
76
|
-
const snapshot = yield
|
|
83
|
+
const snapshot = yield store_1.Snapshot.findSnapshot({
|
|
77
84
|
where: {
|
|
78
85
|
url: (0, utils_1.formatUrl)(url),
|
|
79
86
|
status: 'success',
|
|
80
87
|
},
|
|
88
|
+
order: [
|
|
89
|
+
['lastModified', 'DESC'],
|
|
90
|
+
['updatedAt', 'DESC'],
|
|
91
|
+
],
|
|
81
92
|
});
|
|
82
93
|
return snapshot ? formatSnapshot(snapshot) : null;
|
|
83
94
|
});
|
|
84
95
|
}
|
|
96
|
+
function deleteSnapshots(where_1) {
|
|
97
|
+
return __awaiter(this, arguments, void 0, function* (where, { txn } = {}) {
|
|
98
|
+
const snapshots = yield store_1.Snapshot.findAll({
|
|
99
|
+
where,
|
|
100
|
+
order: [
|
|
101
|
+
['lastModified', 'DESC'],
|
|
102
|
+
['updatedAt', 'DESC'],
|
|
103
|
+
],
|
|
104
|
+
});
|
|
105
|
+
const jobIds = yield Promise.all(snapshots.map((snapshot) => __awaiter(this, void 0, void 0, function* () {
|
|
106
|
+
try {
|
|
107
|
+
yield Promise.all([
|
|
108
|
+
snapshot.html && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.html)),
|
|
109
|
+
snapshot.screenshot && promises_1.default.unlink(node_path_1.default.join(config_1.config.dataDir, snapshot.screenshot)),
|
|
110
|
+
]);
|
|
111
|
+
yield snapshot.destroy({ transaction: txn });
|
|
112
|
+
return snapshot.jobId;
|
|
113
|
+
}
|
|
114
|
+
catch (error) {
|
|
115
|
+
config_1.logger.error('Failed to delete snapshot', { error, snapshot });
|
|
116
|
+
throw error;
|
|
117
|
+
}
|
|
118
|
+
})));
|
|
119
|
+
return jobIds.filter(Boolean);
|
|
120
|
+
});
|
|
121
|
+
}
|
package/lib/cjs/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(`${string}-${string}-${string}-${string}-${string}` | null)[]>;
|
package/lib/cjs/site.js
CHANGED
|
@@ -14,12 +14,14 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
14
14
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
15
|
exports.crawlSite = void 0;
|
|
16
16
|
const uniq_1 = __importDefault(require("lodash/uniq"));
|
|
17
|
+
const node_crypto_1 = require("node:crypto");
|
|
17
18
|
const p_map_1 = __importDefault(require("p-map"));
|
|
18
19
|
const config_1 = require("./config");
|
|
19
20
|
const crawler_1 = require("./crawler");
|
|
20
|
-
const
|
|
21
|
+
const store_1 = require("./store");
|
|
21
22
|
const utils_1 = require("./utils");
|
|
22
23
|
const crawlBlockletRunningMap = new Map();
|
|
24
|
+
const crawlQueue = (0, crawler_1.createCrawlQueue)('cronJobs');
|
|
23
25
|
function parseSitemapUrl(sitemapItem) {
|
|
24
26
|
var _a;
|
|
25
27
|
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
@@ -27,6 +29,7 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
27
29
|
return urls.map((url) => ({ url, sitemapItem }));
|
|
28
30
|
}
|
|
29
31
|
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
32
|
+
var _b;
|
|
30
33
|
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
31
34
|
const key = `${url}-${pathname}`;
|
|
32
35
|
if (crawlBlockletRunningMap.has(key)) {
|
|
@@ -47,7 +50,7 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
47
50
|
try {
|
|
48
51
|
const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
49
52
|
processCount++;
|
|
50
|
-
const snapshot = yield
|
|
53
|
+
const snapshot = yield store_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
|
|
51
54
|
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
52
55
|
const lastModified = new Date(snapshot.lastModified);
|
|
53
56
|
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
@@ -66,13 +69,17 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
66
69
|
url,
|
|
67
70
|
});
|
|
68
71
|
crawlCount++;
|
|
69
|
-
|
|
72
|
+
const jobId = (0, node_crypto_1.randomUUID)();
|
|
73
|
+
crawlQueue.push({
|
|
74
|
+
id: jobId,
|
|
70
75
|
url,
|
|
71
76
|
lastModified: sitemapItem.lastmod,
|
|
72
77
|
includeScreenshot: false,
|
|
73
78
|
includeHtml: true,
|
|
79
|
+
replace: true,
|
|
74
80
|
});
|
|
75
|
-
|
|
81
|
+
return jobId;
|
|
82
|
+
}), { concurrency: ((_b = config_1.config.siteCron) === null || _b === void 0 ? void 0 : _b.concurrency) || 30 });
|
|
76
83
|
config_1.logger.info('Enqueued jobs from sitemap finished', {
|
|
77
84
|
url,
|
|
78
85
|
pathname,
|
package/lib/cjs/store/index.d.ts
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
1
|
import { Sequelize } from '@sequelize/core';
|
|
2
2
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
3
|
-
|
|
3
|
+
declare const sequelize: Sequelize<SqliteDialect>;
|
|
4
|
+
export { sequelize };
|
|
5
|
+
export * from './job';
|
|
6
|
+
export * from './snapshot';
|
package/lib/cjs/store/index.js
CHANGED
|
@@ -1,57 +1,49 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
10
15
|
};
|
|
11
16
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
17
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
18
|
};
|
|
14
19
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.
|
|
20
|
+
exports.sequelize = void 0;
|
|
16
21
|
const core_1 = require("@sequelize/core");
|
|
17
22
|
const sqlite3_1 = require("@sequelize/sqlite3");
|
|
18
23
|
const path_1 = __importDefault(require("path"));
|
|
19
24
|
const config_1 = require("../config");
|
|
20
25
|
const job_1 = require("./job");
|
|
21
26
|
const snapshot_1 = require("./snapshot");
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
46
|
-
]);
|
|
47
|
-
yield sequelize.authenticate();
|
|
48
|
-
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
49
|
-
config_1.logger.info('Successfully connected to database');
|
|
50
|
-
}
|
|
51
|
-
catch (error) {
|
|
52
|
-
config_1.logger.error('Failed to connect to database:', error);
|
|
53
|
-
throw error;
|
|
54
|
-
}
|
|
55
|
-
return sequelize;
|
|
56
|
-
});
|
|
57
|
-
}
|
|
27
|
+
const sequelize = new core_1.Sequelize({
|
|
28
|
+
dialect: sqlite3_1.SqliteDialect,
|
|
29
|
+
storage: path_1.default.join(config_1.config.dataDir, 'snap-kit.db'),
|
|
30
|
+
logging: (msg) => process.env.SQLITE_LOG && config_1.logger.debug(msg),
|
|
31
|
+
pool: {
|
|
32
|
+
min: 0,
|
|
33
|
+
max: 10,
|
|
34
|
+
idle: 10000,
|
|
35
|
+
},
|
|
36
|
+
retry: {
|
|
37
|
+
match: [/SQLITE_BUSY/],
|
|
38
|
+
name: 'query',
|
|
39
|
+
max: 10,
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
exports.sequelize = sequelize;
|
|
43
|
+
sequelize.query('pragma journal_mode = WAL;');
|
|
44
|
+
sequelize.query('pragma synchronous = normal;');
|
|
45
|
+
sequelize.query('pragma journal_size_limit = 67108864;');
|
|
46
|
+
job_1.Job.initModel(sequelize);
|
|
47
|
+
snapshot_1.Snapshot.initModel(sequelize);
|
|
48
|
+
__exportStar(require("./job"), exports);
|
|
49
|
+
__exportStar(require("./snapshot"), exports);
|
package/lib/cjs/store/job.d.ts
CHANGED
|
@@ -12,9 +12,14 @@ export interface JobState {
|
|
|
12
12
|
timeout?: number;
|
|
13
13
|
fullPage?: boolean;
|
|
14
14
|
lastModified?: string;
|
|
15
|
+
waitTime?: number;
|
|
16
|
+
replace?: boolean;
|
|
15
17
|
headers?: Record<string, string>;
|
|
16
18
|
cookies?: CookieParam[];
|
|
17
|
-
localStorage?:
|
|
19
|
+
localStorage?: {
|
|
20
|
+
key: string;
|
|
21
|
+
value: string;
|
|
22
|
+
}[];
|
|
18
23
|
}
|
|
19
24
|
export interface JobModel {
|
|
20
25
|
id: string;
|