@arcblock/crawler 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +56 -113
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +88 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +48 -102
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +26 -37
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +84 -71
- package/package.json +22 -33
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -15
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -15
package/lib/esm/cron.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import Cron from '@abtnode/cron';
|
|
11
|
+
import { config, logger } from './config';
|
|
12
|
+
import { crawlSite } from './site';
|
|
13
|
+
let cron = null;
|
|
14
|
+
export function initCron() {
|
|
15
|
+
if (cron)
|
|
16
|
+
return;
|
|
17
|
+
logger.info('Init cron', { config: config.siteCron });
|
|
18
|
+
cron = Cron.init({
|
|
19
|
+
context: {},
|
|
20
|
+
jobs: [
|
|
21
|
+
{
|
|
22
|
+
name: 'crawl-site',
|
|
23
|
+
time: config.siteCron.time,
|
|
24
|
+
options: { runOnInit: config.siteCron.runOnInit },
|
|
25
|
+
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
26
|
+
logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
|
|
27
|
+
for (const site of config.siteCron.sites) {
|
|
28
|
+
try {
|
|
29
|
+
yield crawlSite(site);
|
|
30
|
+
}
|
|
31
|
+
catch (err) {
|
|
32
|
+
logger.error('Cron task error', { err, site });
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}),
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
onError: (err) => {
|
|
39
|
+
logger.error('Cron error', err);
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
return cron;
|
|
43
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { Config } from './config';
|
|
2
|
+
export * from './crawler';
|
|
3
|
+
export * from './site';
|
|
4
|
+
export * from './services/snapshot';
|
|
5
|
+
export * as utils from './utils';
|
|
6
|
+
type DeepPartial<T> = T extends object ? {
|
|
7
|
+
[P in keyof T]?: DeepPartial<T[P]>;
|
|
8
|
+
} : T;
|
|
9
|
+
export declare function initCrawler(params: DeepPartial<Pick<Config, 'puppeteerPath' | 'siteCron'>>): Promise<void>;
|
package/{esm → lib/esm}/index.js
RENAMED
|
@@ -7,20 +7,29 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
+
import merge from 'lodash/merge';
|
|
10
11
|
import { config, logger } from './config';
|
|
11
12
|
import { createCrawlQueue } from './crawler';
|
|
12
|
-
import {
|
|
13
|
+
import { initCron } from './cron';
|
|
13
14
|
import { ensureBrowser } from './puppeteer';
|
|
14
|
-
|
|
15
|
+
import { initDatabase } from './store';
|
|
15
16
|
export * from './crawler';
|
|
16
|
-
export * from './
|
|
17
|
-
export
|
|
18
|
-
export
|
|
17
|
+
export * from './site';
|
|
18
|
+
export * from './services/snapshot';
|
|
19
|
+
export * as utils from './utils';
|
|
20
|
+
export function initCrawler(params) {
|
|
19
21
|
return __awaiter(this, void 0, void 0, function* () {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
logger.info('Init crawler', { params });
|
|
23
|
+
merge(config, params);
|
|
24
|
+
try {
|
|
25
|
+
yield initDatabase();
|
|
26
|
+
yield ensureBrowser();
|
|
27
|
+
yield createCrawlQueue();
|
|
28
|
+
yield initCron();
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
logger.error('Init crawler error', { err });
|
|
32
|
+
throw err;
|
|
33
|
+
}
|
|
25
34
|
});
|
|
26
35
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import puppeteer, { Browser, Page } from '@blocklet/puppeteer';
|
|
1
|
+
import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer';
|
|
2
2
|
export { puppeteer };
|
|
3
3
|
export declare function ensurePuppeteerrc(): Promise<{
|
|
4
4
|
cacheDirectory: string;
|
|
@@ -12,5 +12,5 @@ export declare const closeBrowser: ({ trimCache }?: {
|
|
|
12
12
|
trimCache?: boolean;
|
|
13
13
|
}) => Promise<void>;
|
|
14
14
|
export declare function initPage({ abortResourceTypes }?: {
|
|
15
|
-
abortResourceTypes?:
|
|
15
|
+
abortResourceTypes?: ResourceType[];
|
|
16
16
|
}): Promise<Page>;
|
|
@@ -7,25 +7,20 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
-
// import fs from 'fs-extra';
|
|
11
|
-
// import path from 'path';
|
|
12
10
|
import puppeteer from '@blocklet/puppeteer';
|
|
13
|
-
import { env } from '@blocklet/sdk/lib/config';
|
|
14
11
|
import fs from 'fs-extra';
|
|
15
12
|
import path from 'path';
|
|
16
13
|
import { clearInterval, setInterval } from 'timers';
|
|
17
|
-
import { useCache } from './cache';
|
|
18
14
|
import { config, logger } from './config';
|
|
19
15
|
import { CRAWLER_FLAG, sleep } from './utils';
|
|
20
|
-
// let puppeteerConfig: {
|
|
21
|
-
// cacheDirectory: string;
|
|
22
|
-
// temporaryDirectory: string;
|
|
23
|
-
// };
|
|
24
|
-
const BROWSER_WS_ENDPOINT_KEY = `browserWSEndpoint-${env.appId || 'unknown'}`;
|
|
25
16
|
const BrowserStatus = {
|
|
17
|
+
None: 'None',
|
|
26
18
|
Launching: 'Launching',
|
|
27
19
|
Ready: 'Ready',
|
|
28
20
|
};
|
|
21
|
+
let browserStatus = BrowserStatus.None;
|
|
22
|
+
/** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */
|
|
23
|
+
let browserEndpoint = '';
|
|
29
24
|
let browser;
|
|
30
25
|
let browserActivatedTimer;
|
|
31
26
|
export { puppeteer };
|
|
@@ -49,9 +44,9 @@ export function ensurePuppeteerrc() {
|
|
|
49
44
|
export function ensureBrowser() {
|
|
50
45
|
return __awaiter(this, void 0, void 0, function* () {
|
|
51
46
|
const puppeteerConfig = yield ensurePuppeteerrc();
|
|
52
|
-
const executablePath =
|
|
53
|
-
logger.
|
|
54
|
-
if (!fs.existsSync(executablePath)) {
|
|
47
|
+
const executablePath = config.puppeteerPath;
|
|
48
|
+
logger.debug('executablePath', executablePath);
|
|
49
|
+
if (!executablePath || !fs.existsSync(executablePath)) {
|
|
55
50
|
logger.info('start download browser', puppeteerConfig);
|
|
56
51
|
const { downloadBrowser } = yield (() => __awaiter(this, void 0, void 0, function* () {
|
|
57
52
|
try {
|
|
@@ -69,7 +64,7 @@ export function ensureBrowser() {
|
|
|
69
64
|
}
|
|
70
65
|
}
|
|
71
66
|
// try to launch browser
|
|
72
|
-
if (config.
|
|
67
|
+
if (config.isProd) {
|
|
73
68
|
const browser = yield launchBrowser();
|
|
74
69
|
if (!browser) {
|
|
75
70
|
throw new Error('Failed to launch browser');
|
|
@@ -81,24 +76,23 @@ export function ensureBrowser() {
|
|
|
81
76
|
}
|
|
82
77
|
export function connectBrowser() {
|
|
83
78
|
return __awaiter(this, void 0, void 0, function* () {
|
|
84
|
-
|
|
85
|
-
if (!browserWSEndpoint) {
|
|
79
|
+
if (!browserEndpoint) {
|
|
86
80
|
return null;
|
|
87
81
|
}
|
|
88
82
|
// retry if browser is launching
|
|
89
|
-
if (
|
|
83
|
+
if (browserStatus === BrowserStatus.Launching) {
|
|
90
84
|
yield sleep(Math.floor(Math.random() * 1000));
|
|
91
85
|
return connectBrowser();
|
|
92
86
|
}
|
|
93
87
|
try {
|
|
94
88
|
browser = yield puppeteer.connect({
|
|
95
|
-
browserWSEndpoint:
|
|
89
|
+
browserWSEndpoint: browserEndpoint,
|
|
96
90
|
});
|
|
97
91
|
logger.info('Connect browser success');
|
|
98
92
|
}
|
|
99
93
|
catch (err) {
|
|
100
94
|
logger.warn('Connect browser failed, clear endpoint', err);
|
|
101
|
-
|
|
95
|
+
browserEndpoint = '';
|
|
102
96
|
return null;
|
|
103
97
|
}
|
|
104
98
|
return browser;
|
|
@@ -106,12 +100,9 @@ export function connectBrowser() {
|
|
|
106
100
|
}
|
|
107
101
|
export function launchBrowser() {
|
|
108
102
|
return __awaiter(this, void 0, void 0, function* () {
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
status: BrowserStatus.Launching,
|
|
112
|
-
});
|
|
103
|
+
browserEndpoint = '';
|
|
104
|
+
browserStatus = BrowserStatus.Launching;
|
|
113
105
|
try {
|
|
114
|
-
// @ts-ignore
|
|
115
106
|
browser = yield puppeteer.launch({
|
|
116
107
|
headless: true,
|
|
117
108
|
args: [
|
|
@@ -138,20 +129,17 @@ export function launchBrowser() {
|
|
|
138
129
|
'--font-render-hinting=none',
|
|
139
130
|
],
|
|
140
131
|
});
|
|
141
|
-
logger.info('Launch browser
|
|
132
|
+
logger.info('Launch browser');
|
|
142
133
|
}
|
|
143
134
|
catch (error) {
|
|
144
135
|
logger.error('launch browser failed: ', error);
|
|
145
|
-
|
|
146
|
-
|
|
136
|
+
browserStatus = BrowserStatus.None;
|
|
137
|
+
browserEndpoint = '';
|
|
147
138
|
throw error;
|
|
148
139
|
}
|
|
149
140
|
// save browserWSEndpoint to cache
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
endpoint,
|
|
153
|
-
status: BrowserStatus.Ready,
|
|
154
|
-
});
|
|
141
|
+
browserEndpoint = yield browser.wsEndpoint();
|
|
142
|
+
browserStatus = BrowserStatus.Ready;
|
|
155
143
|
return browser;
|
|
156
144
|
});
|
|
157
145
|
}
|
|
@@ -194,6 +182,7 @@ export const getBrowser = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
194
182
|
if (connectedBrowser) {
|
|
195
183
|
logger.debug('getBrowser.connectedBrowser');
|
|
196
184
|
browser = connectedBrowser;
|
|
185
|
+
checkBrowserActivated();
|
|
197
186
|
return browser;
|
|
198
187
|
}
|
|
199
188
|
// try to launch browser
|
|
@@ -215,20 +204,20 @@ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0
|
|
|
215
204
|
yield Promise.all(pages.map((page) => page.close()));
|
|
216
205
|
}
|
|
217
206
|
catch (err) {
|
|
218
|
-
logger.
|
|
207
|
+
logger.warn('Failed to close all pages:', err);
|
|
219
208
|
}
|
|
220
209
|
// close browser
|
|
221
210
|
try {
|
|
222
211
|
yield browser.close();
|
|
223
212
|
}
|
|
224
213
|
catch (err) {
|
|
225
|
-
logger.
|
|
214
|
+
logger.warn('Failed to close browser:', err);
|
|
226
215
|
}
|
|
227
216
|
// clear cache
|
|
228
217
|
try {
|
|
229
218
|
if (trimCache) {
|
|
230
219
|
yield puppeteer.trimCache();
|
|
231
|
-
logger.
|
|
220
|
+
logger.debug('Trim cache success');
|
|
232
221
|
}
|
|
233
222
|
// try to clear temporary directory
|
|
234
223
|
// if (puppeteerConfig) {
|
|
@@ -239,11 +228,12 @@ export const closeBrowser = (...args_1) => __awaiter(void 0, [...args_1], void 0
|
|
|
239
228
|
}
|
|
240
229
|
}
|
|
241
230
|
catch (err) {
|
|
242
|
-
logger.
|
|
231
|
+
logger.warn('Failed to clear browser cache:', err);
|
|
243
232
|
}
|
|
244
233
|
browser = null;
|
|
245
234
|
clearBrowserActivatedTimer();
|
|
246
|
-
|
|
235
|
+
browserEndpoint = '';
|
|
236
|
+
browserStatus = BrowserStatus.None;
|
|
247
237
|
logger.info('Close browser success');
|
|
248
238
|
});
|
|
249
239
|
export function initPage() {
|
|
@@ -260,7 +250,6 @@ export function initPage() {
|
|
|
260
250
|
if (abortResourceTypes.length > 0) {
|
|
261
251
|
yield page.setRequestInterception(true);
|
|
262
252
|
page.on('request', (req) => {
|
|
263
|
-
// @ts-ignore
|
|
264
253
|
if (abortResourceTypes.includes(req.resourceType())) {
|
|
265
254
|
return req.abort();
|
|
266
255
|
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { JobState } from '../store/job';
|
|
2
|
+
import { SnapshotModel } from '../store/snapshot';
|
|
3
|
+
export declare function convertJobToSnapshot({ job, snapshot }: {
|
|
4
|
+
job: JobState;
|
|
5
|
+
snapshot?: Partial<SnapshotModel>;
|
|
6
|
+
}): SnapshotModel;
|
|
7
|
+
export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
|
|
8
|
+
/**
|
|
9
|
+
* get snapshot from db or crawl queue
|
|
10
|
+
*/
|
|
11
|
+
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
|
12
|
+
export declare function getLatestSnapshot(url: string): Promise<SnapshotModel | null>;
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import pick from 'lodash/pick';
|
|
11
|
+
import fs from 'node:fs/promises';
|
|
12
|
+
import path from 'node:path';
|
|
13
|
+
import { joinURL } from 'ufo';
|
|
14
|
+
import { config } from '../config';
|
|
15
|
+
import { Job } from '../store/job';
|
|
16
|
+
import { Snapshot } from '../store/snapshot';
|
|
17
|
+
import { formatUrl } from '../utils';
|
|
18
|
+
export function convertJobToSnapshot({ job, snapshot }) {
|
|
19
|
+
return Object.assign({ jobId: job.jobId || job.id, url: job.url, lastModified: job.lastModified || new Date().toISOString(), options: {
|
|
20
|
+
width: job.width,
|
|
21
|
+
height: job.height,
|
|
22
|
+
includeScreenshot: job.includeScreenshot,
|
|
23
|
+
includeHtml: job.includeHtml,
|
|
24
|
+
quality: job.quality,
|
|
25
|
+
fullPage: job.fullPage,
|
|
26
|
+
} }, snapshot);
|
|
27
|
+
}
|
|
28
|
+
export function formatSnapshot(snapshot, columns) {
|
|
29
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
30
|
+
let data = Object.assign({}, snapshot);
|
|
31
|
+
// format screenshot path to full url
|
|
32
|
+
if (data.screenshot) {
|
|
33
|
+
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
34
|
+
}
|
|
35
|
+
// format html path to string
|
|
36
|
+
if (data.html) {
|
|
37
|
+
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
38
|
+
data.html = html.toString();
|
|
39
|
+
}
|
|
40
|
+
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
41
|
+
data = pick(data, columns);
|
|
42
|
+
}
|
|
43
|
+
return data;
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* get snapshot from db or crawl queue
|
|
48
|
+
*/
|
|
49
|
+
export function getSnapshot(jobId) {
|
|
50
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
51
|
+
const snapshot = yield Snapshot.findSnapshot({ where: { jobId } });
|
|
52
|
+
if (snapshot) {
|
|
53
|
+
return formatSnapshot(snapshot);
|
|
54
|
+
}
|
|
55
|
+
const job = yield Job.findJob({ id: jobId });
|
|
56
|
+
if (job) {
|
|
57
|
+
return {
|
|
58
|
+
jobId,
|
|
59
|
+
status: 'pending',
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
export function getLatestSnapshot(url) {
|
|
66
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
67
|
+
const snapshot = yield Snapshot.findSnapshot({
|
|
68
|
+
where: {
|
|
69
|
+
url: formatUrl(url),
|
|
70
|
+
status: 'success',
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
return snapshot ? formatSnapshot(snapshot) : null;
|
|
74
|
+
});
|
|
75
|
+
}
|
package/lib/esm/site.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import uniq from 'lodash/uniq';
|
|
11
|
+
import pMap from 'p-map';
|
|
12
|
+
import { config, logger } from './config';
|
|
13
|
+
import { crawlUrl } from './crawler';
|
|
14
|
+
import { Snapshot } from './store/snapshot';
|
|
15
|
+
import { formatUrl, getSitemapList } from './utils';
|
|
16
|
+
const crawlBlockletRunningMap = new Map();
|
|
17
|
+
function parseSitemapUrl(sitemapItem) {
|
|
18
|
+
var _a;
|
|
19
|
+
const links = ((_a = sitemapItem.links) === null || _a === void 0 ? void 0 : _a.map((item) => item.url)) || [];
|
|
20
|
+
const urls = uniq([...links, sitemapItem.url]).filter(Boolean);
|
|
21
|
+
return urls.map((url) => ({ url, sitemapItem }));
|
|
22
|
+
}
|
|
23
|
+
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
24
|
+
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
25
|
+
const sitemapList = yield getSitemapList(url);
|
|
26
|
+
const pathnameRegex = new RegExp(pathname);
|
|
27
|
+
const sitemapItems = sitemapList
|
|
28
|
+
.filter((item) => new URL(item.url).pathname.match(pathnameRegex))
|
|
29
|
+
.flatMap((sitemapItem) => {
|
|
30
|
+
return parseSitemapUrl(sitemapItem);
|
|
31
|
+
});
|
|
32
|
+
logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
33
|
+
const crawlableItems = (yield Promise.all(sitemapItems.map((_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
34
|
+
const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
|
|
35
|
+
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
36
|
+
const lastModified = new Date(snapshot.lastModified);
|
|
37
|
+
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
38
|
+
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
// skip if interval time has not been reached
|
|
42
|
+
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return { url, sitemapItem };
|
|
47
|
+
})))).filter(Boolean);
|
|
48
|
+
logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
|
|
49
|
+
const key = `${url}-${pathname}`;
|
|
50
|
+
crawlBlockletRunningMap.set(key, crawlableItems);
|
|
51
|
+
try {
|
|
52
|
+
const jobIds = yield pMap(crawlableItems, ({ url, sitemapItem }) => {
|
|
53
|
+
return crawlUrl({
|
|
54
|
+
url,
|
|
55
|
+
lastModified: sitemapItem.lastmod,
|
|
56
|
+
includeScreenshot: false,
|
|
57
|
+
includeHtml: true,
|
|
58
|
+
});
|
|
59
|
+
}, { concurrency: config.siteCron.concurrency });
|
|
60
|
+
return jobIds;
|
|
61
|
+
}
|
|
62
|
+
catch (error) {
|
|
63
|
+
logger.error(`Failed to crawl from sitemap ${url} ${pathname}`, error);
|
|
64
|
+
throw new Error(error);
|
|
65
|
+
}
|
|
66
|
+
finally {
|
|
67
|
+
crawlBlockletRunningMap.delete(key);
|
|
68
|
+
}
|
|
69
|
+
});
|
|
@@ -11,18 +11,33 @@ import { Sequelize } from '@sequelize/core';
|
|
|
11
11
|
import { SqliteDialect } from '@sequelize/sqlite3';
|
|
12
12
|
import path from 'path';
|
|
13
13
|
import { config, logger } from '../config';
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
export function
|
|
14
|
+
import { Job } from './job';
|
|
15
|
+
import { Snapshot } from './snapshot';
|
|
16
|
+
export function initDatabase() {
|
|
17
17
|
return __awaiter(this, void 0, void 0, function* () {
|
|
18
18
|
const sequelize = new Sequelize({
|
|
19
19
|
dialect: SqliteDialect,
|
|
20
20
|
storage: path.join(config.dataDir, 'snap-kit.db'),
|
|
21
|
-
logging: (msg) => logger.debug(msg),
|
|
21
|
+
logging: (msg) => process.env.SQLITE_LOG && logger.debug(msg),
|
|
22
|
+
pool: {
|
|
23
|
+
min: 0,
|
|
24
|
+
max: 10,
|
|
25
|
+
idle: 10000,
|
|
26
|
+
},
|
|
27
|
+
retry: {
|
|
28
|
+
match: [/SQLITE_BUSY/],
|
|
29
|
+
name: 'query',
|
|
30
|
+
max: 10,
|
|
31
|
+
},
|
|
22
32
|
});
|
|
23
|
-
|
|
24
|
-
|
|
33
|
+
Job.initModel(sequelize);
|
|
34
|
+
Snapshot.initModel(sequelize);
|
|
25
35
|
try {
|
|
36
|
+
yield Promise.all([
|
|
37
|
+
sequelize.query('pragma journal_mode = WAL;'),
|
|
38
|
+
sequelize.query('pragma synchronous = normal;'),
|
|
39
|
+
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
40
|
+
]);
|
|
26
41
|
yield sequelize.authenticate();
|
|
27
42
|
yield sequelize.sync();
|
|
28
43
|
logger.info('Successfully connected to database');
|
|
@@ -31,5 +46,6 @@ export function ensureDatabase() {
|
|
|
31
46
|
logger.error('Failed to connect to database:', error);
|
|
32
47
|
throw error;
|
|
33
48
|
}
|
|
49
|
+
return sequelize;
|
|
34
50
|
});
|
|
35
51
|
}
|
|
@@ -10,6 +10,7 @@ export interface JobState {
|
|
|
10
10
|
quality?: number;
|
|
11
11
|
timeout?: number;
|
|
12
12
|
fullPage?: boolean;
|
|
13
|
+
lastModified?: string;
|
|
13
14
|
}
|
|
14
15
|
export interface JobModel {
|
|
15
16
|
id: string;
|
|
@@ -20,7 +21,7 @@ export interface JobModel {
|
|
|
20
21
|
delay: number;
|
|
21
22
|
cancelled: boolean;
|
|
22
23
|
}
|
|
23
|
-
declare class Job extends Model<JobModel> implements JobModel {
|
|
24
|
+
export declare class Job extends Model<JobModel> implements JobModel {
|
|
24
25
|
id: JobModel['id'];
|
|
25
26
|
queue: JobModel['queue'];
|
|
26
27
|
job: JobModel['job'];
|
|
@@ -28,6 +29,6 @@ declare class Job extends Model<JobModel> implements JobModel {
|
|
|
28
29
|
willRunAt: JobModel['willRunAt'];
|
|
29
30
|
delay: JobModel['delay'];
|
|
30
31
|
cancelled: JobModel['cancelled'];
|
|
32
|
+
static initModel(sequelize: Sequelize): typeof Job;
|
|
33
|
+
static findJob(condition: Partial<JobState>): Promise<JobModel | null>;
|
|
31
34
|
}
|
|
32
|
-
export { Job };
|
|
33
|
-
export declare function initJobModel(sequelize: Sequelize): typeof Job;
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import sequelize, { DataTypes, Model } from '@sequelize/core';
|
|
11
|
+
export class Job extends Model {
|
|
12
|
+
static initModel(sequelize) {
|
|
13
|
+
return Job.init({
|
|
14
|
+
id: {
|
|
15
|
+
type: DataTypes.STRING(40),
|
|
16
|
+
primaryKey: true,
|
|
17
|
+
},
|
|
18
|
+
queue: {
|
|
19
|
+
type: DataTypes.STRING(32),
|
|
20
|
+
allowNull: false,
|
|
21
|
+
},
|
|
22
|
+
job: {
|
|
23
|
+
type: DataTypes.JSON,
|
|
24
|
+
allowNull: false,
|
|
25
|
+
},
|
|
26
|
+
retryCount: {
|
|
27
|
+
type: DataTypes.INTEGER,
|
|
28
|
+
},
|
|
29
|
+
delay: {
|
|
30
|
+
type: DataTypes.INTEGER,
|
|
31
|
+
},
|
|
32
|
+
willRunAt: {
|
|
33
|
+
type: DataTypes.INTEGER,
|
|
34
|
+
},
|
|
35
|
+
cancelled: {
|
|
36
|
+
type: DataTypes.BOOLEAN,
|
|
37
|
+
defaultValue: false,
|
|
38
|
+
},
|
|
39
|
+
createdAt: {
|
|
40
|
+
type: DataTypes.DATE,
|
|
41
|
+
defaultValue: DataTypes.NOW,
|
|
42
|
+
index: true,
|
|
43
|
+
},
|
|
44
|
+
updatedAt: {
|
|
45
|
+
type: DataTypes.DATE,
|
|
46
|
+
defaultValue: DataTypes.NOW,
|
|
47
|
+
index: true,
|
|
48
|
+
},
|
|
49
|
+
}, {
|
|
50
|
+
sequelize,
|
|
51
|
+
indexes: [{ fields: ['queue'] }],
|
|
52
|
+
modelName: 'job',
|
|
53
|
+
tableName: 'jobs',
|
|
54
|
+
timestamps: true,
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
static findJob(condition) {
|
|
58
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
59
|
+
const where = Object.keys(condition)
|
|
60
|
+
.filter((key) => condition[key] !== undefined)
|
|
61
|
+
.map((key) => {
|
|
62
|
+
return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
|
|
63
|
+
});
|
|
64
|
+
const job = yield Job.findOne({
|
|
65
|
+
where: {
|
|
66
|
+
[sequelize.Op.and]: where,
|
|
67
|
+
},
|
|
68
|
+
order: [['createdAt', 'DESC']],
|
|
69
|
+
});
|
|
70
|
+
return (job === null || job === void 0 ? void 0 : job.toJSON()) || null;
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Model, Sequelize } from '@sequelize/core';
|
|
2
|
-
interface SnapshotModel {
|
|
1
|
+
import { FindOptions, Model, Sequelize } from '@sequelize/core';
|
|
2
|
+
export interface SnapshotModel {
|
|
3
3
|
jobId: string;
|
|
4
4
|
url: string;
|
|
5
5
|
status: 'success' | 'failed' | 'pending';
|
|
@@ -16,7 +16,7 @@ interface SnapshotModel {
|
|
|
16
16
|
fullPage?: boolean;
|
|
17
17
|
};
|
|
18
18
|
}
|
|
19
|
-
declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
19
|
+
export declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
20
20
|
jobId: SnapshotModel['jobId'];
|
|
21
21
|
url: SnapshotModel['url'];
|
|
22
22
|
status: SnapshotModel['status'];
|
|
@@ -25,7 +25,6 @@ declare class Snapshot extends Model<SnapshotModel> implements SnapshotModel {
|
|
|
25
25
|
error?: SnapshotModel['error'];
|
|
26
26
|
lastModified?: SnapshotModel['lastModified'];
|
|
27
27
|
options: SnapshotModel['options'];
|
|
28
|
+
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
29
|
+
static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
|
|
28
30
|
}
|
|
29
|
-
export { Snapshot };
|
|
30
|
-
export type { SnapshotModel };
|
|
31
|
-
export declare function initSnapshotModel(sequelize: Sequelize): typeof Snapshot;
|