@arcblock/crawler 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -0
- package/lib/cjs/config.d.ts +4 -2
- package/lib/cjs/config.js +7 -5
- package/lib/cjs/crawler.d.ts +4 -0
- package/lib/cjs/crawler.js +31 -17
- package/lib/cjs/cron.js +1 -1
- package/lib/cjs/index.js +4 -2
- package/lib/cjs/site.d.ts +1 -1
- package/lib/cjs/site.js +23 -20
- package/lib/cjs/store/index.js +1 -1
- package/lib/cjs/store/snapshot.d.ts +5 -0
- package/lib/cjs/store/snapshot.js +4 -0
- package/lib/esm/config.d.ts +4 -2
- package/lib/esm/config.js +7 -5
- package/lib/esm/crawler.d.ts +4 -0
- package/lib/esm/crawler.js +31 -17
- package/lib/esm/cron.js +1 -1
- package/lib/esm/index.js +4 -2
- package/lib/esm/site.d.ts +1 -1
- package/lib/esm/site.js +23 -20
- package/lib/esm/store/index.js +1 -1
- package/lib/esm/store/snapshot.d.ts +5 -0
- package/lib/esm/store/snapshot.js +4 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1 +1,66 @@
|
|
|
1
1
|
# @arcblock/crawler
|
|
2
|
+
|
|
3
|
+
A crawler module designed for Blocklets. It supports batch crawling of HTML, webpage screenshots, title, description, and more, based on URL or Sitemap.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import { crawlUrl, getSnapshot, initCrawler } from '@arcblock/crawler';
|
|
9
|
+
|
|
10
|
+
await initCrawler();
|
|
11
|
+
|
|
12
|
+
// Asynchronously crawl a page
|
|
13
|
+
const jobId = await crawlUrl({ url: 'https://www.arcblock.io', includeScreenshot: true, includeHtml: true });
|
|
14
|
+
|
|
15
|
+
// Get the crawling result (need to wait for the crawler to finish)
|
|
16
|
+
const snapshot = await getSnapshot(jobId);
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### initCrawler
|
|
20
|
+
|
|
21
|
+
Initializes the crawler.
|
|
22
|
+
|
|
23
|
+
### crawlUrl
|
|
24
|
+
|
|
25
|
+
Crawls the specified page.
|
|
26
|
+
|
|
27
|
+
### getSnapshot
|
|
28
|
+
|
|
29
|
+
Gets the crawling result by jobId.
|
|
30
|
+
|
|
31
|
+
### getLatestSnapshot
|
|
32
|
+
|
|
33
|
+
Gets the latest crawling result by URL.
|
|
34
|
+
|
|
35
|
+
## Schedule
|
|
36
|
+
|
|
37
|
+
Passing siteCron to initCrawler will enable scheduled tasks to periodically crawl all pages of specified websites based on their sitemaps.
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
await initCrawler({
|
|
41
|
+
siteCron: {
|
|
42
|
+
enabled: !!env.preferences.cronEnabled,
|
|
43
|
+
immediate: !!env.preferences.cronImmediate,
|
|
44
|
+
sites: env.preferences.cronSites,
|
|
45
|
+
time: env.preferences.cronTime,
|
|
46
|
+
crawlConcurrency: env.preferences.crawlConcurrency,
|
|
47
|
+
sitemapConcurrency: env.preferences.sitemapConcurrency,
|
|
48
|
+
},
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Environment Variables
|
|
53
|
+
|
|
54
|
+
- `PUPPETEER_EXECUTABLE_PATH`: The execution path for Puppeteer. This variable is not required if used within the `arcblock/snap-kit` Docker image. When developing locally, you can set it to the Chrome path: `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
|
|
55
|
+
|
|
56
|
+
If not referenced by a Blocklet, some dependent Blocklet environment variables need to be provided:
|
|
57
|
+
|
|
58
|
+
- `BLOCKLET_CACHE_DIR`: (Optional) The directory for automatic Puppeteer installation if `PUPPETEER_EXECUTABLE_PATH` is not provided. Defaults to `process.cwd()`.
|
|
59
|
+
|
|
60
|
+
- `BLOCKLET_APP_URL`: (Optional) The domain prefix for screenshot. Defaults to `/`.
|
|
61
|
+
|
|
62
|
+
- `BLOCKLET_DATA_DIR`: (Required) The directory to save webpage screenshots and HTML source files obtained by the crawler.
|
|
63
|
+
|
|
64
|
+
## SQLite
|
|
65
|
+
|
|
66
|
+
When `initCrawler` is called, it attempts to create an SQLite database at `BLOCKLET_DATA_DIR`. This database is used to cache HTML content and screenshot. Please ensure that the deployment environment supports SQLite.
|
package/lib/cjs/config.d.ts
CHANGED
|
@@ -14,8 +14,10 @@ export type Config = {
|
|
|
14
14
|
siteCron: {
|
|
15
15
|
sites: Site[];
|
|
16
16
|
time: string;
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
enabled: boolean;
|
|
18
|
+
immediate: boolean;
|
|
19
|
+
crawlConcurrency: number;
|
|
20
|
+
sitemapConcurrency: number;
|
|
19
21
|
};
|
|
20
22
|
};
|
|
21
23
|
export declare const logger: any;
|
package/lib/cjs/config.js
CHANGED
|
@@ -10,14 +10,16 @@ exports.config = {
|
|
|
10
10
|
isProd: process.env.NODE_ENV === 'production',
|
|
11
11
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
12
12
|
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
13
|
-
|
|
13
|
+
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
14
|
+
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
14
15
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
15
|
-
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
16
16
|
// cron
|
|
17
17
|
siteCron: {
|
|
18
18
|
sites: [],
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
19
|
+
enabled: true,
|
|
20
|
+
time: '0 0 0 * * *',
|
|
21
|
+
immediate: false,
|
|
22
|
+
crawlConcurrency: 2,
|
|
23
|
+
sitemapConcurrency: 30,
|
|
22
24
|
},
|
|
23
25
|
};
|
package/lib/cjs/crawler.d.ts
CHANGED
|
@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
17
17
|
}) => Promise<{
|
|
18
18
|
html: string;
|
|
19
19
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
|
+
meta: {
|
|
21
|
+
title?: string;
|
|
22
|
+
description?: string;
|
|
23
|
+
};
|
|
20
24
|
}>;
|
|
21
25
|
/**
|
|
22
26
|
* crawl url and return job id
|
package/lib/cjs/crawler.js
CHANGED
|
@@ -33,7 +33,7 @@ function createCrawlQueue() {
|
|
|
33
33
|
const db = new BaseState(job_1.Job);
|
|
34
34
|
crawlQueue = (0, queue_1.default)({
|
|
35
35
|
store: new sequelize_1.default(db, 'crawler'),
|
|
36
|
-
concurrency:
|
|
36
|
+
concurrency: config_1.config.siteCron.crawlConcurrency,
|
|
37
37
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
38
38
|
config_1.logger.info('Starting to execute crawl job', job);
|
|
39
39
|
const canCrawl = yield (0, utils_1.isAcceptCrawler)(job.url);
|
|
@@ -84,6 +84,7 @@ function createCrawlQueue() {
|
|
|
84
84
|
status: 'success',
|
|
85
85
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config_1.config.dataDir, ''),
|
|
86
86
|
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config_1.config.dataDir, ''),
|
|
87
|
+
meta: result.meta,
|
|
87
88
|
},
|
|
88
89
|
});
|
|
89
90
|
yield snapshot_2.Snapshot.upsert(snapshot);
|
|
@@ -150,6 +151,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
150
151
|
}
|
|
151
152
|
let html = null;
|
|
152
153
|
let screenshot = null;
|
|
154
|
+
const meta = {};
|
|
153
155
|
try {
|
|
154
156
|
const response = yield page.goto(url, { timeout });
|
|
155
157
|
if (!response) {
|
|
@@ -170,7 +172,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
170
172
|
// Try to find the tallest element and set the browser to the same height
|
|
171
173
|
if (fullPage) {
|
|
172
174
|
const maxScrollHeight = yield (0, utils_1.findMaxScrollHeight)(page);
|
|
173
|
-
config_1.logger.
|
|
175
|
+
config_1.logger.debug('findMaxScrollHeight', { maxScrollHeight });
|
|
174
176
|
if (maxScrollHeight) {
|
|
175
177
|
yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
|
|
176
178
|
yield page.evaluate((scrollHeight) => {
|
|
@@ -188,22 +190,33 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
188
190
|
}
|
|
189
191
|
}
|
|
190
192
|
// get html
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
193
|
+
try {
|
|
194
|
+
const data = yield page.evaluate(() => {
|
|
195
|
+
var _a;
|
|
196
|
+
// add meta tag to record crawler
|
|
197
|
+
const meta = document.createElement('meta');
|
|
198
|
+
meta.name = 'arcblock-crawler';
|
|
199
|
+
meta.content = 'true';
|
|
200
|
+
document.head.appendChild(meta);
|
|
201
|
+
// get title and meta description
|
|
202
|
+
const title = document.title || '';
|
|
203
|
+
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
204
|
+
return {
|
|
205
|
+
html: document.documentElement.outerHTML,
|
|
206
|
+
title,
|
|
207
|
+
description,
|
|
208
|
+
};
|
|
209
|
+
});
|
|
210
|
+
meta.title = data.title;
|
|
211
|
+
meta.description = data.description;
|
|
212
|
+
if (includeHtml) {
|
|
213
|
+
html = data.html;
|
|
205
214
|
}
|
|
206
215
|
}
|
|
216
|
+
catch (err) {
|
|
217
|
+
config_1.logger.error('Failed to get html:', err);
|
|
218
|
+
throw err;
|
|
219
|
+
}
|
|
207
220
|
}
|
|
208
221
|
catch (error) {
|
|
209
222
|
config_1.logger.error('Failed to get page content:', error);
|
|
@@ -216,6 +229,7 @@ const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url,
|
|
|
216
229
|
return {
|
|
217
230
|
html,
|
|
218
231
|
screenshot,
|
|
232
|
+
meta,
|
|
219
233
|
};
|
|
220
234
|
});
|
|
221
235
|
exports.getPageContent = getPageContent;
|
|
@@ -238,7 +252,7 @@ function crawlUrl(params, callback) {
|
|
|
238
252
|
fullPage: params.fullPage,
|
|
239
253
|
})) || {};
|
|
240
254
|
if (duplicateJob) {
|
|
241
|
-
config_1.logger.
|
|
255
|
+
config_1.logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
242
256
|
return duplicateJob.id;
|
|
243
257
|
}
|
|
244
258
|
config_1.logger.info('create crawl job', params);
|
package/lib/cjs/cron.js
CHANGED
|
@@ -27,7 +27,7 @@ function initCron() {
|
|
|
27
27
|
{
|
|
28
28
|
name: 'crawl-site',
|
|
29
29
|
time: config_1.config.siteCron.time,
|
|
30
|
-
options: { runOnInit: config_1.config.siteCron.
|
|
30
|
+
options: { runOnInit: config_1.config.siteCron.immediate },
|
|
31
31
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
32
32
|
config_1.logger.info('Start cron to crawl site', { sites: config_1.config.siteCron.sites });
|
|
33
33
|
for (const site of config_1.config.siteCron.sites) {
|
package/lib/cjs/index.js
CHANGED
|
@@ -62,13 +62,15 @@ __exportStar(require("./services/snapshot"), exports);
|
|
|
62
62
|
exports.utils = __importStar(require("./utils"));
|
|
63
63
|
function initCrawler(params) {
|
|
64
64
|
return __awaiter(this, void 0, void 0, function* () {
|
|
65
|
-
config_1.logger.info('Init crawler', { params });
|
|
66
65
|
(0, merge_1.default)(config_1.config, params);
|
|
66
|
+
config_1.logger.info('Init crawler', { params, config: config_1.config });
|
|
67
67
|
try {
|
|
68
68
|
yield (0, store_1.initDatabase)();
|
|
69
69
|
yield (0, puppeteer_1.ensureBrowser)();
|
|
70
70
|
yield (0, crawler_1.createCrawlQueue)();
|
|
71
|
-
|
|
71
|
+
if (config_1.config.siteCron.enabled) {
|
|
72
|
+
yield (0, cron_1.initCron)();
|
|
73
|
+
}
|
|
72
74
|
}
|
|
73
75
|
catch (err) {
|
|
74
76
|
config_1.logger.error('Init crawler error', { err });
|
package/lib/cjs/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
|
package/lib/cjs/site.js
CHANGED
|
@@ -28,6 +28,11 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
28
28
|
}
|
|
29
29
|
const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
30
30
|
config_1.logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
31
|
+
const key = `${url}-${pathname}`;
|
|
32
|
+
if (crawlBlockletRunningMap.has(key)) {
|
|
33
|
+
config_1.logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
|
|
34
|
+
return [];
|
|
35
|
+
}
|
|
31
36
|
const sitemapList = yield (0, utils_1.getSitemapList)(url);
|
|
32
37
|
const pathnameRegex = new RegExp(pathname);
|
|
33
38
|
const sitemapItems = sitemapList
|
|
@@ -36,33 +41,31 @@ const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, path
|
|
|
36
41
|
return parseSitemapUrl(sitemapItem);
|
|
37
42
|
});
|
|
38
43
|
config_1.logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
42
|
-
const lastModified = new Date(snapshot.lastModified);
|
|
43
|
-
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
44
|
-
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
45
|
-
return null;
|
|
46
|
-
}
|
|
47
|
-
// skip if interval time has not been reached
|
|
48
|
-
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
49
|
-
return null;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
return { url, sitemapItem };
|
|
53
|
-
})))).filter(Boolean);
|
|
54
|
-
config_1.logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
|
|
55
|
-
const key = `${url}-${pathname}`;
|
|
56
|
-
crawlBlockletRunningMap.set(key, crawlableItems);
|
|
44
|
+
let processCount = 0;
|
|
45
|
+
crawlBlockletRunningMap.set(key, true);
|
|
57
46
|
try {
|
|
58
|
-
const jobIds = yield (0, p_map_1.default)(
|
|
47
|
+
const jobIds = yield (0, p_map_1.default)(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
48
|
+
processCount++;
|
|
49
|
+
const snapshot = yield snapshot_1.Snapshot.findOne({ where: { url: (0, utils_1.formatUrl)(url) } });
|
|
50
|
+
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
51
|
+
const lastModified = new Date(snapshot.lastModified);
|
|
52
|
+
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
53
|
+
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
// skip if interval time has not been reached
|
|
57
|
+
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
config_1.logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
|
|
59
62
|
return (0, crawler_1.crawlUrl)({
|
|
60
63
|
url,
|
|
61
64
|
lastModified: sitemapItem.lastmod,
|
|
62
65
|
includeScreenshot: false,
|
|
63
66
|
includeHtml: true,
|
|
64
67
|
});
|
|
65
|
-
}, { concurrency: config_1.config.siteCron.
|
|
68
|
+
}), { concurrency: config_1.config.siteCron.sitemapConcurrency });
|
|
66
69
|
return jobIds;
|
|
67
70
|
}
|
|
68
71
|
catch (error) {
|
package/lib/cjs/store/index.js
CHANGED
|
@@ -45,7 +45,7 @@ function initDatabase() {
|
|
|
45
45
|
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
46
46
|
]);
|
|
47
47
|
yield sequelize.authenticate();
|
|
48
|
-
yield sequelize.sync();
|
|
48
|
+
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
49
49
|
config_1.logger.info('Successfully connected to database');
|
|
50
50
|
}
|
|
51
51
|
catch (error) {
|
|
@@ -7,6 +7,10 @@ export interface SnapshotModel {
|
|
|
7
7
|
screenshot?: string | null;
|
|
8
8
|
error?: string;
|
|
9
9
|
lastModified?: string;
|
|
10
|
+
meta?: {
|
|
11
|
+
title?: string;
|
|
12
|
+
description?: string;
|
|
13
|
+
};
|
|
10
14
|
options?: {
|
|
11
15
|
width?: number;
|
|
12
16
|
height?: number;
|
|
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
24
28
|
screenshot?: SnapshotModel['screenshot'];
|
|
25
29
|
error?: SnapshotModel['error'];
|
|
26
30
|
lastModified?: SnapshotModel['lastModified'];
|
|
31
|
+
meta?: SnapshotModel['meta'];
|
|
27
32
|
options: SnapshotModel['options'];
|
|
28
33
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
29
34
|
static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
|
package/lib/esm/config.d.ts
CHANGED
|
@@ -14,8 +14,10 @@ export type Config = {
|
|
|
14
14
|
siteCron: {
|
|
15
15
|
sites: Site[];
|
|
16
16
|
time: string;
|
|
17
|
-
|
|
18
|
-
|
|
17
|
+
enabled: boolean;
|
|
18
|
+
immediate: boolean;
|
|
19
|
+
crawlConcurrency: number;
|
|
20
|
+
sitemapConcurrency: number;
|
|
19
21
|
};
|
|
20
22
|
};
|
|
21
23
|
export declare const logger: any;
|
package/lib/esm/config.js
CHANGED
|
@@ -4,14 +4,16 @@ export const config = {
|
|
|
4
4
|
isProd: process.env.NODE_ENV === 'production',
|
|
5
5
|
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
6
|
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
|
-
|
|
7
|
+
cacheDir: process.env.BLOCKLET_CACHE_DIR || process.cwd(),
|
|
8
|
+
appUrl: process.env.BLOCKLET_APP_URL || '/',
|
|
8
9
|
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
9
|
-
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
10
10
|
// cron
|
|
11
11
|
siteCron: {
|
|
12
12
|
sites: [],
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
enabled: true,
|
|
14
|
+
time: '0 0 0 * * *',
|
|
15
|
+
immediate: false,
|
|
16
|
+
crawlConcurrency: 2,
|
|
17
|
+
sitemapConcurrency: 30,
|
|
16
18
|
},
|
|
17
19
|
};
|
package/lib/esm/crawler.d.ts
CHANGED
|
@@ -17,6 +17,10 @@ export declare const getPageContent: ({ url, includeScreenshot, includeHtml, wid
|
|
|
17
17
|
}) => Promise<{
|
|
18
18
|
html: string;
|
|
19
19
|
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
20
|
+
meta: {
|
|
21
|
+
title?: string;
|
|
22
|
+
description?: string;
|
|
23
|
+
};
|
|
20
24
|
}>;
|
|
21
25
|
/**
|
|
22
26
|
* crawl url and return job id
|
package/lib/esm/crawler.js
CHANGED
|
@@ -24,7 +24,7 @@ export function createCrawlQueue() {
|
|
|
24
24
|
const db = new BaseState(Job);
|
|
25
25
|
crawlQueue = createQueue({
|
|
26
26
|
store: new SequelizeStore(db, 'crawler'),
|
|
27
|
-
concurrency:
|
|
27
|
+
concurrency: config.siteCron.crawlConcurrency,
|
|
28
28
|
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
29
29
|
logger.info('Starting to execute crawl job', job);
|
|
30
30
|
const canCrawl = yield isAcceptCrawler(job.url);
|
|
@@ -75,6 +75,7 @@ export function createCrawlQueue() {
|
|
|
75
75
|
status: 'success',
|
|
76
76
|
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
77
77
|
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
|
|
78
|
+
meta: result.meta,
|
|
78
79
|
},
|
|
79
80
|
});
|
|
80
81
|
yield Snapshot.upsert(snapshot);
|
|
@@ -141,6 +142,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
141
142
|
}
|
|
142
143
|
let html = null;
|
|
143
144
|
let screenshot = null;
|
|
145
|
+
const meta = {};
|
|
144
146
|
try {
|
|
145
147
|
const response = yield page.goto(url, { timeout });
|
|
146
148
|
if (!response) {
|
|
@@ -161,7 +163,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
161
163
|
// Try to find the tallest element and set the browser to the same height
|
|
162
164
|
if (fullPage) {
|
|
163
165
|
const maxScrollHeight = yield findMaxScrollHeight(page);
|
|
164
|
-
logger.
|
|
166
|
+
logger.debug('findMaxScrollHeight', { maxScrollHeight });
|
|
165
167
|
if (maxScrollHeight) {
|
|
166
168
|
yield page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
|
|
167
169
|
yield page.evaluate((scrollHeight) => {
|
|
@@ -179,22 +181,33 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
179
181
|
}
|
|
180
182
|
}
|
|
181
183
|
// get html
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
184
|
+
try {
|
|
185
|
+
const data = yield page.evaluate(() => {
|
|
186
|
+
var _a;
|
|
187
|
+
// add meta tag to record crawler
|
|
188
|
+
const meta = document.createElement('meta');
|
|
189
|
+
meta.name = 'arcblock-crawler';
|
|
190
|
+
meta.content = 'true';
|
|
191
|
+
document.head.appendChild(meta);
|
|
192
|
+
// get title and meta description
|
|
193
|
+
const title = document.title || '';
|
|
194
|
+
const description = ((_a = document.querySelector('meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.getAttribute('content')) || '';
|
|
195
|
+
return {
|
|
196
|
+
html: document.documentElement.outerHTML,
|
|
197
|
+
title,
|
|
198
|
+
description,
|
|
199
|
+
};
|
|
200
|
+
});
|
|
201
|
+
meta.title = data.title;
|
|
202
|
+
meta.description = data.description;
|
|
203
|
+
if (includeHtml) {
|
|
204
|
+
html = data.html;
|
|
196
205
|
}
|
|
197
206
|
}
|
|
207
|
+
catch (err) {
|
|
208
|
+
logger.error('Failed to get html:', err);
|
|
209
|
+
throw err;
|
|
210
|
+
}
|
|
198
211
|
}
|
|
199
212
|
catch (error) {
|
|
200
213
|
logger.error('Failed to get page content:', error);
|
|
@@ -207,6 +220,7 @@ export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function*
|
|
|
207
220
|
return {
|
|
208
221
|
html,
|
|
209
222
|
screenshot,
|
|
223
|
+
meta,
|
|
210
224
|
};
|
|
211
225
|
});
|
|
212
226
|
/**
|
|
@@ -228,7 +242,7 @@ export function crawlUrl(params, callback) {
|
|
|
228
242
|
fullPage: params.fullPage,
|
|
229
243
|
})) || {};
|
|
230
244
|
if (duplicateJob) {
|
|
231
|
-
logger.
|
|
245
|
+
logger.info(`Crawl job already exists for ${params.url}, skip`);
|
|
232
246
|
return duplicateJob.id;
|
|
233
247
|
}
|
|
234
248
|
logger.info('create crawl job', params);
|
package/lib/esm/cron.js
CHANGED
|
@@ -21,7 +21,7 @@ export function initCron() {
|
|
|
21
21
|
{
|
|
22
22
|
name: 'crawl-site',
|
|
23
23
|
time: config.siteCron.time,
|
|
24
|
-
options: { runOnInit: config.siteCron.
|
|
24
|
+
options: { runOnInit: config.siteCron.immediate },
|
|
25
25
|
fn: () => __awaiter(this, void 0, void 0, function* () {
|
|
26
26
|
logger.info('Start cron to crawl site', { sites: config.siteCron.sites });
|
|
27
27
|
for (const site of config.siteCron.sites) {
|
package/lib/esm/index.js
CHANGED
|
@@ -19,13 +19,15 @@ export * from './services/snapshot';
|
|
|
19
19
|
export * as utils from './utils';
|
|
20
20
|
export function initCrawler(params) {
|
|
21
21
|
return __awaiter(this, void 0, void 0, function* () {
|
|
22
|
-
logger.info('Init crawler', { params });
|
|
23
22
|
merge(config, params);
|
|
23
|
+
logger.info('Init crawler', { params, config });
|
|
24
24
|
try {
|
|
25
25
|
yield initDatabase();
|
|
26
26
|
yield ensureBrowser();
|
|
27
27
|
yield createCrawlQueue();
|
|
28
|
-
|
|
28
|
+
if (config.siteCron.enabled) {
|
|
29
|
+
yield initCron();
|
|
30
|
+
}
|
|
29
31
|
}
|
|
30
32
|
catch (err) {
|
|
31
33
|
logger.error('Init crawler error', { err });
|
package/lib/esm/site.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Site } from './config';
|
|
2
|
-
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | undefined)[]>;
|
|
2
|
+
export declare const crawlSite: ({ url, pathname, interval }: Site) => Promise<(string | null | undefined)[]>;
|
package/lib/esm/site.js
CHANGED
|
@@ -22,6 +22,11 @@ function parseSitemapUrl(sitemapItem) {
|
|
|
22
22
|
}
|
|
23
23
|
export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, pathname, interval = 0 }) {
|
|
24
24
|
logger.info(`Start crawl from sitemap ${url}`, { pathname });
|
|
25
|
+
const key = `${url}-${pathname}`;
|
|
26
|
+
if (crawlBlockletRunningMap.has(key)) {
|
|
27
|
+
logger.info(`Crawl from sitemap ${url} ${pathname} is already running, skip`);
|
|
28
|
+
return [];
|
|
29
|
+
}
|
|
25
30
|
const sitemapList = yield getSitemapList(url);
|
|
26
31
|
const pathnameRegex = new RegExp(pathname);
|
|
27
32
|
const sitemapItems = sitemapList
|
|
@@ -30,33 +35,31 @@ export const crawlSite = (_a) => __awaiter(void 0, [_a], void 0, function* ({ ur
|
|
|
30
35
|
return parseSitemapUrl(sitemapItem);
|
|
31
36
|
});
|
|
32
37
|
logger.info(`Found ${sitemapItems.length} sitemap items which match ${pathname} from ${url}`);
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
36
|
-
const lastModified = new Date(snapshot.lastModified);
|
|
37
|
-
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
38
|
-
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
39
|
-
return null;
|
|
40
|
-
}
|
|
41
|
-
// skip if interval time has not been reached
|
|
42
|
-
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
43
|
-
return null;
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
return { url, sitemapItem };
|
|
47
|
-
})))).filter(Boolean);
|
|
48
|
-
logger.info(`Found ${crawlableItems.length} pages to crawl from sitemap ${url}`, { pathname });
|
|
49
|
-
const key = `${url}-${pathname}`;
|
|
50
|
-
crawlBlockletRunningMap.set(key, crawlableItems);
|
|
38
|
+
let processCount = 0;
|
|
39
|
+
crawlBlockletRunningMap.set(key, true);
|
|
51
40
|
try {
|
|
52
|
-
const jobIds = yield pMap(
|
|
41
|
+
const jobIds = yield pMap(sitemapItems, (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, sitemapItem }) {
|
|
42
|
+
processCount++;
|
|
43
|
+
const snapshot = yield Snapshot.findOne({ where: { url: formatUrl(url) } });
|
|
44
|
+
if (snapshot === null || snapshot === void 0 ? void 0 : snapshot.lastModified) {
|
|
45
|
+
const lastModified = new Date(snapshot.lastModified);
|
|
46
|
+
// skip if snapshot lastModified is greater than sitemap lastmod
|
|
47
|
+
if (sitemapItem.lastmod && lastModified >= new Date(sitemapItem.lastmod)) {
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
// skip if interval time has not been reached
|
|
51
|
+
if (Date.now() - lastModified.getTime() < interval * 24 * 60 * 60 * 1000) {
|
|
52
|
+
return null;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
logger.debug(`Sitemap process ${processCount} / ${sitemapItems.length}`);
|
|
53
56
|
return crawlUrl({
|
|
54
57
|
url,
|
|
55
58
|
lastModified: sitemapItem.lastmod,
|
|
56
59
|
includeScreenshot: false,
|
|
57
60
|
includeHtml: true,
|
|
58
61
|
});
|
|
59
|
-
}, { concurrency: config.siteCron.
|
|
62
|
+
}), { concurrency: config.siteCron.sitemapConcurrency });
|
|
60
63
|
return jobIds;
|
|
61
64
|
}
|
|
62
65
|
catch (error) {
|
package/lib/esm/store/index.js
CHANGED
|
@@ -39,7 +39,7 @@ export function initDatabase() {
|
|
|
39
39
|
sequelize.query('pragma journal_size_limit = 67108864;'),
|
|
40
40
|
]);
|
|
41
41
|
yield sequelize.authenticate();
|
|
42
|
-
yield sequelize.sync();
|
|
42
|
+
yield sequelize.sync({ alter: process.env.ALTER_SQLITE === 'true' });
|
|
43
43
|
logger.info('Successfully connected to database');
|
|
44
44
|
}
|
|
45
45
|
catch (error) {
|
|
@@ -7,6 +7,10 @@ export interface SnapshotModel {
|
|
|
7
7
|
screenshot?: string | null;
|
|
8
8
|
error?: string;
|
|
9
9
|
lastModified?: string;
|
|
10
|
+
meta?: {
|
|
11
|
+
title?: string;
|
|
12
|
+
description?: string;
|
|
13
|
+
};
|
|
10
14
|
options?: {
|
|
11
15
|
width?: number;
|
|
12
16
|
height?: number;
|
|
@@ -24,6 +28,7 @@ export declare class Snapshot extends Model<SnapshotModel> implements SnapshotMo
|
|
|
24
28
|
screenshot?: SnapshotModel['screenshot'];
|
|
25
29
|
error?: SnapshotModel['error'];
|
|
26
30
|
lastModified?: SnapshotModel['lastModified'];
|
|
31
|
+
meta?: SnapshotModel['meta'];
|
|
27
32
|
options: SnapshotModel['options'];
|
|
28
33
|
static initModel(sequelize: Sequelize): typeof Snapshot;
|
|
29
34
|
static findSnapshot(condition: FindOptions<SnapshotModel>): Promise<SnapshotModel | null>;
|