@arcblock/crawler 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/blocklet.d.ts +6 -0
- package/dist/blocklet.js +199 -0
- package/dist/cache.d.ts +10 -0
- package/dist/cache.js +119 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +17 -0
- package/dist/crawler.d.ts +28 -0
- package/dist/crawler.js +314 -0
- package/dist/db/index.d.ts +1 -0
- package/dist/db/index.js +41 -0
- package/dist/db/job.d.ts +33 -0
- package/dist/db/job.js +54 -0
- package/dist/db/snapshot.d.ts +31 -0
- package/dist/db/snapshot.js +52 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +45 -0
- package/dist/middleware.d.ts +4 -0
- package/dist/middleware.js +44 -0
- package/dist/puppeteer.d.ts +16 -0
- package/dist/puppeteer.js +318 -0
- package/dist/utils.d.ts +15 -0
- package/dist/utils.js +239 -0
- package/esm/blocklet.d.ts +6 -0
- package/esm/blocklet.js +190 -0
- package/esm/cache.d.ts +10 -0
- package/esm/cache.js +114 -0
- package/esm/config.d.ts +10 -0
- package/esm/config.js +11 -0
- package/esm/crawler.d.ts +28 -0
- package/esm/crawler.js +301 -0
- package/esm/db/index.d.ts +1 -0
- package/esm/db/index.js +35 -0
- package/esm/db/job.d.ts +33 -0
- package/esm/db/job.js +50 -0
- package/esm/db/snapshot.d.ts +31 -0
- package/esm/db/snapshot.js +48 -0
- package/esm/index.d.ts +6 -0
- package/esm/index.js +26 -0
- package/esm/middleware.d.ts +4 -0
- package/esm/middleware.js +41 -0
- package/esm/puppeteer.d.ts +16 -0
- package/esm/puppeteer.js +272 -0
- package/esm/utils.d.ts +15 -0
- package/esm/utils.js +220 -0
- package/package.json +10 -3
- package/src/blocklet.ts +0 -223
- package/src/cache.ts +0 -117
- package/src/config.ts +0 -13
- package/src/crawler.ts +0 -364
- package/src/db/index.ts +0 -27
- package/src/db/job.ts +0 -93
- package/src/db/snapshot.ts +0 -89
- package/src/index.ts +0 -19
- package/src/middleware.ts +0 -46
- package/src/puppeteer.ts +0 -296
- package/src/utils.ts +0 -240
- package/third.d.ts +0 -1
- package/tsconfig.json +0 -9
package/esm/blocklet.js
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import Cron from '@abtnode/cron';
|
|
11
|
+
import { components } from '@blocklet/sdk/lib/config';
|
|
12
|
+
import debounce from 'lodash/debounce';
|
|
13
|
+
import { joinURL } from 'ufo';
|
|
14
|
+
import { useCache } from './cache';
|
|
15
|
+
import { config, logger } from './config';
|
|
16
|
+
import { createCrawlJob } from './crawler';
|
|
17
|
+
import { closeBrowser, getBrowser } from './puppeteer';
|
|
18
|
+
import { getComponentInfo, getRelativePath, getSitemapList } from './utils';
|
|
19
|
+
// record crawl blocklet running
|
|
20
|
+
const crawlBlockletRunningMap = new Map();
|
|
21
|
+
// crawl blocklet sitemap urls
|
|
22
|
+
export const crawlBlocklet = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
23
|
+
// @ts-ignore
|
|
24
|
+
const { mountPoint, did } = getComponentInfo();
|
|
25
|
+
if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
|
|
26
|
+
logger.info(`Crawler blocklet ${did} is running, skip it`);
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
// check has browser can use
|
|
30
|
+
try {
|
|
31
|
+
const browser = yield getBrowser();
|
|
32
|
+
if (!browser) {
|
|
33
|
+
throw new Error('No Browser can use');
|
|
34
|
+
}
|
|
35
|
+
logger.info('Crawler blocklet existing can use browser');
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
logger.info(`Crawler blocklet abort by error: ${(error === null || error === void 0 ? void 0 : error.message) || (error === null || error === void 0 ? void 0 : error.reason) || error}`);
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
const { appUrl } = config;
|
|
42
|
+
if (!appUrl) {
|
|
43
|
+
throw new Error('appUrl not found');
|
|
44
|
+
}
|
|
45
|
+
const sitemapList = yield getSitemapList(appUrl);
|
|
46
|
+
const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
|
|
47
|
+
const otherMountPointList = components
|
|
48
|
+
.filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
|
|
49
|
+
.map((item) => item.mountPoint);
|
|
50
|
+
// get can use loc
|
|
51
|
+
const blockletLocList = sitemapList.filter((item) => {
|
|
52
|
+
var _a;
|
|
53
|
+
if (mountPoint !== '/') {
|
|
54
|
+
return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(matchMountPoint)) > -1;
|
|
55
|
+
}
|
|
56
|
+
// if mountPoint is /, skip other mountPoint
|
|
57
|
+
return otherMountPointList.every((mountPoint) => { var _a; return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(mountPoint)) === -1; });
|
|
58
|
+
});
|
|
59
|
+
const canUseBlockletLocList = [];
|
|
60
|
+
const lastmodMap = new Map();
|
|
61
|
+
let skipBlockletLocTotal = 0;
|
|
62
|
+
let blockletLocTotal = 0;
|
|
63
|
+
yield Promise.all(blockletLocList.map((item) => __awaiter(void 0, void 0, void 0, function* () {
|
|
64
|
+
var _a;
|
|
65
|
+
let tempLocList = [];
|
|
66
|
+
if (item.url) {
|
|
67
|
+
tempLocList.push(item.url);
|
|
68
|
+
}
|
|
69
|
+
if (((_a = item === null || item === void 0 ? void 0 : item.links) === null || _a === void 0 ? void 0 : _a.length) > 0) {
|
|
70
|
+
tempLocList.push(...item.links.map((ytem) => ytem.url));
|
|
71
|
+
}
|
|
72
|
+
blockletLocTotal += tempLocList.length;
|
|
73
|
+
// @ts-ignore
|
|
74
|
+
tempLocList = (yield Promise.all(tempLocList.map((loc) => __awaiter(void 0, void 0, void 0, function* () {
|
|
75
|
+
try {
|
|
76
|
+
const { lastModified: cacheLastModified } = yield useCache.get(getRelativePath(loc));
|
|
77
|
+
// sitemap item lastmod is same as cache lastModified, skip it
|
|
78
|
+
if (item.lastmod &&
|
|
79
|
+
cacheLastModified &&
|
|
80
|
+
new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()) {
|
|
81
|
+
skipBlockletLocTotal++;
|
|
82
|
+
return false;
|
|
83
|
+
}
|
|
84
|
+
return loc;
|
|
85
|
+
}
|
|
86
|
+
catch (error) {
|
|
87
|
+
// ignore error
|
|
88
|
+
}
|
|
89
|
+
// if can not get cache, return loc
|
|
90
|
+
return loc;
|
|
91
|
+
})))).filter(Boolean);
|
|
92
|
+
tempLocList.forEach((loc) => {
|
|
93
|
+
if (item.lastmod)
|
|
94
|
+
lastmodMap.set(loc, item.lastmod);
|
|
95
|
+
});
|
|
96
|
+
canUseBlockletLocList.push(...tempLocList);
|
|
97
|
+
})));
|
|
98
|
+
const crawlerLogText = (step = '') => [
|
|
99
|
+
`Crawler sitemap.xml about ${did} ${step}: `,
|
|
100
|
+
{
|
|
101
|
+
blockletLocTotal,
|
|
102
|
+
canUseBlockletLocTotal: canUseBlockletLocList.length,
|
|
103
|
+
skipBlockletLocTotal,
|
|
104
|
+
lastmodMapTotal: lastmodMap.size,
|
|
105
|
+
},
|
|
106
|
+
];
|
|
107
|
+
logger.info(...crawlerLogText('start'));
|
|
108
|
+
try {
|
|
109
|
+
// record crawl blocklet running
|
|
110
|
+
crawlBlockletRunningMap.set(did, true);
|
|
111
|
+
yield createCrawlJob({
|
|
112
|
+
// @ts-ignore
|
|
113
|
+
urls: canUseBlockletLocList,
|
|
114
|
+
saveToRedis: true,
|
|
115
|
+
lastmodMap,
|
|
116
|
+
// formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
|
|
117
|
+
// const pageContent = await page.evaluate(() => {
|
|
118
|
+
// const removeElements = (tagName: string) => {
|
|
119
|
+
// const elements = document.querySelectorAll(tagName);
|
|
120
|
+
// for (let i = elements.length - 1; i >= 0; i--) {
|
|
121
|
+
// try {
|
|
122
|
+
// elements[i]?.parentNode?.removeChild(elements[i] as Node);
|
|
123
|
+
// } catch (error) {
|
|
124
|
+
// // do noting
|
|
125
|
+
// }
|
|
126
|
+
// }
|
|
127
|
+
// };
|
|
128
|
+
// // remove script, style, link, noscript
|
|
129
|
+
// // removeElements('script');
|
|
130
|
+
// // removeElements('style');
|
|
131
|
+
// // removeElements('link');
|
|
132
|
+
// // removeElements('noscript');
|
|
133
|
+
// // remove uploader
|
|
134
|
+
// removeElements('[id="uploader-container"]');
|
|
135
|
+
// removeElements('[class^="uppy-"]');
|
|
136
|
+
// // remove point up component
|
|
137
|
+
// removeElements('[id="point-up-component"]');
|
|
138
|
+
// // add meta tag to record crawler
|
|
139
|
+
// const meta = document.createElement('meta');
|
|
140
|
+
// meta.name = 'blocklet-crawler';
|
|
141
|
+
// meta.content = 'true';
|
|
142
|
+
// document.head.appendChild(meta);
|
|
143
|
+
// return document.documentElement.outerHTML;
|
|
144
|
+
// });
|
|
145
|
+
// return pageContent;
|
|
146
|
+
// },
|
|
147
|
+
});
|
|
148
|
+
logger.info(...crawlerLogText('success'));
|
|
149
|
+
yield closeBrowser({
|
|
150
|
+
trimCache: true,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
catch (error) {
|
|
154
|
+
logger.info('Crawler blocklet abort by error', error);
|
|
155
|
+
}
|
|
156
|
+
finally {
|
|
157
|
+
// delete crawl blocklet running
|
|
158
|
+
crawlBlockletRunningMap.delete(did);
|
|
159
|
+
}
|
|
160
|
+
});
|
|
161
|
+
const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
|
|
162
|
+
let cronCrawlBlockletJob = null;
|
|
163
|
+
// init cron crawl blocklet
|
|
164
|
+
export const initCronCrawlBlocklet = ({ time = '0 0 */12 * * *', // every 12 hours
|
|
165
|
+
options, } = {}) => {
|
|
166
|
+
if (!cronCrawlBlockletJob) {
|
|
167
|
+
cronCrawlBlockletJob = Cron.init({
|
|
168
|
+
context: {},
|
|
169
|
+
jobs: [
|
|
170
|
+
{
|
|
171
|
+
name: CRON_CRAWL_BLOCKLET_KEY,
|
|
172
|
+
time,
|
|
173
|
+
fn: debounce(crawlBlocklet),
|
|
174
|
+
options: Object.assign({ runOnInit: false }, options),
|
|
175
|
+
},
|
|
176
|
+
],
|
|
177
|
+
onError: (err) => {
|
|
178
|
+
console.error('run job failed', err);
|
|
179
|
+
},
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
return cronCrawlBlockletJob;
|
|
183
|
+
};
|
|
184
|
+
export const cancelCronCrawlBlocklet = () => {
|
|
185
|
+
if (cronCrawlBlockletJob) {
|
|
186
|
+
cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
|
|
187
|
+
cronCrawlBlockletJob = null;
|
|
188
|
+
logger.info('Cron crawl blocklet stop, clear crawl queue');
|
|
189
|
+
}
|
|
190
|
+
};
|
package/esm/cache.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare const cachePool: import("generic-pool").Pool<any>;
|
|
2
|
+
export declare const memoryPool: import("generic-pool").Pool<Map<string, any>>;
|
|
3
|
+
export declare const withCache: (cb: Function) => Promise<any>;
|
|
4
|
+
export declare const formatKey: (key: string) => string;
|
|
5
|
+
export declare const useCache: {
|
|
6
|
+
get: (key: string) => Promise<any>;
|
|
7
|
+
set: (key: string, value: any, options?: any) => Promise<any>;
|
|
8
|
+
remove: (key: string) => Promise<any>;
|
|
9
|
+
list: (key?: string) => Promise<any>;
|
|
10
|
+
};
|
package/esm/cache.js
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
var _a;
|
|
11
|
+
import { createPool } from 'generic-pool';
|
|
12
|
+
import { createClient } from 'redis';
|
|
13
|
+
import { config, logger } from './config';
|
|
14
|
+
const cacheKeyPrefix = ((_a = process.env) === null || _a === void 0 ? void 0 : _a.BLOCKLET_REAL_DID) ? `${process.env.BLOCKLET_REAL_DID}:` : '';
|
|
15
|
+
const MAX_REDIS_RETRY = 3;
|
|
16
|
+
const ttl = 1000 * 60 * 60 * 24 * 7;
|
|
17
|
+
export const cachePool = createPool({
|
|
18
|
+
create: () => __awaiter(void 0, void 0, void 0, function* () {
|
|
19
|
+
try {
|
|
20
|
+
const { redisUrl } = config;
|
|
21
|
+
const redisClient = createClient({
|
|
22
|
+
url: redisUrl,
|
|
23
|
+
socket: {
|
|
24
|
+
// @ts-ignore
|
|
25
|
+
reconnectStrategy: (retries) => {
|
|
26
|
+
if (retries >= MAX_REDIS_RETRY) {
|
|
27
|
+
return new Error('Retry Time Exhausted');
|
|
28
|
+
}
|
|
29
|
+
return Math.min(retries * 500, 1000 * 3);
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
|
|
34
|
+
yield redisClient.connect();
|
|
35
|
+
logger.info(`Successfully connected to Redis: ${redisUrl}`);
|
|
36
|
+
return redisClient;
|
|
37
|
+
}
|
|
38
|
+
catch (error) {
|
|
39
|
+
logger.warn('Redis connection failed', error);
|
|
40
|
+
return null;
|
|
41
|
+
}
|
|
42
|
+
}),
|
|
43
|
+
destroy: (client) => __awaiter(void 0, void 0, void 0, function* () {
|
|
44
|
+
// if is redis client
|
|
45
|
+
if (client.isReady) {
|
|
46
|
+
yield client.quit();
|
|
47
|
+
}
|
|
48
|
+
}),
|
|
49
|
+
}, {
|
|
50
|
+
max: 2, // 2 clients
|
|
51
|
+
min: 0,
|
|
52
|
+
// evictionRunIntervalMillis: 0,
|
|
53
|
+
});
|
|
54
|
+
export const memoryPool = createPool({
|
|
55
|
+
create: () => {
|
|
56
|
+
const map = new Map();
|
|
57
|
+
// @ts-ignore
|
|
58
|
+
map.del = map.delete;
|
|
59
|
+
return Promise.resolve(map);
|
|
60
|
+
},
|
|
61
|
+
destroy: (client) => {
|
|
62
|
+
client.clear();
|
|
63
|
+
return Promise.resolve();
|
|
64
|
+
},
|
|
65
|
+
}, {
|
|
66
|
+
max: 10,
|
|
67
|
+
min: 0,
|
|
68
|
+
});
|
|
69
|
+
export const withCache = (cb) => __awaiter(void 0, void 0, void 0, function* () {
|
|
70
|
+
const pool = config.redisUrl ? cachePool : memoryPool;
|
|
71
|
+
const client = yield pool.acquire();
|
|
72
|
+
if (client) {
|
|
73
|
+
try {
|
|
74
|
+
return cb(client);
|
|
75
|
+
}
|
|
76
|
+
finally {
|
|
77
|
+
// release client to pool, let other use
|
|
78
|
+
yield pool.release(client);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
export const formatKey = (key) => {
|
|
83
|
+
return `${cacheKeyPrefix}${key}`;
|
|
84
|
+
};
|
|
85
|
+
export const useCache = {
|
|
86
|
+
get: (key) => {
|
|
87
|
+
return withCache((client) => __awaiter(void 0, void 0, void 0, function* () {
|
|
88
|
+
const value = yield client.get(formatKey(key));
|
|
89
|
+
try {
|
|
90
|
+
return JSON.parse(value);
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
// ignore error
|
|
94
|
+
}
|
|
95
|
+
return value;
|
|
96
|
+
}));
|
|
97
|
+
},
|
|
98
|
+
set: (key, value, options) => {
|
|
99
|
+
return withCache((client) => {
|
|
100
|
+
const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
|
|
101
|
+
return client.set(formatKey(key), formatValue, Object.assign({ PX: ttl }, options));
|
|
102
|
+
});
|
|
103
|
+
},
|
|
104
|
+
remove: (key) => {
|
|
105
|
+
return withCache((client) => {
|
|
106
|
+
return client.del(formatKey(key));
|
|
107
|
+
});
|
|
108
|
+
},
|
|
109
|
+
list: (key = '*') => {
|
|
110
|
+
return withCache((client) => {
|
|
111
|
+
return client.keys(formatKey(key));
|
|
112
|
+
});
|
|
113
|
+
},
|
|
114
|
+
};
|
package/esm/config.d.ts
ADDED
package/esm/config.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import createLogger from '@blocklet/logger';
|
|
2
|
+
export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
3
|
+
export const config = {
|
|
4
|
+
redisUrl: process.env.REDIS_URL,
|
|
5
|
+
dataDir: process.env.BLOCKLET_DATA_DIR,
|
|
6
|
+
appDir: process.env.BLOCKLET_APP_DIR || process.cwd(),
|
|
7
|
+
appUrl: process.env.BLOCKLET_APP_URL,
|
|
8
|
+
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
9
|
+
cacheDir: process.env.BLOCKLET_CACHE_DIR,
|
|
10
|
+
testOnInitialize: process.env.NODE_ENV === 'production',
|
|
11
|
+
};
|
package/esm/crawler.d.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { JobState } from './db/job';
|
|
2
|
+
import { SnapshotModel } from './db/snapshot';
|
|
3
|
+
export declare function createCrawlQueue(): void;
|
|
4
|
+
export declare function getDataDir(): Promise<{
|
|
5
|
+
htmlDir: string;
|
|
6
|
+
screenshotDir: string;
|
|
7
|
+
}>;
|
|
8
|
+
export declare const getPageContent: ({ url, formatPageContent, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage, }: {
|
|
9
|
+
url: string;
|
|
10
|
+
formatPageContent?: Function;
|
|
11
|
+
includeScreenshot?: boolean;
|
|
12
|
+
includeHtml?: boolean;
|
|
13
|
+
width?: number;
|
|
14
|
+
height?: number;
|
|
15
|
+
quality?: number;
|
|
16
|
+
timeout?: number;
|
|
17
|
+
fullPage?: boolean;
|
|
18
|
+
}) => Promise<{
|
|
19
|
+
html: string;
|
|
20
|
+
screenshot: Uint8Array<ArrayBufferLike> | null;
|
|
21
|
+
}>;
|
|
22
|
+
export declare function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void): Promise<any>;
|
|
23
|
+
export declare function getJob(condition: Partial<JobState>): Promise<any>;
|
|
24
|
+
export declare function formatSnapshot(snapshot: SnapshotModel, columns?: Array<keyof SnapshotModel>): Promise<SnapshotModel>;
|
|
25
|
+
/**
|
|
26
|
+
* get snapshot from db or crawl queue
|
|
27
|
+
*/
|
|
28
|
+
export declare function getSnapshot(jobId: string): Promise<SnapshotModel | null>;
|
package/esm/crawler.js
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import createQueue from '@abtnode/queue';
|
|
11
|
+
import SequelizeStore from '@abtnode/queue/lib/store/sequelize';
|
|
12
|
+
import sequelize from '@sequelize/core';
|
|
13
|
+
import { randomUUID } from 'crypto';
|
|
14
|
+
import fs from 'fs-extra';
|
|
15
|
+
import pick from 'lodash/pick';
|
|
16
|
+
import path from 'path';
|
|
17
|
+
import { joinURL } from 'ufo';
|
|
18
|
+
import { config, logger } from './config';
|
|
19
|
+
import { Job } from './db/job';
|
|
20
|
+
import { Snapshot } from './db/snapshot';
|
|
21
|
+
import { initPage } from './puppeteer';
|
|
22
|
+
import { formatUrl, isAcceptCrawler, md5 } from './utils';
|
|
23
|
+
const { BaseState } = require('@abtnode/models');
|
|
24
|
+
let crawlQueue;
|
|
25
|
+
export function createCrawlQueue() {
|
|
26
|
+
const db = new BaseState(Job);
|
|
27
|
+
crawlQueue = createQueue({
|
|
28
|
+
store: new SequelizeStore(db, 'crawler'),
|
|
29
|
+
concurrency: 1,
|
|
30
|
+
onJob: (job) => __awaiter(this, void 0, void 0, function* () {
|
|
31
|
+
logger.debug('job start:', job);
|
|
32
|
+
const canCrawl = yield isAcceptCrawler(job.url);
|
|
33
|
+
if (!canCrawl) {
|
|
34
|
+
logger.error(`failed to crawl ${job.url}, denied by robots.txt`, job);
|
|
35
|
+
const snapshot = convertJobToSnapshot({
|
|
36
|
+
job,
|
|
37
|
+
snapshot: {
|
|
38
|
+
status: 'failed',
|
|
39
|
+
error: 'Denied by robots.txt',
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
yield Snapshot.upsert(snapshot);
|
|
43
|
+
return snapshot;
|
|
44
|
+
}
|
|
45
|
+
// if index reach autoCloseBrowserCount, close browser
|
|
46
|
+
// try {
|
|
47
|
+
// if (index >= autoCloseBrowserCount) {
|
|
48
|
+
// await closeBrowser({ trimCache: false });
|
|
49
|
+
// }
|
|
50
|
+
// } catch (error) {
|
|
51
|
+
// logger.error('failed to close browser when queue index reached autoCloseBrowserCount:', error);
|
|
52
|
+
// }
|
|
53
|
+
try {
|
|
54
|
+
// get page content later
|
|
55
|
+
const result = yield getPageContent(job);
|
|
56
|
+
if (!result || (!result.html && !result.screenshot)) {
|
|
57
|
+
logger.error(`failed to crawl ${job.url}, empty content`, job);
|
|
58
|
+
const snapshot = convertJobToSnapshot({
|
|
59
|
+
job,
|
|
60
|
+
snapshot: {
|
|
61
|
+
status: 'failed',
|
|
62
|
+
error: 'Failed to crawl content',
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
yield Snapshot.upsert(snapshot);
|
|
66
|
+
return snapshot;
|
|
67
|
+
}
|
|
68
|
+
// save html and screenshot to data dir
|
|
69
|
+
const { screenshotPath, htmlPath } = yield saveSnapshotToLocal({
|
|
70
|
+
screenshot: result.screenshot,
|
|
71
|
+
html: result.html,
|
|
72
|
+
});
|
|
73
|
+
// const lastModified = job.lastmodMap?.get(url) || new Date().toISOString();
|
|
74
|
+
const snapshot = convertJobToSnapshot({
|
|
75
|
+
job,
|
|
76
|
+
snapshot: {
|
|
77
|
+
status: 'success',
|
|
78
|
+
screenshot: screenshotPath === null || screenshotPath === void 0 ? void 0 : screenshotPath.replace(config.dataDir, ''),
|
|
79
|
+
html: htmlPath === null || htmlPath === void 0 ? void 0 : htmlPath.replace(config.dataDir, ''),
|
|
80
|
+
},
|
|
81
|
+
});
|
|
82
|
+
yield Snapshot.upsert(snapshot);
|
|
83
|
+
return snapshot;
|
|
84
|
+
// save to redis
|
|
85
|
+
// if (saveToRedis) {
|
|
86
|
+
// useCache.set(url, {
|
|
87
|
+
// html: result.html || '',
|
|
88
|
+
// lastModified,
|
|
89
|
+
// });
|
|
90
|
+
// logger.info(`success to crawl ${url}`, job);
|
|
91
|
+
// return result;
|
|
92
|
+
// }
|
|
93
|
+
}
|
|
94
|
+
catch (error) {
|
|
95
|
+
logger.error(`Failed to crawl ${job.url}`, { error, job });
|
|
96
|
+
console.error(error.stack);
|
|
97
|
+
const snapshot = convertJobToSnapshot({
|
|
98
|
+
job,
|
|
99
|
+
snapshot: {
|
|
100
|
+
status: 'failed',
|
|
101
|
+
error: 'Internal error',
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
yield Snapshot.upsert(snapshot);
|
|
105
|
+
return snapshot;
|
|
106
|
+
}
|
|
107
|
+
}),
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
export function getDataDir() {
|
|
111
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
112
|
+
const htmlDir = path.join(config.dataDir, 'data', 'html');
|
|
113
|
+
const screenshotDir = path.join(config.dataDir, 'data', 'screenshot');
|
|
114
|
+
yield fs.ensureDir(htmlDir);
|
|
115
|
+
yield fs.ensureDir(screenshotDir);
|
|
116
|
+
return { htmlDir, screenshotDir };
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
function saveSnapshotToLocal(_a) {
|
|
120
|
+
return __awaiter(this, arguments, void 0, function* ({ screenshot, html }) {
|
|
121
|
+
const { htmlDir, screenshotDir } = yield getDataDir();
|
|
122
|
+
let screenshotPath = null;
|
|
123
|
+
let htmlPath = null;
|
|
124
|
+
if (screenshot) {
|
|
125
|
+
const hash = md5(screenshot);
|
|
126
|
+
screenshotPath = path.join(screenshotDir, `${hash}.webp`);
|
|
127
|
+
logger.debug('saveSnapshotToLocal.screenshot', { screenshotPath });
|
|
128
|
+
yield fs.writeFile(screenshotPath, screenshot);
|
|
129
|
+
}
|
|
130
|
+
if (html) {
|
|
131
|
+
const hash = md5(html);
|
|
132
|
+
htmlPath = path.join(htmlDir, `${hash}.html`);
|
|
133
|
+
logger.debug('saveSnapshotToLocal.html', { htmlPath });
|
|
134
|
+
yield fs.writeFile(htmlPath, html);
|
|
135
|
+
}
|
|
136
|
+
return {
|
|
137
|
+
screenshotPath,
|
|
138
|
+
htmlPath,
|
|
139
|
+
};
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
function formatHtml(htmlString) {
|
|
143
|
+
if (htmlString.includes('<h2>Unexpected Application Error!</h2>')) {
|
|
144
|
+
return '';
|
|
145
|
+
}
|
|
146
|
+
return htmlString;
|
|
147
|
+
}
|
|
148
|
+
export const getPageContent = (_a) => __awaiter(void 0, [_a], void 0, function* ({ url, formatPageContent, includeScreenshot = true, includeHtml = true, width = 1440, height = 900, quality = 80, timeout = 60 * 1000, fullPage = false, }) {
|
|
149
|
+
logger.debug('getPageContent', { url, includeScreenshot, includeHtml, width, height, quality, timeout, fullPage });
|
|
150
|
+
const page = yield initPage();
|
|
151
|
+
if (width && height) {
|
|
152
|
+
yield page.setViewport({ width, height });
|
|
153
|
+
}
|
|
154
|
+
let html = null;
|
|
155
|
+
let screenshot = null;
|
|
156
|
+
try {
|
|
157
|
+
const response = yield page.goto(url, { timeout });
|
|
158
|
+
if (!response) {
|
|
159
|
+
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
160
|
+
}
|
|
161
|
+
const statusCode = response.status();
|
|
162
|
+
logger.debug('getPageContent.response', { response, statusCode });
|
|
163
|
+
if (![200, 304].includes(statusCode)) {
|
|
164
|
+
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
165
|
+
}
|
|
166
|
+
// await for networkidle0
|
|
167
|
+
// https://pptr.dev/api/puppeteer.page.goforward/#remarks
|
|
168
|
+
yield page.waitForNetworkIdle({
|
|
169
|
+
idleTime: 2 * 1000,
|
|
170
|
+
});
|
|
171
|
+
// get screenshot
|
|
172
|
+
if (includeScreenshot) {
|
|
173
|
+
try {
|
|
174
|
+
screenshot = yield page.screenshot({ fullPage, quality, type: 'webp' });
|
|
175
|
+
}
|
|
176
|
+
catch (err) {
|
|
177
|
+
logger.error('Failed to get screenshot:', err);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
// get html
|
|
181
|
+
if (includeHtml) {
|
|
182
|
+
if (formatPageContent) {
|
|
183
|
+
html = yield formatPageContent({ page, url });
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
html = yield page.content();
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
catch (error) {
|
|
191
|
+
logger.error('Failed to get page content:', error);
|
|
192
|
+
throw error;
|
|
193
|
+
}
|
|
194
|
+
finally {
|
|
195
|
+
yield page.close();
|
|
196
|
+
}
|
|
197
|
+
html = formatHtml(html || '');
|
|
198
|
+
return {
|
|
199
|
+
html,
|
|
200
|
+
screenshot,
|
|
201
|
+
};
|
|
202
|
+
});
|
|
203
|
+
export function createCrawlJob(params, callback) {
|
|
204
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
205
|
+
params = Object.assign(Object.assign({}, params), { url: formatUrl(params.url) });
|
|
206
|
+
// skip duplicate job
|
|
207
|
+
const existsJob = yield getJob({
|
|
208
|
+
url: params.url,
|
|
209
|
+
includeScreenshot: params.includeScreenshot,
|
|
210
|
+
includeHtml: params.includeHtml,
|
|
211
|
+
quality: params.quality,
|
|
212
|
+
width: params.width,
|
|
213
|
+
height: params.height,
|
|
214
|
+
fullPage: params.fullPage,
|
|
215
|
+
});
|
|
216
|
+
logger.info('create crawl job', params);
|
|
217
|
+
if (existsJob) {
|
|
218
|
+
logger.warn(`Crawl job already exists for ${params.url}, skip`);
|
|
219
|
+
return existsJob.id;
|
|
220
|
+
}
|
|
221
|
+
const jobId = randomUUID();
|
|
222
|
+
const job = crawlQueue.push(Object.assign(Object.assign({}, params), { id: jobId }));
|
|
223
|
+
job.on('finished', ({ result }) => {
|
|
224
|
+
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
|
|
225
|
+
callback === null || callback === void 0 ? void 0 : callback(result);
|
|
226
|
+
});
|
|
227
|
+
job.on('failed', ({ error }) => {
|
|
228
|
+
logger.error(`Failed to execute job for ${params.url}`, { error, job: params });
|
|
229
|
+
callback === null || callback === void 0 ? void 0 : callback(null);
|
|
230
|
+
});
|
|
231
|
+
return jobId;
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
// @ts-ignore
|
|
235
|
+
export function getJob(condition) {
|
|
236
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
237
|
+
const where = Object.keys(condition)
|
|
238
|
+
.filter((key) => condition[key] !== undefined)
|
|
239
|
+
.map((key) => {
|
|
240
|
+
return sequelize.where(sequelize.fn('json_extract', sequelize.col('job'), `$.${key}`), condition[key]);
|
|
241
|
+
});
|
|
242
|
+
const job = yield crawlQueue.store.db.findOne({
|
|
243
|
+
where: {
|
|
244
|
+
[sequelize.Op.and]: where,
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
if (job) {
|
|
248
|
+
return job.job;
|
|
249
|
+
}
|
|
250
|
+
return null;
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
function convertJobToSnapshot({ job, snapshot }) {
|
|
254
|
+
return Object.assign({
|
|
255
|
+
// @ts-ignore
|
|
256
|
+
jobId: job.jobId || job.id, url: job.url, options: {
|
|
257
|
+
width: job.width,
|
|
258
|
+
height: job.height,
|
|
259
|
+
includeScreenshot: job.includeScreenshot,
|
|
260
|
+
includeHtml: job.includeHtml,
|
|
261
|
+
quality: job.quality,
|
|
262
|
+
fullPage: job.fullPage,
|
|
263
|
+
} }, snapshot);
|
|
264
|
+
}
|
|
265
|
+
export function formatSnapshot(snapshot, columns) {
|
|
266
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
267
|
+
let data = Object.assign({}, snapshot);
|
|
268
|
+
// format screenshot path to full url
|
|
269
|
+
if (data.screenshot) {
|
|
270
|
+
data.screenshot = joinURL(config.appUrl, data.screenshot);
|
|
271
|
+
}
|
|
272
|
+
// format html path to string
|
|
273
|
+
if (data.html) {
|
|
274
|
+
const html = yield fs.readFile(path.join(config.dataDir, data.html));
|
|
275
|
+
data.html = html.toString();
|
|
276
|
+
}
|
|
277
|
+
if (columns === null || columns === void 0 ? void 0 : columns.length) {
|
|
278
|
+
data = pick(data, columns);
|
|
279
|
+
}
|
|
280
|
+
return data;
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* get snapshot from db or crawl queue
|
|
285
|
+
*/
|
|
286
|
+
export function getSnapshot(jobId) {
|
|
287
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
288
|
+
const snapshotModel = yield Snapshot.findByPk(jobId);
|
|
289
|
+
if (snapshotModel) {
|
|
290
|
+
return snapshotModel.toJSON();
|
|
291
|
+
}
|
|
292
|
+
const job = yield getJob({ id: jobId });
|
|
293
|
+
if (job) {
|
|
294
|
+
return {
|
|
295
|
+
jobId,
|
|
296
|
+
status: 'pending',
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
return null;
|
|
300
|
+
});
|
|
301
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function ensureDatabase(): Promise<void>;
|