@arcblock/crawler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +110 -0
- package/src/blocklet.ts +223 -0
- package/src/cache.ts +117 -0
- package/src/config.ts +13 -0
- package/src/crawler.ts +364 -0
- package/src/db/index.ts +27 -0
- package/src/db/job.ts +93 -0
- package/src/db/snapshot.ts +89 -0
- package/src/index.ts +19 -0
- package/src/middleware.ts +46 -0
- package/src/puppeteer.ts +296 -0
- package/src/utils.ts +240 -0
- package/third.d.ts +1 -0
- package/tsconfig.json +9 -0
package/package.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@arcblock/crawler",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"main": "dist/index.js",
|
|
5
|
+
"types": "dist/index.d.ts",
|
|
6
|
+
"publishConfig": {
|
|
7
|
+
"access": "public"
|
|
8
|
+
},
|
|
9
|
+
"lint-staged": {
|
|
10
|
+
"*.{mjs,js,jsx,ts,tsx}": [
|
|
11
|
+
"prettier --write",
|
|
12
|
+
"eslint"
|
|
13
|
+
],
|
|
14
|
+
"*.{css,less,scss,json,graphql}": [
|
|
15
|
+
"prettier --write"
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
"browserslist": {
|
|
19
|
+
"production": [
|
|
20
|
+
">0.2%",
|
|
21
|
+
"not dead",
|
|
22
|
+
"not op_mini all"
|
|
23
|
+
],
|
|
24
|
+
"development": [
|
|
25
|
+
"last 1 chrome version",
|
|
26
|
+
"last 1 firefox version",
|
|
27
|
+
"last 1 safari version"
|
|
28
|
+
]
|
|
29
|
+
},
|
|
30
|
+
"dependencies": {
|
|
31
|
+
"@abtnode/cron": "^1.16.43",
|
|
32
|
+
"@abtnode/models": "^1.16.43",
|
|
33
|
+
"@abtnode/queue": "^1.16.43",
|
|
34
|
+
"@arcblock/did-auth": "^1.19.15",
|
|
35
|
+
"@arcblock/did-auth-storage-nedb": "^1.7.1",
|
|
36
|
+
"@blocklet/logger": "^1.16.43",
|
|
37
|
+
"@blocklet/puppeteer": "^22.11.3",
|
|
38
|
+
"@blocklet/sdk": "^1.16.43",
|
|
39
|
+
"@ocap/client": "^1.19.15",
|
|
40
|
+
"@ocap/mcrypto": "^1.19.15",
|
|
41
|
+
"@ocap/util": "^1.20.11",
|
|
42
|
+
"@ocap/wallet": "^1.19.15",
|
|
43
|
+
"@sequelize/core": "7.0.0-alpha.46",
|
|
44
|
+
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
45
|
+
"axios": "^1.7.9",
|
|
46
|
+
"cookie-parser": "^1.4.7",
|
|
47
|
+
"cors": "^2.8.5",
|
|
48
|
+
"dotenv-flow": "^4.1.0",
|
|
49
|
+
"express": "^4.21.2",
|
|
50
|
+
"express-async-errors": "^3.1.1",
|
|
51
|
+
"fs-extra": "^11.2.0",
|
|
52
|
+
"generic-pool": "^3.9.0",
|
|
53
|
+
"lodash": "^4.17.21",
|
|
54
|
+
"lru-cache": "^10.4.3",
|
|
55
|
+
"p-queue": "6.6.2",
|
|
56
|
+
"p-wait-for": "^5.0.2",
|
|
57
|
+
"redis": "^4.7.0",
|
|
58
|
+
"robots-parser": "^3.0.1",
|
|
59
|
+
"sequelize": "^6.37.7",
|
|
60
|
+
"sitemap": "^7.1.2",
|
|
61
|
+
"sqlite3": "^5.1.7",
|
|
62
|
+
"ufo": "^1.5.4",
|
|
63
|
+
"url-join": "^4.0.1"
|
|
64
|
+
},
|
|
65
|
+
"devDependencies": {
|
|
66
|
+
"@blocklet/js-sdk": "^1.16.39",
|
|
67
|
+
"@types/cookie-parser": "^1.4.8",
|
|
68
|
+
"@types/cors": "^2.8.17",
|
|
69
|
+
"@types/dotenv-flow": "^3.3.3",
|
|
70
|
+
"@types/express": "^4.17.21",
|
|
71
|
+
"@types/fs-extra": "^11.0.4",
|
|
72
|
+
"@types/lodash": "^4.17.16",
|
|
73
|
+
"@types/node": "^20.17.19",
|
|
74
|
+
"@types/react": "^18.3.18",
|
|
75
|
+
"@types/react-dom": "^18.3.5",
|
|
76
|
+
"@vitejs/plugin-react": "^4.3.4",
|
|
77
|
+
"bumpp": "^9.11.1",
|
|
78
|
+
"nodemon": "^3.1.9",
|
|
79
|
+
"npm-run-all": "^4.1.5",
|
|
80
|
+
"puppeteer": "^24.8.2",
|
|
81
|
+
"react": "~18.2.0",
|
|
82
|
+
"react-dom": "~18.2.0",
|
|
83
|
+
"react-router-dom": "^6.29.0",
|
|
84
|
+
"rimraf": "^5.0.10",
|
|
85
|
+
"tsx": "^4.19.3",
|
|
86
|
+
"vite": "^5.4.14",
|
|
87
|
+
"vite-plugin-blocklet": "^0.9.32",
|
|
88
|
+
"vite-plugin-svgr": "^4.3.0",
|
|
89
|
+
"zx": "^8.3.2"
|
|
90
|
+
},
|
|
91
|
+
"importSort": {
|
|
92
|
+
".js, .jsx, .mjs": {
|
|
93
|
+
"parser": "babylon",
|
|
94
|
+
"style": "module"
|
|
95
|
+
},
|
|
96
|
+
".ts, .tsx": {
|
|
97
|
+
"style": "module",
|
|
98
|
+
"parser": "typescript"
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
"simple-git-hooks": {
|
|
102
|
+
"pre-commit": "npx lint-staged"
|
|
103
|
+
},
|
|
104
|
+
"scripts": {
|
|
105
|
+
"dev": "tsc --watch",
|
|
106
|
+
"lint": "tsc --noEmit && eslint src --ext .mjs,.js,.jsx,.ts,.tsx",
|
|
107
|
+
"lint:fix": "npm run lint -- --fix",
|
|
108
|
+
"bundle": "tsc"
|
|
109
|
+
}
|
|
110
|
+
}
|
package/src/blocklet.ts
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
import Cron from '@abtnode/cron';
|
|
2
|
+
import { components } from '@blocklet/sdk/lib/config';
|
|
3
|
+
import debounce from 'lodash/debounce';
|
|
4
|
+
import { joinURL } from 'ufo';
|
|
5
|
+
|
|
6
|
+
import { useCache } from './cache';
|
|
7
|
+
import { config, logger } from './config';
|
|
8
|
+
import { createCrawlJob } from './crawler';
|
|
9
|
+
import { closeBrowser, getBrowser } from './puppeteer';
|
|
10
|
+
import { getComponentInfo, getRelativePath, getSitemapList } from './utils';
|
|
11
|
+
|
|
12
|
+
// record crawl blocklet running
|
|
13
|
+
const crawlBlockletRunningMap = new Map();
|
|
14
|
+
|
|
15
|
+
// crawl blocklet sitemap urls
|
|
16
|
+
export const crawlBlocklet = async () => {
|
|
17
|
+
// @ts-ignore
|
|
18
|
+
const { mountPoint, did } = getComponentInfo();
|
|
19
|
+
|
|
20
|
+
if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
|
|
21
|
+
logger.info(`Crawler blocklet ${did} is running, skip it`);
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// check has browser can use
|
|
26
|
+
try {
|
|
27
|
+
const browser = await getBrowser();
|
|
28
|
+
if (!browser) {
|
|
29
|
+
throw new Error('No Browser can use');
|
|
30
|
+
}
|
|
31
|
+
logger.info('Crawler blocklet existing can use browser');
|
|
32
|
+
} catch (error: any) {
|
|
33
|
+
logger.info(`Crawler blocklet abort by error: ${error?.message || error?.reason || error}`);
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const { appUrl } = config;
|
|
38
|
+
|
|
39
|
+
if (!appUrl) {
|
|
40
|
+
throw new Error('appUrl not found');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const sitemapList = await getSitemapList(appUrl);
|
|
44
|
+
|
|
45
|
+
const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
|
|
46
|
+
const otherMountPointList = components
|
|
47
|
+
.filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
|
|
48
|
+
.map((item) => item.mountPoint);
|
|
49
|
+
|
|
50
|
+
// get can use loc
|
|
51
|
+
const blockletLocList = sitemapList.filter((item: any) => {
|
|
52
|
+
if (mountPoint !== '/') {
|
|
53
|
+
return item?.url?.indexOf(matchMountPoint) > -1;
|
|
54
|
+
}
|
|
55
|
+
// if mountPoint is /, skip other mountPoint
|
|
56
|
+
return otherMountPointList.every((mountPoint) => item?.url?.indexOf(mountPoint) === -1);
|
|
57
|
+
}) as [];
|
|
58
|
+
|
|
59
|
+
const canUseBlockletLocList = [] as string[];
|
|
60
|
+
const lastmodMap = new Map();
|
|
61
|
+
let skipBlockletLocTotal = 0;
|
|
62
|
+
let blockletLocTotal = 0;
|
|
63
|
+
|
|
64
|
+
await Promise.all(
|
|
65
|
+
blockletLocList.map(async (item: any) => {
|
|
66
|
+
let tempLocList: string[] = [];
|
|
67
|
+
|
|
68
|
+
if (item.url) {
|
|
69
|
+
tempLocList.push(item.url);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (item?.links?.length > 0) {
|
|
73
|
+
tempLocList.push(...item.links.map((ytem: any) => ytem.url));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
blockletLocTotal += tempLocList.length;
|
|
77
|
+
|
|
78
|
+
// @ts-ignore
|
|
79
|
+
tempLocList = (
|
|
80
|
+
await Promise.all(
|
|
81
|
+
tempLocList.map(async (loc) => {
|
|
82
|
+
try {
|
|
83
|
+
const { lastModified: cacheLastModified } = await useCache.get(getRelativePath(loc));
|
|
84
|
+
|
|
85
|
+
// sitemap item lastmod is same as cache lastModified, skip it
|
|
86
|
+
if (
|
|
87
|
+
item.lastmod &&
|
|
88
|
+
cacheLastModified &&
|
|
89
|
+
new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()
|
|
90
|
+
) {
|
|
91
|
+
skipBlockletLocTotal++;
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return loc;
|
|
96
|
+
} catch (error) {
|
|
97
|
+
// ignore error
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// if can not get cache, return loc
|
|
101
|
+
return loc;
|
|
102
|
+
}),
|
|
103
|
+
)
|
|
104
|
+
).filter(Boolean);
|
|
105
|
+
|
|
106
|
+
tempLocList.forEach((loc) => {
|
|
107
|
+
if (item.lastmod) lastmodMap.set(loc, item.lastmod);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
canUseBlockletLocList.push(...tempLocList);
|
|
111
|
+
}),
|
|
112
|
+
);
|
|
113
|
+
|
|
114
|
+
const crawlerLogText = (step = '') => [
|
|
115
|
+
`Crawler sitemap.xml about ${did} ${step}: `,
|
|
116
|
+
{
|
|
117
|
+
blockletLocTotal,
|
|
118
|
+
canUseBlockletLocTotal: canUseBlockletLocList.length,
|
|
119
|
+
skipBlockletLocTotal,
|
|
120
|
+
lastmodMapTotal: lastmodMap.size,
|
|
121
|
+
},
|
|
122
|
+
];
|
|
123
|
+
|
|
124
|
+
logger.info(...crawlerLogText('start'));
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
// record crawl blocklet running
|
|
128
|
+
crawlBlockletRunningMap.set(did, true);
|
|
129
|
+
|
|
130
|
+
await createCrawlJob({
|
|
131
|
+
// @ts-ignore
|
|
132
|
+
urls: canUseBlockletLocList,
|
|
133
|
+
saveToRedis: true,
|
|
134
|
+
lastmodMap,
|
|
135
|
+
// formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
|
|
136
|
+
// const pageContent = await page.evaluate(() => {
|
|
137
|
+
// const removeElements = (tagName: string) => {
|
|
138
|
+
// const elements = document.querySelectorAll(tagName);
|
|
139
|
+
// for (let i = elements.length - 1; i >= 0; i--) {
|
|
140
|
+
// try {
|
|
141
|
+
// elements[i]?.parentNode?.removeChild(elements[i] as Node);
|
|
142
|
+
// } catch (error) {
|
|
143
|
+
// // do noting
|
|
144
|
+
// }
|
|
145
|
+
// }
|
|
146
|
+
// };
|
|
147
|
+
|
|
148
|
+
// // remove script, style, link, noscript
|
|
149
|
+
// // removeElements('script');
|
|
150
|
+
// // removeElements('style');
|
|
151
|
+
// // removeElements('link');
|
|
152
|
+
// // removeElements('noscript');
|
|
153
|
+
|
|
154
|
+
// // remove uploader
|
|
155
|
+
// removeElements('[id="uploader-container"]');
|
|
156
|
+
// removeElements('[class^="uppy-"]');
|
|
157
|
+
|
|
158
|
+
// // remove point up component
|
|
159
|
+
// removeElements('[id="point-up-component"]');
|
|
160
|
+
|
|
161
|
+
// // add meta tag to record crawler
|
|
162
|
+
// const meta = document.createElement('meta');
|
|
163
|
+
// meta.name = 'blocklet-crawler';
|
|
164
|
+
// meta.content = 'true';
|
|
165
|
+
// document.head.appendChild(meta);
|
|
166
|
+
|
|
167
|
+
// return document.documentElement.outerHTML;
|
|
168
|
+
// });
|
|
169
|
+
|
|
170
|
+
// return pageContent;
|
|
171
|
+
// },
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
logger.info(...crawlerLogText('success'));
|
|
175
|
+
|
|
176
|
+
await closeBrowser({
|
|
177
|
+
trimCache: true,
|
|
178
|
+
});
|
|
179
|
+
} catch (error) {
|
|
180
|
+
logger.info('Crawler blocklet abort by error', error);
|
|
181
|
+
} finally {
|
|
182
|
+
// delete crawl blocklet running
|
|
183
|
+
crawlBlockletRunningMap.delete(did);
|
|
184
|
+
}
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
|
|
188
|
+
let cronCrawlBlockletJob = null as any;
|
|
189
|
+
|
|
190
|
+
// init cron crawl blocklet
|
|
191
|
+
export const initCronCrawlBlocklet = (
|
|
192
|
+
{
|
|
193
|
+
time = '0 0 */12 * * *', // every 12 hours
|
|
194
|
+
options,
|
|
195
|
+
} = {} as { time: string; options: any },
|
|
196
|
+
) => {
|
|
197
|
+
if (!cronCrawlBlockletJob) {
|
|
198
|
+
cronCrawlBlockletJob = Cron.init({
|
|
199
|
+
context: {},
|
|
200
|
+
jobs: [
|
|
201
|
+
{
|
|
202
|
+
name: CRON_CRAWL_BLOCKLET_KEY,
|
|
203
|
+
time,
|
|
204
|
+
fn: debounce(crawlBlocklet),
|
|
205
|
+
options: { runOnInit: false, ...options },
|
|
206
|
+
},
|
|
207
|
+
],
|
|
208
|
+
onError: (err: Error) => {
|
|
209
|
+
console.error('run job failed', err);
|
|
210
|
+
},
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return cronCrawlBlockletJob;
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
export const cancelCronCrawlBlocklet = () => {
|
|
218
|
+
if (cronCrawlBlockletJob) {
|
|
219
|
+
cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
|
|
220
|
+
cronCrawlBlockletJob = null;
|
|
221
|
+
logger.info('Cron crawl blocklet stop, clear crawl queue');
|
|
222
|
+
}
|
|
223
|
+
};
|
package/src/cache.ts
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { createPool } from 'generic-pool';
|
|
2
|
+
import { createClient } from 'redis';
|
|
3
|
+
|
|
4
|
+
import { config, logger } from './config';
|
|
5
|
+
|
|
6
|
+
const cacheKeyPrefix = process.env?.BLOCKLET_REAL_DID ? `${process.env.BLOCKLET_REAL_DID}:` : '';
|
|
7
|
+
const MAX_REDIS_RETRY = 3;
|
|
8
|
+
const ttl = 1000 * 60 * 60 * 24 * 7;
|
|
9
|
+
|
|
10
|
+
export const cachePool = createPool(
|
|
11
|
+
{
|
|
12
|
+
create: async () => {
|
|
13
|
+
try {
|
|
14
|
+
const { redisUrl } = config;
|
|
15
|
+
const redisClient = createClient({
|
|
16
|
+
url: redisUrl,
|
|
17
|
+
socket: {
|
|
18
|
+
// @ts-ignore
|
|
19
|
+
reconnectStrategy: (retries) => {
|
|
20
|
+
if (retries >= MAX_REDIS_RETRY) {
|
|
21
|
+
return new Error('Retry Time Exhausted');
|
|
22
|
+
}
|
|
23
|
+
return Math.min(retries * 500, 1000 * 3);
|
|
24
|
+
},
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
redisClient.on('error', (err) => logger.warn('Redis Client Error:', err));
|
|
29
|
+
await redisClient.connect();
|
|
30
|
+
logger.info(`Successfully connected to Redis: ${redisUrl}`);
|
|
31
|
+
|
|
32
|
+
return redisClient;
|
|
33
|
+
} catch (error) {
|
|
34
|
+
logger.warn('Redis connection failed', error);
|
|
35
|
+
return null;
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
destroy: async (client: any) => {
|
|
39
|
+
// if is redis client
|
|
40
|
+
if (client.isReady) {
|
|
41
|
+
await client.quit();
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
max: 2, // 2 clients
|
|
47
|
+
min: 0,
|
|
48
|
+
// evictionRunIntervalMillis: 0,
|
|
49
|
+
},
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
export const memoryPool = createPool(
|
|
53
|
+
{
|
|
54
|
+
create: () => {
|
|
55
|
+
const map = new Map<string, any>();
|
|
56
|
+
// @ts-ignore
|
|
57
|
+
map.del = map.delete;
|
|
58
|
+
return Promise.resolve(map);
|
|
59
|
+
},
|
|
60
|
+
destroy: (client: Map<string, any>) => {
|
|
61
|
+
client.clear();
|
|
62
|
+
return Promise.resolve();
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
max: 10,
|
|
67
|
+
min: 0,
|
|
68
|
+
},
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
export const withCache = async (cb: Function) => {
|
|
72
|
+
const pool = config.redisUrl ? cachePool : memoryPool;
|
|
73
|
+
const client = await pool.acquire();
|
|
74
|
+
|
|
75
|
+
if (client) {
|
|
76
|
+
try {
|
|
77
|
+
return cb(client);
|
|
78
|
+
} finally {
|
|
79
|
+
// release client to pool, let other use
|
|
80
|
+
await pool.release(client);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
export const formatKey = (key: string) => {
|
|
86
|
+
return `${cacheKeyPrefix}${key}`;
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
export const useCache = {
|
|
90
|
+
get: (key: string) => {
|
|
91
|
+
return withCache(async (client: any) => {
|
|
92
|
+
const value = await client.get(formatKey(key));
|
|
93
|
+
try {
|
|
94
|
+
return JSON.parse(value);
|
|
95
|
+
} catch (error) {
|
|
96
|
+
// ignore error
|
|
97
|
+
}
|
|
98
|
+
return value;
|
|
99
|
+
});
|
|
100
|
+
},
|
|
101
|
+
set: (key: string, value: any, options?: any) => {
|
|
102
|
+
return withCache((client: any) => {
|
|
103
|
+
const formatValue = typeof value === 'string' ? value : JSON.stringify(value);
|
|
104
|
+
return client.set(formatKey(key), formatValue, { PX: ttl, ...options });
|
|
105
|
+
});
|
|
106
|
+
},
|
|
107
|
+
remove: (key: string) => {
|
|
108
|
+
return withCache((client: any) => {
|
|
109
|
+
return client.del(formatKey(key));
|
|
110
|
+
});
|
|
111
|
+
},
|
|
112
|
+
list: (key: string = '*') => {
|
|
113
|
+
return withCache((client: any) => {
|
|
114
|
+
return client.keys(formatKey(key));
|
|
115
|
+
});
|
|
116
|
+
},
|
|
117
|
+
};
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import createLogger from '@blocklet/logger';
|
|
2
|
+
|
|
3
|
+
export const logger = createLogger('crawler', { level: process.env.LOG_LEVEL || 'info' });
|
|
4
|
+
|
|
5
|
+
export const config = {
|
|
6
|
+
redisUrl: process.env.REDIS_URL!,
|
|
7
|
+
dataDir: process.env.BLOCKLET_DATA_DIR!,
|
|
8
|
+
appDir: process.env.BLOCKLET_APP_DIR! || process.cwd(),
|
|
9
|
+
appUrl: process.env.BLOCKLET_APP_URL!,
|
|
10
|
+
puppeteerPath: process.env.PUPPETEER_EXECUTABLE_PATH!,
|
|
11
|
+
cacheDir: process.env.BLOCKLET_CACHE_DIR!,
|
|
12
|
+
testOnInitialize: process.env.NODE_ENV === 'production',
|
|
13
|
+
};
|