@arcblock/crawler 1.0.5 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/lib/cjs/config.d.ts +22 -0
- package/{dist → lib/cjs}/config.js +9 -3
- package/lib/cjs/crawler.d.ts +26 -0
- package/{dist → lib/cjs}/crawler.js +56 -113
- package/lib/cjs/cron.d.ts +1 -0
- package/lib/cjs/cron.js +49 -0
- package/lib/cjs/index.d.ts +9 -0
- package/lib/cjs/index.js +78 -0
- package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
- package/{dist → lib/cjs}/puppeteer.js +43 -54
- package/lib/cjs/services/snapshot.d.ts +12 -0
- package/lib/cjs/services/snapshot.js +84 -0
- package/lib/cjs/site.d.ts +2 -0
- package/lib/cjs/site.js +76 -0
- package/lib/cjs/store/index.d.ts +3 -0
- package/{dist/db → lib/cjs/store}/index.js +21 -5
- package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
- package/lib/cjs/store/job.js +110 -0
- package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
- package/lib/cjs/store/snapshot.js +68 -0
- package/lib/cjs/utils.d.ts +32 -0
- package/{dist → lib/cjs}/utils.js +88 -78
- package/lib/esm/config.d.ts +22 -0
- package/{esm → lib/esm}/config.js +9 -3
- package/lib/esm/crawler.d.ts +26 -0
- package/{esm → lib/esm}/crawler.js +48 -102
- package/lib/esm/cron.d.ts +1 -0
- package/lib/esm/cron.js +43 -0
- package/lib/esm/index.d.ts +9 -0
- package/{esm → lib/esm}/index.js +19 -10
- package/{dist → lib/esm}/puppeteer.d.ts +2 -2
- package/{esm → lib/esm}/puppeteer.js +26 -37
- package/lib/esm/services/snapshot.d.ts +12 -0
- package/lib/esm/services/snapshot.js +75 -0
- package/lib/esm/site.d.ts +2 -0
- package/lib/esm/site.js +69 -0
- package/lib/esm/store/index.d.ts +3 -0
- package/{esm/db → lib/esm/store}/index.js +22 -6
- package/{esm/db → lib/esm/store}/job.d.ts +4 -3
- package/lib/esm/store/job.js +73 -0
- package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
- package/lib/esm/store/snapshot.js +64 -0
- package/lib/esm/utils.d.ts +32 -0
- package/{esm → lib/esm}/utils.js +84 -71
- package/package.json +22 -33
- package/third.d.ts +0 -0
- package/dist/blocklet.d.ts +0 -6
- package/dist/blocklet.js +0 -199
- package/dist/cache.d.ts +0 -10
- package/dist/cache.js +0 -119
- package/dist/config.d.ts +0 -10
- package/dist/crawler.d.ts +0 -28
- package/dist/db/index.d.ts +0 -1
- package/dist/db/job.js +0 -54
- package/dist/db/snapshot.js +0 -52
- package/dist/index.d.ts +0 -6
- package/dist/index.js +0 -45
- package/dist/middleware.d.ts +0 -4
- package/dist/middleware.js +0 -44
- package/dist/utils.d.ts +0 -15
- package/esm/blocklet.d.ts +0 -6
- package/esm/blocklet.js +0 -190
- package/esm/cache.d.ts +0 -10
- package/esm/cache.js +0 -114
- package/esm/config.d.ts +0 -10
- package/esm/crawler.d.ts +0 -28
- package/esm/db/index.d.ts +0 -1
- package/esm/db/job.js +0 -50
- package/esm/db/snapshot.js +0 -48
- package/esm/index.d.ts +0 -6
- package/esm/middleware.d.ts +0 -4
- package/esm/middleware.js +0 -41
- package/esm/utils.d.ts +0 -15
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
2
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
3
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
4
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
5
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
6
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
7
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
|
+
});
|
|
9
|
+
};
|
|
10
|
+
import { DataTypes, Model } from '@sequelize/core';
|
|
11
|
+
export class Snapshot extends Model {
|
|
12
|
+
static initModel(sequelize) {
|
|
13
|
+
return Snapshot.init({
|
|
14
|
+
jobId: {
|
|
15
|
+
type: DataTypes.STRING,
|
|
16
|
+
primaryKey: true,
|
|
17
|
+
allowNull: false,
|
|
18
|
+
},
|
|
19
|
+
url: {
|
|
20
|
+
type: DataTypes.STRING,
|
|
21
|
+
allowNull: false,
|
|
22
|
+
index: true,
|
|
23
|
+
},
|
|
24
|
+
status: {
|
|
25
|
+
type: DataTypes.ENUM('success', 'failed', 'pending'),
|
|
26
|
+
allowNull: false,
|
|
27
|
+
},
|
|
28
|
+
html: {
|
|
29
|
+
type: DataTypes.TEXT,
|
|
30
|
+
allowNull: true,
|
|
31
|
+
},
|
|
32
|
+
screenshot: {
|
|
33
|
+
type: DataTypes.STRING,
|
|
34
|
+
allowNull: true,
|
|
35
|
+
},
|
|
36
|
+
error: {
|
|
37
|
+
type: DataTypes.STRING,
|
|
38
|
+
allowNull: true,
|
|
39
|
+
},
|
|
40
|
+
lastModified: {
|
|
41
|
+
type: DataTypes.STRING,
|
|
42
|
+
allowNull: true,
|
|
43
|
+
},
|
|
44
|
+
options: {
|
|
45
|
+
type: DataTypes.JSON,
|
|
46
|
+
allowNull: true,
|
|
47
|
+
},
|
|
48
|
+
}, {
|
|
49
|
+
sequelize,
|
|
50
|
+
modelName: 'snapshot',
|
|
51
|
+
tableName: 'snap',
|
|
52
|
+
timestamps: true,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
static findSnapshot(condition) {
|
|
56
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
57
|
+
const snapshot = yield Snapshot.findOne(Object.assign({ order: [
|
|
58
|
+
['lastModified', 'DESC'],
|
|
59
|
+
['updatedAt', 'DESC'],
|
|
60
|
+
] }, condition));
|
|
61
|
+
return (snapshot === null || snapshot === void 0 ? void 0 : snapshot.toJSON()) || null;
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { Page } from '@blocklet/puppeteer';
|
|
2
|
+
import { Request } from 'express';
|
|
3
|
+
export declare const axios: import("axios").AxiosInstance;
|
|
4
|
+
export declare const CRAWLER_FLAG = "x-arcblock-crawler";
|
|
5
|
+
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
6
|
+
/**
|
|
7
|
+
* Check if the request is a arcblock crawler
|
|
8
|
+
*/
|
|
9
|
+
export declare const isSelfCrawler: (req: Request) => boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Check if the request is a static file
|
|
12
|
+
*/
|
|
13
|
+
export declare function isStaticFile(req: Request): boolean;
|
|
14
|
+
/**
|
|
15
|
+
* Check if the request is a spider
|
|
16
|
+
*/
|
|
17
|
+
export declare function isSpider(req: Request): boolean;
|
|
18
|
+
/**
|
|
19
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
20
|
+
*/
|
|
21
|
+
export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
|
|
22
|
+
/**
|
|
23
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
24
|
+
*/
|
|
25
|
+
export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
|
|
26
|
+
/**
|
|
27
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
28
|
+
*/
|
|
29
|
+
export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
|
|
30
|
+
export declare const formatUrl: (url: string) => string;
|
|
31
|
+
export declare function md5(content: string | Uint8Array): string;
|
|
32
|
+
export declare function findMaxScrollHeight(page: Page): Promise<number>;
|
package/{esm → lib/esm}/utils.js
RENAMED
|
@@ -7,31 +7,22 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
7
7
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
8
8
|
});
|
|
9
9
|
};
|
|
10
|
-
import
|
|
11
|
-
import axios from 'axios';
|
|
10
|
+
import Axios from 'axios';
|
|
12
11
|
import flattenDeep from 'lodash/flattenDeep';
|
|
13
12
|
import uniq from 'lodash/uniq';
|
|
14
13
|
import { createHash } from 'node:crypto';
|
|
15
14
|
import robotsParser from 'robots-parser';
|
|
16
15
|
import { parseSitemap } from 'sitemap';
|
|
17
16
|
import { Readable } from 'stream';
|
|
18
|
-
import { joinURL } from 'ufo';
|
|
19
|
-
|
|
20
|
-
|
|
17
|
+
import { joinURL, withQuery } from 'ufo';
|
|
18
|
+
import { logger } from './config';
|
|
19
|
+
export const axios = Axios.create({
|
|
20
|
+
timeout: 1000 * 30,
|
|
21
21
|
headers: {
|
|
22
22
|
'Content-Type': 'application/json',
|
|
23
23
|
},
|
|
24
24
|
});
|
|
25
|
-
export const
|
|
26
|
-
return new Promise((resolve) => {
|
|
27
|
-
setTimeout(resolve, ms);
|
|
28
|
-
});
|
|
29
|
-
};
|
|
30
|
-
export const CRAWLER_FLAG = 'x-crawler';
|
|
31
|
-
export const isSelfCrawler = (req) => {
|
|
32
|
-
const ua = req.get('user-agent') || '';
|
|
33
|
-
return req.get(CRAWLER_FLAG) === 'true' || `${ua}`.toLowerCase().indexOf('headless') !== -1;
|
|
34
|
-
};
|
|
25
|
+
export const CRAWLER_FLAG = 'x-arcblock-crawler';
|
|
35
26
|
/**
|
|
36
27
|
* A default set of user agent patterns for bots/crawlers that do not perform
|
|
37
28
|
* well with pages that require JavaScript.
|
|
@@ -87,12 +78,8 @@ const botUserAgents = [
|
|
|
87
78
|
/AlibabaGroup/i,
|
|
88
79
|
/adaptive-edge-crawler/i,
|
|
89
80
|
];
|
|
90
|
-
const isSpider = (ua) => botUserAgents.some((spider) => {
|
|
91
|
-
return spider.test(ua);
|
|
92
|
-
});
|
|
93
81
|
/**
|
|
94
|
-
* A default set of file extensions for static assets that do not need to be
|
|
95
|
-
* proxied.
|
|
82
|
+
* A default set of file extensions for static assets that do not need to be proxied.
|
|
96
83
|
*/
|
|
97
84
|
const staticFileExtensions = [
|
|
98
85
|
'ai',
|
|
@@ -137,84 +124,110 @@ const staticFileExtensions = [
|
|
|
137
124
|
'xml',
|
|
138
125
|
'zip',
|
|
139
126
|
];
|
|
140
|
-
export const
|
|
141
|
-
|
|
142
|
-
|
|
127
|
+
export const sleep = (ms) => {
|
|
128
|
+
return new Promise((resolve) => {
|
|
129
|
+
setTimeout(resolve, ms);
|
|
130
|
+
});
|
|
131
|
+
};
|
|
132
|
+
/**
|
|
133
|
+
* Check if the request is a arcblock crawler
|
|
134
|
+
*/
|
|
135
|
+
export const isSelfCrawler = (req) => {
|
|
136
|
+
const ua = req.get('user-agent') || '';
|
|
137
|
+
return req.get(CRAWLER_FLAG) === 'true' || ua.toLowerCase().indexOf('headless') !== -1;
|
|
143
138
|
};
|
|
139
|
+
/**
|
|
140
|
+
* Check if the request is a static file
|
|
141
|
+
*/
|
|
142
|
+
export function isStaticFile(req) {
|
|
143
|
+
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
144
|
+
return excludeUrlPattern.test(req.path);
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Check if the request is a spider
|
|
148
|
+
*/
|
|
149
|
+
export function isSpider(req) {
|
|
150
|
+
const ua = req.get('user-agent') || '';
|
|
151
|
+
return botUserAgents.some((spider) => spider.test(ua));
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Get and parse the robots.txt by `robots-parser`
|
|
155
|
+
*/
|
|
144
156
|
export function getRobots(url) {
|
|
145
157
|
return __awaiter(this, void 0, void 0, function* () {
|
|
146
158
|
const { origin } = new URL(url);
|
|
147
159
|
const robotsUrl = joinURL(origin, 'robots.txt?nocache=1');
|
|
148
|
-
const { data } = yield
|
|
149
|
-
|
|
150
|
-
|
|
160
|
+
const { data } = yield axios.get(robotsUrl).catch((error) => {
|
|
161
|
+
logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, { error });
|
|
162
|
+
return { data: null };
|
|
163
|
+
});
|
|
151
164
|
return data ? robotsParser(robotsUrl, data) : null;
|
|
152
165
|
});
|
|
153
166
|
}
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
};
|
|
167
|
+
/**
|
|
168
|
+
* Check if the url is allowed to crawl from robots.txt
|
|
169
|
+
*/
|
|
158
170
|
export const isAcceptCrawler = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
159
171
|
const robots = yield getRobots(url);
|
|
160
172
|
const isAllowed = robots ? yield robots.isAllowed(url) : true;
|
|
161
173
|
return isAllowed;
|
|
162
174
|
});
|
|
175
|
+
/**
|
|
176
|
+
* Get and parse the sitemap.xml by `sitemap` package
|
|
177
|
+
*/
|
|
163
178
|
export const getSitemapList = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
164
|
-
let sitemapUrlList = [
|
|
179
|
+
let sitemapUrlList = [];
|
|
165
180
|
const robots = yield getRobots(url);
|
|
166
181
|
if (robots) {
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
}
|
|
182
|
+
sitemapUrlList = (yield robots.getSitemaps()) || [];
|
|
183
|
+
}
|
|
184
|
+
if (!sitemapUrlList.length) {
|
|
185
|
+
const { origin } = new URL(url);
|
|
186
|
+
sitemapUrlList.push(joinURL(origin, 'sitemap.xml?nocache=1'));
|
|
171
187
|
}
|
|
172
188
|
// loop site map url list
|
|
173
189
|
const sitemapList = yield Promise.all(sitemapUrlList.map((sitemapUrl) => __awaiter(void 0, void 0, void 0, function* () {
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
190
|
+
sitemapUrl = withQuery(sitemapUrl, { nocache: '1' });
|
|
191
|
+
try {
|
|
192
|
+
const { data: sitemapTxt } = yield axios.get(sitemapUrl).catch(() => ({
|
|
193
|
+
data: '',
|
|
194
|
+
}));
|
|
195
|
+
if (sitemapTxt) {
|
|
196
|
+
const stream = Readable.from([sitemapTxt]);
|
|
197
|
+
const sitemapJson = yield parseSitemap(stream);
|
|
198
|
+
return sitemapJson;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
catch (error) {
|
|
202
|
+
logger.error(`Could not get sitemap from ${sitemapUrl}`, { error });
|
|
184
203
|
}
|
|
185
204
|
return [];
|
|
186
205
|
})));
|
|
187
206
|
return uniq(flattenDeep(sitemapList.filter(Boolean)));
|
|
188
207
|
});
|
|
189
|
-
export const isBotUserAgent = (req) => {
|
|
190
|
-
const ua = req.get('user-agent');
|
|
191
|
-
const excludeUrlPattern = new RegExp(`\\.(${staticFileExtensions.join('|')})$`, 'i');
|
|
192
|
-
if (ua === undefined || !isSpider(ua) || excludeUrlPattern.test(req.path)) {
|
|
193
|
-
return false;
|
|
194
|
-
}
|
|
195
|
-
return true;
|
|
196
|
-
};
|
|
197
|
-
export const getComponentInfo = () => {
|
|
198
|
-
return components.find((item) => item.did === env.componentDid) || {};
|
|
199
|
-
};
|
|
200
|
-
export const getFullUrl = (req) => {
|
|
201
|
-
const blockletPathname = req.headers['x-path-prefix']
|
|
202
|
-
? joinURL(req.headers['x-path-prefix'], req.originalUrl)
|
|
203
|
-
: req.originalUrl;
|
|
204
|
-
return joinURL(env.appUrl, blockletPathname);
|
|
205
|
-
};
|
|
206
|
-
export const getRelativePath = (url) => {
|
|
207
|
-
try {
|
|
208
|
-
return new URL(url).pathname;
|
|
209
|
-
}
|
|
210
|
-
catch (error) {
|
|
211
|
-
// ignore error
|
|
212
|
-
}
|
|
213
|
-
return url;
|
|
214
|
-
};
|
|
215
208
|
export const formatUrl = (url) => {
|
|
216
209
|
return url.replace(/\/$/, '').trim();
|
|
217
210
|
};
|
|
218
211
|
export function md5(content) {
|
|
219
212
|
return createHash('md5').update(content).digest('hex');
|
|
220
213
|
}
|
|
214
|
+
export function findMaxScrollHeight(page) {
|
|
215
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
216
|
+
const maxHeightHandler = yield page.evaluateHandle(() => {
|
|
217
|
+
const elements = Array.from(document.querySelectorAll('*'));
|
|
218
|
+
let maxHeight = document.body.scrollHeight;
|
|
219
|
+
for (const el of elements) {
|
|
220
|
+
const style = window.getComputedStyle(el);
|
|
221
|
+
if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
|
|
222
|
+
if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
|
|
223
|
+
maxHeight = el.scrollHeight;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return maxHeight;
|
|
228
|
+
});
|
|
229
|
+
const maxHeight = yield maxHeightHandler.jsonValue();
|
|
230
|
+
maxHeightHandler.dispose();
|
|
231
|
+
return maxHeight;
|
|
232
|
+
});
|
|
233
|
+
}
|
package/package.json
CHANGED
|
@@ -1,16 +1,28 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@arcblock/crawler",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"main": "
|
|
5
|
-
"module": "esm/index.js",
|
|
6
|
-
"types": "
|
|
3
|
+
"version": "1.1.1",
|
|
4
|
+
"main": "lib/cjs/index.js",
|
|
5
|
+
"module": "lib/esm/index.js",
|
|
6
|
+
"types": "lib/cjs/index.d.ts",
|
|
7
7
|
"publishConfig": {
|
|
8
8
|
"access": "public"
|
|
9
9
|
},
|
|
10
10
|
"files": [
|
|
11
|
-
"
|
|
12
|
-
"
|
|
11
|
+
"lib",
|
|
12
|
+
"*.d.ts"
|
|
13
13
|
],
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"require": "./lib/cjs/index.js",
|
|
17
|
+
"import": "./lib/esm/index.js",
|
|
18
|
+
"types": "./lib/cjs/index.d.ts"
|
|
19
|
+
},
|
|
20
|
+
"./utils": {
|
|
21
|
+
"require": "./lib/cjs/utils.js",
|
|
22
|
+
"import": "./lib/esm/utils.js",
|
|
23
|
+
"types": "./lib/cjs/utils.d.ts"
|
|
24
|
+
}
|
|
25
|
+
},
|
|
14
26
|
"lint-staged": {
|
|
15
27
|
"*.{mjs,js,jsx,ts,tsx}": [
|
|
16
28
|
"prettier --write",
|
|
@@ -36,61 +48,37 @@
|
|
|
36
48
|
"@abtnode/cron": "^1.16.43",
|
|
37
49
|
"@abtnode/models": "^1.16.43",
|
|
38
50
|
"@abtnode/queue": "^1.16.43",
|
|
39
|
-
"@arcblock/did-auth": "^1.19.15",
|
|
40
|
-
"@arcblock/did-auth-storage-nedb": "^1.7.1",
|
|
41
51
|
"@blocklet/logger": "^1.16.43",
|
|
42
52
|
"@blocklet/puppeteer": "^22.11.3",
|
|
43
53
|
"@blocklet/sdk": "^1.16.43",
|
|
44
|
-
"@ocap/client": "^1.19.15",
|
|
45
|
-
"@ocap/mcrypto": "^1.19.15",
|
|
46
|
-
"@ocap/util": "^1.20.11",
|
|
47
|
-
"@ocap/wallet": "^1.19.15",
|
|
48
54
|
"@sequelize/core": "7.0.0-alpha.46",
|
|
49
55
|
"@sequelize/sqlite3": "7.0.0-alpha.46",
|
|
50
56
|
"axios": "^1.7.9",
|
|
51
|
-
"cookie-parser": "^1.4.7",
|
|
52
|
-
"cors": "^2.8.5",
|
|
53
|
-
"dotenv-flow": "^4.1.0",
|
|
54
|
-
"express": "^4.21.2",
|
|
55
|
-
"express-async-errors": "^3.1.1",
|
|
56
57
|
"fs-extra": "^11.2.0",
|
|
57
58
|
"generic-pool": "^3.9.0",
|
|
58
59
|
"lodash": "^4.17.21",
|
|
59
60
|
"lru-cache": "^10.4.3",
|
|
60
|
-
"p-queue": "6.6.2",
|
|
61
|
-
"p-wait-for": "^5.0.2",
|
|
62
61
|
"redis": "^4.7.0",
|
|
63
62
|
"robots-parser": "^3.0.1",
|
|
64
63
|
"sequelize": "^6.37.7",
|
|
65
64
|
"sitemap": "^7.1.2",
|
|
66
65
|
"sqlite3": "^5.1.7",
|
|
67
66
|
"ufo": "^1.5.4",
|
|
68
|
-
"
|
|
67
|
+
"p-map": "^7.0.3"
|
|
69
68
|
},
|
|
70
69
|
"devDependencies": {
|
|
71
70
|
"@blocklet/js-sdk": "^1.16.39",
|
|
72
|
-
"@types/cookie-parser": "^1.4.8",
|
|
73
|
-
"@types/cors": "^2.8.17",
|
|
74
71
|
"@types/dotenv-flow": "^3.3.3",
|
|
75
72
|
"@types/express": "^4.17.21",
|
|
76
73
|
"@types/fs-extra": "^11.0.4",
|
|
77
74
|
"@types/lodash": "^4.17.16",
|
|
78
75
|
"@types/node": "^20.17.19",
|
|
79
|
-
"
|
|
80
|
-
"@types/react-dom": "^18.3.5",
|
|
81
|
-
"@vitejs/plugin-react": "^4.3.4",
|
|
76
|
+
"express": "^4.21.2",
|
|
82
77
|
"bumpp": "^9.11.1",
|
|
83
78
|
"nodemon": "^3.1.9",
|
|
84
79
|
"npm-run-all": "^4.1.5",
|
|
85
80
|
"puppeteer": "^24.8.2",
|
|
86
|
-
"react": "~18.2.0",
|
|
87
|
-
"react-dom": "~18.2.0",
|
|
88
|
-
"react-router-dom": "^6.29.0",
|
|
89
|
-
"rimraf": "^5.0.10",
|
|
90
81
|
"tsx": "^4.19.3",
|
|
91
|
-
"vite": "^5.4.14",
|
|
92
|
-
"vite-plugin-blocklet": "^0.9.32",
|
|
93
|
-
"vite-plugin-svgr": "^4.3.0",
|
|
94
82
|
"zx": "^8.3.2"
|
|
95
83
|
},
|
|
96
84
|
"importSort": {
|
|
@@ -113,6 +101,7 @@
|
|
|
113
101
|
"bundle": "npm run build",
|
|
114
102
|
"build:cjs": "tsc -p tsconfig.cjs.json",
|
|
115
103
|
"build:esm": "tsc -p tsconfig.esm.json",
|
|
116
|
-
"build": "npm run build:cjs && npm run build:esm"
|
|
104
|
+
"build": "npm run build:cjs && npm run build:esm",
|
|
105
|
+
"fix:sqlite": "cd node_modules/sqlite3 && npm run rebuild"
|
|
117
106
|
}
|
|
118
107
|
}
|
package/third.d.ts
ADDED
|
File without changes
|
package/dist/blocklet.d.ts
DELETED
package/dist/blocklet.js
DELETED
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
-
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
-
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
-
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
-
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
-
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
-
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
-
});
|
|
10
|
-
};
|
|
11
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
-
};
|
|
14
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
-
exports.cancelCronCrawlBlocklet = exports.initCronCrawlBlocklet = exports.crawlBlocklet = void 0;
|
|
16
|
-
const cron_1 = __importDefault(require("@abtnode/cron"));
|
|
17
|
-
const config_1 = require("@blocklet/sdk/lib/config");
|
|
18
|
-
const debounce_1 = __importDefault(require("lodash/debounce"));
|
|
19
|
-
const ufo_1 = require("ufo");
|
|
20
|
-
const cache_1 = require("./cache");
|
|
21
|
-
const config_2 = require("./config");
|
|
22
|
-
const crawler_1 = require("./crawler");
|
|
23
|
-
const puppeteer_1 = require("./puppeteer");
|
|
24
|
-
const utils_1 = require("./utils");
|
|
25
|
-
// record crawl blocklet running
|
|
26
|
-
const crawlBlockletRunningMap = new Map();
|
|
27
|
-
// crawl blocklet sitemap urls
|
|
28
|
-
const crawlBlocklet = () => __awaiter(void 0, void 0, void 0, function* () {
|
|
29
|
-
// @ts-ignore
|
|
30
|
-
const { mountPoint, did } = (0, utils_1.getComponentInfo)();
|
|
31
|
-
if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
|
|
32
|
-
config_2.logger.info(`Crawler blocklet ${did} is running, skip it`);
|
|
33
|
-
return;
|
|
34
|
-
}
|
|
35
|
-
// check has browser can use
|
|
36
|
-
try {
|
|
37
|
-
const browser = yield (0, puppeteer_1.getBrowser)();
|
|
38
|
-
if (!browser) {
|
|
39
|
-
throw new Error('No Browser can use');
|
|
40
|
-
}
|
|
41
|
-
config_2.logger.info('Crawler blocklet existing can use browser');
|
|
42
|
-
}
|
|
43
|
-
catch (error) {
|
|
44
|
-
config_2.logger.info(`Crawler blocklet abort by error: ${(error === null || error === void 0 ? void 0 : error.message) || (error === null || error === void 0 ? void 0 : error.reason) || error}`);
|
|
45
|
-
return;
|
|
46
|
-
}
|
|
47
|
-
const { appUrl } = config_2.config;
|
|
48
|
-
if (!appUrl) {
|
|
49
|
-
throw new Error('appUrl not found');
|
|
50
|
-
}
|
|
51
|
-
const sitemapList = yield (0, utils_1.getSitemapList)(appUrl);
|
|
52
|
-
const matchMountPoint = (0, ufo_1.joinURL)(appUrl, !mountPoint || mountPoint === '/' ? '' : mountPoint);
|
|
53
|
-
const otherMountPointList = config_1.components
|
|
54
|
-
.filter((item) => item.mountPoint && item.mountPoint !== mountPoint)
|
|
55
|
-
.map((item) => item.mountPoint);
|
|
56
|
-
// get can use loc
|
|
57
|
-
const blockletLocList = sitemapList.filter((item) => {
|
|
58
|
-
var _a;
|
|
59
|
-
if (mountPoint !== '/') {
|
|
60
|
-
return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(matchMountPoint)) > -1;
|
|
61
|
-
}
|
|
62
|
-
// if mountPoint is /, skip other mountPoint
|
|
63
|
-
return otherMountPointList.every((mountPoint) => { var _a; return ((_a = item === null || item === void 0 ? void 0 : item.url) === null || _a === void 0 ? void 0 : _a.indexOf(mountPoint)) === -1; });
|
|
64
|
-
});
|
|
65
|
-
const canUseBlockletLocList = [];
|
|
66
|
-
const lastmodMap = new Map();
|
|
67
|
-
let skipBlockletLocTotal = 0;
|
|
68
|
-
let blockletLocTotal = 0;
|
|
69
|
-
yield Promise.all(blockletLocList.map((item) => __awaiter(void 0, void 0, void 0, function* () {
|
|
70
|
-
var _a;
|
|
71
|
-
let tempLocList = [];
|
|
72
|
-
if (item.url) {
|
|
73
|
-
tempLocList.push(item.url);
|
|
74
|
-
}
|
|
75
|
-
if (((_a = item === null || item === void 0 ? void 0 : item.links) === null || _a === void 0 ? void 0 : _a.length) > 0) {
|
|
76
|
-
tempLocList.push(...item.links.map((ytem) => ytem.url));
|
|
77
|
-
}
|
|
78
|
-
blockletLocTotal += tempLocList.length;
|
|
79
|
-
// @ts-ignore
|
|
80
|
-
tempLocList = (yield Promise.all(tempLocList.map((loc) => __awaiter(void 0, void 0, void 0, function* () {
|
|
81
|
-
try {
|
|
82
|
-
const { lastModified: cacheLastModified } = yield cache_1.useCache.get((0, utils_1.getRelativePath)(loc));
|
|
83
|
-
// sitemap item lastmod is same as cache lastModified, skip it
|
|
84
|
-
if (item.lastmod &&
|
|
85
|
-
cacheLastModified &&
|
|
86
|
-
new Date(cacheLastModified).getTime() === new Date(item.lastmod).getTime()) {
|
|
87
|
-
skipBlockletLocTotal++;
|
|
88
|
-
return false;
|
|
89
|
-
}
|
|
90
|
-
return loc;
|
|
91
|
-
}
|
|
92
|
-
catch (error) {
|
|
93
|
-
// ignore error
|
|
94
|
-
}
|
|
95
|
-
// if can not get cache, return loc
|
|
96
|
-
return loc;
|
|
97
|
-
})))).filter(Boolean);
|
|
98
|
-
tempLocList.forEach((loc) => {
|
|
99
|
-
if (item.lastmod)
|
|
100
|
-
lastmodMap.set(loc, item.lastmod);
|
|
101
|
-
});
|
|
102
|
-
canUseBlockletLocList.push(...tempLocList);
|
|
103
|
-
})));
|
|
104
|
-
const crawlerLogText = (step = '') => [
|
|
105
|
-
`Crawler sitemap.xml about ${did} ${step}: `,
|
|
106
|
-
{
|
|
107
|
-
blockletLocTotal,
|
|
108
|
-
canUseBlockletLocTotal: canUseBlockletLocList.length,
|
|
109
|
-
skipBlockletLocTotal,
|
|
110
|
-
lastmodMapTotal: lastmodMap.size,
|
|
111
|
-
},
|
|
112
|
-
];
|
|
113
|
-
config_2.logger.info(...crawlerLogText('start'));
|
|
114
|
-
try {
|
|
115
|
-
// record crawl blocklet running
|
|
116
|
-
crawlBlockletRunningMap.set(did, true);
|
|
117
|
-
yield (0, crawler_1.createCrawlJob)({
|
|
118
|
-
// @ts-ignore
|
|
119
|
-
urls: canUseBlockletLocList,
|
|
120
|
-
saveToRedis: true,
|
|
121
|
-
lastmodMap,
|
|
122
|
-
// formatPageContent: async ({ page }: { page: any; url: string; lastmod?: string }) => {
|
|
123
|
-
// const pageContent = await page.evaluate(() => {
|
|
124
|
-
// const removeElements = (tagName: string) => {
|
|
125
|
-
// const elements = document.querySelectorAll(tagName);
|
|
126
|
-
// for (let i = elements.length - 1; i >= 0; i--) {
|
|
127
|
-
// try {
|
|
128
|
-
// elements[i]?.parentNode?.removeChild(elements[i] as Node);
|
|
129
|
-
// } catch (error) {
|
|
130
|
-
// // do noting
|
|
131
|
-
// }
|
|
132
|
-
// }
|
|
133
|
-
// };
|
|
134
|
-
// // remove script, style, link, noscript
|
|
135
|
-
// // removeElements('script');
|
|
136
|
-
// // removeElements('style');
|
|
137
|
-
// // removeElements('link');
|
|
138
|
-
// // removeElements('noscript');
|
|
139
|
-
// // remove uploader
|
|
140
|
-
// removeElements('[id="uploader-container"]');
|
|
141
|
-
// removeElements('[class^="uppy-"]');
|
|
142
|
-
// // remove point up component
|
|
143
|
-
// removeElements('[id="point-up-component"]');
|
|
144
|
-
// // add meta tag to record crawler
|
|
145
|
-
// const meta = document.createElement('meta');
|
|
146
|
-
// meta.name = 'blocklet-crawler';
|
|
147
|
-
// meta.content = 'true';
|
|
148
|
-
// document.head.appendChild(meta);
|
|
149
|
-
// return document.documentElement.outerHTML;
|
|
150
|
-
// });
|
|
151
|
-
// return pageContent;
|
|
152
|
-
// },
|
|
153
|
-
});
|
|
154
|
-
config_2.logger.info(...crawlerLogText('success'));
|
|
155
|
-
yield (0, puppeteer_1.closeBrowser)({
|
|
156
|
-
trimCache: true,
|
|
157
|
-
});
|
|
158
|
-
}
|
|
159
|
-
catch (error) {
|
|
160
|
-
config_2.logger.info('Crawler blocklet abort by error', error);
|
|
161
|
-
}
|
|
162
|
-
finally {
|
|
163
|
-
// delete crawl blocklet running
|
|
164
|
-
crawlBlockletRunningMap.delete(did);
|
|
165
|
-
}
|
|
166
|
-
});
|
|
167
|
-
exports.crawlBlocklet = crawlBlocklet;
|
|
168
|
-
const CRON_CRAWL_BLOCKLET_KEY = 'cron-crawl-blocklet';
|
|
169
|
-
let cronCrawlBlockletJob = null;
|
|
170
|
-
// init cron crawl blocklet
|
|
171
|
-
const initCronCrawlBlocklet = ({ time = '0 0 */12 * * *', // every 12 hours
|
|
172
|
-
options, } = {}) => {
|
|
173
|
-
if (!cronCrawlBlockletJob) {
|
|
174
|
-
cronCrawlBlockletJob = cron_1.default.init({
|
|
175
|
-
context: {},
|
|
176
|
-
jobs: [
|
|
177
|
-
{
|
|
178
|
-
name: CRON_CRAWL_BLOCKLET_KEY,
|
|
179
|
-
time,
|
|
180
|
-
fn: (0, debounce_1.default)(exports.crawlBlocklet),
|
|
181
|
-
options: Object.assign({ runOnInit: false }, options),
|
|
182
|
-
},
|
|
183
|
-
],
|
|
184
|
-
onError: (err) => {
|
|
185
|
-
console.error('run job failed', err);
|
|
186
|
-
},
|
|
187
|
-
});
|
|
188
|
-
}
|
|
189
|
-
return cronCrawlBlockletJob;
|
|
190
|
-
};
|
|
191
|
-
exports.initCronCrawlBlocklet = initCronCrawlBlocklet;
|
|
192
|
-
const cancelCronCrawlBlocklet = () => {
|
|
193
|
-
if (cronCrawlBlockletJob) {
|
|
194
|
-
cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].stop();
|
|
195
|
-
cronCrawlBlockletJob = null;
|
|
196
|
-
config_2.logger.info('Cron crawl blocklet stop, clear crawl queue');
|
|
197
|
-
}
|
|
198
|
-
};
|
|
199
|
-
exports.cancelCronCrawlBlocklet = cancelCronCrawlBlocklet;
|
package/dist/cache.d.ts
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
export declare const cachePool: import("generic-pool").Pool<any>;
|
|
2
|
-
export declare const memoryPool: import("generic-pool").Pool<Map<string, any>>;
|
|
3
|
-
export declare const withCache: (cb: Function) => Promise<any>;
|
|
4
|
-
export declare const formatKey: (key: string) => string;
|
|
5
|
-
export declare const useCache: {
|
|
6
|
-
get: (key: string) => Promise<any>;
|
|
7
|
-
set: (key: string, value: any, options?: any) => Promise<any>;
|
|
8
|
-
remove: (key: string) => Promise<any>;
|
|
9
|
-
list: (key?: string) => Promise<any>;
|
|
10
|
-
};
|