@blocklet/crawler 2.0.172 → 2.0.174
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -9,9 +9,9 @@ var _ufo = require("ufo");
|
|
|
9
9
|
var _utils = require("./utils");
|
|
10
10
|
var _cron = _interopRequireDefault(require("@abtnode/cron"));
|
|
11
11
|
var _debounce = _interopRequireDefault(require("lodash/debounce"));
|
|
12
|
-
function _interopRequireDefault(
|
|
12
|
+
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
13
13
|
function _getRequireWildcardCache(e) { if ("function" != typeof WeakMap) return null; var r = new WeakMap(), t = new WeakMap(); return (_getRequireWildcardCache = function (e) { return e ? t : r; })(e); }
|
|
14
|
-
function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u &&
|
|
14
|
+
function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u && {}.hasOwnProperty.call(e, u)) { var i = a ? Object.getOwnPropertyDescriptor(e, u) : null; i && (i.get || i.set) ? Object.defineProperty(n, u, i) : n[u] = e[u]; } return n.default = e, t && t.set(e, n), n; }
|
|
15
15
|
const formatHtml = htmlString => {
|
|
16
16
|
if (typeof htmlString !== "string") {
|
|
17
17
|
return "";
|
|
@@ -41,7 +41,6 @@ const getPageContent = async ({
|
|
|
41
41
|
idleTime: 1 * 1e3
|
|
42
42
|
// 1s
|
|
43
43
|
});
|
|
44
|
-
|
|
45
44
|
if (formatPageContent) {
|
|
46
45
|
pageContent = await formatPageContent({
|
|
47
46
|
page,
|
|
@@ -86,7 +85,7 @@ const crawlUrl = async ({
|
|
|
86
85
|
await _utils.useCache.set((0, _utils.getRelativePath)(url), {
|
|
87
86
|
content: pageContent,
|
|
88
87
|
lastmod: lastmodMap?.get(url),
|
|
89
|
-
updatedAt: /* @__PURE__ */new Date().toISOString(),
|
|
88
|
+
updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
|
|
90
89
|
nextDate: cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].nextDate()
|
|
91
90
|
});
|
|
92
91
|
_utils.logger.info(`Crawler[${index}] ${url} success`);
|
|
@@ -178,7 +177,7 @@ const crawlBlocklet = async () => {
|
|
|
178
177
|
skipBlockletLocTotal++;
|
|
179
178
|
return false;
|
|
180
179
|
}
|
|
181
|
-
if (nextDate && new Date(nextDate).getTime() >= /* @__PURE__ */new Date().getTime()) {
|
|
180
|
+
if (nextDate && new Date(nextDate).getTime() >= ( /* @__PURE__ */new Date()).getTime()) {
|
|
182
181
|
skipBlockletLocTotal++;
|
|
183
182
|
return false;
|
|
184
183
|
}
|
|
@@ -62,8 +62,8 @@ Object.keys(_config).forEach(function (key) {
|
|
|
62
62
|
});
|
|
63
63
|
});
|
|
64
64
|
function _getRequireWildcardCache(e) { if ("function" != typeof WeakMap) return null; var r = new WeakMap(), t = new WeakMap(); return (_getRequireWildcardCache = function (e) { return e ? t : r; })(e); }
|
|
65
|
-
function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u &&
|
|
66
|
-
function _interopRequireDefault(
|
|
65
|
+
function _interopRequireWildcard(e, r) { if (!r && e && e.__esModule) return e; if (null === e || "object" != typeof e && "function" != typeof e) return { default: e }; var t = _getRequireWildcardCache(r); if (t && t.has(e)) return t.get(e); var n = { __proto__: null }, a = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var u in e) if ("default" !== u && {}.hasOwnProperty.call(e, u)) { var i = a ? Object.getOwnPropertyDescriptor(e, u) : null; i && (i.get || i.set) ? Object.defineProperty(n, u, i) : n[u] = e[u]; } return n.default = e, t && t.set(e, n), n; }
|
|
66
|
+
function _interopRequireDefault(e) { return e && e.__esModule ? e : { default: e }; }
|
|
67
67
|
const {
|
|
68
68
|
logger
|
|
69
69
|
} = _config.default;
|
|
@@ -338,7 +338,6 @@ const cachePool = exports.cachePool = (0, _genericPool.createPool)({
|
|
|
338
338
|
min: 1
|
|
339
339
|
// evictionRunIntervalMillis: 0,
|
|
340
340
|
});
|
|
341
|
-
|
|
342
341
|
const withCache = async cb => {
|
|
343
342
|
const client = await cachePool.acquire();
|
|
344
343
|
if (client) {
|
|
@@ -165,8 +165,7 @@ export const crawlBlocklet = async () => {
|
|
|
165
165
|
})
|
|
166
166
|
)).filter(Boolean);
|
|
167
167
|
tempLocList.forEach((loc) => {
|
|
168
|
-
if (item.lastmod)
|
|
169
|
-
lastmodMap.set(loc, item.lastmod);
|
|
168
|
+
if (item.lastmod) lastmodMap.set(loc, item.lastmod);
|
|
170
169
|
});
|
|
171
170
|
canUseBlockletLocList.push(...tempLocList);
|
|
172
171
|
})
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@blocklet/crawler",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.174",
|
|
4
4
|
"description": "blocklet crawler lib",
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -40,15 +40,6 @@
|
|
|
40
40
|
"lib",
|
|
41
41
|
"*.d.ts"
|
|
42
42
|
],
|
|
43
|
-
"scripts": {
|
|
44
|
-
"coverage": "yarn test -- --coverage",
|
|
45
|
-
"test": "vitest tests",
|
|
46
|
-
"build": "unbuild",
|
|
47
|
-
"build:watch": "npx nodemon --ext 'ts,tsx,json,js,jsx' --exec 'yarn run build' --ignore 'lib/*' ",
|
|
48
|
-
"dev": "yarn run build:watch",
|
|
49
|
-
"prepublish": "yarn run build",
|
|
50
|
-
"prebuild:dep": "yarn run build"
|
|
51
|
-
},
|
|
52
43
|
"keywords": [
|
|
53
44
|
"blocklet",
|
|
54
45
|
"crawler"
|
|
@@ -59,37 +50,45 @@
|
|
|
59
50
|
"@abtnode/cron": "1.16.30",
|
|
60
51
|
"@blocklet/puppeteer": "^22.11.3",
|
|
61
52
|
"@blocklet/sdk": "1.16.30",
|
|
62
|
-
"axios": "^1.
|
|
53
|
+
"axios": "^1.7.7",
|
|
63
54
|
"crypto": "^1.0.1",
|
|
64
|
-
"fs-extra": "^11.
|
|
55
|
+
"fs-extra": "^11.2.0",
|
|
65
56
|
"generic-pool": "^3.9.0",
|
|
66
57
|
"lodash": "^4.17.21",
|
|
67
|
-
"lru-cache": "^10.
|
|
58
|
+
"lru-cache": "^10.4.3",
|
|
68
59
|
"p-queue": "6.6.2",
|
|
69
60
|
"p-wait-for": "^5.0.2",
|
|
70
|
-
"prettier": "^3.3.
|
|
71
|
-
"redis": "^4.
|
|
61
|
+
"prettier": "^3.3.3",
|
|
62
|
+
"redis": "^4.7.0",
|
|
72
63
|
"robots-parser": "^3.0.1",
|
|
73
|
-
"sitemap": "^7.1.
|
|
74
|
-
"ufo": "^1.
|
|
64
|
+
"sitemap": "^7.1.2",
|
|
65
|
+
"ufo": "^1.5.4",
|
|
75
66
|
"url-join": "^4.0.1"
|
|
76
67
|
},
|
|
77
68
|
"devDependencies": {
|
|
78
69
|
"@arcblock/eslint-config-ts": "^0.3.2",
|
|
79
|
-
"@types/express": "^4.17.
|
|
80
|
-
"@types/fs-extra": "^11.0.
|
|
81
|
-
"@types/lodash": "^4.
|
|
82
|
-
"@types/mime-types": "^2.1.
|
|
83
|
-
"@types/node": "^20.
|
|
70
|
+
"@types/express": "^4.17.21",
|
|
71
|
+
"@types/fs-extra": "^11.0.4",
|
|
72
|
+
"@types/lodash": "^4.17.7",
|
|
73
|
+
"@types/mime-types": "^2.1.4",
|
|
74
|
+
"@types/node": "^20.16.5",
|
|
84
75
|
"@types/puppeteer": "^7.0.4",
|
|
85
|
-
"@types/react": "^18.
|
|
86
|
-
"@types/url-join": "^4.0.
|
|
76
|
+
"@types/react": "^18.3.5",
|
|
77
|
+
"@types/url-join": "^4.0.3",
|
|
87
78
|
"@vitest/coverage-c8": "^0.33.0",
|
|
88
79
|
"jsdom": "^22.1.0",
|
|
89
|
-
"typescript": "^5.
|
|
80
|
+
"typescript": "^5.6.2",
|
|
90
81
|
"unbuild": "^1.2.1",
|
|
91
|
-
"vitest": "^1.
|
|
82
|
+
"vitest": "^1.6.0",
|
|
92
83
|
"vitest-fetch-mock": "^0.2.2"
|
|
93
84
|
},
|
|
94
|
-
"
|
|
95
|
-
|
|
85
|
+
"scripts": {
|
|
86
|
+
"coverage": "pnpm test -- --coverage",
|
|
87
|
+
"test": "vitest tests",
|
|
88
|
+
"build": "unbuild",
|
|
89
|
+
"build:watch": "npx nodemon --ext 'ts,tsx,json,js,jsx' --exec 'pnpm run build' --ignore 'lib/*' ",
|
|
90
|
+
"dev": "pnpm run build:watch",
|
|
91
|
+
"prepublish": "pnpm run build",
|
|
92
|
+
"prebuild:dep": "pnpm run build"
|
|
93
|
+
}
|
|
94
|
+
}
|