@blocklet/crawler 2.1.236 → 2.1.237
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/cjs/middlewares/crawler.d.ts +10 -1
- package/lib/cjs/middlewares/crawler.js +39 -10
- package/lib/cjs/middlewares/utils.d.ts +3 -1
- package/lib/cjs/middlewares/utils.js +9 -3
- package/lib/es/middlewares/crawler.d.ts +10 -1
- package/lib/es/middlewares/crawler.js +28 -11
- package/lib/es/middlewares/utils.d.ts +3 -1
- package/lib/es/middlewares/utils.js +7 -3
- package/package.json +1 -1
|
@@ -1,9 +1,18 @@
|
|
|
1
|
+
import { getRelativePath } from './utils';
|
|
1
2
|
import { PageOptions } from './types';
|
|
3
|
+
export { getRelativePath };
|
|
2
4
|
export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
|
|
3
|
-
export declare const
|
|
5
|
+
export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
|
|
6
|
+
export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
|
|
7
|
+
url: string;
|
|
8
|
+
content: string;
|
|
9
|
+
lastmod?: string;
|
|
10
|
+
}) => Promise<any>;
|
|
11
|
+
export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
|
|
4
12
|
urls: string[] | string;
|
|
5
13
|
lastmodMap?: Map<string, string>;
|
|
6
14
|
formatPageContent?: Function | null | undefined;
|
|
15
|
+
autoCloseBrowserCount?: number;
|
|
7
16
|
}) => Promise<void>;
|
|
8
17
|
export declare const crawlBlocklet: () => Promise<void>;
|
|
9
18
|
export declare const initCronCrawlBlocklet: ({ time, options, }?: {
|
|
@@ -3,7 +3,14 @@
|
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
|
4
4
|
value: true
|
|
5
5
|
});
|
|
6
|
-
exports.
|
|
6
|
+
exports.getPageContent = exports.crawlUrl = exports.crawlBlocklet = exports.cancelCronCrawlBlocklet = void 0;
|
|
7
|
+
Object.defineProperty(exports, "getRelativePath", {
|
|
8
|
+
enumerable: true,
|
|
9
|
+
get: function () {
|
|
10
|
+
return _utils.getRelativePath;
|
|
11
|
+
}
|
|
12
|
+
});
|
|
13
|
+
exports.setUrlInfoToCache = exports.initCronCrawlBlocklet = exports.getUrlInfoFromCache = void 0;
|
|
7
14
|
var PQueue = _interopRequireWildcard(require("p-queue"));
|
|
8
15
|
var _ufo = require("ufo");
|
|
9
16
|
var _utils = require("./utils");
|
|
@@ -59,10 +66,31 @@ const getPageContent = async ({
|
|
|
59
66
|
return formatHtml(pageContent);
|
|
60
67
|
};
|
|
61
68
|
exports.getPageContent = getPageContent;
|
|
69
|
+
const getUrlInfoFromCache = async url => {
|
|
70
|
+
const cache = await _utils.useCache.get((0, _utils.getRelativePath)(url));
|
|
71
|
+
return cache;
|
|
72
|
+
};
|
|
73
|
+
exports.getUrlInfoFromCache = getUrlInfoFromCache;
|
|
74
|
+
const setUrlInfoToCache = async ({
|
|
75
|
+
url,
|
|
76
|
+
content,
|
|
77
|
+
lastmod
|
|
78
|
+
}) => {
|
|
79
|
+
if (!content || !url) {
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
return await _utils.useCache.set((0, _utils.getRelativePath)(url), {
|
|
83
|
+
content,
|
|
84
|
+
lastmod: lastmod || ( /* @__PURE__ */new Date()).toISOString(),
|
|
85
|
+
updatedAt: ( /* @__PURE__ */new Date()).toISOString()
|
|
86
|
+
});
|
|
87
|
+
};
|
|
88
|
+
exports.setUrlInfoToCache = setUrlInfoToCache;
|
|
62
89
|
const crawlUrl = async ({
|
|
63
90
|
urls,
|
|
64
91
|
lastmodMap,
|
|
65
|
-
formatPageContent
|
|
92
|
+
formatPageContent,
|
|
93
|
+
autoCloseBrowserCount = 30
|
|
66
94
|
}) => {
|
|
67
95
|
if (typeof urls === "string") {
|
|
68
96
|
urls = [urls];
|
|
@@ -74,8 +102,10 @@ const crawlUrl = async ({
|
|
|
74
102
|
}) => {
|
|
75
103
|
return async () => {
|
|
76
104
|
try {
|
|
77
|
-
if (index %
|
|
78
|
-
await (0, _utils.closeBrowser)(
|
|
105
|
+
if (index % autoCloseBrowserCount === 0) {
|
|
106
|
+
await (0, _utils.closeBrowser)({
|
|
107
|
+
trimCache: false
|
|
108
|
+
});
|
|
79
109
|
}
|
|
80
110
|
const canCrawl = await (0, _utils.isAcceptCrawler)(url);
|
|
81
111
|
if (canCrawl) {
|
|
@@ -84,11 +114,10 @@ const crawlUrl = async ({
|
|
|
84
114
|
formatPageContent
|
|
85
115
|
});
|
|
86
116
|
if (pageContent) {
|
|
87
|
-
await
|
|
117
|
+
await setUrlInfoToCache({
|
|
118
|
+
url,
|
|
88
119
|
content: pageContent,
|
|
89
|
-
lastmod: lastmodMap?.get(url)
|
|
90
|
-
updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
|
|
91
|
-
nextDate: cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].nextDate()
|
|
120
|
+
lastmod: lastmodMap?.get(url)
|
|
92
121
|
});
|
|
93
122
|
_utils.logger.info(`Crawler[${index}] ${url} success`);
|
|
94
123
|
} else if (retryCount < 3) {
|
|
@@ -239,8 +268,8 @@ exports.crawlBlocklet = crawlBlocklet;
|
|
|
239
268
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
240
269
|
let cronCrawlBlockletJob = null;
|
|
241
270
|
const initCronCrawlBlocklet = ({
|
|
242
|
-
time = "0 0 */
|
|
243
|
-
// every
|
|
271
|
+
time = "0 0 */12 * * *",
|
|
272
|
+
// every 12 hours
|
|
244
273
|
options
|
|
245
274
|
} = {}) => {
|
|
246
275
|
if (!cronCrawlBlockletJob) {
|
|
@@ -3,7 +3,9 @@ export * from '@blocklet/sdk/lib/config';
|
|
|
3
3
|
export declare const api: import("axios").AxiosInstance;
|
|
4
4
|
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
5
5
|
export declare const clearCheckBrowserTimer: () => void;
|
|
6
|
-
export declare const closeBrowser: (
|
|
6
|
+
export declare const closeBrowser: ({ trimCache }?: {
|
|
7
|
+
trimCache?: boolean;
|
|
8
|
+
}) => Promise<void>;
|
|
7
9
|
export declare const getBrowser: () => Promise<any>;
|
|
8
10
|
export declare const CRAWLER_FLAG = "x-crawler";
|
|
9
11
|
export declare const isSelfCrawler: (req: any) => boolean;
|
|
@@ -95,7 +95,9 @@ const clearCheckBrowserTimer = () => {
|
|
|
95
95
|
}
|
|
96
96
|
};
|
|
97
97
|
exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
|
|
98
|
-
const closeBrowser = async (
|
|
98
|
+
const closeBrowser = async ({
|
|
99
|
+
trimCache = true
|
|
100
|
+
} = {}) => {
|
|
99
101
|
try {
|
|
100
102
|
if (browser) {
|
|
101
103
|
const pages = await browser.pages().catch(() => []);
|
|
@@ -107,7 +109,9 @@ const closeBrowser = async () => {
|
|
|
107
109
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
108
110
|
}
|
|
109
111
|
clearCheckBrowserTimer();
|
|
110
|
-
|
|
112
|
+
if (trimCache) {
|
|
113
|
+
await _puppeteer.default.trimCache();
|
|
114
|
+
}
|
|
111
115
|
logger.info("Close browser success");
|
|
112
116
|
if (global.gc) {
|
|
113
117
|
global.gc();
|
|
@@ -175,7 +179,9 @@ const getBrowser = async () => {
|
|
|
175
179
|
}
|
|
176
180
|
if (count >= 3) {
|
|
177
181
|
logger.info("Browser inactive for 3 minutes, closing...");
|
|
178
|
-
await closeBrowser(
|
|
182
|
+
await closeBrowser({
|
|
183
|
+
trimCache: true
|
|
184
|
+
});
|
|
179
185
|
}
|
|
180
186
|
}
|
|
181
187
|
}, 1e3 * 60);
|
|
@@ -1,9 +1,18 @@
|
|
|
1
|
+
import { getRelativePath } from './utils';
|
|
1
2
|
import { PageOptions } from './types';
|
|
3
|
+
export { getRelativePath };
|
|
2
4
|
export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
|
|
3
|
-
export declare const
|
|
5
|
+
export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
|
|
6
|
+
export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
|
|
7
|
+
url: string;
|
|
8
|
+
content: string;
|
|
9
|
+
lastmod?: string;
|
|
10
|
+
}) => Promise<any>;
|
|
11
|
+
export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
|
|
4
12
|
urls: string[] | string;
|
|
5
13
|
lastmodMap?: Map<string, string>;
|
|
6
14
|
formatPageContent?: Function | null | undefined;
|
|
15
|
+
autoCloseBrowserCount?: number;
|
|
7
16
|
}) => Promise<void>;
|
|
8
17
|
export declare const crawlBlocklet: () => Promise<void>;
|
|
9
18
|
export declare const initCronCrawlBlocklet: ({ time, options, }?: {
|
|
@@ -16,6 +16,7 @@ import {
|
|
|
16
16
|
} from "./utils.js";
|
|
17
17
|
import Cron from "@abtnode/cron";
|
|
18
18
|
import debounce from "lodash/debounce";
|
|
19
|
+
export { getRelativePath };
|
|
19
20
|
const formatHtml = (htmlString) => {
|
|
20
21
|
if (typeof htmlString !== "string") {
|
|
21
22
|
return "";
|
|
@@ -53,10 +54,29 @@ export const getPageContent = async ({ url, formatPageContent }) => {
|
|
|
53
54
|
}
|
|
54
55
|
return formatHtml(pageContent);
|
|
55
56
|
};
|
|
57
|
+
export const getUrlInfoFromCache = async (url) => {
|
|
58
|
+
const cache = await useCache.get(getRelativePath(url));
|
|
59
|
+
return cache;
|
|
60
|
+
};
|
|
61
|
+
export const setUrlInfoToCache = async ({
|
|
62
|
+
url,
|
|
63
|
+
content,
|
|
64
|
+
lastmod
|
|
65
|
+
}) => {
|
|
66
|
+
if (!content || !url) {
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
return await useCache.set(getRelativePath(url), {
|
|
70
|
+
content,
|
|
71
|
+
lastmod: lastmod || (/* @__PURE__ */ new Date()).toISOString(),
|
|
72
|
+
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
73
|
+
});
|
|
74
|
+
};
|
|
56
75
|
export const crawlUrl = async ({
|
|
57
76
|
urls,
|
|
58
77
|
lastmodMap,
|
|
59
|
-
formatPageContent
|
|
78
|
+
formatPageContent,
|
|
79
|
+
autoCloseBrowserCount = 30
|
|
60
80
|
}) => {
|
|
61
81
|
if (typeof urls === "string") {
|
|
62
82
|
urls = [urls];
|
|
@@ -64,8 +84,10 @@ export const crawlUrl = async ({
|
|
|
64
84
|
const crawlUrlJob = ({ url, retryCount = 0, index }) => {
|
|
65
85
|
return async () => {
|
|
66
86
|
try {
|
|
67
|
-
if (index %
|
|
68
|
-
await closeBrowser(
|
|
87
|
+
if (index % autoCloseBrowserCount === 0) {
|
|
88
|
+
await closeBrowser({
|
|
89
|
+
trimCache: false
|
|
90
|
+
});
|
|
69
91
|
}
|
|
70
92
|
const canCrawl = await isAcceptCrawler(url);
|
|
71
93
|
if (canCrawl) {
|
|
@@ -74,12 +96,7 @@ export const crawlUrl = async ({
|
|
|
74
96
|
formatPageContent
|
|
75
97
|
});
|
|
76
98
|
if (pageContent) {
|
|
77
|
-
await
|
|
78
|
-
content: pageContent,
|
|
79
|
-
lastmod: lastmodMap?.get(url),
|
|
80
|
-
updatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
81
|
-
nextDate: cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].nextDate()
|
|
82
|
-
});
|
|
99
|
+
await setUrlInfoToCache({ url, content: pageContent, lastmod: lastmodMap?.get(url) });
|
|
83
100
|
logger.info(`Crawler[${index}] ${url} success`);
|
|
84
101
|
} else if (retryCount < 3) {
|
|
85
102
|
retryCount++;
|
|
@@ -218,8 +235,8 @@ export const crawlBlocklet = async () => {
|
|
|
218
235
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
219
236
|
let cronCrawlBlockletJob = null;
|
|
220
237
|
export const initCronCrawlBlocklet = ({
|
|
221
|
-
time = "0 0 */
|
|
222
|
-
// every
|
|
238
|
+
time = "0 0 */12 * * *",
|
|
239
|
+
// every 12 hours
|
|
223
240
|
options
|
|
224
241
|
} = {}) => {
|
|
225
242
|
if (!cronCrawlBlockletJob) {
|
|
@@ -3,7 +3,9 @@ export * from '@blocklet/sdk/lib/config';
|
|
|
3
3
|
export declare const api: import("axios").AxiosInstance;
|
|
4
4
|
export declare const sleep: (ms: number) => Promise<unknown>;
|
|
5
5
|
export declare const clearCheckBrowserTimer: () => void;
|
|
6
|
-
export declare const closeBrowser: (
|
|
6
|
+
export declare const closeBrowser: ({ trimCache }?: {
|
|
7
|
+
trimCache?: boolean;
|
|
8
|
+
}) => Promise<void>;
|
|
7
9
|
export declare const getBrowser: () => Promise<any>;
|
|
8
10
|
export declare const CRAWLER_FLAG = "x-crawler";
|
|
9
11
|
export declare const isSelfCrawler: (req: any) => boolean;
|
|
@@ -39,7 +39,7 @@ export const clearCheckBrowserTimer = () => {
|
|
|
39
39
|
checkBrowserTimer = null;
|
|
40
40
|
}
|
|
41
41
|
};
|
|
42
|
-
export const closeBrowser = async () => {
|
|
42
|
+
export const closeBrowser = async ({ trimCache = true } = {}) => {
|
|
43
43
|
try {
|
|
44
44
|
if (browser) {
|
|
45
45
|
const pages = await browser.pages().catch(() => []);
|
|
@@ -52,7 +52,9 @@ export const closeBrowser = async () => {
|
|
|
52
52
|
await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
|
|
53
53
|
}
|
|
54
54
|
clearCheckBrowserTimer();
|
|
55
|
-
|
|
55
|
+
if (trimCache) {
|
|
56
|
+
await puppeteer.trimCache();
|
|
57
|
+
}
|
|
56
58
|
logger.info("Close browser success");
|
|
57
59
|
if (global.gc) {
|
|
58
60
|
global.gc();
|
|
@@ -131,7 +133,9 @@ export const getBrowser = async () => {
|
|
|
131
133
|
}
|
|
132
134
|
if (count >= 3) {
|
|
133
135
|
logger.info("Browser inactive for 3 minutes, closing...");
|
|
134
|
-
await closeBrowser(
|
|
136
|
+
await closeBrowser({
|
|
137
|
+
trimCache: true
|
|
138
|
+
});
|
|
135
139
|
}
|
|
136
140
|
}
|
|
137
141
|
}, 1e3 * 60);
|