@blocklet/crawler 2.1.235 → 2.1.237

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,18 @@
1
+ import { getRelativePath } from './utils';
1
2
  import { PageOptions } from './types';
3
+ export { getRelativePath };
2
4
  export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
3
- export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, }: {
5
+ export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
6
+ export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
7
+ url: string;
8
+ content: string;
9
+ lastmod?: string;
10
+ }) => Promise<any>;
11
+ export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
4
12
  urls: string[] | string;
5
13
  lastmodMap?: Map<string, string>;
6
14
  formatPageContent?: Function | null | undefined;
15
+ autoCloseBrowserCount?: number;
7
16
  }) => Promise<void>;
8
17
  export declare const crawlBlocklet: () => Promise<void>;
9
18
  export declare const initCronCrawlBlocklet: ({ time, options, }?: {
@@ -3,7 +3,14 @@
3
3
  Object.defineProperty(exports, "__esModule", {
4
4
  value: true
5
5
  });
6
- exports.initCronCrawlBlocklet = exports.getPageContent = exports.crawlUrl = exports.crawlBlocklet = exports.cancelCronCrawlBlocklet = void 0;
6
+ exports.getPageContent = exports.crawlUrl = exports.crawlBlocklet = exports.cancelCronCrawlBlocklet = void 0;
7
+ Object.defineProperty(exports, "getRelativePath", {
8
+ enumerable: true,
9
+ get: function () {
10
+ return _utils.getRelativePath;
11
+ }
12
+ });
13
+ exports.setUrlInfoToCache = exports.initCronCrawlBlocklet = exports.getUrlInfoFromCache = void 0;
7
14
  var PQueue = _interopRequireWildcard(require("p-queue"));
8
15
  var _ufo = require("ufo");
9
16
  var _utils = require("./utils");
@@ -59,10 +66,31 @@ const getPageContent = async ({
59
66
  return formatHtml(pageContent);
60
67
  };
61
68
  exports.getPageContent = getPageContent;
69
+ const getUrlInfoFromCache = async url => {
70
+ const cache = await _utils.useCache.get((0, _utils.getRelativePath)(url));
71
+ return cache;
72
+ };
73
+ exports.getUrlInfoFromCache = getUrlInfoFromCache;
74
+ const setUrlInfoToCache = async ({
75
+ url,
76
+ content,
77
+ lastmod
78
+ }) => {
79
+ if (!content || !url) {
80
+ return;
81
+ }
82
+ return await _utils.useCache.set((0, _utils.getRelativePath)(url), {
83
+ content,
84
+ lastmod: lastmod || ( /* @__PURE__ */new Date()).toISOString(),
85
+ updatedAt: ( /* @__PURE__ */new Date()).toISOString()
86
+ });
87
+ };
88
+ exports.setUrlInfoToCache = setUrlInfoToCache;
62
89
  const crawlUrl = async ({
63
90
  urls,
64
91
  lastmodMap,
65
- formatPageContent
92
+ formatPageContent,
93
+ autoCloseBrowserCount = 30
66
94
  }) => {
67
95
  if (typeof urls === "string") {
68
96
  urls = [urls];
@@ -74,8 +102,10 @@ const crawlUrl = async ({
74
102
  }) => {
75
103
  return async () => {
76
104
  try {
77
- if (index % 50 === 0) {
78
- await (0, _utils.closeBrowser)();
105
+ if (index % autoCloseBrowserCount === 0) {
106
+ await (0, _utils.closeBrowser)({
107
+ trimCache: false
108
+ });
79
109
  }
80
110
  const canCrawl = await (0, _utils.isAcceptCrawler)(url);
81
111
  if (canCrawl) {
@@ -84,11 +114,10 @@ const crawlUrl = async ({
84
114
  formatPageContent
85
115
  });
86
116
  if (pageContent) {
87
- await _utils.useCache.set((0, _utils.getRelativePath)(url), {
117
+ await setUrlInfoToCache({
118
+ url,
88
119
  content: pageContent,
89
- lastmod: lastmodMap?.get(url),
90
- updatedAt: ( /* @__PURE__ */new Date()).toISOString(),
91
- nextDate: cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].nextDate()
120
+ lastmod: lastmodMap?.get(url)
92
121
  });
93
122
  _utils.logger.info(`Crawler[${index}] ${url} success`);
94
123
  } else if (retryCount < 3) {
@@ -239,8 +268,8 @@ exports.crawlBlocklet = crawlBlocklet;
239
268
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
240
269
  let cronCrawlBlockletJob = null;
241
270
  const initCronCrawlBlocklet = ({
242
- time = "0 0 */24 * * *",
243
- // every 24 hours
271
+ time = "0 0 */12 * * *",
272
+ // every 12 hours
244
273
  options
245
274
  } = {}) => {
246
275
  if (!cronCrawlBlockletJob) {
@@ -3,7 +3,9 @@ export * from '@blocklet/sdk/lib/config';
3
3
  export declare const api: import("axios").AxiosInstance;
4
4
  export declare const sleep: (ms: number) => Promise<unknown>;
5
5
  export declare const clearCheckBrowserTimer: () => void;
6
- export declare const closeBrowser: () => Promise<void>;
6
+ export declare const closeBrowser: ({ trimCache }?: {
7
+ trimCache?: boolean;
8
+ }) => Promise<void>;
7
9
  export declare const getBrowser: () => Promise<any>;
8
10
  export declare const CRAWLER_FLAG = "x-crawler";
9
11
  export declare const isSelfCrawler: (req: any) => boolean;
@@ -95,7 +95,9 @@ const clearCheckBrowserTimer = () => {
95
95
  }
96
96
  };
97
97
  exports.clearCheckBrowserTimer = clearCheckBrowserTimer;
98
- const closeBrowser = async () => {
98
+ const closeBrowser = async ({
99
+ trimCache = true
100
+ } = {}) => {
99
101
  try {
100
102
  if (browser) {
101
103
  const pages = await browser.pages().catch(() => []);
@@ -107,7 +109,9 @@ const closeBrowser = async () => {
107
109
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
108
110
  }
109
111
  clearCheckBrowserTimer();
110
- await _puppeteer.default.trimCache();
112
+ if (trimCache) {
113
+ await _puppeteer.default.trimCache();
114
+ }
111
115
  logger.info("Close browser success");
112
116
  if (global.gc) {
113
117
  global.gc();
@@ -175,7 +179,9 @@ const getBrowser = async () => {
175
179
  }
176
180
  if (count >= 3) {
177
181
  logger.info("Browser inactive for 3 minutes, closing...");
178
- await closeBrowser();
182
+ await closeBrowser({
183
+ trimCache: true
184
+ });
179
185
  }
180
186
  }
181
187
  }, 1e3 * 60);
@@ -1,9 +1,18 @@
1
+ import { getRelativePath } from './utils';
1
2
  import { PageOptions } from './types';
3
+ export { getRelativePath };
2
4
  export declare const getPageContent: ({ url, formatPageContent }: PageOptions) => Promise<string>;
3
- export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, }: {
5
+ export declare const getUrlInfoFromCache: (url: string) => Promise<any>;
6
+ export declare const setUrlInfoToCache: ({ url, content, lastmod, }: {
7
+ url: string;
8
+ content: string;
9
+ lastmod?: string;
10
+ }) => Promise<any>;
11
+ export declare const crawlUrl: ({ urls, lastmodMap, formatPageContent, autoCloseBrowserCount, }: {
4
12
  urls: string[] | string;
5
13
  lastmodMap?: Map<string, string>;
6
14
  formatPageContent?: Function | null | undefined;
15
+ autoCloseBrowserCount?: number;
7
16
  }) => Promise<void>;
8
17
  export declare const crawlBlocklet: () => Promise<void>;
9
18
  export declare const initCronCrawlBlocklet: ({ time, options, }?: {
@@ -16,6 +16,7 @@ import {
16
16
  } from "./utils.js";
17
17
  import Cron from "@abtnode/cron";
18
18
  import debounce from "lodash/debounce";
19
+ export { getRelativePath };
19
20
  const formatHtml = (htmlString) => {
20
21
  if (typeof htmlString !== "string") {
21
22
  return "";
@@ -53,10 +54,29 @@ export const getPageContent = async ({ url, formatPageContent }) => {
53
54
  }
54
55
  return formatHtml(pageContent);
55
56
  };
57
+ export const getUrlInfoFromCache = async (url) => {
58
+ const cache = await useCache.get(getRelativePath(url));
59
+ return cache;
60
+ };
61
+ export const setUrlInfoToCache = async ({
62
+ url,
63
+ content,
64
+ lastmod
65
+ }) => {
66
+ if (!content || !url) {
67
+ return;
68
+ }
69
+ return await useCache.set(getRelativePath(url), {
70
+ content,
71
+ lastmod: lastmod || (/* @__PURE__ */ new Date()).toISOString(),
72
+ updatedAt: (/* @__PURE__ */ new Date()).toISOString()
73
+ });
74
+ };
56
75
  export const crawlUrl = async ({
57
76
  urls,
58
77
  lastmodMap,
59
- formatPageContent
78
+ formatPageContent,
79
+ autoCloseBrowserCount = 30
60
80
  }) => {
61
81
  if (typeof urls === "string") {
62
82
  urls = [urls];
@@ -64,8 +84,10 @@ export const crawlUrl = async ({
64
84
  const crawlUrlJob = ({ url, retryCount = 0, index }) => {
65
85
  return async () => {
66
86
  try {
67
- if (index % 50 === 0) {
68
- await closeBrowser();
87
+ if (index % autoCloseBrowserCount === 0) {
88
+ await closeBrowser({
89
+ trimCache: false
90
+ });
69
91
  }
70
92
  const canCrawl = await isAcceptCrawler(url);
71
93
  if (canCrawl) {
@@ -74,12 +96,7 @@ export const crawlUrl = async ({
74
96
  formatPageContent
75
97
  });
76
98
  if (pageContent) {
77
- await useCache.set(getRelativePath(url), {
78
- content: pageContent,
79
- lastmod: lastmodMap?.get(url),
80
- updatedAt: (/* @__PURE__ */ new Date()).toISOString(),
81
- nextDate: cronCrawlBlockletJob.jobs[CRON_CRAWL_BLOCKLET_KEY].nextDate()
82
- });
99
+ await setUrlInfoToCache({ url, content: pageContent, lastmod: lastmodMap?.get(url) });
83
100
  logger.info(`Crawler[${index}] ${url} success`);
84
101
  } else if (retryCount < 3) {
85
102
  retryCount++;
@@ -218,8 +235,8 @@ export const crawlBlocklet = async () => {
218
235
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
219
236
  let cronCrawlBlockletJob = null;
220
237
  export const initCronCrawlBlocklet = ({
221
- time = "0 0 */24 * * *",
222
- // every 24 hours
238
+ time = "0 0 */12 * * *",
239
+ // every 12 hours
223
240
  options
224
241
  } = {}) => {
225
242
  if (!cronCrawlBlockletJob) {
@@ -3,7 +3,9 @@ export * from '@blocklet/sdk/lib/config';
3
3
  export declare const api: import("axios").AxiosInstance;
4
4
  export declare const sleep: (ms: number) => Promise<unknown>;
5
5
  export declare const clearCheckBrowserTimer: () => void;
6
- export declare const closeBrowser: () => Promise<void>;
6
+ export declare const closeBrowser: ({ trimCache }?: {
7
+ trimCache?: boolean;
8
+ }) => Promise<void>;
7
9
  export declare const getBrowser: () => Promise<any>;
8
10
  export declare const CRAWLER_FLAG = "x-crawler";
9
11
  export declare const isSelfCrawler: (req: any) => boolean;
@@ -39,7 +39,7 @@ export const clearCheckBrowserTimer = () => {
39
39
  checkBrowserTimer = null;
40
40
  }
41
41
  };
42
- export const closeBrowser = async () => {
42
+ export const closeBrowser = async ({ trimCache = true } = {}) => {
43
43
  try {
44
44
  if (browser) {
45
45
  const pages = await browser.pages().catch(() => []);
@@ -52,7 +52,9 @@ export const closeBrowser = async () => {
52
52
  await useCache.remove(BROWSER_WS_ENDPOINT_KEY);
53
53
  }
54
54
  clearCheckBrowserTimer();
55
- await puppeteer.trimCache();
55
+ if (trimCache) {
56
+ await puppeteer.trimCache();
57
+ }
56
58
  logger.info("Close browser success");
57
59
  if (global.gc) {
58
60
  global.gc();
@@ -131,7 +133,9 @@ export const getBrowser = async () => {
131
133
  }
132
134
  if (count >= 3) {
133
135
  logger.info("Browser inactive for 3 minutes, closing...");
134
- await closeBrowser();
136
+ await closeBrowser({
137
+ trimCache: true
138
+ });
135
139
  }
136
140
  }
137
141
  }, 1e3 * 60);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.1.235",
3
+ "version": "2.1.237",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"