@blocklet/crawler 2.2.11 → 2.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,8 +40,12 @@ const getPageContent = async ({
40
40
  let pageContent = null;
41
41
  try {
42
42
  const response = await page.goto(url, {
43
- timeout: 20 * 1e3
43
+ timeout: 60 * 1e3
44
+ // 60s
44
45
  });
46
+ if (!response) {
47
+ throw new Error(`Failed to load page: response is null for ${url}`);
48
+ }
45
49
  const statusCode = response.status();
46
50
  if (![200, 304].includes(statusCode)) {
47
51
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -127,7 +131,7 @@ const crawlUrl = async ({
127
131
  try {
128
132
  if (index % autoCloseBrowserCount === 0) {
129
133
  await (0, _utils.closeBrowser)({
130
- trimCache: index % (autoCloseBrowserCount * 5) === 0
134
+ trimCache: false
131
135
  });
132
136
  }
133
137
  const canCrawl = await (0, _utils.isAcceptCrawler)(url);
@@ -180,7 +184,16 @@ const crawlUrl = async ({
180
184
  }
181
185
  };
182
186
  exports.crawlUrl = crawlUrl;
187
+ const crawlBlockletRunningMap = /* @__PURE__ */new Map();
183
188
  const crawlBlocklet = async () => {
189
+ const {
190
+ mountPoint,
191
+ did
192
+ } = (0, _utils.getComponentInfo)();
193
+ if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
194
+ _utils.logger.info(`Crawler blocklet ${did} is running, skip it`);
195
+ return;
196
+ }
184
197
  try {
185
198
  const browser = await (0, _utils.getBrowser)();
186
199
  if (!browser) {
@@ -197,10 +210,6 @@ const crawlBlocklet = async () => {
197
210
  if (!appUrl) {
198
211
  throw new Error("appUrl not found");
199
212
  }
200
- const {
201
- mountPoint,
202
- did
203
- } = (0, _utils.getComponentInfo)();
204
213
  const sitemapList = await (0, _utils.getSitemapList)(appUrl);
205
214
  const matchMountPoint = (0, _ufo.joinURL)(appUrl, !mountPoint || mountPoint === "/" ? "" : mountPoint);
206
215
  const otherMountPointList = _utils.components.filter(item => item.mountPoint && item.mountPoint !== mountPoint).map(item => item.mountPoint);
@@ -253,39 +262,46 @@ const crawlBlocklet = async () => {
253
262
  lastmodMapTotal: lastmodMap.size
254
263
  }];
255
264
  _utils.logger.info(...crawlerLogText("start"));
256
- await crawlUrl({
257
- // @ts-ignore
258
- urls: canUseBlockletLocList,
259
- lastmodMap,
260
- formatPageContent: async ({
261
- page,
262
- url
263
- }) => {
264
- const pageContent = await page.evaluate(() => {
265
- const removeElements = tagName => {
266
- const elements = document.querySelectorAll(tagName);
267
- for (let i = elements.length - 1; i >= 0; i--) {
268
- try {
269
- elements[i]?.parentNode?.removeChild(elements[i]);
270
- } catch (error) {}
271
- }
272
- };
273
- removeElements('[id="uploader-container"]');
274
- removeElements('[class^="uppy-"]');
275
- removeElements('[id="point-up-component"]');
276
- const meta = document.createElement("meta");
277
- meta.name = "blocklet-crawler";
278
- meta.content = "true";
279
- document.head.appendChild(meta);
280
- return document.documentElement.outerHTML;
281
- });
282
- return pageContent;
283
- }
284
- });
285
- _utils.logger.info(...crawlerLogText("success"));
286
- await (0, _utils.closeBrowser)({
287
- trimCache: true
288
- });
265
+ try {
266
+ crawlBlockletRunningMap.set(did, true);
267
+ await crawlUrl({
268
+ // @ts-ignore
269
+ urls: canUseBlockletLocList,
270
+ lastmodMap,
271
+ formatPageContent: async ({
272
+ page,
273
+ url
274
+ }) => {
275
+ const pageContent = await page.evaluate(() => {
276
+ const removeElements = tagName => {
277
+ const elements = document.querySelectorAll(tagName);
278
+ for (let i = elements.length - 1; i >= 0; i--) {
279
+ try {
280
+ elements[i]?.parentNode?.removeChild(elements[i]);
281
+ } catch (error) {}
282
+ }
283
+ };
284
+ removeElements('[id="uploader-container"]');
285
+ removeElements('[class^="uppy-"]');
286
+ removeElements('[id="point-up-component"]');
287
+ const meta = document.createElement("meta");
288
+ meta.name = "blocklet-crawler";
289
+ meta.content = "true";
290
+ document.head.appendChild(meta);
291
+ return document.documentElement.outerHTML;
292
+ });
293
+ return pageContent;
294
+ }
295
+ });
296
+ _utils.logger.info(...crawlerLogText("success"));
297
+ await (0, _utils.closeBrowser)({
298
+ trimCache: true
299
+ });
300
+ } catch (error) {
301
+ _utils.logger.info(`Crawler blocklet abort by error`, error);
302
+ } finally {
303
+ crawlBlockletRunningMap.delete(did);
304
+ }
289
305
  };
290
306
  exports.crawlBlocklet = crawlBlocklet;
291
307
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
@@ -121,6 +121,7 @@ const closeBrowser = async ({
121
121
  if (temporaryDirectory) {
122
122
  _fsExtra.default.emptyDirSync(temporaryDirectory);
123
123
  }
124
+ logger.info("Trim cache success");
124
125
  }
125
126
  logger.info("Close browser success");
126
127
  if (global.gc) {
@@ -32,8 +32,12 @@ export const getPageContent = async ({ url, formatPageContent }) => {
32
32
  let pageContent = null;
33
33
  try {
34
34
  const response = await page.goto(url, {
35
- timeout: 20 * 1e3
35
+ timeout: 60 * 1e3
36
+ // 60s
36
37
  });
38
+ if (!response) {
39
+ throw new Error(`Failed to load page: response is null for ${url}`);
40
+ }
37
41
  const statusCode = response.status();
38
42
  if (![200, 304].includes(statusCode)) {
39
43
  throw new Error(`Request failed with status ${statusCode}, in ${url}`);
@@ -112,7 +116,7 @@ export const crawlUrl = async ({
112
116
  try {
113
117
  if (index % autoCloseBrowserCount === 0) {
114
118
  await closeBrowser({
115
- trimCache: index % (autoCloseBrowserCount * 5) === 0
119
+ trimCache: false
116
120
  });
117
121
  }
118
122
  const canCrawl = await isAcceptCrawler(url);
@@ -157,7 +161,13 @@ export const crawlUrl = async ({
157
161
  await crawlQueue.add(crawlUrlJob({ url, index: index + 1 }));
158
162
  }
159
163
  };
164
+ const crawlBlockletRunningMap = /* @__PURE__ */ new Map();
160
165
  export const crawlBlocklet = async () => {
166
+ const { mountPoint, did } = getComponentInfo();
167
+ if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
168
+ logger.info(`Crawler blocklet ${did} is running, skip it`);
169
+ return;
170
+ }
161
171
  try {
162
172
  const browser = await getBrowser();
163
173
  if (!browser) {
@@ -172,7 +182,6 @@ export const crawlBlocklet = async () => {
172
182
  if (!appUrl) {
173
183
  throw new Error("appUrl not found");
174
184
  }
175
- const { mountPoint, did } = getComponentInfo();
176
185
  const sitemapList = await getSitemapList(appUrl);
177
186
  const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === "/" ? "" : mountPoint);
178
187
  const otherMountPointList = components.filter((item) => item.mountPoint && item.mountPoint !== mountPoint).map((item) => item.mountPoint);
@@ -230,37 +239,44 @@ export const crawlBlocklet = async () => {
230
239
  }
231
240
  ];
232
241
  logger.info(...crawlerLogText("start"));
233
- await crawlUrl({
234
- // @ts-ignore
235
- urls: canUseBlockletLocList,
236
- lastmodMap,
237
- formatPageContent: async ({ page, url }) => {
238
- const pageContent = await page.evaluate(() => {
239
- const removeElements = (tagName) => {
240
- const elements = document.querySelectorAll(tagName);
241
- for (let i = elements.length - 1; i >= 0; i--) {
242
- try {
243
- elements[i]?.parentNode?.removeChild(elements[i]);
244
- } catch (error) {
242
+ try {
243
+ crawlBlockletRunningMap.set(did, true);
244
+ await crawlUrl({
245
+ // @ts-ignore
246
+ urls: canUseBlockletLocList,
247
+ lastmodMap,
248
+ formatPageContent: async ({ page, url }) => {
249
+ const pageContent = await page.evaluate(() => {
250
+ const removeElements = (tagName) => {
251
+ const elements = document.querySelectorAll(tagName);
252
+ for (let i = elements.length - 1; i >= 0; i--) {
253
+ try {
254
+ elements[i]?.parentNode?.removeChild(elements[i]);
255
+ } catch (error) {
256
+ }
245
257
  }
246
- }
247
- };
248
- removeElements('[id="uploader-container"]');
249
- removeElements('[class^="uppy-"]');
250
- removeElements('[id="point-up-component"]');
251
- const meta = document.createElement("meta");
252
- meta.name = "blocklet-crawler";
253
- meta.content = "true";
254
- document.head.appendChild(meta);
255
- return document.documentElement.outerHTML;
256
- });
257
- return pageContent;
258
- }
259
- });
260
- logger.info(...crawlerLogText("success"));
261
- await closeBrowser({
262
- trimCache: true
263
- });
258
+ };
259
+ removeElements('[id="uploader-container"]');
260
+ removeElements('[class^="uppy-"]');
261
+ removeElements('[id="point-up-component"]');
262
+ const meta = document.createElement("meta");
263
+ meta.name = "blocklet-crawler";
264
+ meta.content = "true";
265
+ document.head.appendChild(meta);
266
+ return document.documentElement.outerHTML;
267
+ });
268
+ return pageContent;
269
+ }
270
+ });
271
+ logger.info(...crawlerLogText("success"));
272
+ await closeBrowser({
273
+ trimCache: true
274
+ });
275
+ } catch (error) {
276
+ logger.info(`Crawler blocklet abort by error`, error);
277
+ } finally {
278
+ crawlBlockletRunningMap.delete(did);
279
+ }
264
280
  };
265
281
  const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
266
282
  let cronCrawlBlockletJob = null;
@@ -61,6 +61,7 @@ export const closeBrowser = async ({ trimCache = true } = {}) => {
61
61
  if (temporaryDirectory) {
62
62
  fs.emptyDirSync(temporaryDirectory);
63
63
  }
64
+ logger.info("Trim cache success");
64
65
  }
65
66
  logger.info("Close browser success");
66
67
  if (global.gc) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blocklet/crawler",
3
- "version": "2.2.11",
3
+ "version": "2.2.12",
4
4
  "description": "blocklet crawler lib",
5
5
  "publishConfig": {
6
6
  "access": "public"