@blocklet/crawler 2.2.11 → 2.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -40,8 +40,12 @@ const getPageContent = async ({
|
|
|
40
40
|
let pageContent = null;
|
|
41
41
|
try {
|
|
42
42
|
const response = await page.goto(url, {
|
|
43
|
-
timeout:
|
|
43
|
+
timeout: 60 * 1e3
|
|
44
|
+
// 60s
|
|
44
45
|
});
|
|
46
|
+
if (!response) {
|
|
47
|
+
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
48
|
+
}
|
|
45
49
|
const statusCode = response.status();
|
|
46
50
|
if (![200, 304].includes(statusCode)) {
|
|
47
51
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -127,7 +131,7 @@ const crawlUrl = async ({
|
|
|
127
131
|
try {
|
|
128
132
|
if (index % autoCloseBrowserCount === 0) {
|
|
129
133
|
await (0, _utils.closeBrowser)({
|
|
130
|
-
trimCache:
|
|
134
|
+
trimCache: false
|
|
131
135
|
});
|
|
132
136
|
}
|
|
133
137
|
const canCrawl = await (0, _utils.isAcceptCrawler)(url);
|
|
@@ -180,7 +184,16 @@ const crawlUrl = async ({
|
|
|
180
184
|
}
|
|
181
185
|
};
|
|
182
186
|
exports.crawlUrl = crawlUrl;
|
|
187
|
+
const crawlBlockletRunningMap = /* @__PURE__ */new Map();
|
|
183
188
|
const crawlBlocklet = async () => {
|
|
189
|
+
const {
|
|
190
|
+
mountPoint,
|
|
191
|
+
did
|
|
192
|
+
} = (0, _utils.getComponentInfo)();
|
|
193
|
+
if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
|
|
194
|
+
_utils.logger.info(`Crawler blocklet ${did} is running, skip it`);
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
184
197
|
try {
|
|
185
198
|
const browser = await (0, _utils.getBrowser)();
|
|
186
199
|
if (!browser) {
|
|
@@ -197,10 +210,6 @@ const crawlBlocklet = async () => {
|
|
|
197
210
|
if (!appUrl) {
|
|
198
211
|
throw new Error("appUrl not found");
|
|
199
212
|
}
|
|
200
|
-
const {
|
|
201
|
-
mountPoint,
|
|
202
|
-
did
|
|
203
|
-
} = (0, _utils.getComponentInfo)();
|
|
204
213
|
const sitemapList = await (0, _utils.getSitemapList)(appUrl);
|
|
205
214
|
const matchMountPoint = (0, _ufo.joinURL)(appUrl, !mountPoint || mountPoint === "/" ? "" : mountPoint);
|
|
206
215
|
const otherMountPointList = _utils.components.filter(item => item.mountPoint && item.mountPoint !== mountPoint).map(item => item.mountPoint);
|
|
@@ -253,39 +262,46 @@ const crawlBlocklet = async () => {
|
|
|
253
262
|
lastmodMapTotal: lastmodMap.size
|
|
254
263
|
}];
|
|
255
264
|
_utils.logger.info(...crawlerLogText("start"));
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
265
|
+
try {
|
|
266
|
+
crawlBlockletRunningMap.set(did, true);
|
|
267
|
+
await crawlUrl({
|
|
268
|
+
// @ts-ignore
|
|
269
|
+
urls: canUseBlockletLocList,
|
|
270
|
+
lastmodMap,
|
|
271
|
+
formatPageContent: async ({
|
|
272
|
+
page,
|
|
273
|
+
url
|
|
274
|
+
}) => {
|
|
275
|
+
const pageContent = await page.evaluate(() => {
|
|
276
|
+
const removeElements = tagName => {
|
|
277
|
+
const elements = document.querySelectorAll(tagName);
|
|
278
|
+
for (let i = elements.length - 1; i >= 0; i--) {
|
|
279
|
+
try {
|
|
280
|
+
elements[i]?.parentNode?.removeChild(elements[i]);
|
|
281
|
+
} catch (error) {}
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
removeElements('[id="uploader-container"]');
|
|
285
|
+
removeElements('[class^="uppy-"]');
|
|
286
|
+
removeElements('[id="point-up-component"]');
|
|
287
|
+
const meta = document.createElement("meta");
|
|
288
|
+
meta.name = "blocklet-crawler";
|
|
289
|
+
meta.content = "true";
|
|
290
|
+
document.head.appendChild(meta);
|
|
291
|
+
return document.documentElement.outerHTML;
|
|
292
|
+
});
|
|
293
|
+
return pageContent;
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
_utils.logger.info(...crawlerLogText("success"));
|
|
297
|
+
await (0, _utils.closeBrowser)({
|
|
298
|
+
trimCache: true
|
|
299
|
+
});
|
|
300
|
+
} catch (error) {
|
|
301
|
+
_utils.logger.info(`Crawler blocklet abort by error`, error);
|
|
302
|
+
} finally {
|
|
303
|
+
crawlBlockletRunningMap.delete(did);
|
|
304
|
+
}
|
|
289
305
|
};
|
|
290
306
|
exports.crawlBlocklet = crawlBlocklet;
|
|
291
307
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
@@ -32,8 +32,12 @@ export const getPageContent = async ({ url, formatPageContent }) => {
|
|
|
32
32
|
let pageContent = null;
|
|
33
33
|
try {
|
|
34
34
|
const response = await page.goto(url, {
|
|
35
|
-
timeout:
|
|
35
|
+
timeout: 60 * 1e3
|
|
36
|
+
// 60s
|
|
36
37
|
});
|
|
38
|
+
if (!response) {
|
|
39
|
+
throw new Error(`Failed to load page: response is null for ${url}`);
|
|
40
|
+
}
|
|
37
41
|
const statusCode = response.status();
|
|
38
42
|
if (![200, 304].includes(statusCode)) {
|
|
39
43
|
throw new Error(`Request failed with status ${statusCode}, in ${url}`);
|
|
@@ -112,7 +116,7 @@ export const crawlUrl = async ({
|
|
|
112
116
|
try {
|
|
113
117
|
if (index % autoCloseBrowserCount === 0) {
|
|
114
118
|
await closeBrowser({
|
|
115
|
-
trimCache:
|
|
119
|
+
trimCache: false
|
|
116
120
|
});
|
|
117
121
|
}
|
|
118
122
|
const canCrawl = await isAcceptCrawler(url);
|
|
@@ -157,7 +161,13 @@ export const crawlUrl = async ({
|
|
|
157
161
|
await crawlQueue.add(crawlUrlJob({ url, index: index + 1 }));
|
|
158
162
|
}
|
|
159
163
|
};
|
|
164
|
+
const crawlBlockletRunningMap = /* @__PURE__ */ new Map();
|
|
160
165
|
export const crawlBlocklet = async () => {
|
|
166
|
+
const { mountPoint, did } = getComponentInfo();
|
|
167
|
+
if (crawlBlockletRunningMap.has(did) && crawlBlockletRunningMap.get(did)) {
|
|
168
|
+
logger.info(`Crawler blocklet ${did} is running, skip it`);
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
161
171
|
try {
|
|
162
172
|
const browser = await getBrowser();
|
|
163
173
|
if (!browser) {
|
|
@@ -172,7 +182,6 @@ export const crawlBlocklet = async () => {
|
|
|
172
182
|
if (!appUrl) {
|
|
173
183
|
throw new Error("appUrl not found");
|
|
174
184
|
}
|
|
175
|
-
const { mountPoint, did } = getComponentInfo();
|
|
176
185
|
const sitemapList = await getSitemapList(appUrl);
|
|
177
186
|
const matchMountPoint = joinURL(appUrl, !mountPoint || mountPoint === "/" ? "" : mountPoint);
|
|
178
187
|
const otherMountPointList = components.filter((item) => item.mountPoint && item.mountPoint !== mountPoint).map((item) => item.mountPoint);
|
|
@@ -230,37 +239,44 @@ export const crawlBlocklet = async () => {
|
|
|
230
239
|
}
|
|
231
240
|
];
|
|
232
241
|
logger.info(...crawlerLogText("start"));
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
242
|
+
try {
|
|
243
|
+
crawlBlockletRunningMap.set(did, true);
|
|
244
|
+
await crawlUrl({
|
|
245
|
+
// @ts-ignore
|
|
246
|
+
urls: canUseBlockletLocList,
|
|
247
|
+
lastmodMap,
|
|
248
|
+
formatPageContent: async ({ page, url }) => {
|
|
249
|
+
const pageContent = await page.evaluate(() => {
|
|
250
|
+
const removeElements = (tagName) => {
|
|
251
|
+
const elements = document.querySelectorAll(tagName);
|
|
252
|
+
for (let i = elements.length - 1; i >= 0; i--) {
|
|
253
|
+
try {
|
|
254
|
+
elements[i]?.parentNode?.removeChild(elements[i]);
|
|
255
|
+
} catch (error) {
|
|
256
|
+
}
|
|
245
257
|
}
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
})
|
|
258
|
+
};
|
|
259
|
+
removeElements('[id="uploader-container"]');
|
|
260
|
+
removeElements('[class^="uppy-"]');
|
|
261
|
+
removeElements('[id="point-up-component"]');
|
|
262
|
+
const meta = document.createElement("meta");
|
|
263
|
+
meta.name = "blocklet-crawler";
|
|
264
|
+
meta.content = "true";
|
|
265
|
+
document.head.appendChild(meta);
|
|
266
|
+
return document.documentElement.outerHTML;
|
|
267
|
+
});
|
|
268
|
+
return pageContent;
|
|
269
|
+
}
|
|
270
|
+
});
|
|
271
|
+
logger.info(...crawlerLogText("success"));
|
|
272
|
+
await closeBrowser({
|
|
273
|
+
trimCache: true
|
|
274
|
+
});
|
|
275
|
+
} catch (error) {
|
|
276
|
+
logger.info(`Crawler blocklet abort by error`, error);
|
|
277
|
+
} finally {
|
|
278
|
+
crawlBlockletRunningMap.delete(did);
|
|
279
|
+
}
|
|
264
280
|
};
|
|
265
281
|
const CRON_CRAWL_BLOCKLET_KEY = "cron-crawl-blocklet";
|
|
266
282
|
let cronCrawlBlockletJob = null;
|