@govtechsg/oobee 0.10.39 → 0.10.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import crawlee, { Request, RequestList } from 'crawlee';
1
+ import crawlee, { LaunchContext, Request, RequestList } from 'crawlee';
2
2
  import printMessage from 'print-message';
3
3
  import fs from 'fs';
4
4
  import {
@@ -8,7 +8,7 @@ import {
8
8
  isUrlPdf,
9
9
  } from './commonCrawlerFunc.js';
10
10
 
11
- import constants, { guiInfoStatusTypes } from '../constants/constants.js';
11
+ import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, UrlsCrawled } from '../constants/constants.js';
12
12
  import {
13
13
  getLinksFromSitemap,
14
14
  getPlaywrightLaunchOptions,
@@ -22,31 +22,32 @@ import {
22
22
  import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
23
23
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
24
24
  import { guiInfoLog } from '../logs.js';
25
+ import { ViewportSettingsClass } from '../combine.js';
25
26
 
26
27
  const crawlSitemap = async (
27
- sitemapUrl,
28
- randomToken,
29
- host,
30
- viewportSettings,
31
- maxRequestsPerCrawl,
32
- browser,
33
- userDataDirectory,
34
- specifiedMaxConcurrency,
35
- fileTypes,
36
- blacklistedPatterns,
37
- includeScreenshots,
38
- extraHTTPHeaders,
28
+ sitemapUrl: string,
29
+ randomToken: string,
30
+ _host: string,
31
+ viewportSettings: ViewportSettingsClass,
32
+ maxRequestsPerCrawl: number,
33
+ browser: string,
34
+ userDataDirectory: string,
35
+ specifiedMaxConcurrency: number,
36
+ fileTypes: string,
37
+ blacklistedPatterns: string[],
38
+ includeScreenshots: boolean,
39
+ extraHTTPHeaders: Record<string, string>,
39
40
  fromCrawlIntelligentSitemap = false, // optional
40
- userUrlInputFromIntelligent = null, // optional
41
- datasetFromIntelligent = null, // optional
42
- urlsCrawledFromIntelligent = null, // optional
41
+ userUrlInputFromIntelligent: string = null, // optional
42
+ datasetFromIntelligent: crawlee.Dataset = null, // optional
43
+ urlsCrawledFromIntelligent: UrlsCrawled = null, // optional
43
44
  crawledFromLocalFile = false, // optional
44
45
  ) => {
45
- let dataset;
46
- let urlsCrawled;
46
+ let dataset: crawlee.Dataset;
47
+ let urlsCrawled: UrlsCrawled;
47
48
 
48
49
  // Boolean to omit axe scan for basic auth URL
49
- let isBasicAuth;
50
+ let isBasicAuth: boolean;
50
51
  let basicAuthPage = 0;
51
52
  let finalLinks = [];
52
53
  let authHeader = '';
@@ -119,8 +120,8 @@ const crawlSitemap = async (
119
120
  basicAuthPage = -2;
120
121
  }
121
122
 
122
- const pdfDownloads = [];
123
- const uuidToPdfMapping = {};
123
+ const pdfDownloads: Promise<void>[] = [];
124
+ const uuidToPdfMapping: Record<string, string> = {};
124
125
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
125
126
  const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
126
127
  const { playwrightDeviceDetailsObject } = viewportSettings;
@@ -152,7 +153,7 @@ const crawlSitemap = async (
152
153
  browserPoolOptions: {
153
154
  useFingerprints: false,
154
155
  preLaunchHooks: [
155
- async (pageId, launchContext) => {
156
+ async (_pageId: string, launchContext: LaunchContext) => {
156
157
  launchContext.launchOptions = {
157
158
  ...launchContext.launchOptions,
158
159
  bypassCSP: true,
@@ -164,39 +165,43 @@ const crawlSitemap = async (
164
165
  },
165
166
  requestList,
166
167
  postNavigationHooks: [
167
- async ({ page, request }) => {
168
+
169
+ async ({ page }) => {
168
170
  try {
169
171
  // Wait for a quiet period in the DOM, but with safeguards
170
172
  await page.evaluate(() => {
171
- return new Promise((resolve) => {
173
+ return new Promise(resolve => {
172
174
  let timeout;
173
175
  let mutationCount = 0;
174
- const MAX_MUTATIONS = 250; // Prevent infinite mutations
175
- const OBSERVER_TIMEOUT = 5000; // Hard timeout to exit
176
-
176
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
177
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
178
+
177
179
  const observer = new MutationObserver(() => {
178
180
  clearTimeout(timeout);
179
-
181
+
180
182
  mutationCount++;
181
183
  if (mutationCount > MAX_MUTATIONS) {
182
184
  observer.disconnect();
183
- resolve('Too many mutations detected, exiting.');
185
+ resolve('Too many mutations, exiting.');
184
186
  return;
185
187
  }
186
-
188
+
189
+ // restart quiet‑period timer
187
190
  timeout = setTimeout(() => {
188
191
  observer.disconnect();
189
- resolve('DOM stabilized after mutations.');
192
+ resolve('DOM stabilized.');
190
193
  }, 1000);
191
194
  });
192
-
195
+
196
+ // overall timeout in case the page never settles
193
197
  timeout = setTimeout(() => {
194
198
  observer.disconnect();
195
- resolve('Observer timeout reached, exiting.');
196
- }, OBSERVER_TIMEOUT); // Ensure the observer stops after X seconds
197
-
198
- observer.observe(document.documentElement, { childList: true, subtree: true });
199
-
199
+ resolve('Observer timeout reached.');
200
+ }, OBSERVER_TIMEOUT);
201
+
202
+ // **HERE**: select the real DOM node inside evaluate
203
+ const root = document.documentElement;
204
+ observer.observe(root, { childList: true, subtree: true });
200
205
  });
201
206
  });
202
207
  } catch (err) {
@@ -207,6 +212,7 @@ const crawlSitemap = async (
207
212
  throw err; // Rethrow unknown errors
208
213
  }
209
214
  },
215
+
210
216
  ],
211
217
 
212
218
  preNavigationHooks: isBasicAuth
@@ -246,16 +252,18 @@ const crawlSitemap = async (
246
252
  return;
247
253
  }
248
254
 
249
- if (isUrlPdf(actualUrl)) {
255
+ if (request.skipNavigation && actualUrl === "about:blank") {
250
256
  if (!isScanPdfs) {
251
257
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
252
258
  numScanned: urlsCrawled.scanned.length,
253
259
  urlScanned: request.url,
254
260
  });
255
- urlsCrawled.blacklisted.push({
261
+ urlsCrawled.userExcluded.push({
256
262
  url: request.url,
257
263
  pageTitle: request.url,
258
- actualUrl: actualUrl, // i.e. actualUrl
264
+ actualUrl: request.url, // because about:blank is not useful
265
+ metadata: STATUS_CODE_METADATA[1],
266
+ httpStatusCode: 0,
259
267
  });
260
268
 
261
269
  return;
@@ -276,85 +284,64 @@ const crawlSitemap = async (
276
284
  const contentType = response?.headers?.()['content-type'] || '';
277
285
  const status = response ? response.status() : 0;
278
286
 
279
- if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
280
- urlsCrawled.userExcluded.push({
281
- url: request.url,
282
- pageTitle: request.url,
283
- actualUrl: actualUrl,
284
- });
285
-
286
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
287
- numScanned: urlsCrawled.scanned.length,
288
- urlScanned: request.url,
289
- });
290
- return;
291
- }
287
+ if (basicAuthPage < 0) {
288
+ basicAuthPage += 1;
289
+ } else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
290
+ const isRedirected = !areLinksEqual(page.url(), request.url);
291
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
292
+ item => (item.actualUrl || item.url) === page.url(),
293
+ );
292
294
 
293
- if (status === 403) {
294
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
295
- numScanned: urlsCrawled.scanned.length,
296
- urlScanned: request.url,
297
- });
298
- urlsCrawled.forbidden.push({ url: request.url });
299
- return;
300
- }
295
+ if (isRedirected && isLoadedUrlInCrawledUrls) {
296
+ urlsCrawled.notScannedRedirects.push({
297
+ fromUrl: request.url,
298
+ toUrl: actualUrl, // i.e. actualUrl
299
+ });
300
+ return;
301
+ }
301
302
 
302
- if (status !== 200) {
303
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
304
- numScanned: urlsCrawled.scanned.length,
305
- urlScanned: request.url,
306
- });
307
- urlsCrawled.invalid.push({
308
- url: request.url,
309
- pageTitle: request.url,
310
- actualUrl: actualUrl, // i.e. actualUrl
311
- });
303
+ // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
304
+ if (
305
+ isRedirected &&
306
+ blacklistedPatterns &&
307
+ isSkippedUrl(actualUrl, blacklistedPatterns)
308
+ ) {
309
+ urlsCrawled.userExcluded.push({
310
+ url: request.url,
311
+ pageTitle: request.url,
312
+ actualUrl: actualUrl,
313
+ metadata: STATUS_CODE_METADATA[0],
314
+ httpStatusCode: 0,
315
+ });
312
316
 
313
- return;
314
- }
317
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
318
+ numScanned: urlsCrawled.scanned.length,
319
+ urlScanned: request.url,
320
+ });
321
+ return;
322
+ }
315
323
 
316
- if (basicAuthPage < 0) {
317
- basicAuthPage += 1;
318
- } else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
319
324
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
325
+
320
326
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
321
327
  numScanned: urlsCrawled.scanned.length,
322
328
  urlScanned: request.url,
323
329
  });
324
330
 
325
- const isRedirected = !areLinksEqual(page.url(), request.url);
326
- if (isRedirected) {
327
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
328
- item => (item.actualUrl || item.url.href) === page,
329
- );
330
-
331
- if (isLoadedUrlInCrawledUrls) {
332
- urlsCrawled.notScannedRedirects.push({
333
- fromUrl: request.url,
334
- toUrl: actualUrl, // i.e. actualUrl
335
- });
336
- return;
337
- }
331
+ urlsCrawled.scanned.push({
332
+ url: urlWithoutAuth(request.url),
333
+ pageTitle: results.pageTitle,
334
+ actualUrl: actualUrl, // i.e. actualUrl
335
+ });
338
336
 
339
- urlsCrawled.scanned.push({
340
- url: urlWithoutAuth(request.url),
341
- pageTitle: results.pageTitle,
342
- actualUrl: actualUrl, // i.e. actualUrl
343
- });
337
+ urlsCrawled.scannedRedirects.push({
338
+ fromUrl: urlWithoutAuth(request.url),
339
+ toUrl: actualUrl,
340
+ });
344
341
 
345
- urlsCrawled.scannedRedirects.push({
346
- fromUrl: urlWithoutAuth(request.url),
347
- toUrl: actualUrl,
348
- });
342
+ results.url = request.url;
343
+ results.actualUrl = actualUrl;
349
344
 
350
- results.url = request.url;
351
- results.actualUrl = actualUrl;
352
- } else {
353
- urlsCrawled.scanned.push({
354
- url: urlWithoutAuth(request.url),
355
- pageTitle: results.pageTitle,
356
- });
357
- }
358
345
  await dataset.pushData(results);
359
346
  } else {
360
347
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -363,11 +350,23 @@ const crawlSitemap = async (
363
350
  });
364
351
 
365
352
  if (isScanHtml) {
366
- urlsCrawled.invalid.push(actualUrl);
353
+ // carry through the HTTP status metadata
354
+ const status = response?.status();
355
+ const metadata = typeof status === 'number'
356
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
357
+ : STATUS_CODE_METADATA[2];
358
+
359
+ urlsCrawled.invalid.push({
360
+ actualUrl,
361
+ url: request.url,
362
+ pageTitle: request.url,
363
+ metadata,
364
+ httpStatusCode: typeof status === 'number' ? status : 0
365
+ });
367
366
  }
368
367
  }
369
368
  },
370
- failedRequestHandler: async ({ request }) => {
369
+ failedRequestHandler: async ({ request, response, error }) => {
371
370
  if (isBasicAuth && request.url) {
372
371
  request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
373
372
  }
@@ -381,7 +380,19 @@ const crawlSitemap = async (
381
380
  numScanned: urlsCrawled.scanned.length,
382
381
  urlScanned: request.url,
383
382
  });
384
- urlsCrawled.error.push(request.url);
383
+
384
+ const status = response?.status();
385
+ const metadata = typeof status === 'number'
386
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
387
+ : STATUS_CODE_METADATA[2];
388
+
389
+ urlsCrawled.error.push({
390
+ url: request.url,
391
+ pageTitle: request.url,
392
+ actualUrl: request.url,
393
+ metadata,
394
+ httpStatusCode: typeof status === 'number' ? status : 0
395
+ });
385
396
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
386
397
  },
387
398
  maxRequestsPerCrawl: Infinity,
@@ -16,7 +16,7 @@ export function findElementByCssSelector(cssSelector: string): string | null {
16
16
 
17
17
  // Handle Shadow DOM if the element is not found
18
18
  if (!element) {
19
- const shadowRoots = [];
19
+ const shadowRoots: ShadowRoot[] = [];
20
20
  const allElements = document.querySelectorAll('*');
21
21
 
22
22
  // Look for elements with shadow roots