@govtechsg/oobee 0.10.42 → 0.10.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import crawlee, { Request, RequestList } from 'crawlee';
1
+ import crawlee, { LaunchContext, Request, RequestList } from 'crawlee';
2
2
  import printMessage from 'print-message';
3
3
  import fs from 'fs';
4
4
  import {
@@ -8,7 +8,7 @@ import {
8
8
  isUrlPdf,
9
9
  } from './commonCrawlerFunc.js';
10
10
 
11
- import constants, { guiInfoStatusTypes } from '../constants/constants.js';
11
+ import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, UrlsCrawled } from '../constants/constants.js';
12
12
  import {
13
13
  getLinksFromSitemap,
14
14
  getPlaywrightLaunchOptions,
@@ -22,31 +22,32 @@ import {
22
22
  import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
23
23
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
24
24
  import { guiInfoLog } from '../logs.js';
25
+ import { ViewportSettingsClass } from '../combine.js';
25
26
 
26
27
  const crawlSitemap = async (
27
- sitemapUrl,
28
- randomToken,
29
- host,
30
- viewportSettings,
31
- maxRequestsPerCrawl,
32
- browser,
33
- userDataDirectory,
34
- specifiedMaxConcurrency,
35
- fileTypes,
36
- blacklistedPatterns,
37
- includeScreenshots,
38
- extraHTTPHeaders,
28
+ sitemapUrl: string,
29
+ randomToken: string,
30
+ _host: string,
31
+ viewportSettings: ViewportSettingsClass,
32
+ maxRequestsPerCrawl: number,
33
+ browser: string,
34
+ userDataDirectory: string,
35
+ specifiedMaxConcurrency: number,
36
+ fileTypes: string,
37
+ blacklistedPatterns: string[],
38
+ includeScreenshots: boolean,
39
+ extraHTTPHeaders: Record<string, string>,
39
40
  fromCrawlIntelligentSitemap = false, // optional
40
- userUrlInputFromIntelligent = null, // optional
41
- datasetFromIntelligent = null, // optional
42
- urlsCrawledFromIntelligent = null, // optional
41
+ userUrlInputFromIntelligent: string = null, // optional
42
+ datasetFromIntelligent: crawlee.Dataset = null, // optional
43
+ urlsCrawledFromIntelligent: UrlsCrawled = null, // optional
43
44
  crawledFromLocalFile = false, // optional
44
45
  ) => {
45
- let dataset;
46
- let urlsCrawled;
46
+ let dataset: crawlee.Dataset;
47
+ let urlsCrawled: UrlsCrawled;
47
48
 
48
49
  // Boolean to omit axe scan for basic auth URL
49
- let isBasicAuth;
50
+ let isBasicAuth: boolean;
50
51
  let basicAuthPage = 0;
51
52
  let finalLinks = [];
52
53
  let authHeader = '';
@@ -119,8 +120,8 @@ const crawlSitemap = async (
119
120
  basicAuthPage = -2;
120
121
  }
121
122
 
122
- const pdfDownloads = [];
123
- const uuidToPdfMapping = {};
123
+ const pdfDownloads: Promise<void>[] = [];
124
+ const uuidToPdfMapping: Record<string, string> = {};
124
125
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
125
126
  const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
126
127
  const { playwrightDeviceDetailsObject } = viewportSettings;
@@ -152,7 +153,7 @@ const crawlSitemap = async (
152
153
  browserPoolOptions: {
153
154
  useFingerprints: false,
154
155
  preLaunchHooks: [
155
- async (pageId, launchContext) => {
156
+ async (_pageId: string, launchContext: LaunchContext) => {
156
157
  launchContext.launchOptions = {
157
158
  ...launchContext.launchOptions,
158
159
  bypassCSP: true,
@@ -164,39 +165,43 @@ const crawlSitemap = async (
164
165
  },
165
166
  requestList,
166
167
  postNavigationHooks: [
167
- async ({ page, request }) => {
168
+
169
+ async ({ page }) => {
168
170
  try {
169
171
  // Wait for a quiet period in the DOM, but with safeguards
170
172
  await page.evaluate(() => {
171
- return new Promise((resolve) => {
173
+ return new Promise(resolve => {
172
174
  let timeout;
173
175
  let mutationCount = 0;
174
- const MAX_MUTATIONS = 250; // Prevent infinite mutations
175
- const OBSERVER_TIMEOUT = 5000; // Hard timeout to exit
176
-
176
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
177
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
178
+
177
179
  const observer = new MutationObserver(() => {
178
180
  clearTimeout(timeout);
179
-
181
+
180
182
  mutationCount++;
181
183
  if (mutationCount > MAX_MUTATIONS) {
182
184
  observer.disconnect();
183
- resolve('Too many mutations detected, exiting.');
185
+ resolve('Too many mutations, exiting.');
184
186
  return;
185
187
  }
186
-
188
+
189
+ // restart quiet‑period timer
187
190
  timeout = setTimeout(() => {
188
191
  observer.disconnect();
189
- resolve('DOM stabilized after mutations.');
192
+ resolve('DOM stabilized.');
190
193
  }, 1000);
191
194
  });
192
-
195
+
196
+ // overall timeout in case the page never settles
193
197
  timeout = setTimeout(() => {
194
198
  observer.disconnect();
195
- resolve('Observer timeout reached, exiting.');
196
- }, OBSERVER_TIMEOUT); // Ensure the observer stops after X seconds
197
-
198
- observer.observe(document.documentElement, { childList: true, subtree: true });
199
-
199
+ resolve('Observer timeout reached.');
200
+ }, OBSERVER_TIMEOUT);
201
+
202
+ // **HERE**: select the real DOM node inside evaluate
203
+ const root = document.documentElement;
204
+ observer.observe(root, { childList: true, subtree: true });
200
205
  });
201
206
  });
202
207
  } catch (err) {
@@ -207,6 +212,7 @@ const crawlSitemap = async (
207
212
  throw err; // Rethrow unknown errors
208
213
  }
209
214
  },
215
+
210
216
  ],
211
217
 
212
218
  preNavigationHooks: isBasicAuth
@@ -252,10 +258,12 @@ const crawlSitemap = async (
252
258
  numScanned: urlsCrawled.scanned.length,
253
259
  urlScanned: request.url,
254
260
  });
255
- urlsCrawled.blacklisted.push({
261
+ urlsCrawled.userExcluded.push({
256
262
  url: request.url,
257
263
  pageTitle: request.url,
258
- actualUrl: actualUrl, // i.e. actualUrl
264
+ actualUrl: request.url, // because about:blank is not useful
265
+ metadata: STATUS_CODE_METADATA[1],
266
+ httpStatusCode: 0,
259
267
  });
260
268
 
261
269
  return;
@@ -276,85 +284,64 @@ const crawlSitemap = async (
276
284
  const contentType = response?.headers?.()['content-type'] || '';
277
285
  const status = response ? response.status() : 0;
278
286
 
279
- if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
280
- urlsCrawled.userExcluded.push({
281
- url: request.url,
282
- pageTitle: request.url,
283
- actualUrl: actualUrl,
284
- });
285
-
286
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
287
- numScanned: urlsCrawled.scanned.length,
288
- urlScanned: request.url,
289
- });
290
- return;
291
- }
287
+ if (basicAuthPage < 0) {
288
+ basicAuthPage += 1;
289
+ } else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
290
+ const isRedirected = !areLinksEqual(page.url(), request.url);
291
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
292
+ item => (item.actualUrl || item.url) === page.url(),
293
+ );
292
294
 
293
- if (status === 403) {
294
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
295
- numScanned: urlsCrawled.scanned.length,
296
- urlScanned: request.url,
297
- });
298
- urlsCrawled.forbidden.push({ url: request.url });
299
- return;
300
- }
295
+ if (isRedirected && isLoadedUrlInCrawledUrls) {
296
+ urlsCrawled.notScannedRedirects.push({
297
+ fromUrl: request.url,
298
+ toUrl: actualUrl, // i.e. actualUrl
299
+ });
300
+ return;
301
+ }
301
302
 
302
- if (status !== 200) {
303
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
304
- numScanned: urlsCrawled.scanned.length,
305
- urlScanned: request.url,
306
- });
307
- urlsCrawled.invalid.push({
308
- url: request.url,
309
- pageTitle: request.url,
310
- actualUrl: actualUrl, // i.e. actualUrl
311
- });
303
+ // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
304
+ if (
305
+ isRedirected &&
306
+ blacklistedPatterns &&
307
+ isSkippedUrl(actualUrl, blacklistedPatterns)
308
+ ) {
309
+ urlsCrawled.userExcluded.push({
310
+ url: request.url,
311
+ pageTitle: request.url,
312
+ actualUrl: actualUrl,
313
+ metadata: STATUS_CODE_METADATA[0],
314
+ httpStatusCode: 0,
315
+ });
312
316
 
313
- return;
314
- }
317
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
318
+ numScanned: urlsCrawled.scanned.length,
319
+ urlScanned: request.url,
320
+ });
321
+ return;
322
+ }
315
323
 
316
- if (basicAuthPage < 0) {
317
- basicAuthPage += 1;
318
- } else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
319
324
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
325
+
320
326
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
321
327
  numScanned: urlsCrawled.scanned.length,
322
328
  urlScanned: request.url,
323
329
  });
324
330
 
325
- const isRedirected = !areLinksEqual(page.url(), request.url);
326
- if (isRedirected) {
327
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
328
- item => (item.actualUrl || item.url.href) === page,
329
- );
330
-
331
- if (isLoadedUrlInCrawledUrls) {
332
- urlsCrawled.notScannedRedirects.push({
333
- fromUrl: request.url,
334
- toUrl: actualUrl, // i.e. actualUrl
335
- });
336
- return;
337
- }
331
+ urlsCrawled.scanned.push({
332
+ url: urlWithoutAuth(request.url),
333
+ pageTitle: results.pageTitle,
334
+ actualUrl: actualUrl, // i.e. actualUrl
335
+ });
338
336
 
339
- urlsCrawled.scanned.push({
340
- url: urlWithoutAuth(request.url),
341
- pageTitle: results.pageTitle,
342
- actualUrl: actualUrl, // i.e. actualUrl
343
- });
337
+ urlsCrawled.scannedRedirects.push({
338
+ fromUrl: urlWithoutAuth(request.url),
339
+ toUrl: actualUrl,
340
+ });
344
341
 
345
- urlsCrawled.scannedRedirects.push({
346
- fromUrl: urlWithoutAuth(request.url),
347
- toUrl: actualUrl,
348
- });
342
+ results.url = request.url;
343
+ results.actualUrl = actualUrl;
349
344
 
350
- results.url = request.url;
351
- results.actualUrl = actualUrl;
352
- } else {
353
- urlsCrawled.scanned.push({
354
- url: urlWithoutAuth(request.url),
355
- pageTitle: results.pageTitle,
356
- });
357
- }
358
345
  await dataset.pushData(results);
359
346
  } else {
360
347
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -363,11 +350,23 @@ const crawlSitemap = async (
363
350
  });
364
351
 
365
352
  if (isScanHtml) {
366
- urlsCrawled.invalid.push(actualUrl);
353
+ // carry through the HTTP status metadata
354
+ const status = response?.status();
355
+ const metadata = typeof status === 'number'
356
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
357
+ : STATUS_CODE_METADATA[2];
358
+
359
+ urlsCrawled.invalid.push({
360
+ actualUrl,
361
+ url: request.url,
362
+ pageTitle: request.url,
363
+ metadata,
364
+ httpStatusCode: typeof status === 'number' ? status : 0
365
+ });
367
366
  }
368
367
  }
369
368
  },
370
- failedRequestHandler: async ({ request }) => {
369
+ failedRequestHandler: async ({ request, response, error }) => {
371
370
  if (isBasicAuth && request.url) {
372
371
  request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
373
372
  }
@@ -381,7 +380,19 @@ const crawlSitemap = async (
381
380
  numScanned: urlsCrawled.scanned.length,
382
381
  urlScanned: request.url,
383
382
  });
384
- urlsCrawled.error.push(request.url);
383
+
384
+ const status = response?.status();
385
+ const metadata = typeof status === 'number'
386
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
387
+ : STATUS_CODE_METADATA[2];
388
+
389
+ urlsCrawled.error.push({
390
+ url: request.url,
391
+ pageTitle: request.url,
392
+ actualUrl: request.url,
393
+ metadata,
394
+ httpStatusCode: typeof status === 'number' ? status : 0
395
+ });
385
396
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
386
397
  },
387
398
  maxRequestsPerCrawl: Infinity,
@@ -16,7 +16,7 @@ export function findElementByCssSelector(cssSelector: string): string | null {
16
16
 
17
17
  // Handle Shadow DOM if the element is not found
18
18
  if (!element) {
19
- const shadowRoots = [];
19
+ const shadowRoots: ShadowRoot[] = [];
20
20
  const allElements = document.querySelectorAll('*');
21
21
 
22
22
  // Look for elements with shadow roots
@@ -27,9 +27,9 @@ export async function flagUnlabelledClickableElements() {
27
27
  const loggingEnabled = false; // Set to true to enable console warnings
28
28
 
29
29
  let previousFlaggedXPathsByDocument = {}; // Object to hold previous flagged XPaths
30
- const previousAllFlaggedElementsXPaths = []; // Array to store all flagged XPaths
30
+ const previousAllFlaggedElementsXPaths : {xpath: string, code: string }[] = []; // Array to store all flagged XPaths
31
31
 
32
- function getXPath(element: Node) {
32
+ function getXPath(element: Node): string {
33
33
  if (!element) return null;
34
34
  if (element instanceof HTMLElement && element.id) {
35
35
  return `//*[@id="${element.id}"]`;
@@ -297,7 +297,7 @@ function hasPointerCursor(node: Node): boolean {
297
297
  return hasAccessibleChildElement || hasDirectAccessibleText;
298
298
  }
299
299
 
300
- function hasAllChildrenAccessible(element: Element) {
300
+ function hasAllChildrenAccessible(element: Element): boolean {
301
301
  // If the element is aria-hidden, consider it accessible
302
302
  if (element.getAttribute('aria-hidden') === 'true') {
303
303
  return true;
@@ -331,7 +331,7 @@ function hasPointerCursor(node: Node): boolean {
331
331
  function hasChildNotANewInteractWithAccessibleText(element: Element) {
332
332
 
333
333
  // Helper function to check if the element is a link or button
334
- const isBuildInInteractable = (child) => {
334
+ const isBuildInInteractable = (child: Element) => {
335
335
  return child.nodeName.toLowerCase() === "a" || child.nodeName.toLowerCase() === "button" || child.nodeName.toLowerCase() === "input" ||
336
336
  child.getAttribute('role') === 'link' || child.getAttribute('role') === 'button';
337
337
  };
@@ -376,7 +376,7 @@ function hasPointerCursor(node: Node): boolean {
376
376
  }
377
377
 
378
378
  // Recursively check for text content inside child nodes of elements that are not links or buttons
379
- if (node.nodeType === Node.ELEMENT_NODE && !isBuildInInteractable(node)) {
379
+ if (node.nodeType === Node.ELEMENT_NODE && !isBuildInInteractable(node as Element)) {
380
380
  return Array.from(node.childNodes).some(innerNode => {
381
381
  if (innerNode.nodeType === Node.TEXT_NODE) {
382
382
  const innerTextContent = getTextContent(innerNode).trim();
@@ -440,7 +440,7 @@ function hasPointerCursor(node: Node): boolean {
440
440
  const beforeContent = window.getComputedStyle(element, '::before').getPropertyValue('content');
441
441
  const afterContent = window.getComputedStyle(element, '::after').getPropertyValue('content');
442
442
 
443
- function isAccessibleContent(value) {
443
+ function isAccessibleContent(value: string) {
444
444
  if (!value || value === 'none' || value === 'normal') {
445
445
  return false;
446
446
  }
@@ -1126,11 +1126,11 @@ function hasPointerCursor(node: Node): boolean {
1126
1126
  });
1127
1127
 
1128
1128
  // Collect XPaths and outerHTMLs of flagged elements per document
1129
- const flaggedXPathsByDocument = {};
1129
+ const flaggedXPathsByDocument: { [key: string]: { xpath: string; code: string }[] } = {};
1130
1130
 
1131
1131
  for (const docKey in currentFlaggedElementsByDocument) {
1132
1132
  const elements = currentFlaggedElementsByDocument[docKey];
1133
- const flaggedInfo = []; // Array to hold flagged element info
1133
+ const flaggedInfo: { xpath: string; code: string }[] = []; // Array to hold flagged element info
1134
1134
  elements.forEach(flaggedElement => {
1135
1135
  const parentFlagged = flaggedElement.closest('[data-flagged="true"]');
1136
1136
  if (!parentFlagged || parentFlagged === flaggedElement) {
@@ -1,12 +1,12 @@
1
- export function xPathToCss(expr: string) {
2
- const isValidXPath = expr =>
1
+ export default function xPathToCss(expr: string) {
2
+ const isValidXPath = (expr: string) =>
3
3
  typeof expr !== 'undefined' &&
4
4
  expr.replace(/[\s-_=]/g, '') !== '' &&
5
5
  expr.length ===
6
- expr.replace(
7
- /[-_\w:.]+\(\)\s*=|=\s*[-_\w:.]+\(\)|\sor\s|\sand\s|\[(?:[^\/\]]+[\/\[]\/?.+)+\]|starts-with\(|\[.*last\(\)\s*[-\+<>=].+\]|number\(\)|not\(|count\(|text\(|first\(|normalize-space|[^\/]following-sibling|concat\(|descendant::|parent::|self::|child::|/gi,
8
- '',
9
- ).length;
6
+ expr.replace(
7
+ /[-_\w:.]+\(\)\s*=|=\s*[-_\w:.]+\(\)|\sor\s|\sand\s|\[(?:[^\/\]]+[\/\[]\/?.+)+\]|starts-with\(|\[.*last\(\)\s*[-\+<>=].+\]|number\(\)|not\(|count\(|text\(|first\(|normalize-space|[^\/]following-sibling|concat\(|descendant::|parent::|self::|child::|/gi,
8
+ '',
9
+ ).length;
10
10
 
11
11
  const getValidationRegex = () => {
12
12
  let regex =
@@ -30,7 +30,7 @@ export function xPathToCss(expr: string) {
30
30
  value: '\\s*[\\w/:][-/\\w\\s,:;.]*',
31
31
  };
32
32
 
33
- Object.keys(subRegexes).forEach(key => {
33
+ Object.keys(subRegexes).forEach((key: keyof typeof subRegexes) => {
34
34
  regex = regex.replace(new RegExp(`%\\(${key}\\)s`, 'gi'), subRegexes[key]);
35
35
  });
36
36
 
@@ -42,14 +42,14 @@ export function xPathToCss(expr: string) {
42
42
  return new RegExp(regex, 'gi');
43
43
  };
44
44
 
45
- const preParseXpath = expr =>
45
+ const preParseXpath = (expr: string) =>
46
46
  expr.replace(
47
47
  /contains\s*\(\s*concat\(["']\s+["']\s*,\s*@class\s*,\s*["']\s+["']\)\s*,\s*["']\s+([a-zA-Z0-9-_]+)\s+["']\)/gi,
48
48
  '@class="$1"',
49
49
  );
50
50
 
51
- function escapeCssIdSelectors(cssSelector) {
52
- return cssSelector.replace(/#([^ >]+)/g, (match, id) => {
51
+ function escapeCssIdSelectors(cssSelector: string) {
52
+ return cssSelector.replace(/#([^ >]+)/g, (_match, id) => {
53
53
  // Escape special characters in the id part
54
54
  return `#${id.replace(/[!"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g, '\\$&')}`;
55
55
  });
@@ -48,7 +48,7 @@ const runCustom = async (
48
48
  includeScreenshots: boolean,
49
49
  ) => {
50
50
  // checks and delete datasets path if it already exists
51
- await cleanUp(randomToken);
51
+ cleanUp(randomToken);
52
52
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
53
53
 
54
54
  const urlsCrawled: UrlsCrawled = { ...constants.urlsCrawledObj };
package/src/index.ts CHANGED
@@ -1,6 +1,4 @@
1
1
  #!/usr/bin/env node
2
- /* eslint-disable func-names */
3
- /* eslint-disable no-param-reassign */
4
2
  import printMessage from 'print-message';
5
3
  import inquirer from 'inquirer';
6
4
  import { EnqueueStrategy } from 'crawlee';
@@ -22,6 +20,7 @@ import {
22
20
  import questions from './constants/questions.js';
23
21
  import combineRun from './combine.js';
24
22
  import { BrowserTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
23
+ import { DeviceDescriptor } from './types/types.js';
25
24
 
26
25
  export type Answers = {
27
26
  headless: boolean;
@@ -32,7 +31,7 @@ export type Answers = {
32
31
  scanner: ScannerTypes;
33
32
  url: string;
34
33
  clonedBrowserDataDir: string;
35
- playwrightDeviceDetailsObject: object;
34
+ playwrightDeviceDetailsObject: DeviceDescriptor;
36
35
  nameEmail: string;
37
36
  fileTypes: string;
38
37
  metadata: string;
@@ -61,7 +60,7 @@ export type Data = {
61
60
  deviceChosen: string;
62
61
  customDevice: string;
63
62
  viewportWidth: number;
64
- playwrightDeviceDetailsObject: object;
63
+ playwrightDeviceDetailsObject: DeviceDescriptor;
65
64
  maxRequestsPerCrawl: number;
66
65
  strategy: EnqueueStrategy;
67
66
  isLocalFileScan: boolean;
package/src/logs.ts CHANGED
@@ -40,7 +40,7 @@ const silentLogger = createLogger({
40
40
  });
41
41
 
42
42
  // guiInfoLogger feeds the gui information via console log and is mainly used for scanning process
43
- export const guiInfoLog = (status, data) => {
43
+ export const guiInfoLog = (status: string, data: { numScanned?: number; urlScanned?: string }) => {
44
44
  if (process.env.RUNNING_FROM_PH_GUI || process.env.OOBEE_VERBOSE) {
45
45
  switch (status) {
46
46
  case guiInfoStatusTypes.COMPLETED: