@govtechsg/oobee 0.10.86 → 0.10.88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.github/workflows/docker-push-ghcr.yml +49 -0
  2. package/.github/workflows/image.yml +2 -3
  3. package/DETAILS_OUTPUT_EXAMPLES.md +178 -0
  4. package/Dockerfile +6 -7
  5. package/dist/cli.js +18 -5
  6. package/dist/combine.js +3 -0
  7. package/dist/constants/cliFunctions.js +2 -2
  8. package/dist/constants/common.js +55 -13
  9. package/dist/crawlers/commonCrawlerFunc.js +523 -2
  10. package/dist/crawlers/crawlDomain.js +38 -13
  11. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  12. package/dist/crawlers/crawlLocalFile.js +2 -2
  13. package/dist/crawlers/crawlSitemap.js +44 -5
  14. package/dist/crawlers/custom/extractAndGradeText.js +1 -1
  15. package/dist/crawlers/custom/getAxeConfiguration.js +26 -21
  16. package/dist/crawlers/custom/gradeReadability.js +1 -1
  17. package/dist/crawlers/custom/utils.js +81 -40
  18. package/dist/generateHtmlReport.js +18 -11
  19. package/dist/mergeAxeResults/itemReferences.js +60 -25
  20. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  21. package/dist/mergeAxeResults.js +18 -9
  22. package/dist/npmIndex.js +16 -12
  23. package/dist/screenshotFunc/htmlScreenshotFunc.js +67 -0
  24. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  25. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
  26. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
  27. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  28. package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
  29. package/dist/static/ejs/summary.ejs +18 -12
  30. package/dist/utils.js +4 -3
  31. package/examples/oobee-test-details-runner.js +214 -0
  32. package/examples/test-violations.html +42 -0
  33. package/fix-summary-html-oom-pr.md +62 -0
  34. package/package.json +5 -5
  35. package/src/cli.ts +19 -5
  36. package/src/combine.ts +3 -0
  37. package/src/constants/cliFunctions.ts +2 -2
  38. package/src/constants/common.ts +65 -12
  39. package/src/crawlers/commonCrawlerFunc.ts +625 -2
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlLocalFile.ts +4 -1
  43. package/src/crawlers/crawlSitemap.ts +50 -3
  44. package/src/crawlers/custom/extractAndGradeText.ts +1 -1
  45. package/src/crawlers/custom/getAxeConfiguration.ts +25 -23
  46. package/src/crawlers/custom/gradeReadability.ts +1 -1
  47. package/src/crawlers/custom/utils.ts +99 -43
  48. package/src/generateHtmlReport.ts +21 -11
  49. package/src/mergeAxeResults/itemReferences.ts +70 -26
  50. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  51. package/src/mergeAxeResults.ts +21 -11
  52. package/src/npmIndex.ts +17 -12
  53. package/src/screenshotFunc/htmlScreenshotFunc.ts +81 -1
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
  56. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
  57. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  58. package/src/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
  59. package/src/static/ejs/summary.ejs +18 -12
  60. package/src/utils.ts +4 -3
  61. package/testStaticJSScanner.html +1 -1
@@ -0,0 +1,49 @@
1
+ name: Build and Push Docker Image to GHCR
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ release_tag:
7
+ description: 'Release tag for the image (e.g. 0.10.87)'
8
+ required: true
9
+ type: string
10
+
11
+ permissions:
12
+ contents: read
13
+ packages: write
14
+
15
+ jobs:
16
+ build-and-push:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: Checkout code
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up QEMU
23
+ uses: docker/setup-qemu-action@v3
24
+
25
+ - name: Set up Docker Buildx
26
+ uses: docker/setup-buildx-action@v3
27
+
28
+ - name: Log in to GitHub Container Registry
29
+ uses: docker/login-action@v3
30
+ with:
31
+ registry: ghcr.io
32
+ username: ${{ github.actor }}
33
+ password: ${{ secrets.GITHUB_TOKEN }}
34
+
35
+ - name: Lowercase repository name
36
+ id: repo
37
+ run: echo "name=${GITHUB_REPOSITORY,,}" >> "$GITHUB_OUTPUT"
38
+
39
+ - name: Build and push multi-arch image
40
+ uses: docker/build-push-action@v6
41
+ with:
42
+ context: .
43
+ platforms: linux/amd64,linux/arm64
44
+ push: true
45
+ tags: |
46
+ ghcr.io/${{ steps.repo.outputs.name }}:${{ inputs.release_tag }}
47
+ ghcr.io/${{ steps.repo.outputs.name }}:latest
48
+ cache-from: type=gha
49
+ cache-to: type=gha,mode=max
@@ -146,18 +146,17 @@ jobs:
146
146
  chmod -R u+w "$GITHUB_WORKSPACE/oobee"
147
147
 
148
148
  # Sign all Mach-O (exec bits OR dylib OR node native addons)
149
- # Search $GITHUB_WORKSPACE (not just oobee/) to cover scripts copied to the parent dir
150
149
  while IFS= read -r f; do
151
150
  echo "Signing $f"
152
151
  codesign --force --options runtime --timestamp --sign "${CERTIFICATE_NAME}" "$f"
153
152
  done < <(
154
- find "$GITHUB_WORKSPACE" -type f \
153
+ find "$GITHUB_WORKSPACE/oobee" -type f \
155
154
  \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
156
155
  ! -path "*/.git/*"
157
156
  )
158
157
 
159
158
  echo "Verifying signatures of Mach-O files..."
160
- find "$GITHUB_WORKSPACE" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
159
+ find "$GITHUB_WORKSPACE/oobee" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
161
160
  -exec codesign --verify --strict --verbose=2 {} \; || true
162
161
 
163
162
  - name: Cleanup keychain
@@ -0,0 +1,178 @@
1
+ # Enriched Details Output Examples
2
+
3
+ These are real outputs captured from `scanPage` against intentionally non-compliant test pages.
4
+ The **Details** panel in the HTML report renders the `message` field shown below.
5
+
6
+ ---
7
+
8
+ ## 1. `color-contrast` (WCAG AA — mustFix)
9
+
10
+ **Element:**
11
+ ```html
12
+ <p style="color: #999999; font-size: 14px;">This light gray text on white background fails AA contrast</p>
13
+ ```
14
+
15
+ **Details message:**
16
+ ```
17
+ Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
18
+ Audit all visible text in the snippet and update every failing foreground color so
19
+ normal text achieves at least 4.5:1 contrast against its actual background, with a
20
+ safety margin above the minimum where possible. Known failing combinations in this
21
+ snippet include foreground #999999 on #ffffff at 14px normal text (current contrast
22
+ 2.84, expected 4.5:1). Fix all failing text colors in the component, not just the
23
+ first reported element. Recommendation: To meet the required contrast ratio, for
24
+ foreground #999999 on background #ffffff (target 4.5:1), adjust foreground to #737373
25
+ (rgb(115, 115, 115)) or background to #2e2e2e (rgb(46, 46, 46)).
26
+ ```
27
+
28
+ **Element:**
29
+ ```html
30
+ <button style="background-color: #55aa99; color: #e8ffe8; font-size: 12px;">Low contrast button</button>
31
+ ```
32
+
33
+ **Details message:**
34
+ ```
35
+ Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
36
+ Audit all visible text in the snippet and update every failing foreground color so
37
+ normal text achieves at least 4.5:1 contrast against its actual background, with a
38
+ safety margin above the minimum where possible. Known failing combinations in this
39
+ snippet include foreground #e8ffe8 on #55aa99 at 12px normal text (current contrast
40
+ 2.61, expected 4.5:1). Fix all failing text colors in the component, not just the
41
+ first reported element. Recommendation: To meet the required contrast ratio, for
42
+ foreground #e8ffe8 on background #55aa99 (target 4.5:1), adjust foreground to #003a00
43
+ (rgb(0, 58, 0)) or background to #3d7a6e (rgb(61, 122, 110)).
44
+ ```
45
+
46
+ ---
47
+
48
+ ## 2. `color-contrast-enhanced` (WCAG AAA — goodToFix)
49
+
50
+ **Element:**
51
+ ```html
52
+ <p style="color: #757575; font-size: 14px;">This text passes AA but fails AAA needs 7 to 1</p>
53
+ ```
54
+
55
+ **Details message:**
56
+ ```
57
+ Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
58
+ Audit all visible text in the snippet and update every failing foreground color so
59
+ normal text achieves at least 7:1 contrast against its actual background, with a
60
+ safety margin above the minimum where possible. Known failing combinations in this
61
+ snippet include foreground #757575 on #ffffff at 14px normal text (current contrast
62
+ 4.6, expected 7:1). Fix all failing text colors in the component, not just the first
63
+ reported element. Recommendation: To meet the required contrast ratio, for foreground
64
+ #757575 on background #ffffff (target 7:1), adjust foreground to #555555
65
+ (rgb(85, 85, 85)).
66
+ ```
67
+
68
+ **Element:**
69
+ ```html
70
+ <p style="color: #6b6b6b; font-size: 12px;">Small text needs 7 to 1 for AAA</p>
71
+ ```
72
+
73
+ **Details message:**
74
+ ```
75
+ Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
76
+ Audit all visible text in the snippet and update every failing foreground color so
77
+ normal text achieves at least 7:1 contrast against its actual background, with a
78
+ safety margin above the minimum where possible. Known failing combinations in this
79
+ snippet include foreground #6b6b6b on #ffffff at 12px normal text (current contrast
80
+ 5.32, expected 7:1). Fix all failing text colors in the component, not just the first
81
+ reported element. Recommendation: To meet the required contrast ratio, for foreground
82
+ #6b6b6b on background #ffffff (target 7:1), adjust foreground to #555555
83
+ (rgb(85, 85, 85)).
84
+ ```
85
+
86
+ ---
87
+
88
+ ## 3. `target-size` (WCAG 2.5.8 — mustFix)
89
+
90
+ ### Example A: `content-box` elements (no WARNING)
91
+
92
+ **Element:**
93
+ ```html
94
+ <a href="/a" class="icon-link" style="width: 16px; height: 16px;">A</a>
95
+ ```
96
+
97
+ **Details message:**
98
+ ```
99
+ Fix any of the following:
100
+ Target has insufficient size (16px by 16px, should be at least 24px by 24px)
101
+ Target has insufficient space to its closest neighbors. Safe clickable space has a
102
+ diameter of 17px instead of at least 24px.
103
+ Computed hit area: 16px × 16px (box-sizing: content-box).
104
+ ```
105
+
106
+ ### Example B: `border-box` element with explicit inline width/height (WARNING appended)
107
+
108
+ **Element:**
109
+ ```html
110
+ <button style="width: 20px; height: 20px; padding: 0; box-sizing: border-box;">X</button>
111
+ ```
112
+
113
+ **Details message:**
114
+ ```
115
+ Fix any of the following:
116
+ Target has insufficient size (20px by 20px, should be at least 24px by 24px)
117
+ Target has insufficient space to its closest neighbors. Safe clickable space has a
118
+ diameter of 21px instead of at least 24px.
119
+ Computed hit area: 20px × 20px (box-sizing: border-box).
120
+ Tip: inline style sets width: 20px, height: 20px with box-sizing: border-box —
121
+ padding is included within those dimensions and will not increase the hit area. Fix:
122
+ remove the explicit width/height and use min-width: 24px; min-height: 24px instead,
123
+ or place the visual content in a child <span> element.
124
+ ```
125
+
126
+ ---
127
+
128
+ ## 4. `valid-lang` (mustFix)
129
+
130
+ **Element:**
131
+ ```html
132
+ <div lang="x-klingon">This section also has an invalid private-use lang tag with some sample text content for context.</div>
133
+ ```
134
+
135
+ **Details message:**
136
+ ```
137
+ Fix all of the following:
138
+ Value of lang attribute not included in the list of valid languages
139
+ Note: "x-klingon" uses a private-use "x-" prefix. axe-core's valid-lang rule also
140
+ rejects private-use subtags — you must use a registered IANA language code.
141
+ Original text: "This section also has an invalid private-use lang tag with some sample
142
+ text content for context.". Identify the actual language of this text and use its
143
+ registered BCP 47 code (e.g., lang="it" Italian, "es" Spanish, "fr" French,
144
+ "de" German, "zh" Chinese, "ja" Japanese, "ko" Korean, "pt" Portuguese, "ar" Arabic).
145
+ ```
146
+
147
+ ---
148
+
149
+ ## 5. `oobee-grading-text-contents` (WCAG AAA — needsReview)
150
+
151
+ **Element:**
152
+ ```html
153
+ <html lang="en">...</html>
154
+ ```
155
+
156
+ **Details message:**
157
+ ```
158
+ The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease
159
+ score of 33.92. Difficult — college level or above.
160
+ ```
161
+
162
+ ### Score interpretation table (appended inline after the numeric score)
163
+
164
+ Only scores in the range 1–50 trigger a violation (scores > 50 pass, scores ≤ 0 are filtered out).
165
+
166
+ | Score Range | Interpretation appended to message |
167
+ |---|---|
168
+ | 31–50 | Difficult — college level or above. |
169
+ | 1–30 | Very difficult — best understood by university graduates. |
170
+
171
+ ---
172
+
173
+ ## Notes
174
+
175
+ - **color-contrast** and **color-contrast-enhanced** messages include computed color recommendations using WCAG relative luminance math with a binary search on HSL lightness.
176
+ - **target-size** appends `Computed hit area` and, when `box-sizing: border-box` with explicit inline dimensions is detected, a `Tip` explaining why padding won't help.
177
+ - **valid-lang** appends a `NOTE` when the lang value uses a private-use `x-*` prefix, plus the element's text content (up to 120 chars) to help identify the correct language code.
178
+ - **oobee-grading-text-contents** now appends a plain-language interpretation of the Flesch-Kincaid score immediately after the numeric value.
package/Dockerfile CHANGED
@@ -2,13 +2,12 @@
2
2
  # Node version is v22
3
3
  FROM mcr.microsoft.com/playwright:v1.58.2-noble
4
4
 
5
- # Installation of packages for oobee and runner (locked versions from build log)
6
- RUN apt-get update && apt-get install -y \
7
- git=1:2.43.0-1ubuntu7.3 \
8
- git-man=1:2.43.0-1ubuntu7.3 \
9
- unzip=6.0-28ubuntu4.1 \
10
- zip=3.0-13ubuntu0.2 \
11
- && rm -rf /var/lib/apt/lists/*
5
+ # Installation of packages for oobee
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ git \
8
+ unzip \
9
+ zip && \
10
+ rm -rf /var/lib/apt/lists/*
12
11
 
13
12
  WORKDIR /app/oobee
14
13
 
package/dist/cli.js CHANGED
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
147
147
  })
148
148
  .check(argvs => {
149
149
  const scanner = String(argvs.scanner ?? '');
150
- if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
151
- throw new Error('-s or --strategy is only available in website and custom flow scans.');
150
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
151
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
152
+ }
153
+ if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
154
+ throw new Error('-s ignore is only available for sitemap scans.');
152
155
  }
153
156
  return true;
154
157
  })
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
161
164
  return duration;
162
165
  })
163
166
  .check(argvs => {
164
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
165
- throw new Error('-s or --strategy is only available in website scans.');
167
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
168
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
169
+ }
170
+ if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
171
+ throw new Error('-s ignore is only available for sitemap scans.');
166
172
  }
167
173
  return true;
168
174
  })
169
175
  .conflicts('d', 'w')
170
176
  .parse();
177
+ if (!options.strategy) {
178
+ options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
179
+ }
171
180
  const scanInit = async (argvs) => {
172
181
  const updatedArgvs = { ...argvs };
173
182
  // Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
187
196
  if (res.httpStatus)
188
197
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
189
198
  if (res.status === statuses.success.code) {
190
- data.url = res.url;
199
+ // Custom flow should continue from the user-provided entry URL so auth redirects
200
+ // do not replace the original domain used for overlay gating and navigation.
201
+ if (data.type !== ScannerTypes.CUSTOM) {
202
+ data.url = res.url;
203
+ }
191
204
  if (process.env.OOBEE_VALIDATE_URL) {
192
205
  consoleLogger.info('Url is valid');
193
206
  cleanUpAndExit(0, data.randomToken);
package/dist/combine.js CHANGED
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
95
95
  blacklistedPatterns,
96
96
  includeScreenshots,
97
97
  extraHTTPHeaders,
98
+ strategy,
99
+ userUrl: url,
98
100
  scanDuration,
99
101
  });
100
102
  urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -115,6 +117,7 @@ const combineRun = async (details, deviceToScan) => {
115
117
  includeScreenshots,
116
118
  extraHTTPHeaders,
117
119
  scanDuration,
120
+ ruleset,
118
121
  });
119
122
  if (localFileResult) {
120
123
  if ('urlsCrawled' in localFileResult) {
@@ -147,8 +147,8 @@ export const cliOptions = {
147
147
  },
148
148
  s: {
149
149
  alias: 'strategy',
150
- describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
151
- choices: ['same-domain', 'same-hostname'],
150
+ describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
151
+ choices: ['same-domain', 'same-hostname', 'ignore'],
152
152
  requiresArg: true,
153
153
  demandOption: false,
154
154
  },
@@ -26,7 +26,7 @@ formDataFields,
26
26
  ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
27
27
  import { consoleLogger } from '../logs.js';
28
28
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
29
- import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
29
+ import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
30
30
  import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
31
31
  // validateDirPath validates a provided directory path
32
32
  // returns null if no error
@@ -592,7 +592,9 @@ export const prepareData = async (argv) => {
592
592
  viewportWidth,
593
593
  playwrightDeviceDetailsObject,
594
594
  maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
595
- strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
595
+ strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
596
+ : strategy === 'ignore' ? EnqueueStrategy.All
597
+ : EnqueueStrategy.SameDomain,
596
598
  isLocalFileScan,
597
599
  browser: browserToRun,
598
600
  nameEmail,
@@ -637,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
637
639
  let shouldCapture = false;
638
640
  const disallowedUrls = [];
639
641
  const allowedUrls = [];
642
+ // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
643
+ // Two patterns are returned for bare paths (no trailing wildcard) so that
644
+ // both the exact URL and all child paths are blocked, matching robots.txt
645
+ // prefix semantics.
640
646
  const sanitisePattern = (pattern) => {
641
647
  const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
642
648
  const subdirWildcardRegex = /\/\*\//g;
@@ -644,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
644
650
  if (subdirWildcardRegex.test(pattern)) {
645
651
  pattern = pattern.replace(subdirWildcardRegex, '/**/');
646
652
  }
653
+ // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
654
+ // '?' is the query separator in robots.txt but a single-char wildcard in
655
+ // minimatch. Escape it to a literal match and append '*' so any query
656
+ // value after the stated prefix is also blocked.
657
+ if (pattern.includes('?')) {
658
+ return [domain + pattern.replace('?', '\\?') + '*'];
659
+ }
647
660
  if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
648
661
  if (pattern.endsWith('*')) {
649
- pattern = pattern.concat('*');
662
+ // e.g. /ebook/* → /ebook/** (already covers all children)
663
+ return [domain + pattern.concat('*')];
650
664
  }
651
665
  else {
652
- if (!pattern.endsWith('/'))
653
- pattern = pattern.concat('/');
654
- pattern = pattern.concat('**');
666
+ // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
667
+ // exact URL *and* every descendant. minimatch's '/**' glob does not
668
+ // match the bare path itself (no trailing slash), so we emit both the
669
+ // exact-path pattern and a children glob.
670
+ const base = domain + pattern;
671
+ const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
672
+ return [base, children];
655
673
  }
656
674
  }
657
- const final = domain.concat(pattern);
658
- return final;
675
+ return [domain + pattern];
659
676
  };
660
677
  for (const line of lines) {
661
678
  if (line.toLowerCase().startsWith('user-agent: *')) {
@@ -667,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
667
684
  else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
668
685
  let disallowed = line.substring('disallow: '.length).trim();
669
686
  if (disallowed) {
670
- disallowed = sanitisePattern(disallowed);
671
- disallowedUrls.push(disallowed);
687
+ disallowedUrls.push(...sanitisePattern(disallowed));
672
688
  }
673
689
  }
674
690
  else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
675
691
  let allowed = line.substring('allow: '.length).trim();
676
692
  if (allowed) {
677
- allowed = sanitisePattern(allowed);
678
- allowedUrls.push(allowed);
693
+ allowedUrls.push(...sanitisePattern(allowed));
679
694
  }
680
695
  }
681
696
  }
@@ -726,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
726
741
  }
727
742
  }
728
743
  };
744
+ export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
745
+ const domain = new URL(url).origin;
746
+ const robotsUrl = domain.concat('/robots.txt');
747
+ let robotsTxt;
748
+ try {
749
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
750
+ }
751
+ catch (e) {
752
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
753
+ return [];
754
+ }
755
+ if (!robotsTxt)
756
+ return [];
757
+ const sitemaps = [];
758
+ const lines = robotsTxt.split(/\r?\n/);
759
+ for (const line of lines) {
760
+ if (line.toLowerCase().startsWith('sitemap:')) {
761
+ const sitemapUrl = line.substring('sitemap:'.length).trim();
762
+ if (sitemapUrl) {
763
+ sitemaps.push(sitemapUrl);
764
+ }
765
+ }
766
+ }
767
+ return sitemaps;
768
+ };
729
769
  export const isDisallowedInRobotsTxt = (url) => {
730
770
  if (!constants.robotsTxtUrls)
731
771
  return;
@@ -744,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
744
784
  }
745
785
  return false;
746
786
  };
747
- export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
787
+ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
748
788
  const scannedSitemaps = new Set();
749
789
  const urls = {}; // dictionary of requests to urls to be scanned
750
790
  const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
@@ -753,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
753
793
  return;
754
794
  if (isDisallowedInRobotsTxt(url))
755
795
  return;
796
+ if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
797
+ return;
756
798
  url = convertPathToLocalFile(url);
757
799
  let request;
758
800
  try {