@govtechsg/oobee 0.10.86 → 0.10.88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.github/workflows/docker-push-ghcr.yml +49 -0
  2. package/.github/workflows/image.yml +2 -3
  3. package/DETAILS_OUTPUT_EXAMPLES.md +178 -0
  4. package/Dockerfile +6 -7
  5. package/dist/cli.js +18 -5
  6. package/dist/combine.js +3 -0
  7. package/dist/constants/cliFunctions.js +2 -2
  8. package/dist/constants/common.js +55 -13
  9. package/dist/crawlers/commonCrawlerFunc.js +523 -2
  10. package/dist/crawlers/crawlDomain.js +38 -13
  11. package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
  12. package/dist/crawlers/crawlLocalFile.js +2 -2
  13. package/dist/crawlers/crawlSitemap.js +44 -5
  14. package/dist/crawlers/custom/extractAndGradeText.js +1 -1
  15. package/dist/crawlers/custom/getAxeConfiguration.js +26 -21
  16. package/dist/crawlers/custom/gradeReadability.js +1 -1
  17. package/dist/crawlers/custom/utils.js +81 -40
  18. package/dist/generateHtmlReport.js +18 -11
  19. package/dist/mergeAxeResults/itemReferences.js +60 -25
  20. package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
  21. package/dist/mergeAxeResults.js +18 -9
  22. package/dist/npmIndex.js +16 -12
  23. package/dist/screenshotFunc/htmlScreenshotFunc.js +67 -0
  24. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  25. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
  26. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
  27. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  28. package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
  29. package/dist/static/ejs/summary.ejs +18 -12
  30. package/dist/utils.js +4 -3
  31. package/examples/oobee-test-details-runner.js +214 -0
  32. package/examples/test-violations.html +42 -0
  33. package/fix-summary-html-oom-pr.md +62 -0
  34. package/package.json +5 -5
  35. package/src/cli.ts +19 -5
  36. package/src/combine.ts +3 -0
  37. package/src/constants/cliFunctions.ts +2 -2
  38. package/src/constants/common.ts +65 -12
  39. package/src/crawlers/commonCrawlerFunc.ts +625 -2
  40. package/src/crawlers/crawlDomain.ts +39 -13
  41. package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
  42. package/src/crawlers/crawlLocalFile.ts +4 -1
  43. package/src/crawlers/crawlSitemap.ts +50 -3
  44. package/src/crawlers/custom/extractAndGradeText.ts +1 -1
  45. package/src/crawlers/custom/getAxeConfiguration.ts +25 -23
  46. package/src/crawlers/custom/gradeReadability.ts +1 -1
  47. package/src/crawlers/custom/utils.ts +99 -43
  48. package/src/generateHtmlReport.ts +21 -11
  49. package/src/mergeAxeResults/itemReferences.ts +70 -26
  50. package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
  51. package/src/mergeAxeResults.ts +21 -11
  52. package/src/npmIndex.ts +17 -12
  53. package/src/screenshotFunc/htmlScreenshotFunc.ts +81 -1
  54. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
  55. package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
  56. package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
  57. package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
  58. package/src/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
  59. package/src/static/ejs/summary.ejs +18 -12
  60. package/src/utils.ts +4 -3
  61. package/testStaticJSScanner.html +1 -1
@@ -0,0 +1,214 @@
1
+ /**
2
+ * Details Output Demo
3
+ *
4
+ * Runs scanPage against intentionally non-compliant test pages to capture the
5
+ * enriched Details messages for: color-contrast, color-contrast-enhanced,
6
+ * target-size, valid-lang, and oobee-grading-text-contents.
7
+ *
8
+ * Usage: node examples/details-runner.js
9
+ */
10
+ import { chromium } from 'playwright';
11
+ import { scanPage } from '../dist/npmIndex.js';
12
+ import { gradeReadability } from '../dist/crawlers/custom/gradeReadability.js';
13
+
14
+ // --- Test HTML pages ---
15
+
16
+ const colorContrastHTML = `
17
+ <!DOCTYPE html>
18
+ <html lang="en">
19
+ <head><title>Color Contrast Test</title></head>
20
+ <body style="background-color: #ffffff;">
21
+ <h1>Color Contrast Violations</h1>
22
+ <p style="color: #999999; font-size: 14px;">This light gray text on white background fails AA contrast</p>
23
+ <p style="color: #aaaaaa; font-size: 14px; background-color: #f0f0f0;">Very light gray on light gray</p>
24
+ <button style="background-color: #55aa99; color: #e8ffe8; font-size: 12px;">Low contrast button</button>
25
+ </body>
26
+ </html>
27
+ `;
28
+
29
+ const colorContrastEnhancedHTML = `
30
+ <!DOCTYPE html>
31
+ <html lang="en">
32
+ <head><title>Color Contrast Enhanced Test</title></head>
33
+ <body style="background-color: #ffffff;">
34
+ <h1>Color Contrast Enhanced AAA Violations</h1>
35
+ <p style="color: #757575; font-size: 14px;">This text passes AA but fails AAA needs 7 to 1</p>
36
+ <p style="color: #6b6b6b; font-size: 12px;">Small text needs 7 to 1 for AAA</p>
37
+ </body>
38
+ </html>
39
+ `;
40
+
41
+ const targetSizeHTML = `
42
+ <!DOCTYPE html>
43
+ <html lang="en">
44
+ <head><title>Target Size Test</title>
45
+ <style>
46
+ body { font-family: sans-serif; padding: 40px; }
47
+ .icon-link {
48
+ display: inline-block;
49
+ width: 16px;
50
+ height: 16px;
51
+ font-size: 10px;
52
+ line-height: 16px;
53
+ text-align: center;
54
+ text-decoration: none;
55
+ color: #333;
56
+ overflow: hidden;
57
+ }
58
+ </style>
59
+ </head>
60
+ <body>
61
+ <main>
62
+ <h1>Icon-sized interactive targets</h1>
63
+ <a href="/a" class="icon-link" style="width: 16px; height: 16px;">A</a>
64
+ <a href="/b" class="icon-link" style="width: 16px; height: 16px;">B</a>
65
+ <a href="/c" class="icon-link" style="width: 16px; height: 16px;">C</a>
66
+ </main>
67
+ </body>
68
+ </html>
69
+ `;
70
+
71
+ const validLangHTML = `
72
+ <!DOCTYPE html>
73
+ <html lang="x-sindarin">
74
+ <head><title>Valid Lang Test</title></head>
75
+ <body>
76
+ <main>
77
+ <h1>Valid Lang Violation</h1>
78
+ <p>This page uses a private-use language subtag that is not valid according to BCP 47.</p>
79
+ <div lang="x-klingon">This section also has an invalid private-use lang tag with some sample text content for context.</div>
80
+ </main>
81
+ </body>
82
+ </html>
83
+ `;
84
+
85
+ const readabilityHTML = `
86
+ <!DOCTYPE html>
87
+ <html lang="en">
88
+ <head><title>Readability Test</title></head>
89
+ <body>
90
+ <main>
91
+ <h1>Building Safety Standards</h1>
92
+ <p>The committee reviewed the proposed changes to the building safety standards last Thursday. Members noted that the current regulations do not address modern construction materials adequately. Several technical amendments were suggested to improve clarity for contractors and inspectors. The revised standards will require additional testing for fire resistance in commercial properties. Public consultation on these proposed changes will remain open until the end of next quarter. Building owners should review the draft guidelines to understand potential compliance requirements.</p>
93
+ </main>
94
+ </body>
95
+ </html>
96
+ `;
97
+
98
+ // --- Helpers ---
99
+
100
+ function extractMessages(result, ruleId) {
101
+ const messages = [];
102
+ for (const category of ['mustFix', 'goodToFix', 'needsReview']) {
103
+ const rules = result?.[category]?.rules;
104
+ if (rules && rules[ruleId]) {
105
+ const rule = rules[ruleId];
106
+ messages.push({
107
+ category,
108
+ rule: rule.rule || ruleId,
109
+ description: rule.description,
110
+ totalItems: rule.totalItems,
111
+ items: rule.items?.map(item => ({
112
+ html: item.html || item.element,
113
+ message: item.message,
114
+ })),
115
+ });
116
+ }
117
+ }
118
+ return messages;
119
+ }
120
+
121
+ // --- Main ---
122
+
123
+ (async () => {
124
+ console.log("Launching browser...");
125
+ const browser = await chromium.launch({ headless: true });
126
+ const output = {};
127
+
128
+ // 1. Color Contrast (AA)
129
+ console.log("Scanning: color-contrast...");
130
+ try {
131
+ const page = await browser.newPage();
132
+ await page.setContent(colorContrastHTML);
133
+ const result = await scanPage(page, {
134
+ name: "Test", email: "test@test.com", pageTitle: "Color Contrast Test",
135
+ });
136
+ output['color-contrast'] = extractMessages(result, 'color-contrast');
137
+ await page.close();
138
+ } catch (e) { console.error("color-contrast error:", e.message); }
139
+
140
+ // 2. Color Contrast Enhanced (AAA)
141
+ console.log("Scanning: color-contrast-enhanced...");
142
+ try {
143
+ const page = await browser.newPage();
144
+ await page.setContent(colorContrastEnhancedHTML);
145
+ const result = await scanPage(page, {
146
+ name: "Test", email: "test@test.com", pageTitle: "Color Contrast Enhanced Test",
147
+ ruleset: ['default', 'enable-wcag-aaa'],
148
+ });
149
+ output['color-contrast-enhanced'] = extractMessages(result, 'color-contrast-enhanced');
150
+ await page.close();
151
+ } catch (e) { console.error("color-contrast-enhanced error:", e.message); }
152
+
153
+ // 3. Target Size
154
+ console.log("Scanning: target-size...");
155
+ try {
156
+ const page = await browser.newPage();
157
+ await page.setContent(targetSizeHTML);
158
+ const result = await scanPage(page, {
159
+ name: "Test", email: "test@test.com", pageTitle: "Target Size Test",
160
+ });
161
+ output['target-size'] = extractMessages(result, 'target-size');
162
+ await page.close();
163
+ } catch (e) { console.error("target-size error:", e.message); }
164
+
165
+ // 4. Valid Lang
166
+ console.log("Scanning: valid-lang...");
167
+ try {
168
+ const page = await browser.newPage();
169
+ await page.setContent(validLangHTML);
170
+ const result = await scanPage(page, {
171
+ name: "Test", email: "test@test.com", pageTitle: "Valid Lang Test",
172
+ });
173
+ output['valid-lang'] = extractMessages(result, 'valid-lang');
174
+ await page.close();
175
+ } catch (e) { console.error("valid-lang error:", e.message); }
176
+
177
+ // 5. Readability (oobee-grading-text-contents)
178
+ console.log("Scanning: oobee-grading-text-contents...");
179
+ try {
180
+ const page = await browser.newPage();
181
+ await page.setContent(readabilityHTML);
182
+
183
+ // Simulate what the crawler does: extract text, grade readability
184
+ const textContent = readabilityHTML.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
185
+ const sentences = textContent.split(/(?<=[.!?])\s+/).filter(s => s.trim().length > 0);
186
+ const flag = gradeReadability(sentences);
187
+ console.log(` Readability flag: "${flag}"`);
188
+
189
+ if (flag) {
190
+ const score = parseFloat(flag);
191
+ let interpretation = '';
192
+ if (score > 30) interpretation = 'It is targeted for junior college (JC) level comprehension and above.';
193
+ else interpretation = 'It is targeted for university graduate level comprehension and above.';
194
+
195
+ output['oobee-grading-text-contents'] = [{
196
+ category: 'needsReview',
197
+ rule: 'oobee-grading-text-contents',
198
+ description: 'Page content must use clear, plain language',
199
+ items: [{
200
+ html: '<html lang="en">...</html>',
201
+ message: `Text content is potentially difficult to read. It scored ${flag} out of 50 on the Flesch-Kincaid Readability Test. ${interpretation}`,
202
+ }],
203
+ }];
204
+ } else {
205
+ console.log(" Score filtered out (<=0 or >50). No violation triggered.");
206
+ }
207
+ await page.close();
208
+ } catch (e) { console.error("readability error:", e.message); }
209
+
210
+ await browser.close();
211
+
212
+ // Print results
213
+ console.log("\n" + JSON.stringify(output, null, 2));
214
+ })();
@@ -0,0 +1,42 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <title>Combined Accessibility Test</title>
5
+ <style>
6
+ body { font-family: sans-serif; padding: 40px; }
7
+ .icon-link {
8
+ display: inline-block;
9
+ width: 16px;
10
+ height: 16px;
11
+ font-size: 10px;
12
+ line-height: 16px;
13
+ text-align: center;
14
+ text-decoration: none;
15
+ color: #333;
16
+ overflow: hidden;
17
+ }
18
+ </style>
19
+ </head>
20
+ <body>
21
+ <main>
22
+ <h1>Accessibility Violations Test Page</h1>
23
+
24
+ <h2>Color Contrast</h2>
25
+ <p style="color: #999999; font-size: 14px;">This light gray text on white background fails AA contrast</p>
26
+ <p style="color: #aaaaaa; font-size: 14px; background-color: #f0f0f0;">Very light gray on light gray</p>
27
+ <button style="background-color: #55aa99; color: #e8ffe8; font-size: 12px;">Low contrast button</button>
28
+
29
+ <h2>Target Size</h2>
30
+ <a href="/a" class="icon-link" style="width: 16px; height: 16px;">A</a>
31
+ <a href="/b" class="icon-link" style="width: 16px; height: 16px;">B</a>
32
+ <a href="/c" class="icon-link" style="width: 16px; height: 16px;">C</a>
33
+
34
+ <h2>Valid Lang</h2>
35
+ <div lang="x-klingon">This section has an invalid private-use lang tag with some sample text content for context.</div>
36
+
37
+ <h2>Readability</h2>
38
+ <p>The committee reviewed the proposed changes to the building safety standards last Thursday. Members noted that the current regulations do not address modern construction materials adequately. Several technical amendments were suggested to improve clarity for contractors and inspectors. The revised standards will require additional testing for fire resistance in commercial properties. Public consultation on these proposed changes will remain open until the end of next quarter. Building owners should review the draft guidelines to understand potential compliance requirements.</p>
39
+
40
+ </main>
41
+ </body>
42
+ </html>
@@ -0,0 +1,62 @@
1
+ # fix: prevent OOM and browser crash in report generation for large scans
2
+
3
+ ## Summary
4
+
5
+ - Fix `summary.ejs` inlining the entire scan items payload (2 GB+ for 1000-page scans) via `JSON.stringify`, causing V8 OOM and killing the process
6
+ - Fix `report.html` embedded scanItems exceeding browser memory limits (746 MB uncompressed JSON for 1000-page scans)
7
+ - Fix write stream backpressure handling when embedding chunked base64 data
8
+ - `writeSummaryHTML` crash also blocked `report.html` generation since it runs first
9
+
10
+ ## Problem 1: OOM in summary.html generation (server-side)
11
+
12
+ For large scans (e.g. 1000 pages, 2.5M+ passed occurrences), `summary.ejs` serialized the full `items` object — including every rule's `pagesAffected` array with all individual issue items — into an inline `<script>` tag. This produced a string exceeding V8's limits, crashing the process silently.
13
+
14
+ The result: neither `summary.html` nor `report.html` were generated, even though all JSON artifacts (`scanData.json`, `scanItems.json`, etc.) were written successfully.
15
+
16
+ ## Problem 2: Browser cannot parse embedded scanItems (client-side)
17
+
18
+ Even with report generation fixed, the browser failed to load the All Issues view:
19
+ ```
20
+ Failed to decode/unzip/parse: Unexpected end of JSON input
21
+ ```
22
+
23
+ Root cause: `convertItemsToReferences` stripped per-page `items` arrays but still embedded the full `pagesAffected` array (url, pageTitle, actualUrl, metadata, etc. for every page × every rule). For 1000-page scans this produced **746 MB of uncompressed JSON** after base64-decode and gunzip — exceeding browser string/memory limits during `JSON.parse()`.
24
+
25
+ ## Problem 3: Write stream backpressure (server-side)
26
+
27
+ The `writeHTML` function writes scan items as 2 MB base64 chunks via a `for await` loop over a read stream. `outputStream.write()` was not being checked for backpressure — when the write buffer filled up, subsequent writes could be silently dropped, producing truncated base64.
28
+
29
+ ## Fix
30
+
31
+ ### summary.ejs (OOM fix)
32
+ Strip the inline JSON to only what `summaryTable.ejs` actually needs:
33
+ - Rule-level metadata: `description`, `helpUrl`, `conformance`, `totalItems`
34
+ - `pagesAffected: { length: N }` (just the count object, not the full array)
35
+
36
+ This reduces the serialized payload from potentially gigabytes to a few kilobytes regardless of scan size.
37
+
38
+ ### itemReferences.ts (browser payload fix)
39
+ `convertItemsToReferences` now strips each `pagesAffected` entry down to only `url`, `pageTitle`, and `itemsCount` — removing all per-item details (html snippets, screenshots, xpath, metadata, etc.) that constituted the bulk of the data. The All Issues list renders rule totals, and the "Group By Page" view in the rule modal still shows page URLs with occurrence counts.
40
+
41
+ This reduces the embedded payload from 746 MB (uncompressed) to ~11 MB for a 1000-page scan — well within browser memory limits.
42
+
43
+ ### mergeAxeResults.ts (backpressure fix)
44
+ Await the `drain` event on the output stream when `write()` returns `false` before writing the next chunk. This ensures all base64 data is fully written to the report regardless of payload size.
45
+
46
+ ## Files changed
47
+
48
+ | File | Change |
49
+ |------|--------|
50
+ | `src/static/ejs/summary.ejs` | Strip inline JSON to rule counts only |
51
+ | `src/mergeAxeResults/itemReferences.ts` | Strip `pagesAffected` to lightweight entries (url, pageTitle, itemsCount only) |
52
+ | `src/mergeAxeResults.ts` | Await drain on backpressure during chunked write |
53
+ | `src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs` | Fall back to `pagesAffectedCount` |
54
+ | `src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs` | Fall back to `pagesAffectedCount` |
55
+
56
+ ## Test plan
57
+
58
+ - [ ] Run a large scan (500+ pages) and verify both `summary.html` and `report.html` are generated
59
+ - [ ] Open `summary.html` in a browser and verify the summary table renders correctly (issue counts, page counts, help links)
60
+ - [ ] Open `report.html` and verify the All Issues list loads and displays rule counts correctly
61
+ - [ ] Verify the rule modal shows correct "Pages affected" count
62
+ - [ ] Verify small scans still produce correct reports (no regression)
package/package.json CHANGED
@@ -1,19 +1,19 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.86",
4
+ "version": "0.10.88",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "bin": {
8
8
  "oobee": "./dist/cli.js"
9
9
  },
10
10
  "dependencies": {
11
- "@aws-sdk/client-s3": "^3.893.0",
11
+ "@aws-sdk/client-s3": "^3.1049.0",
12
12
  "@json2csv/node": "^7.0.3",
13
13
  "@napi-rs/canvas": "^0.1.53",
14
14
  "@sentry/node": "^9.13.0",
15
15
  "@types/aws-sdk": "^0.0.42",
16
- "axe-core": "^4.11.1",
16
+ "axe-core": "^4.11.4",
17
17
  "axios": "^1.8.2",
18
18
  "base64-stream": "^1.0.0",
19
19
  "cheerio": "^1.0.0-rc.12",
@@ -39,7 +39,7 @@
39
39
  "tldts": "^7.0.27",
40
40
  "typescript": "^5.4.5",
41
41
  "url": "^0.11.3",
42
- "uuid": "^11.0.3",
42
+ "uuid": "^14.0.0",
43
43
  "validator": "^13.11.0",
44
44
  "which": "^4.0.0",
45
45
  "winston": "^3.11.0",
@@ -86,7 +86,7 @@
86
86
  "fast-xml-parser": ">=5.3.8",
87
87
  "js-yaml": "^4.1.1",
88
88
  "minimatch": "^10.2.4",
89
- "brace-expansion": "^5.0.5",
89
+ "brace-expansion": "^5.0.6",
90
90
  "glob": "^13.0.6",
91
91
  "flatted": "^3.4.1",
92
92
  "file-type": "^21.3.3"
package/src/cli.ts CHANGED
@@ -193,8 +193,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
193
193
  .check(argvs => {
194
194
  const scanner = String(argvs.scanner ?? '');
195
195
 
196
- if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
197
- throw new Error('-s or --strategy is only available in website and custom flow scans.');
196
+ if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
197
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
198
+ }
199
+ if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
200
+ throw new Error('-s ignore is only available for sitemap scans.');
198
201
  }
199
202
  return true;
200
203
  })
@@ -210,14 +213,21 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
210
213
  return duration;
211
214
  })
212
215
  .check(argvs => {
213
- if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
214
- throw new Error('-s or --strategy is only available in website scans.');
216
+ if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
217
+ throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
218
+ }
219
+ if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
220
+ throw new Error('-s ignore is only available for sitemap scans.');
215
221
  }
216
222
  return true;
217
223
  })
218
224
  .conflicts('d', 'w')
219
225
  .parse() as unknown as Answers;
220
226
 
227
+ if (!options.strategy) {
228
+ options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
229
+ }
230
+
221
231
  const scanInit = async (argvs: Answers): Promise<string> => {
222
232
  const updatedArgvs = { ...argvs };
223
233
 
@@ -250,7 +260,11 @@ const scanInit = async (argvs: Answers): Promise<string> => {
250
260
  consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
251
261
 
252
262
  if (res.status === statuses.success.code) {
253
- data.url = res.url;
263
+ // Custom flow should continue from the user-provided entry URL so auth redirects
264
+ // do not replace the original domain used for overlay gating and navigation.
265
+ if (data.type !== ScannerTypes.CUSTOM) {
266
+ data.url = res.url;
267
+ }
254
268
  if (process.env.OOBEE_VALIDATE_URL) {
255
269
  consoleLogger.info('Url is valid');
256
270
  cleanUpAndExit(0, data.randomToken);
package/src/combine.ts CHANGED
@@ -161,6 +161,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
161
161
  blacklistedPatterns,
162
162
  includeScreenshots,
163
163
  extraHTTPHeaders,
164
+ strategy,
165
+ userUrl: url,
164
166
  scanDuration,
165
167
  });
166
168
  urlsCrawledObj = sitemapResult.urlsCrawled;
@@ -182,6 +184,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
182
184
  includeScreenshots,
183
185
  extraHTTPHeaders,
184
186
  scanDuration,
187
+ ruleset,
185
188
  });
186
189
  if (localFileResult) {
187
190
  if ('urlsCrawled' in localFileResult) {
@@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
168
168
  s: {
169
169
  alias: 'strategy',
170
170
  describe:
171
- 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
172
- choices: ['same-domain', 'same-hostname'],
171
+ 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
172
+ choices: ['same-domain', 'same-hostname', 'ignore'],
173
173
  requiresArg: true,
174
174
  demandOption: false,
175
175
  },
@@ -33,7 +33,7 @@ import constants, {
33
33
  } from './constants.js';
34
34
  import { consoleLogger } from '../logs.js';
35
35
  import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
36
- import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
36
+ import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
37
37
  import { Answers, Data } from '../index.js';
38
38
  import { DeviceDescriptor } from '../types/types.js';
39
39
  import { getProxyInfo, proxyInfoToResolution, ProxySettings } from '../proxyService.js';
@@ -746,7 +746,9 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
746
746
  playwrightDeviceDetailsObject,
747
747
  maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
748
748
  strategy:
749
- strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
749
+ strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
750
+ : strategy === 'ignore' ? EnqueueStrategy.All
751
+ : EnqueueStrategy.SameDomain,
750
752
  isLocalFileScan,
751
753
  browser: browserToRun,
752
754
  nameEmail,
@@ -804,7 +806,11 @@ export const getUrlsFromRobotsTxt = async (
804
806
  const disallowedUrls = [];
805
807
  const allowedUrls = [];
806
808
 
807
- const sanitisePattern = (pattern: string): string => {
809
+ // Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
810
+ // Two patterns are returned for bare paths (no trailing wildcard) so that
811
+ // both the exact URL and all child paths are blocked, matching robots.txt
812
+ // prefix semantics.
813
+ const sanitisePattern = (pattern: string): string[] => {
808
814
  const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
809
815
  const subdirWildcardRegex = /\/\*\//g;
810
816
  const filePathRegex = /^\/(?:[^\/]+\/)*[^\/]+\.[a-zA-Z0-9]{1,6}$/;
@@ -812,16 +818,30 @@ export const getUrlsFromRobotsTxt = async (
812
818
  if (subdirWildcardRegex.test(pattern)) {
813
819
  pattern = pattern.replace(subdirWildcardRegex, '/**/');
814
820
  }
821
+
822
+ // Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
823
+ // '?' is the query separator in robots.txt but a single-char wildcard in
824
+ // minimatch. Escape it to a literal match and append '*' so any query
825
+ // value after the stated prefix is also blocked.
826
+ if (pattern.includes('?')) {
827
+ return [domain + pattern.replace('?', '\\?') + '*'];
828
+ }
829
+
815
830
  if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
816
831
  if (pattern.endsWith('*')) {
817
- pattern = pattern.concat('*');
832
+ // e.g. /ebook/* → /ebook/** (already covers all children)
833
+ return [domain + pattern.concat('*')];
818
834
  } else {
819
- if (!pattern.endsWith('/')) pattern = pattern.concat('/');
820
- pattern = pattern.concat('**');
835
+ // Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
836
+ // exact URL *and* every descendant. minimatch's '/**' glob does not
837
+ // match the bare path itself (no trailing slash), so we emit both the
838
+ // exact-path pattern and a children glob.
839
+ const base = domain + pattern;
840
+ const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
841
+ return [base, children];
821
842
  }
822
843
  }
823
- const final = domain.concat(pattern);
824
- return final;
844
+ return [domain + pattern];
825
845
  };
826
846
 
827
847
  for (const line of lines) {
@@ -832,14 +852,12 @@ export const getUrlsFromRobotsTxt = async (
832
852
  } else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
833
853
  let disallowed = line.substring('disallow: '.length).trim();
834
854
  if (disallowed) {
835
- disallowed = sanitisePattern(disallowed);
836
- disallowedUrls.push(disallowed);
855
+ disallowedUrls.push(...sanitisePattern(disallowed));
837
856
  }
838
857
  } else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
839
858
  let allowed = line.substring('allow: '.length).trim();
840
859
  if (allowed) {
841
- allowed = sanitisePattern(allowed);
842
- allowedUrls.push(allowed);
860
+ allowedUrls.push(...sanitisePattern(allowed));
843
861
  }
844
862
  }
845
863
  }
@@ -899,6 +917,38 @@ const getRobotsTxtViaPlaywright = async (
899
917
  }
900
918
  };
901
919
 
920
+ export const getSitemapsFromRobotsTxt = async (
921
+ url: string,
922
+ browser: string,
923
+ userDataDirectory: string,
924
+ extraHTTPHeaders: Record<string, string>,
925
+ ): Promise<string[]> => {
926
+ const domain = new URL(url).origin;
927
+ const robotsUrl = domain.concat('/robots.txt');
928
+
929
+ let robotsTxt: string;
930
+ try {
931
+ robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
932
+ } catch (e) {
933
+ consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
934
+ return [];
935
+ }
936
+
937
+ if (!robotsTxt) return [];
938
+
939
+ const sitemaps: string[] = [];
940
+ const lines = robotsTxt.split(/\r?\n/);
941
+ for (const line of lines) {
942
+ if (line.toLowerCase().startsWith('sitemap:')) {
943
+ const sitemapUrl = line.substring('sitemap:'.length).trim();
944
+ if (sitemapUrl) {
945
+ sitemaps.push(sitemapUrl);
946
+ }
947
+ }
948
+ }
949
+ return sitemaps;
950
+ };
951
+
902
952
  export const isDisallowedInRobotsTxt = (url: string): boolean => {
903
953
  if (!constants.robotsTxtUrls) return;
904
954
 
@@ -931,6 +981,8 @@ export const getLinksFromSitemap = async (
931
981
  userUrlInput: string,
932
982
  isIntelligent: boolean,
933
983
  extraHTTPHeaders: Record<string, string>,
984
+ strategy: EnqueueStrategy = EnqueueStrategy.All,
985
+ userUrl: string = userUrlInput,
934
986
  ) => {
935
987
  const scannedSitemaps = new Set<string>();
936
988
  const urls: Record<string, Request> = {}; // dictionary of requests to urls to be scanned
@@ -940,6 +992,7 @@ export const getLinksFromSitemap = async (
940
992
  const addToUrlList = (url: string) => {
941
993
  if (!url) return;
942
994
  if (isDisallowedInRobotsTxt(url)) return;
995
+ if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy)) return;
943
996
 
944
997
  url = convertPathToLocalFile(url);
945
998