@govtechsg/oobee 0.10.86 → 0.10.88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/docker-push-ghcr.yml +49 -0
- package/.github/workflows/image.yml +2 -3
- package/DETAILS_OUTPUT_EXAMPLES.md +178 -0
- package/Dockerfile +6 -7
- package/dist/cli.js +18 -5
- package/dist/combine.js +3 -0
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +55 -13
- package/dist/crawlers/commonCrawlerFunc.js +523 -2
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlLocalFile.js +2 -2
- package/dist/crawlers/crawlSitemap.js +44 -5
- package/dist/crawlers/custom/extractAndGradeText.js +1 -1
- package/dist/crawlers/custom/getAxeConfiguration.js +26 -21
- package/dist/crawlers/custom/gradeReadability.js +1 -1
- package/dist/crawlers/custom/utils.js +81 -40
- package/dist/generateHtmlReport.js +18 -11
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +18 -9
- package/dist/npmIndex.js +16 -12
- package/dist/screenshotFunc/htmlScreenshotFunc.js +67 -0
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
- package/dist/static/ejs/summary.ejs +18 -12
- package/dist/utils.js +4 -3
- package/examples/oobee-test-details-runner.js +214 -0
- package/examples/test-violations.html +42 -0
- package/fix-summary-html-oom-pr.md +62 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +3 -0
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +65 -12
- package/src/crawlers/commonCrawlerFunc.ts +625 -2
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlLocalFile.ts +4 -1
- package/src/crawlers/crawlSitemap.ts +50 -3
- package/src/crawlers/custom/extractAndGradeText.ts +1 -1
- package/src/crawlers/custom/getAxeConfiguration.ts +25 -23
- package/src/crawlers/custom/gradeReadability.ts +1 -1
- package/src/crawlers/custom/utils.ts +99 -43
- package/src/generateHtmlReport.ts +21 -11
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +21 -11
- package/src/npmIndex.ts +17 -12
- package/src/screenshotFunc/htmlScreenshotFunc.ts +81 -1
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +45 -6
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +8 -5
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/partials/scripts/ruleModal/utilities.ejs +2 -1
- package/src/static/ejs/summary.ejs +18 -12
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +1 -1
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: Build and Push Docker Image to GHCR
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
inputs:
|
|
6
|
+
release_tag:
|
|
7
|
+
description: 'Release tag for the image (e.g. 0.10.87)'
|
|
8
|
+
required: true
|
|
9
|
+
type: string
|
|
10
|
+
|
|
11
|
+
permissions:
|
|
12
|
+
contents: read
|
|
13
|
+
packages: write
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
build-and-push:
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout code
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up QEMU
|
|
23
|
+
uses: docker/setup-qemu-action@v3
|
|
24
|
+
|
|
25
|
+
- name: Set up Docker Buildx
|
|
26
|
+
uses: docker/setup-buildx-action@v3
|
|
27
|
+
|
|
28
|
+
- name: Log in to GitHub Container Registry
|
|
29
|
+
uses: docker/login-action@v3
|
|
30
|
+
with:
|
|
31
|
+
registry: ghcr.io
|
|
32
|
+
username: ${{ github.actor }}
|
|
33
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
34
|
+
|
|
35
|
+
- name: Lowercase repository name
|
|
36
|
+
id: repo
|
|
37
|
+
run: echo "name=${GITHUB_REPOSITORY,,}" >> "$GITHUB_OUTPUT"
|
|
38
|
+
|
|
39
|
+
- name: Build and push multi-arch image
|
|
40
|
+
uses: docker/build-push-action@v6
|
|
41
|
+
with:
|
|
42
|
+
context: .
|
|
43
|
+
platforms: linux/amd64,linux/arm64
|
|
44
|
+
push: true
|
|
45
|
+
tags: |
|
|
46
|
+
ghcr.io/${{ steps.repo.outputs.name }}:${{ inputs.release_tag }}
|
|
47
|
+
ghcr.io/${{ steps.repo.outputs.name }}:latest
|
|
48
|
+
cache-from: type=gha
|
|
49
|
+
cache-to: type=gha,mode=max
|
|
@@ -146,18 +146,17 @@ jobs:
|
|
|
146
146
|
chmod -R u+w "$GITHUB_WORKSPACE/oobee"
|
|
147
147
|
|
|
148
148
|
# Sign all Mach-O (exec bits OR dylib OR node native addons)
|
|
149
|
-
# Search $GITHUB_WORKSPACE (not just oobee/) to cover scripts copied to the parent dir
|
|
150
149
|
while IFS= read -r f; do
|
|
151
150
|
echo "Signing $f"
|
|
152
151
|
codesign --force --options runtime --timestamp --sign "${CERTIFICATE_NAME}" "$f"
|
|
153
152
|
done < <(
|
|
154
|
-
find "$GITHUB_WORKSPACE" -type f \
|
|
153
|
+
find "$GITHUB_WORKSPACE/oobee" -type f \
|
|
155
154
|
\( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
156
155
|
! -path "*/.git/*"
|
|
157
156
|
)
|
|
158
157
|
|
|
159
158
|
echo "Verifying signatures of Mach-O files..."
|
|
160
|
-
find "$GITHUB_WORKSPACE" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
159
|
+
find "$GITHUB_WORKSPACE/oobee" -type f \( -perm -111 -o -name "*.dylib" -o -name "*.node" \) \
|
|
161
160
|
-exec codesign --verify --strict --verbose=2 {} \; || true
|
|
162
161
|
|
|
163
162
|
- name: Cleanup keychain
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Enriched Details Output Examples
|
|
2
|
+
|
|
3
|
+
These are real outputs captured from `scanPage` against intentionally non-compliant test pages.
|
|
4
|
+
The **Details** panel in the HTML report renders the `message` field shown below.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 1. `color-contrast` (WCAG AA — mustFix)
|
|
9
|
+
|
|
10
|
+
**Element:**
|
|
11
|
+
```html
|
|
12
|
+
<p style="color: #999999; font-size: 14px;">This light gray text on white background fails AA contrast</p>
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
**Details message:**
|
|
16
|
+
```
|
|
17
|
+
Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
|
|
18
|
+
Audit all visible text in the snippet and update every failing foreground color so
|
|
19
|
+
normal text achieves at least 4.5:1 contrast against its actual background, with a
|
|
20
|
+
safety margin above the minimum where possible. Known failing combinations in this
|
|
21
|
+
snippet include foreground #999999 on #ffffff at 14px normal text (current contrast
|
|
22
|
+
2.84, expected 4.5:1). Fix all failing text colors in the component, not just the
|
|
23
|
+
first reported element. Recommendation: To meet the required contrast ratio, for
|
|
24
|
+
foreground #999999 on background #ffffff (target 4.5:1), adjust foreground to #737373
|
|
25
|
+
(rgb(115, 115, 115)) or background to #2e2e2e (rgb(46, 46, 46)).
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Element:**
|
|
29
|
+
```html
|
|
30
|
+
<button style="background-color: #55aa99; color: #e8ffe8; font-size: 12px;">Low contrast button</button>
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
**Details message:**
|
|
34
|
+
```
|
|
35
|
+
Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
|
|
36
|
+
Audit all visible text in the snippet and update every failing foreground color so
|
|
37
|
+
normal text achieves at least 4.5:1 contrast against its actual background, with a
|
|
38
|
+
safety margin above the minimum where possible. Known failing combinations in this
|
|
39
|
+
snippet include foreground #e8ffe8 on #55aa99 at 12px normal text (current contrast
|
|
40
|
+
2.61, expected 4.5:1). Fix all failing text colors in the component, not just the
|
|
41
|
+
first reported element. Recommendation: To meet the required contrast ratio, for
|
|
42
|
+
foreground #e8ffe8 on background #55aa99 (target 4.5:1), adjust foreground to #003a00
|
|
43
|
+
(rgb(0, 58, 0)) or background to #3d7a6e (rgb(61, 122, 110)).
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## 2. `color-contrast-enhanced` (WCAG AAA — goodToFix)
|
|
49
|
+
|
|
50
|
+
**Element:**
|
|
51
|
+
```html
|
|
52
|
+
<p style="color: #757575; font-size: 14px;">This text passes AA but fails AAA needs 7 to 1</p>
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Details message:**
|
|
56
|
+
```
|
|
57
|
+
Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
|
|
58
|
+
Audit all visible text in the snippet and update every failing foreground color so
|
|
59
|
+
normal text achieves at least 7:1 contrast against its actual background, with a
|
|
60
|
+
safety margin above the minimum where possible. Known failing combinations in this
|
|
61
|
+
snippet include foreground #757575 on #ffffff at 14px normal text (current contrast
|
|
62
|
+
4.6, expected 7:1). Fix all failing text colors in the component, not just the first
|
|
63
|
+
reported element. Recommendation: To meet the required contrast ratio, for foreground
|
|
64
|
+
#757575 on background #ffffff (target 7:1), adjust foreground to #555555
|
|
65
|
+
(rgb(85, 85, 85)).
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Element:**
|
|
69
|
+
```html
|
|
70
|
+
<p style="color: #6b6b6b; font-size: 12px;">Small text needs 7 to 1 for AAA</p>
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**Details message:**
|
|
74
|
+
```
|
|
75
|
+
Multiple text elements in this component fail WCAG 1.4.3 Color Contrast Minimum.
|
|
76
|
+
Audit all visible text in the snippet and update every failing foreground color so
|
|
77
|
+
normal text achieves at least 7:1 contrast against its actual background, with a
|
|
78
|
+
safety margin above the minimum where possible. Known failing combinations in this
|
|
79
|
+
snippet include foreground #6b6b6b on #ffffff at 12px normal text (current contrast
|
|
80
|
+
5.32, expected 7:1). Fix all failing text colors in the component, not just the first
|
|
81
|
+
reported element. Recommendation: To meet the required contrast ratio, for foreground
|
|
82
|
+
#6b6b6b on background #ffffff (target 7:1), adjust foreground to #555555
|
|
83
|
+
(rgb(85, 85, 85)).
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## 3. `target-size` (WCAG 2.5.8 — mustFix)
|
|
89
|
+
|
|
90
|
+
### Example A: `content-box` elements (no WARNING)
|
|
91
|
+
|
|
92
|
+
**Element:**
|
|
93
|
+
```html
|
|
94
|
+
<a href="/a" class="icon-link" style="width: 16px; height: 16px;">A</a>
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Details message:**
|
|
98
|
+
```
|
|
99
|
+
Fix any of the following:
|
|
100
|
+
Target has insufficient size (16px by 16px, should be at least 24px by 24px)
|
|
101
|
+
Target has insufficient space to its closest neighbors. Safe clickable space has a
|
|
102
|
+
diameter of 17px instead of at least 24px.
|
|
103
|
+
Computed hit area: 16px × 16px (box-sizing: content-box).
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Example B: `border-box` element with explicit inline width/height (WARNING appended)
|
|
107
|
+
|
|
108
|
+
**Element:**
|
|
109
|
+
```html
|
|
110
|
+
<button style="width: 20px; height: 20px; padding: 0; box-sizing: border-box;">X</button>
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Details message:**
|
|
114
|
+
```
|
|
115
|
+
Fix any of the following:
|
|
116
|
+
Target has insufficient size (20px by 20px, should be at least 24px by 24px)
|
|
117
|
+
Target has insufficient space to its closest neighbors. Safe clickable space has a
|
|
118
|
+
diameter of 21px instead of at least 24px.
|
|
119
|
+
Computed hit area: 20px × 20px (box-sizing: border-box).
|
|
120
|
+
Tip: inline style sets width: 20px, height: 20px with box-sizing: border-box —
|
|
121
|
+
padding is included within those dimensions and will not increase the hit area. Fix:
|
|
122
|
+
remove the explicit width/height and use min-width: 24px; min-height: 24px instead,
|
|
123
|
+
or place the visual content in a child <span> element.
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## 4. `valid-lang` (mustFix)
|
|
129
|
+
|
|
130
|
+
**Element:**
|
|
131
|
+
```html
|
|
132
|
+
<div lang="x-klingon">This section also has an invalid private-use lang tag with some sample text content for context.</div>
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
**Details message:**
|
|
136
|
+
```
|
|
137
|
+
Fix all of the following:
|
|
138
|
+
Value of lang attribute not included in the list of valid languages
|
|
139
|
+
Note: "x-klingon" uses a private-use "x-" prefix. axe-core's valid-lang rule also
|
|
140
|
+
rejects private-use subtags — you must use a registered IANA language code.
|
|
141
|
+
Original text: "This section also has an invalid private-use lang tag with some sample
|
|
142
|
+
text content for context.". Identify the actual language of this text and use its
|
|
143
|
+
registered BCP 47 code (e.g., lang="it" Italian, "es" Spanish, "fr" French,
|
|
144
|
+
"de" German, "zh" Chinese, "ja" Japanese, "ko" Korean, "pt" Portuguese, "ar" Arabic).
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## 5. `oobee-grading-text-contents` (WCAG AAA — needsReview)
|
|
150
|
+
|
|
151
|
+
**Element:**
|
|
152
|
+
```html
|
|
153
|
+
<html lang="en">...</html>
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Details message:**
|
|
157
|
+
```
|
|
158
|
+
The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease
|
|
159
|
+
score of 33.92. Difficult — college level or above.
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Score interpretation table (appended inline after the numeric score)
|
|
163
|
+
|
|
164
|
+
Only scores in the range 1–50 trigger a violation (scores > 50 pass, scores ≤ 0 are filtered out).
|
|
165
|
+
|
|
166
|
+
| Score Range | Interpretation appended to message |
|
|
167
|
+
|---|---|
|
|
168
|
+
| 31–50 | Difficult — college level or above. |
|
|
169
|
+
| 1–30 | Very difficult — best understood by university graduates. |
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Notes
|
|
174
|
+
|
|
175
|
+
- **color-contrast** and **color-contrast-enhanced** messages include computed color recommendations using WCAG relative luminance math with a binary search on HSL lightness.
|
|
176
|
+
- **target-size** appends `Computed hit area` and, when `box-sizing: border-box` with explicit inline dimensions is detected, a `Tip` explaining why padding won't help.
|
|
177
|
+
- **valid-lang** appends a `NOTE` when the lang value uses a private-use `x-*` prefix, plus the element's text content (up to 120 chars) to help identify the correct language code.
|
|
178
|
+
- **oobee-grading-text-contents** now appends a plain-language interpretation of the Flesch-Kincaid score immediately after the numeric value.
|
package/Dockerfile
CHANGED
|
@@ -2,13 +2,12 @@
|
|
|
2
2
|
# Node version is v22
|
|
3
3
|
FROM mcr.microsoft.com/playwright:v1.58.2-noble
|
|
4
4
|
|
|
5
|
-
# Installation of packages for oobee
|
|
6
|
-
RUN apt-get update && apt-get install -y \
|
|
7
|
-
git
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
&& rm -rf /var/lib/apt/lists/*
|
|
5
|
+
# Installation of packages for oobee
|
|
6
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
7
|
+
git \
|
|
8
|
+
unzip \
|
|
9
|
+
zip && \
|
|
10
|
+
rm -rf /var/lib/apt/lists/*
|
|
12
11
|
|
|
13
12
|
WORKDIR /app/oobee
|
|
14
13
|
|
package/dist/cli.js
CHANGED
|
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
147
147
|
})
|
|
148
148
|
.check(argvs => {
|
|
149
149
|
const scanner = String(argvs.scanner ?? '');
|
|
150
|
-
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
|
|
151
|
-
throw new Error('-s or --strategy is only available in website
|
|
150
|
+
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
|
|
151
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
152
|
+
}
|
|
153
|
+
if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
|
|
154
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
152
155
|
}
|
|
153
156
|
return true;
|
|
154
157
|
})
|
|
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
161
164
|
return duration;
|
|
162
165
|
})
|
|
163
166
|
.check(argvs => {
|
|
164
|
-
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
|
|
165
|
-
throw new Error('-s or --strategy is only available in website scans.');
|
|
167
|
+
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
|
|
168
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
169
|
+
}
|
|
170
|
+
if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
|
|
171
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
166
172
|
}
|
|
167
173
|
return true;
|
|
168
174
|
})
|
|
169
175
|
.conflicts('d', 'w')
|
|
170
176
|
.parse();
|
|
177
|
+
if (!options.strategy) {
|
|
178
|
+
options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
|
|
179
|
+
}
|
|
171
180
|
const scanInit = async (argvs) => {
|
|
172
181
|
const updatedArgvs = { ...argvs };
|
|
173
182
|
// Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
|
|
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
|
|
|
187
196
|
if (res.httpStatus)
|
|
188
197
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
189
198
|
if (res.status === statuses.success.code) {
|
|
190
|
-
|
|
199
|
+
// Custom flow should continue from the user-provided entry URL so auth redirects
|
|
200
|
+
// do not replace the original domain used for overlay gating and navigation.
|
|
201
|
+
if (data.type !== ScannerTypes.CUSTOM) {
|
|
202
|
+
data.url = res.url;
|
|
203
|
+
}
|
|
191
204
|
if (process.env.OOBEE_VALIDATE_URL) {
|
|
192
205
|
consoleLogger.info('Url is valid');
|
|
193
206
|
cleanUpAndExit(0, data.randomToken);
|
package/dist/combine.js
CHANGED
|
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
95
95
|
blacklistedPatterns,
|
|
96
96
|
includeScreenshots,
|
|
97
97
|
extraHTTPHeaders,
|
|
98
|
+
strategy,
|
|
99
|
+
userUrl: url,
|
|
98
100
|
scanDuration,
|
|
99
101
|
});
|
|
100
102
|
urlsCrawledObj = sitemapResult.urlsCrawled;
|
|
@@ -115,6 +117,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
115
117
|
includeScreenshots,
|
|
116
118
|
extraHTTPHeaders,
|
|
117
119
|
scanDuration,
|
|
120
|
+
ruleset,
|
|
118
121
|
});
|
|
119
122
|
if (localFileResult) {
|
|
120
123
|
if ('urlsCrawled' in localFileResult) {
|
|
@@ -147,8 +147,8 @@ export const cliOptions = {
|
|
|
147
147
|
},
|
|
148
148
|
s: {
|
|
149
149
|
alias: 'strategy',
|
|
150
|
-
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
|
|
151
|
-
choices: ['same-domain', 'same-hostname'],
|
|
150
|
+
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
|
|
151
|
+
choices: ['same-domain', 'same-hostname', 'ignore'],
|
|
152
152
|
requiresArg: true,
|
|
153
153
|
demandOption: false,
|
|
154
154
|
},
|
package/dist/constants/common.js
CHANGED
|
@@ -26,7 +26,7 @@ formDataFields,
|
|
|
26
26
|
ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
|
|
27
27
|
import { consoleLogger } from '../logs.js';
|
|
28
28
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
29
|
-
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
29
|
+
import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
|
|
30
30
|
import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
|
|
31
31
|
// validateDirPath validates a provided directory path
|
|
32
32
|
// returns null if no error
|
|
@@ -592,7 +592,9 @@ export const prepareData = async (argv) => {
|
|
|
592
592
|
viewportWidth,
|
|
593
593
|
playwrightDeviceDetailsObject,
|
|
594
594
|
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
|
595
|
-
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
595
|
+
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
596
|
+
: strategy === 'ignore' ? EnqueueStrategy.All
|
|
597
|
+
: EnqueueStrategy.SameDomain,
|
|
596
598
|
isLocalFileScan,
|
|
597
599
|
browser: browserToRun,
|
|
598
600
|
nameEmail,
|
|
@@ -637,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
637
639
|
let shouldCapture = false;
|
|
638
640
|
const disallowedUrls = [];
|
|
639
641
|
const allowedUrls = [];
|
|
642
|
+
// Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
|
|
643
|
+
// Two patterns are returned for bare paths (no trailing wildcard) so that
|
|
644
|
+
// both the exact URL and all child paths are blocked, matching robots.txt
|
|
645
|
+
// prefix semantics.
|
|
640
646
|
const sanitisePattern = (pattern) => {
|
|
641
647
|
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
|
642
648
|
const subdirWildcardRegex = /\/\*\//g;
|
|
@@ -644,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
644
650
|
if (subdirWildcardRegex.test(pattern)) {
|
|
645
651
|
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
|
646
652
|
}
|
|
653
|
+
// Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
|
|
654
|
+
// '?' is the query separator in robots.txt but a single-char wildcard in
|
|
655
|
+
// minimatch. Escape it to a literal match and append '*' so any query
|
|
656
|
+
// value after the stated prefix is also blocked.
|
|
657
|
+
if (pattern.includes('?')) {
|
|
658
|
+
return [domain + pattern.replace('?', '\\?') + '*'];
|
|
659
|
+
}
|
|
647
660
|
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
|
648
661
|
if (pattern.endsWith('*')) {
|
|
649
|
-
|
|
662
|
+
// e.g. /ebook/* → /ebook/** (already covers all children)
|
|
663
|
+
return [domain + pattern.concat('*')];
|
|
650
664
|
}
|
|
651
665
|
else {
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
666
|
+
// Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
|
|
667
|
+
// exact URL *and* every descendant. minimatch's '/**' glob does not
|
|
668
|
+
// match the bare path itself (no trailing slash), so we emit both the
|
|
669
|
+
// exact-path pattern and a children glob.
|
|
670
|
+
const base = domain + pattern;
|
|
671
|
+
const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
|
|
672
|
+
return [base, children];
|
|
655
673
|
}
|
|
656
674
|
}
|
|
657
|
-
|
|
658
|
-
return final;
|
|
675
|
+
return [domain + pattern];
|
|
659
676
|
};
|
|
660
677
|
for (const line of lines) {
|
|
661
678
|
if (line.toLowerCase().startsWith('user-agent: *')) {
|
|
@@ -667,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
667
684
|
else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
|
668
685
|
let disallowed = line.substring('disallow: '.length).trim();
|
|
669
686
|
if (disallowed) {
|
|
670
|
-
|
|
671
|
-
disallowedUrls.push(disallowed);
|
|
687
|
+
disallowedUrls.push(...sanitisePattern(disallowed));
|
|
672
688
|
}
|
|
673
689
|
}
|
|
674
690
|
else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
|
675
691
|
let allowed = line.substring('allow: '.length).trim();
|
|
676
692
|
if (allowed) {
|
|
677
|
-
|
|
678
|
-
allowedUrls.push(allowed);
|
|
693
|
+
allowedUrls.push(...sanitisePattern(allowed));
|
|
679
694
|
}
|
|
680
695
|
}
|
|
681
696
|
}
|
|
@@ -726,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
726
741
|
}
|
|
727
742
|
}
|
|
728
743
|
};
|
|
744
|
+
export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
|
|
745
|
+
const domain = new URL(url).origin;
|
|
746
|
+
const robotsUrl = domain.concat('/robots.txt');
|
|
747
|
+
let robotsTxt;
|
|
748
|
+
try {
|
|
749
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
|
|
750
|
+
}
|
|
751
|
+
catch (e) {
|
|
752
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
|
|
753
|
+
return [];
|
|
754
|
+
}
|
|
755
|
+
if (!robotsTxt)
|
|
756
|
+
return [];
|
|
757
|
+
const sitemaps = [];
|
|
758
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
759
|
+
for (const line of lines) {
|
|
760
|
+
if (line.toLowerCase().startsWith('sitemap:')) {
|
|
761
|
+
const sitemapUrl = line.substring('sitemap:'.length).trim();
|
|
762
|
+
if (sitemapUrl) {
|
|
763
|
+
sitemaps.push(sitemapUrl);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
return sitemaps;
|
|
768
|
+
};
|
|
729
769
|
export const isDisallowedInRobotsTxt = (url) => {
|
|
730
770
|
if (!constants.robotsTxtUrls)
|
|
731
771
|
return;
|
|
@@ -744,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
|
|
|
744
784
|
}
|
|
745
785
|
return false;
|
|
746
786
|
};
|
|
747
|
-
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
|
|
787
|
+
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
|
|
748
788
|
const scannedSitemaps = new Set();
|
|
749
789
|
const urls = {}; // dictionary of requests to urls to be scanned
|
|
750
790
|
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
@@ -753,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
753
793
|
return;
|
|
754
794
|
if (isDisallowedInRobotsTxt(url))
|
|
755
795
|
return;
|
|
796
|
+
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
|
|
797
|
+
return;
|
|
756
798
|
url = convertPathToLocalFile(url);
|
|
757
799
|
let request;
|
|
758
800
|
try {
|