@govtechsg/oobee 0.10.85 → 0.10.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +10 -0
- package/DETAILS.md +29 -0
- package/dist/cli.js +18 -5
- package/dist/combine.js +3 -1
- package/dist/constants/cliFunctions.js +2 -2
- package/dist/constants/common.js +70 -17
- package/dist/constants/constants.js +604 -1
- package/dist/crawlers/commonCrawlerFunc.js +3 -2
- package/dist/crawlers/crawlDomain.js +38 -13
- package/dist/crawlers/crawlIntelligentSitemap.js +62 -30
- package/dist/crawlers/crawlSitemap.js +141 -84
- package/dist/crawlers/custom/utils.js +218 -71
- package/dist/crawlers/guards/urlGuard.js +8 -15
- package/dist/crawlers/runCustom.js +18 -11
- package/dist/generateHtmlReport.js +18 -11
- package/dist/generateOobeeClientScanner.js +570 -0
- package/dist/mergeAxeResults/itemReferences.js +60 -25
- package/dist/mergeAxeResults/sentryTelemetry.js +4 -1
- package/dist/mergeAxeResults.js +23 -13
- package/dist/npmIndex.js +10 -2
- package/dist/proxyService.js +18 -3
- package/dist/services/s3Uploader.js +21 -10
- package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/dist/static/ejs/summary.ejs +19 -8
- package/dist/utils.js +4 -3
- package/fix-summary-html-oom-pr.md +62 -0
- package/oobee-client-scanner.js +34992 -0
- package/package.json +5 -5
- package/src/cli.ts +19 -5
- package/src/combine.ts +5 -1
- package/src/constants/cliFunctions.ts +2 -2
- package/src/constants/common.ts +87 -22
- package/src/constants/constants.ts +602 -1
- package/src/crawlers/commonCrawlerFunc.ts +4 -3
- package/src/crawlers/crawlDomain.ts +39 -13
- package/src/crawlers/crawlIntelligentSitemap.ts +63 -30
- package/src/crawlers/crawlSitemap.ts +165 -100
- package/src/crawlers/custom/utils.ts +241 -80
- package/src/crawlers/guards/urlGuard.ts +24 -31
- package/src/crawlers/runCustom.ts +29 -11
- package/src/generateHtmlReport.ts +21 -11
- package/src/generateOobeeClientScanner.ts +591 -0
- package/src/mergeAxeResults/itemReferences.ts +70 -26
- package/src/mergeAxeResults/sentryTelemetry.ts +4 -1
- package/src/mergeAxeResults.ts +26 -14
- package/src/npmIndex.ts +12 -2
- package/src/proxyService.ts +25 -4
- package/src/services/s3Uploader.ts +23 -11
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +6 -3
- package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +2 -2
- package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +1 -761
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +38 -2
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +4 -4
- package/src/static/ejs/summary.ejs +19 -8
- package/src/utils.ts +4 -3
- package/testStaticJSScanner.html +534 -0
|
@@ -3,6 +3,8 @@ on:
|
|
|
3
3
|
workflow_dispatch:
|
|
4
4
|
release:
|
|
5
5
|
types: [published]
|
|
6
|
+
permissions:
|
|
7
|
+
contents: write
|
|
6
8
|
jobs:
|
|
7
9
|
build:
|
|
8
10
|
runs-on: ubuntu-latest
|
|
@@ -20,6 +22,14 @@ jobs:
|
|
|
20
22
|
- run: npm run build
|
|
21
23
|
continue-on-error: false
|
|
22
24
|
|
|
25
|
+
- name: Create and push git tag
|
|
26
|
+
run: |
|
|
27
|
+
VERSION=$(node -p "require('./package.json').version")
|
|
28
|
+
git config user.name "github-actions[bot]"
|
|
29
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
30
|
+
git tag -af "v${VERSION}" -m "Version ${VERSION}"
|
|
31
|
+
git push origin "v${VERSION}" --force
|
|
32
|
+
|
|
23
33
|
- run: npm publish
|
|
24
34
|
env:
|
|
25
35
|
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
package/DETAILS.md
CHANGED
|
@@ -195,3 +195,32 @@ Note: Level AAA are disabled by default. Please specify `enable-wcag-aaa` in ru
|
|
|
195
195
|
| skip-link | Ensure all skip links have a focusable target | Good to Fix |
|
|
196
196
|
| tabindex | Ensures tabindex attribute values are not greater than 0 | Good to Fix |
|
|
197
197
|
| table-duplicate-name | Ensure the `<caption>` element does not contain the same text as the summary attribute | Good to Fix |
|
|
198
|
+
|
|
199
|
+
## Additional Information
|
|
200
|
+
### How the Readability Grading Works
|
|
201
|
+
|
|
202
|
+
#### 1. Text Extraction
|
|
203
|
+
|
|
204
|
+
During a page scan, Oobee extracts text from all `<p>` elements on the page (via extractAndGradeText.ts or extractText.ts). The raw text is split into individual **sentences** using the pattern `/[^.!?]*[.!?]+/g` — only text segments ending with `.`, `!`, or `?` are kept.
|
|
205
|
+
|
|
206
|
+
#### 2. Flesch Reading Ease Scoring
|
|
207
|
+
|
|
208
|
+
The extracted sentences are joined into a single string and word-counted. If the page has **fewer than 20 words**, grading is skipped (score = 0, treated as a pass). Otherwise, the [Flesch Reading Ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) formula is applied via the `text-readability` library in gradeReadability.ts:
|
|
209
|
+
|
|
210
|
+
| Score Range | Interpretation |
|
|
211
|
+
|---|---|
|
|
212
|
+
| 90–100 | Very easy to read (5th grade) |
|
|
213
|
+
| 60–70 | Easily understood by 13–15 year olds |
|
|
214
|
+
| **≤ 50** | **Difficult — college level or above** |
|
|
215
|
+
| 0–30 | Very difficult — best understood by university graduates |
|
|
216
|
+
|
|
217
|
+
#### 3. Flagging Criteria
|
|
218
|
+
|
|
219
|
+
The `oobee-grading-text-contents` rule is **only enabled when WCAG AAA mode is on** (`enableWcagAaa = true`) and violations are flagged under **Manual Review Required** findings. It maps to **WCAG 3.1.5 (Reading Level)**.
|
|
220
|
+
|
|
221
|
+
A page is **flagged** (incomplete) when the Flesch Reading Ease score is **50 or below**, indicating the text is potentially difficult to understand. The issue message reports the exact score and explains that the target passing score is above 50.
|
|
222
|
+
|
|
223
|
+
A page **passes** when:
|
|
224
|
+
- The score is **above 50**, or
|
|
225
|
+
- There are fewer than 20 words of paragraph text, or
|
|
226
|
+
- No valid sentences (ending with punctuation) are found
|
package/dist/cli.js
CHANGED
|
@@ -147,8 +147,11 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
147
147
|
})
|
|
148
148
|
.check(argvs => {
|
|
149
149
|
const scanner = String(argvs.scanner ?? '');
|
|
150
|
-
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM) {
|
|
151
|
-
throw new Error('-s or --strategy is only available in website
|
|
150
|
+
if (argvs.strategy && scanner !== ScannerTypes.WEBSITE && scanner !== ScannerTypes.CUSTOM && scanner !== ScannerTypes.INTELLIGENT && scanner !== ScannerTypes.SITEMAP) {
|
|
151
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
152
|
+
}
|
|
153
|
+
if (argvs.strategy === 'ignore' && scanner !== ScannerTypes.SITEMAP) {
|
|
154
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
152
155
|
}
|
|
153
156
|
return true;
|
|
154
157
|
})
|
|
@@ -161,13 +164,19 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`)
|
|
|
161
164
|
return duration;
|
|
162
165
|
})
|
|
163
166
|
.check(argvs => {
|
|
164
|
-
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.strategy) {
|
|
165
|
-
throw new Error('-s or --strategy is only available in website scans.');
|
|
167
|
+
if (argvs.scanner !== ScannerTypes.WEBSITE && argvs.scanner !== ScannerTypes.CUSTOM && argvs.scanner !== ScannerTypes.INTELLIGENT && argvs.scanner !== ScannerTypes.SITEMAP && argvs.strategy) {
|
|
168
|
+
throw new Error('-s or --strategy is only available in website, custom flow, intelligent, and sitemap scans.');
|
|
169
|
+
}
|
|
170
|
+
if (argvs.strategy === 'ignore' && argvs.scanner !== ScannerTypes.SITEMAP) {
|
|
171
|
+
throw new Error('-s ignore is only available for sitemap scans.');
|
|
166
172
|
}
|
|
167
173
|
return true;
|
|
168
174
|
})
|
|
169
175
|
.conflicts('d', 'w')
|
|
170
176
|
.parse();
|
|
177
|
+
if (!options.strategy) {
|
|
178
|
+
options.strategy = options.scanner === ScannerTypes.SITEMAP ? 'ignore' : 'same-domain';
|
|
179
|
+
}
|
|
171
180
|
const scanInit = async (argvs) => {
|
|
172
181
|
const updatedArgvs = { ...argvs };
|
|
173
182
|
// Cannot use data.browser and data.isHeadless as the connectivity check comes first before prepareData
|
|
@@ -187,7 +196,11 @@ const scanInit = async (argvs) => {
|
|
|
187
196
|
if (res.httpStatus)
|
|
188
197
|
consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
|
189
198
|
if (res.status === statuses.success.code) {
|
|
190
|
-
|
|
199
|
+
// Custom flow should continue from the user-provided entry URL so auth redirects
|
|
200
|
+
// do not replace the original domain used for overlay gating and navigation.
|
|
201
|
+
if (data.type !== ScannerTypes.CUSTOM) {
|
|
202
|
+
data.url = res.url;
|
|
203
|
+
}
|
|
191
204
|
if (process.env.OOBEE_VALIDATE_URL) {
|
|
192
205
|
consoleLogger.info('Url is valid');
|
|
193
206
|
cleanUpAndExit(0, data.randomToken);
|
package/dist/combine.js
CHANGED
|
@@ -77,7 +77,7 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
77
77
|
let durationExceeded = false;
|
|
78
78
|
switch (type) {
|
|
79
79
|
case ScannerTypes.CUSTOM:
|
|
80
|
-
const res = await runCustom(url, randomToken, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
|
|
80
|
+
const res = await runCustom(url, randomToken, browser, userDataDirectory, viewportSettings, blacklistedPatterns, includeScreenshots, customFlowLabel && customFlowLabel !== 'None' ? customFlowLabel : '');
|
|
81
81
|
urlsCrawledObj = res.urlsCrawled;
|
|
82
82
|
uiCustomFlowLabel = res.customFlowLabel;
|
|
83
83
|
break;
|
|
@@ -95,6 +95,8 @@ const combineRun = async (details, deviceToScan) => {
|
|
|
95
95
|
blacklistedPatterns,
|
|
96
96
|
includeScreenshots,
|
|
97
97
|
extraHTTPHeaders,
|
|
98
|
+
strategy,
|
|
99
|
+
userUrl: url,
|
|
98
100
|
scanDuration,
|
|
99
101
|
});
|
|
100
102
|
urlsCrawledObj = sitemapResult.urlsCrawled;
|
|
@@ -147,8 +147,8 @@ export const cliOptions = {
|
|
|
147
147
|
},
|
|
148
148
|
s: {
|
|
149
149
|
alias: 'strategy',
|
|
150
|
-
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
|
|
151
|
-
choices: ['same-domain', 'same-hostname'],
|
|
150
|
+
describe: 'Crawls up to general (same parent) domains, or only specific hostname. Use "ignore" to disable URL filtering (default for sitemap scans). Defaults to "same-domain".',
|
|
151
|
+
choices: ['same-domain', 'same-hostname', 'ignore'],
|
|
152
152
|
requiresArg: true,
|
|
153
153
|
demandOption: false,
|
|
154
154
|
},
|
package/dist/constants/common.js
CHANGED
|
@@ -26,7 +26,7 @@ formDataFields,
|
|
|
26
26
|
ScannerTypes, BrowserTypes, FileTypes, getEnumKey, } from './constants.js';
|
|
27
27
|
import { consoleLogger } from '../logs.js';
|
|
28
28
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
|
29
|
-
import { cleanUpAndExit, randomThreeDigitNumberString, register } from '../utils.js';
|
|
29
|
+
import { cleanUpAndExit, isFollowStrategy, randomThreeDigitNumberString, register } from '../utils.js';
|
|
30
30
|
import { getProxyInfo, proxyInfoToResolution } from '../proxyService.js';
|
|
31
31
|
// validateDirPath validates a provided directory path
|
|
32
32
|
// returns null if no error
|
|
@@ -175,6 +175,14 @@ export const validateXML = (content) => {
|
|
|
175
175
|
});
|
|
176
176
|
return { isValid, parsedContent };
|
|
177
177
|
};
|
|
178
|
+
export const validateTXT = (content) => {
|
|
179
|
+
// Strip HTML tags first — browsers wrap .txt files in HTML when fetched via Playwright
|
|
180
|
+
const plainText = content.replace(/<[^>]+>/g, '\n');
|
|
181
|
+
const lines = plainText.split(/\r?\n/).map(l => l.trim()).filter(l => l.length > 0);
|
|
182
|
+
// Allow http, https and relative paths (starting with /) for txt sitemaps, as some sitemaps use relative paths and some txt sitemaps are fetched as HTML by Playwright
|
|
183
|
+
const urlPattern = /^(https?:\/\/|\/)[^\s]+$/i;
|
|
184
|
+
return { isValid: lines.some(line => urlPattern.test(line)) };
|
|
185
|
+
};
|
|
178
186
|
export const isSkippedUrl = (pageUrl, whitelistedDomains) => {
|
|
179
187
|
const matched = whitelistedDomains.filter(p => {
|
|
180
188
|
const pattern = p.replace(/[\n\r]+/g, '');
|
|
@@ -464,13 +472,13 @@ export const isSitemapContent = (content) => {
|
|
|
464
472
|
}
|
|
465
473
|
const regexForHtml = new RegExp('<(?:!doctype html|html|head|body)+?>', 'gmi');
|
|
466
474
|
const regexForXmlSitemap = new RegExp('<(?:urlset|feed|rss)+?.*>', 'gmi');
|
|
467
|
-
const regexForUrl = new RegExp('^.*(http|https):/{2}.*$', 'gmi');
|
|
468
475
|
if (content.match(regexForHtml) && content.match(regexForXmlSitemap)) {
|
|
469
476
|
// is an XML sitemap wrapped in a HTML document
|
|
470
477
|
return true;
|
|
471
478
|
}
|
|
472
|
-
|
|
473
|
-
|
|
479
|
+
const { isValid: isTxtSitemap } = validateTXT(content);
|
|
480
|
+
if (isTxtSitemap) {
|
|
481
|
+
// treat this as a txt sitemap (plain text or browser-wrapped with HTML)
|
|
474
482
|
return true;
|
|
475
483
|
}
|
|
476
484
|
// is HTML webpage
|
|
@@ -584,7 +592,9 @@ export const prepareData = async (argv) => {
|
|
|
584
592
|
viewportWidth,
|
|
585
593
|
playwrightDeviceDetailsObject,
|
|
586
594
|
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
|
|
587
|
-
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
595
|
+
strategy: strategy === 'same-hostname' ? EnqueueStrategy.SameHostname
|
|
596
|
+
: strategy === 'ignore' ? EnqueueStrategy.All
|
|
597
|
+
: EnqueueStrategy.SameDomain,
|
|
588
598
|
isLocalFileScan,
|
|
589
599
|
browser: browserToRun,
|
|
590
600
|
nameEmail,
|
|
@@ -629,6 +639,10 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
629
639
|
let shouldCapture = false;
|
|
630
640
|
const disallowedUrls = [];
|
|
631
641
|
const allowedUrls = [];
|
|
642
|
+
// Returns 1–2 minimatch glob patterns for a single robots.txt path pattern.
|
|
643
|
+
// Two patterns are returned for bare paths (no trailing wildcard) so that
|
|
644
|
+
// both the exact URL and all child paths are blocked, matching robots.txt
|
|
645
|
+
// prefix semantics.
|
|
632
646
|
const sanitisePattern = (pattern) => {
|
|
633
647
|
const directoryRegex = /^\/(?:[^?#/]+\/)*[^?#]*$/;
|
|
634
648
|
const subdirWildcardRegex = /\/\*\//g;
|
|
@@ -636,18 +650,29 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
636
650
|
if (subdirWildcardRegex.test(pattern)) {
|
|
637
651
|
pattern = pattern.replace(subdirWildcardRegex, '/**/');
|
|
638
652
|
}
|
|
653
|
+
// Query-string patterns (e.g. /faq?faqItem= or /faq/?faq&faqItem=):
|
|
654
|
+
// '?' is the query separator in robots.txt but a single-char wildcard in
|
|
655
|
+
// minimatch. Escape it to a literal match and append '*' so any query
|
|
656
|
+
// value after the stated prefix is also blocked.
|
|
657
|
+
if (pattern.includes('?')) {
|
|
658
|
+
return [domain + pattern.replace('?', '\\?') + '*'];
|
|
659
|
+
}
|
|
639
660
|
if (pattern.match(directoryRegex) && !pattern.match(filePathRegex)) {
|
|
640
661
|
if (pattern.endsWith('*')) {
|
|
641
|
-
|
|
662
|
+
// e.g. /ebook/* → /ebook/** (already covers all children)
|
|
663
|
+
return [domain + pattern.concat('*')];
|
|
642
664
|
}
|
|
643
665
|
else {
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
666
|
+
// Bare path (e.g. /subscription/unsubscribe): robots.txt blocks the
|
|
667
|
+
// exact URL *and* every descendant. minimatch's '/**' glob does not
|
|
668
|
+
// match the bare path itself (no trailing slash), so we emit both the
|
|
669
|
+
// exact-path pattern and a children glob.
|
|
670
|
+
const base = domain + pattern;
|
|
671
|
+
const children = domain + (pattern.endsWith('/') ? pattern : pattern + '/') + '**';
|
|
672
|
+
return [base, children];
|
|
647
673
|
}
|
|
648
674
|
}
|
|
649
|
-
|
|
650
|
-
return final;
|
|
675
|
+
return [domain + pattern];
|
|
651
676
|
};
|
|
652
677
|
for (const line of lines) {
|
|
653
678
|
if (line.toLowerCase().startsWith('user-agent: *')) {
|
|
@@ -659,15 +684,13 @@ export const getUrlsFromRobotsTxt = async (url, browserToRun, userDataDirectory,
|
|
|
659
684
|
else if (shouldCapture && line.toLowerCase().startsWith('disallow:')) {
|
|
660
685
|
let disallowed = line.substring('disallow: '.length).trim();
|
|
661
686
|
if (disallowed) {
|
|
662
|
-
|
|
663
|
-
disallowedUrls.push(disallowed);
|
|
687
|
+
disallowedUrls.push(...sanitisePattern(disallowed));
|
|
664
688
|
}
|
|
665
689
|
}
|
|
666
690
|
else if (shouldCapture && line.toLowerCase().startsWith('allow:')) {
|
|
667
691
|
let allowed = line.substring('allow: '.length).trim();
|
|
668
692
|
if (allowed) {
|
|
669
|
-
|
|
670
|
-
allowedUrls.push(allowed);
|
|
693
|
+
allowedUrls.push(...sanitisePattern(allowed));
|
|
671
694
|
}
|
|
672
695
|
}
|
|
673
696
|
}
|
|
@@ -718,6 +741,31 @@ const getRobotsTxtViaPlaywright = async (robotsUrl, browser, userDataDirectory,
|
|
|
718
741
|
}
|
|
719
742
|
}
|
|
720
743
|
};
|
|
744
|
+
export const getSitemapsFromRobotsTxt = async (url, browser, userDataDirectory, extraHTTPHeaders) => {
|
|
745
|
+
const domain = new URL(url).origin;
|
|
746
|
+
const robotsUrl = domain.concat('/robots.txt');
|
|
747
|
+
let robotsTxt;
|
|
748
|
+
try {
|
|
749
|
+
robotsTxt = await getRobotsTxtViaPlaywright(robotsUrl, browser, userDataDirectory, extraHTTPHeaders);
|
|
750
|
+
}
|
|
751
|
+
catch (e) {
|
|
752
|
+
consoleLogger.info(`Unable to fetch robots.txt from ${robotsUrl} for sitemap discovery`);
|
|
753
|
+
return [];
|
|
754
|
+
}
|
|
755
|
+
if (!robotsTxt)
|
|
756
|
+
return [];
|
|
757
|
+
const sitemaps = [];
|
|
758
|
+
const lines = robotsTxt.split(/\r?\n/);
|
|
759
|
+
for (const line of lines) {
|
|
760
|
+
if (line.toLowerCase().startsWith('sitemap:')) {
|
|
761
|
+
const sitemapUrl = line.substring('sitemap:'.length).trim();
|
|
762
|
+
if (sitemapUrl) {
|
|
763
|
+
sitemaps.push(sitemapUrl);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
return sitemaps;
|
|
768
|
+
};
|
|
721
769
|
export const isDisallowedInRobotsTxt = (url) => {
|
|
722
770
|
if (!constants.robotsTxtUrls)
|
|
723
771
|
return;
|
|
@@ -736,7 +784,7 @@ export const isDisallowedInRobotsTxt = (url) => {
|
|
|
736
784
|
}
|
|
737
785
|
return false;
|
|
738
786
|
};
|
|
739
|
-
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders) => {
|
|
787
|
+
export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, userDataDirectory, userUrlInput, isIntelligent, extraHTTPHeaders, strategy = EnqueueStrategy.All, userUrl = userUrlInput) => {
|
|
740
788
|
const scannedSitemaps = new Set();
|
|
741
789
|
const urls = {}; // dictionary of requests to urls to be scanned
|
|
742
790
|
const isLimitReached = () => Object.keys(urls).length >= maxLinksCount;
|
|
@@ -745,6 +793,8 @@ export const getLinksFromSitemap = async (sitemapUrl, maxLinksCount, browser, us
|
|
|
745
793
|
return;
|
|
746
794
|
if (isDisallowedInRobotsTxt(url))
|
|
747
795
|
return;
|
|
796
|
+
if (!isFilePath(userUrl) && !isFollowStrategy(url, userUrl, strategy))
|
|
797
|
+
return;
|
|
748
798
|
url = convertPathToLocalFile(url);
|
|
749
799
|
let request;
|
|
750
800
|
try {
|
|
@@ -1603,6 +1653,7 @@ const cacheProxyInfo = getProxyInfo();
|
|
|
1603
1653
|
export const getPlaywrightLaunchOptions = (browser) => {
|
|
1604
1654
|
const channel = browser || undefined;
|
|
1605
1655
|
const resolution = proxyInfoToResolution(cacheProxyInfo);
|
|
1656
|
+
const shouldIgnoreMuteAudio = process.env.OOBEE_PLAYWRIGHT_IGNORE_DEFAULT_ARGS === '--mute-audio';
|
|
1606
1657
|
// Start with your base args and sanitise
|
|
1607
1658
|
const finalArgs = [...constants.launchOptionsArgs].filter(arg => !arg.startsWith('--headless') &&
|
|
1608
1659
|
!arg.startsWith('--user-agent=') &&
|
|
@@ -1630,7 +1681,9 @@ export const getPlaywrightLaunchOptions = (browser) => {
|
|
|
1630
1681
|
break;
|
|
1631
1682
|
}
|
|
1632
1683
|
const options = {
|
|
1633
|
-
ignoreDefaultArgs:
|
|
1684
|
+
ignoreDefaultArgs: shouldIgnoreMuteAudio
|
|
1685
|
+
? ['--use-mock-keychain', '--mute-audio']
|
|
1686
|
+
: ['--use-mock-keychain'],
|
|
1634
1687
|
args: finalArgs,
|
|
1635
1688
|
headless: process.env.CRAWLEE_HEADLESS === '1',
|
|
1636
1689
|
...(channel && { channel }),
|