milieu-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +200 -0
- package/README.md +153 -0
- package/dist/bridges/index.d.ts +5 -0
- package/dist/bridges/index.d.ts.map +1 -0
- package/dist/bridges/index.js +6 -0
- package/dist/bridges/index.js.map +1 -0
- package/dist/bridges/reachability/crawler-policy.d.ts +36 -0
- package/dist/bridges/reachability/crawler-policy.d.ts.map +1 -0
- package/dist/bridges/reachability/crawler-policy.js +110 -0
- package/dist/bridges/reachability/crawler-policy.js.map +1 -0
- package/dist/bridges/reachability/http-status.d.ts +7 -0
- package/dist/bridges/reachability/http-status.d.ts.map +1 -0
- package/dist/bridges/reachability/http-status.js +74 -0
- package/dist/bridges/reachability/http-status.js.map +1 -0
- package/dist/bridges/reachability/https-check.d.ts +14 -0
- package/dist/bridges/reachability/https-check.d.ts.map +1 -0
- package/dist/bridges/reachability/https-check.js +38 -0
- package/dist/bridges/reachability/https-check.js.map +1 -0
- package/dist/bridges/reachability/index.d.ts +13 -0
- package/dist/bridges/reachability/index.d.ts.map +1 -0
- package/dist/bridges/reachability/index.js +115 -0
- package/dist/bridges/reachability/index.js.map +1 -0
- package/dist/bridges/reachability/meta-robots.d.ts +16 -0
- package/dist/bridges/reachability/meta-robots.d.ts.map +1 -0
- package/dist/bridges/reachability/meta-robots.js +119 -0
- package/dist/bridges/reachability/meta-robots.js.map +1 -0
- package/dist/bridges/reachability/robots-parser.d.ts +26 -0
- package/dist/bridges/reachability/robots-parser.d.ts.map +1 -0
- package/dist/bridges/reachability/robots-parser.js +105 -0
- package/dist/bridges/reachability/robots-parser.js.map +1 -0
- package/dist/bridges/reachability/robots-txt.d.ts +14 -0
- package/dist/bridges/reachability/robots-txt.d.ts.map +1 -0
- package/dist/bridges/reachability/robots-txt.js +80 -0
- package/dist/bridges/reachability/robots-txt.js.map +1 -0
- package/dist/bridges/separation/api-presence.d.ts +14 -0
- package/dist/bridges/separation/api-presence.d.ts.map +1 -0
- package/dist/bridges/separation/api-presence.js +96 -0
- package/dist/bridges/separation/api-presence.js.map +1 -0
- package/dist/bridges/separation/developer-docs.d.ts +21 -0
- package/dist/bridges/separation/developer-docs.d.ts.map +1 -0
- package/dist/bridges/separation/developer-docs.js +81 -0
- package/dist/bridges/separation/developer-docs.js.map +1 -0
- package/dist/bridges/separation/index.d.ts +20 -0
- package/dist/bridges/separation/index.d.ts.map +1 -0
- package/dist/bridges/separation/index.js +63 -0
- package/dist/bridges/separation/index.js.map +1 -0
- package/dist/bridges/separation/sdk-references.d.ts +12 -0
- package/dist/bridges/separation/sdk-references.d.ts.map +1 -0
- package/dist/bridges/separation/sdk-references.js +93 -0
- package/dist/bridges/separation/sdk-references.js.map +1 -0
- package/dist/bridges/separation/webhook-support.d.ts +19 -0
- package/dist/bridges/separation/webhook-support.d.ts.map +1 -0
- package/dist/bridges/separation/webhook-support.js +94 -0
- package/dist/bridges/separation/webhook-support.js.map +1 -0
- package/dist/bridges/standards/index.d.ts +13 -0
- package/dist/bridges/standards/index.d.ts.map +1 -0
- package/dist/bridges/standards/index.js +79 -0
- package/dist/bridges/standards/index.js.map +1 -0
- package/dist/bridges/standards/json-ld.d.ts +16 -0
- package/dist/bridges/standards/json-ld.d.ts.map +1 -0
- package/dist/bridges/standards/json-ld.js +63 -0
- package/dist/bridges/standards/json-ld.js.map +1 -0
- package/dist/bridges/standards/llms-txt.d.ts +19 -0
- package/dist/bridges/standards/llms-txt.d.ts.map +1 -0
- package/dist/bridges/standards/llms-txt.js +64 -0
- package/dist/bridges/standards/llms-txt.js.map +1 -0
- package/dist/bridges/standards/mcp.d.ts +13 -0
- package/dist/bridges/standards/mcp.d.ts.map +1 -0
- package/dist/bridges/standards/mcp.js +72 -0
- package/dist/bridges/standards/mcp.js.map +1 -0
- package/dist/bridges/standards/openapi.d.ts +14 -0
- package/dist/bridges/standards/openapi.d.ts.map +1 -0
- package/dist/bridges/standards/openapi.js +424 -0
- package/dist/bridges/standards/openapi.js.map +1 -0
- package/dist/bridges/standards/schema-org.d.ts +12 -0
- package/dist/bridges/standards/schema-org.d.ts.map +1 -0
- package/dist/bridges/standards/schema-org.js +101 -0
- package/dist/bridges/standards/schema-org.js.map +1 -0
- package/dist/bridges/standards/well-known.d.ts +16 -0
- package/dist/bridges/standards/well-known.d.ts.map +1 -0
- package/dist/bridges/standards/well-known.js +77 -0
- package/dist/bridges/standards/well-known.js.map +1 -0
- package/dist/bridges/stubs.d.ts +4 -0
- package/dist/bridges/stubs.d.ts.map +1 -0
- package/dist/bridges/stubs.js +25 -0
- package/dist/bridges/stubs.js.map +1 -0
- package/dist/cli/index.d.ts +4 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +83 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/explanations.d.ts +11 -0
- package/dist/core/explanations.d.ts.map +1 -0
- package/dist/core/explanations.js +128 -0
- package/dist/core/explanations.js.map +1 -0
- package/dist/core/index.d.ts +6 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +6 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/scan.d.ts +3 -0
- package/dist/core/scan.d.ts.map +1 -0
- package/dist/core/scan.js +89 -0
- package/dist/core/scan.js.map +1 -0
- package/dist/core/types.d.ts +119 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +3 -0
- package/dist/core/types.js.map +1 -0
- package/dist/core/version.d.ts +2 -0
- package/dist/core/version.d.ts.map +1 -0
- package/dist/core/version.js +7 -0
- package/dist/core/version.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/render/colors.d.ts +7 -0
- package/dist/render/colors.d.ts.map +1 -0
- package/dist/render/colors.js +28 -0
- package/dist/render/colors.js.map +1 -0
- package/dist/render/format-bridge.d.ts +3 -0
- package/dist/render/format-bridge.d.ts.map +1 -0
- package/dist/render/format-bridge.js +39 -0
- package/dist/render/format-bridge.js.map +1 -0
- package/dist/render/format-scan.d.ts +3 -0
- package/dist/render/format-scan.d.ts.map +1 -0
- package/dist/render/format-scan.js +44 -0
- package/dist/render/format-scan.js.map +1 -0
- package/dist/render/format-verbose.d.ts +3 -0
- package/dist/render/format-verbose.d.ts.map +1 -0
- package/dist/render/format-verbose.js +14 -0
- package/dist/render/format-verbose.js.map +1 -0
- package/dist/render/index.d.ts +7 -0
- package/dist/render/index.d.ts.map +1 -0
- package/dist/render/index.js +8 -0
- package/dist/render/index.js.map +1 -0
- package/dist/render/progress-bar.d.ts +10 -0
- package/dist/render/progress-bar.d.ts.map +1 -0
- package/dist/render/progress-bar.js +21 -0
- package/dist/render/progress-bar.js.map +1 -0
- package/dist/render/symbols.d.ts +10 -0
- package/dist/render/symbols.d.ts.map +1 -0
- package/dist/render/symbols.js +21 -0
- package/dist/render/symbols.js.map +1 -0
- package/dist/utils/http-client.d.ts +25 -0
- package/dist/utils/http-client.d.ts.map +1 -0
- package/dist/utils/http-client.js +235 -0
- package/dist/utils/http-client.js.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +7 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/ssrf.d.ts +29 -0
- package/dist/utils/ssrf.d.ts.map +1 -0
- package/dist/utils/ssrf.js +134 -0
- package/dist/utils/ssrf.js.map +1 -0
- package/dist/utils/url.d.ts +53 -0
- package/dist/utils/url.d.ts.map +1 -0
- package/dist/utils/url.js +64 -0
- package/dist/utils/url.js.map +1 -0
- package/package.json +74 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { httpGet } from "../../utils/http-client.js";
|
|
2
|
+
const ABORT_ERRORS = new Set(["dns", "connection_refused", "ssl_error"]);
|
|
3
|
+
/**
|
|
4
|
+
* Check HTTPS availability via HEAD request to https://<domain>.
|
|
5
|
+
*
|
|
6
|
+
* Any response (even 4xx/5xx) means HTTPS works.
|
|
7
|
+
* Abort-worthy errors: dns, connection_refused, ssl_error.
|
|
8
|
+
*/
|
|
9
|
+
export async function checkHttps(domain, timeout) {
|
|
10
|
+
const result = await httpGet("https://" + domain, {
|
|
11
|
+
method: "HEAD",
|
|
12
|
+
timeout,
|
|
13
|
+
});
|
|
14
|
+
if (result.ok) {
|
|
15
|
+
return {
|
|
16
|
+
check: {
|
|
17
|
+
id: "https_available",
|
|
18
|
+
label: "HTTPS Available",
|
|
19
|
+
status: "pass",
|
|
20
|
+
detail: "HTTPS connection successful",
|
|
21
|
+
},
|
|
22
|
+
abort: false,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
const { kind, message } = result.error;
|
|
26
|
+
const abort = ABORT_ERRORS.has(kind);
|
|
27
|
+
return {
|
|
28
|
+
check: {
|
|
29
|
+
id: "https_available",
|
|
30
|
+
label: "HTTPS Available",
|
|
31
|
+
status: "fail",
|
|
32
|
+
detail: message,
|
|
33
|
+
},
|
|
34
|
+
abort,
|
|
35
|
+
abortReason: abort ? kind : undefined,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=https-check.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"https-check.js","sourceRoot":"","sources":["../../../src/bridges/reachability/https-check.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAE,MAAM,4BAA4B,CAAC;AAQrD,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,CAAC,KAAK,EAAE,oBAAoB,EAAE,WAAW,CAAC,CAAC,CAAC;AAEzE;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAc,EACd,OAAgB;IAEhB,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,UAAU,GAAG,MAAM,EAAE;QAChD,MAAM,EAAE,MAAM;QACd,OAAO;KACR,CAAC,CAAC;IAEH,IAAI,MAAM,CAAC,EAAE,EAAE,CAAC;QACd,OAAO;YACL,KAAK,EAAE;gBACL,EAAE,EAAE,iBAAiB;gBACrB,KAAK,EAAE,iBAAiB;gBACxB,MAAM,EAAE,MAAM;gBACd,MAAM,EAAE,6BAA6B;aACtC;YACD,KAAK,EAAE,KAAK;SACb,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC;IACvC,MAAM,KAAK,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAErC,OAAO;QACL,KAAK,EAAE;YACL,EAAE,EAAE,iBAAiB;YACrB,KAAK,EAAE,iBAAiB;YACxB,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,OAAO;SAChB;QACD,KAAK;QACL,WAAW,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS;KACtC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { BridgeResult, ScanContext } from "../../core/types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Run Bridge 1: Reachability.
|
|
4
|
+
*
|
|
5
|
+
* Makes 3 HTTP calls:
|
|
6
|
+
* 1. HEAD https://<domain> (HTTPS availability)
|
|
7
|
+
* 2. GET <baseUrl> (page content for meta robots)
|
|
8
|
+
* 3. GET https://<domain>/robots.txt (robots.txt fetch)
|
|
9
|
+
*
|
|
10
|
+
* Aborts on dns/connection_refused/ssl_error from HTTPS check.
|
|
11
|
+
*/
|
|
12
|
+
export declare function runReachabilityBridge(ctx: ScanContext): Promise<BridgeResult>;
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/bridges/reachability/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAS,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAyC5E;;;;;;;;;GASG;AACH,wBAAsB,qBAAqB,CACzC,GAAG,EAAE,WAAW,GACf,OAAO,CAAC,YAAY,CAAC,CAwFvB"}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { httpGet } from "../../utils/http-client.js";
|
|
2
|
+
import { checkHttps } from "./https-check.js";
|
|
3
|
+
import { checkHttpStatus } from "./http-status.js";
|
|
4
|
+
import { checkRobotsTxt } from "./robots-txt.js";
|
|
5
|
+
import { evaluateCrawlerPolicies } from "./crawler-policy.js";
|
|
6
|
+
import { checkMetaRobots, checkXRobotsTag } from "./meta-robots.js";
|
|
7
|
+
/**
|
|
8
|
+
* Calculate bridge score from check results.
|
|
9
|
+
* - Pass = 1 point, Partial = 0.5 points, Fail/Error = 0 points
|
|
10
|
+
* - Checks with data.policy === "skip" excluded from both numerator and denominator
|
|
11
|
+
*/
|
|
12
|
+
function calculateScore(checks) {
|
|
13
|
+
let points = 0;
|
|
14
|
+
let maxPoints = 0;
|
|
15
|
+
for (const check of checks) {
|
|
16
|
+
// Skip checks excluded from scoring (e.g., crawler policy with no robots.txt)
|
|
17
|
+
if (check.data &&
|
|
18
|
+
check.data.policy === "skip") {
|
|
19
|
+
continue;
|
|
20
|
+
}
|
|
21
|
+
maxPoints += 1;
|
|
22
|
+
if (check.status === "pass")
|
|
23
|
+
points += 1;
|
|
24
|
+
else if (check.status === "partial")
|
|
25
|
+
points += 0.5;
|
|
26
|
+
// fail and error = 0 points
|
|
27
|
+
}
|
|
28
|
+
const score = maxPoints === 0 ? 0 : Math.round((points / maxPoints) * 100);
|
|
29
|
+
const scoreLabel = score >= 80 ? "pass" : score >= 40 ? "partial" : "fail";
|
|
30
|
+
return { score, scoreLabel };
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Run Bridge 1: Reachability.
|
|
34
|
+
*
|
|
35
|
+
* Makes 3 HTTP calls:
|
|
36
|
+
* 1. HEAD https://<domain> (HTTPS availability)
|
|
37
|
+
* 2. GET <baseUrl> (page content for meta robots)
|
|
38
|
+
* 3. GET https://<domain>/robots.txt (robots.txt fetch)
|
|
39
|
+
*
|
|
40
|
+
* Aborts on dns/connection_refused/ssl_error from HTTPS check.
|
|
41
|
+
*/
|
|
42
|
+
export async function runReachabilityBridge(ctx) {
|
|
43
|
+
const start = performance.now();
|
|
44
|
+
// 1. HTTPS check (HEAD request)
|
|
45
|
+
const httpsResult = await checkHttps(ctx.domain, ctx.options.timeout);
|
|
46
|
+
// Abort on fatal HTTPS errors
|
|
47
|
+
if (httpsResult.abort) {
|
|
48
|
+
return {
|
|
49
|
+
id: 1,
|
|
50
|
+
name: "Reachability",
|
|
51
|
+
status: "evaluated",
|
|
52
|
+
score: 0,
|
|
53
|
+
scoreLabel: "fail",
|
|
54
|
+
checks: [httpsResult.check],
|
|
55
|
+
durationMs: Math.round(performance.now() - start),
|
|
56
|
+
abort: true,
|
|
57
|
+
abortReason: httpsResult.abortReason,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
// 2. Page GET (uses normalized baseUrl)
|
|
61
|
+
const pageResponse = await httpGet(ctx.baseUrl, {
|
|
62
|
+
timeout: ctx.options.timeout,
|
|
63
|
+
});
|
|
64
|
+
// Store page body for Bridge 2 (JSON-LD, Schema.org extraction)
|
|
65
|
+
if (pageResponse.ok) {
|
|
66
|
+
ctx.shared.pageBody = pageResponse.body;
|
|
67
|
+
ctx.shared.pageHeaders = pageResponse.headers;
|
|
68
|
+
}
|
|
69
|
+
// 3. HTTP status check (no HTTP call -- uses pageResponse)
|
|
70
|
+
const httpStatusCheck = checkHttpStatus(pageResponse);
|
|
71
|
+
// 4. robots.txt fetch + parse
|
|
72
|
+
const robotsResult = await checkRobotsTxt(ctx.domain, ctx.options.timeout);
|
|
73
|
+
// 5. Crawler policies (uses parsed robots.txt data)
|
|
74
|
+
let targetPath;
|
|
75
|
+
try {
|
|
76
|
+
targetPath = new URL(ctx.baseUrl).pathname;
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
targetPath = "/";
|
|
80
|
+
}
|
|
81
|
+
const crawlerChecks = evaluateCrawlerPolicies(robotsResult.parsed, targetPath);
|
|
82
|
+
// 6. Meta robots (uses page response)
|
|
83
|
+
let metaRobotsCheck;
|
|
84
|
+
let xRobotsCheck;
|
|
85
|
+
if (pageResponse.ok) {
|
|
86
|
+
metaRobotsCheck = checkMetaRobots(pageResponse.body);
|
|
87
|
+
xRobotsCheck = checkXRobotsTag(pageResponse.headers);
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
// Page unavailable -- pass empty content (absence of restrictive tags = pass)
|
|
91
|
+
metaRobotsCheck = checkMetaRobots("");
|
|
92
|
+
xRobotsCheck = checkXRobotsTag({});
|
|
93
|
+
}
|
|
94
|
+
// 7. Collect all checks
|
|
95
|
+
const checks = [
|
|
96
|
+
httpsResult.check,
|
|
97
|
+
httpStatusCheck,
|
|
98
|
+
robotsResult.check,
|
|
99
|
+
...crawlerChecks,
|
|
100
|
+
metaRobotsCheck,
|
|
101
|
+
xRobotsCheck,
|
|
102
|
+
];
|
|
103
|
+
// 8. Calculate score
|
|
104
|
+
const { score, scoreLabel } = calculateScore(checks);
|
|
105
|
+
return {
|
|
106
|
+
id: 1,
|
|
107
|
+
name: "Reachability",
|
|
108
|
+
status: "evaluated",
|
|
109
|
+
score,
|
|
110
|
+
scoreLabel,
|
|
111
|
+
checks,
|
|
112
|
+
durationMs: Math.round(performance.now() - start),
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/bridges/reachability/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAE,MAAM,4BAA4B,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC;AAC9D,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAEpE;;;;GAIG;AACH,SAAS,cAAc,CAAC,MAAe;IAIrC,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,8EAA8E;QAC9E,IACE,KAAK,CAAC,IAAI;YACT,KAAK,CAAC,IAAgC,CAAC,MAAM,KAAK,MAAM,EACzD,CAAC;YACD,SAAS;QACX,CAAC;QAED,SAAS,IAAI,CAAC,CAAC;QACf,IAAI,KAAK,CAAC,MAAM,KAAK,MAAM;YAAE,MAAM,IAAI,CAAC,CAAC;aACpC,IAAI,KAAK,CAAC,MAAM,KAAK,SAAS;YAAE,MAAM,IAAI,GAAG,CAAC;QACnD,4BAA4B;IAC9B,CAAC;IAED,MAAM,KAAK,GAAG,SAAS,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,GAAG,SAAS,CAAC,GAAG,GAAG,CAAC,CAAC;IAC3E,MAAM,UAAU,GACd,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC;IAC1D,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC;AAC/B,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,GAAgB;IAEhB,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEhC,gCAAgC;IAChC,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAEtE,8BAA8B;IAC9B,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QACtB,OAAO;YACL,EAAE,EAAE,CAAC;YACL,IAAI,EAAE,cAAc;YACpB,MAAM,EAAE,WAAW;YACnB,KAAK,EAAE,CAAC;YACR,UAAU,EAAE,MAAM;YAClB,MAAM,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC;YAC3B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;YACjD,KAAK,EAAE,IAAI;YACX,WAAW,EAAE,WAAW,CAAC,WAAW;SACrC,CAAC;IACJ,CAAC;IAED,wCAAwC;IACxC,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,EAAE;QAC9C,OAAO,EAAE,GAAG,CAAC,OAAO,CAAC,OAAO;KAC7B,CAAC,CAAC;IAEH,gEAAgE;IAChE,IAAI,YAAY,CAAC,EAAE,EAAE,CAAC;QACpB,GAAG,CAAC,MAAM,CAAC,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC;QACxC,GAAG,CAAC,MAAM,CAAC,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC;IAChD,CAAC;IAED,2DAA2D;IAC3D,MAAM,eAAe,GAAG,eAAe,CAAC,YAAY,CAAC,CAAC;IAEtD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,MAAM,cAAc,CACvC,GAAG,CAAC,MAAM,EACV,GAAG,CAAC,OAAO,CAAC,OAAO,CACpB,CAAC;IAEF,oDAAoD;IACpD,IAAI,UAAkB,CAAC;IACvB,IAAI,CAAC;QACH,UAAU,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC;IAC7C,CAAC;IAAC,MAAM,CAAC;QACP,UAAU,GAAG,GAAG,CAAC;IACnB,CAAC;IACD,MAAM,aAAa,GAAG,uBAAuB,CAC3C,YAAY,CAAC,MAAM,EACnB,UAAU,CACX,CAAC;IAEF,sCAAsC;IACtC,IAAI,eAAsB,CAAC;IAC3B,IAAI,YAAmB,CAAC;IAExB,IAAI,YAAY,CAAC,EAAE,EAAE,CAAC;QACpB,eAAe,GAAG,eAAe,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QACrD,YAAY,GAAG,eAAe,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IACvD,CAAC;SAAM,CAAC;QACN,8EAA8E;QAC9E,eAAe,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;QACtC,YAAY,GAAG,eAAe,CAAC,EAAE,CAAC,CAAC;IACrC,CAAC;IAED,wBAAwB;IACxB,MAAM,MAAM,GAAY;QACtB,WAAW,CAAC,KAAK;QACjB,eAAe;QACf,YAAY,CAAC,KAAK;QAClB,GAAG,aAAa;QAChB,eAAe;QACf,YAAY;KACb,CAAC;IAEF,qBAAqB;IACrB,MAAM,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC;IAErD,OAAO;QACL,EAAE,EAAE,CAAC;QACL,IAAI,EAAE,cAAc;QACpB,MAAM,EAAE,WAAW;QACnB,KAAK;QACL,UAAU;QACV,MAAM;QACN,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;KAClD,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { Check } from "../../core/types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Check for restrictive meta robots tags in the HTML <head> section.
|
|
4
|
+
* Uses regex only -- no HTML parser dependency.
|
|
5
|
+
*
|
|
6
|
+
* Scans for: robots, googlebot, bingbot name attributes.
|
|
7
|
+
* Handles both attribute orders, single/double quotes, self-closing tags, case insensitivity.
|
|
8
|
+
*/
|
|
9
|
+
export declare function checkMetaRobots(html: string): Check;
|
|
10
|
+
/**
|
|
11
|
+
* Check for X-Robots-Tag HTTP header directives.
|
|
12
|
+
*
|
|
13
|
+
* Headers object is expected to have lowercase keys (from Phase 2 httpGet).
|
|
14
|
+
*/
|
|
15
|
+
export declare function checkXRobotsTag(headers: Record<string, string>): Check;
|
|
16
|
+
//# sourceMappingURL=meta-robots.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meta-robots.d.ts","sourceRoot":"","sources":["../../../src/bridges/reachability/meta-robots.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,qBAAqB,CAAC;AAEjD;;;;;;GAMG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,CA2EnD;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,KAAK,CAoDtE"}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Check for restrictive meta robots tags in the HTML <head> section.
|
|
3
|
+
* Uses regex only -- no HTML parser dependency.
|
|
4
|
+
*
|
|
5
|
+
* Scans for: robots, googlebot, bingbot name attributes.
|
|
6
|
+
* Handles both attribute orders, single/double quotes, self-closing tags, case insensitivity.
|
|
7
|
+
*/
|
|
8
|
+
export function checkMetaRobots(html) {
|
|
9
|
+
const id = "meta_robots";
|
|
10
|
+
const label = "Meta Robots Tags";
|
|
11
|
+
// Extract head content (case-insensitive)
|
|
12
|
+
const headMatch = html.match(/<head[^>]*>([\s\S]*?)<\/head>/i);
|
|
13
|
+
if (!headMatch) {
|
|
14
|
+
return {
|
|
15
|
+
id,
|
|
16
|
+
label,
|
|
17
|
+
status: "pass",
|
|
18
|
+
detail: "No restrictive meta robots tags found",
|
|
19
|
+
data: { directives: [] },
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
const headContent = headMatch[1];
|
|
23
|
+
const allDirectives = [];
|
|
24
|
+
// Pattern A: name first, then content
|
|
25
|
+
const patternA = /<meta\s+[^>]*name\s*=\s*["'](robots|googlebot|bingbot)["'][^>]*content\s*=\s*["']([^"']*)["'][^>]*\/?>/gi;
|
|
26
|
+
// Pattern B: content first, then name
|
|
27
|
+
const patternB = /<meta\s+[^>]*content\s*=\s*["']([^"']*)["'][^>]*name\s*=\s*["'](robots|googlebot|bingbot)["'][^>]*\/?>/gi;
|
|
28
|
+
let match;
|
|
29
|
+
while ((match = patternA.exec(headContent)) !== null) {
|
|
30
|
+
const contentValue = match[2];
|
|
31
|
+
const directives = contentValue
|
|
32
|
+
.split(",")
|
|
33
|
+
.map((d) => d.trim().toLowerCase());
|
|
34
|
+
allDirectives.push(...directives);
|
|
35
|
+
}
|
|
36
|
+
while ((match = patternB.exec(headContent)) !== null) {
|
|
37
|
+
const contentValue = match[1];
|
|
38
|
+
const directives = contentValue
|
|
39
|
+
.split(",")
|
|
40
|
+
.map((d) => d.trim().toLowerCase());
|
|
41
|
+
allDirectives.push(...directives);
|
|
42
|
+
}
|
|
43
|
+
// Filter out empty strings
|
|
44
|
+
const filtered = allDirectives.filter((d) => d.length > 0);
|
|
45
|
+
if (filtered.some((d) => d === "noindex")) {
|
|
46
|
+
return {
|
|
47
|
+
id,
|
|
48
|
+
label,
|
|
49
|
+
status: "fail",
|
|
50
|
+
detail: `Restrictive meta robots directives found: ${filtered.join(", ")}`,
|
|
51
|
+
data: { directives: filtered },
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
if (filtered.some((d) => d === "nofollow")) {
|
|
55
|
+
return {
|
|
56
|
+
id,
|
|
57
|
+
label,
|
|
58
|
+
status: "partial",
|
|
59
|
+
detail: `Restrictive meta robots directives found: ${filtered.join(", ")}`,
|
|
60
|
+
data: { directives: filtered },
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
return {
|
|
64
|
+
id,
|
|
65
|
+
label,
|
|
66
|
+
status: "pass",
|
|
67
|
+
detail: "No restrictive meta robots tags found",
|
|
68
|
+
data: { directives: filtered },
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Check for X-Robots-Tag HTTP header directives.
|
|
73
|
+
*
|
|
74
|
+
* Headers object is expected to have lowercase keys (from Phase 2 httpGet).
|
|
75
|
+
*/
|
|
76
|
+
export function checkXRobotsTag(headers) {
|
|
77
|
+
const id = "x_robots_tag";
|
|
78
|
+
const label = "X-Robots-Tag Header";
|
|
79
|
+
const headerValue = headers["x-robots-tag"];
|
|
80
|
+
if (!headerValue) {
|
|
81
|
+
return {
|
|
82
|
+
id,
|
|
83
|
+
label,
|
|
84
|
+
status: "pass",
|
|
85
|
+
detail: "No X-Robots-Tag header",
|
|
86
|
+
data: { directives: [], raw: "" },
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
const directives = headerValue
|
|
90
|
+
.split(",")
|
|
91
|
+
.map((d) => d.trim().toLowerCase())
|
|
92
|
+
.filter((d) => d.length > 0);
|
|
93
|
+
if (directives.some((d) => d.includes("noindex"))) {
|
|
94
|
+
return {
|
|
95
|
+
id,
|
|
96
|
+
label,
|
|
97
|
+
status: "fail",
|
|
98
|
+
detail: `X-Robots-Tag contains restrictive directives: ${directives.join(", ")}`,
|
|
99
|
+
data: { directives, raw: headerValue },
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
if (directives.some((d) => d.includes("nofollow") || d.includes("noarchive") || d === "none")) {
|
|
103
|
+
return {
|
|
104
|
+
id,
|
|
105
|
+
label,
|
|
106
|
+
status: "partial",
|
|
107
|
+
detail: `X-Robots-Tag contains directives: ${directives.join(", ")}`,
|
|
108
|
+
data: { directives, raw: headerValue },
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
return {
|
|
112
|
+
id,
|
|
113
|
+
label,
|
|
114
|
+
status: "pass",
|
|
115
|
+
detail: `X-Robots-Tag header present with no restrictive directives`,
|
|
116
|
+
data: { directives, raw: headerValue },
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
//# sourceMappingURL=meta-robots.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meta-robots.js","sourceRoot":"","sources":["../../../src/bridges/reachability/meta-robots.ts"],"names":[],"mappings":"AAEA;;;;;;GAMG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,MAAM,EAAE,GAAG,aAAa,CAAC;IACzB,MAAM,KAAK,GAAG,kBAAkB,CAAC;IAEjC,0CAA0C;IAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IAC/D,IAAI,CAAC,SAAS,EAAE,CAAC;QACf,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,uCAAuC;YAC/C,IAAI,EAAE,EAAE,UAAU,EAAE,EAAE,EAAE;SACzB,CAAC;IACJ,CAAC;IAED,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;IACjC,MAAM,aAAa,GAAa,EAAE,CAAC;IAEnC,sCAAsC;IACtC,MAAM,QAAQ,GACZ,0GAA0G,CAAC;IAE7G,sCAAsC;IACtC,MAAM,QAAQ,GACZ,0GAA0G,CAAC;IAE7G,IAAI,KAA6B,CAAC;IAElC,OAAO,CAAC,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,UAAU,GAAG,YAAY;aAC5B,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACtC,aAAa,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IACpC,CAAC;IAED,OAAO,CAAC,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,MAAM,YAAY,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9B,MAAM,UAAU,GAAG,YAAY;aAC5B,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACtC,aAAa,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC;IACpC,CAAC;IAED,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE3D,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,EAAE,CAAC;QAC1C,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,6CAA6C,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YAC1E,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,EAAE;SAC/B,CAAC;IACJ,CAAC;IAED,IAAI,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,UAAU,CAAC,EAAE,CAAC;QAC3C,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,SAAS;YACjB,MAAM,EAAE,6CAA6C,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YAC1E,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,EAAE;SAC/B,CAAC;IACJ,CAAC;IAED,OAAO;QACL,EAAE;QACF,KAAK;QACL,MAAM,EAAE,MAAM;QACd,MAAM,EAAE,uCAAuC;QAC/C,IAAI,EAAE,EAAE,UAAU,EAAE,QAAQ,EAAE;KAC/B,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,eAAe,CAAC,OAA+B;IAC7D,MAAM,EAAE,GAAG,cAAc,CAAC;IAC1B,MAAM,KAAK,GAAG,qBAAqB,CAAC;IAEpC,MAAM,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAC5C,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,wBAAwB;YAChC,IAAI,EAAE,EAAE,UAAU,EAAE,EAAE,EAAE,GAAG,EAAE,EAAE,EAAE;SAClC,CAAC;IACJ,CAAC;IAED,MAAM,UAAU,GAAG,WAAW;SAC3B,KAAK,CAAC,GAAG,CAAC;SACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;SAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE/B,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;QAClD,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,iDAAiD,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YAChF,IAAI,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE;SACvC,CAAC;IACJ,CAAC;IAED,IACE,UAAU,CAAC,IAAI,CACb,CAAC,CAAC,EAAE,EAAE,CACJ,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,MAAM,CACpE,EACD,CAAC;QACD,OAAO;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,SAAS;YACjB,MAAM,EAAE,qCAAqC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YACpE,IAAI,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE;SACvC,CAAC;IACJ,CAAC;IAED,OAAO;QACL,EAAE;QACF,KAAK;QACL,MAAM,EAAE,MAAM;QACd,MAAM,EAAE,4DAA4D;QACpE,IAAI,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,WAAW,EAAE;KACvC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export interface RobotsTxtResult {
|
|
2
|
+
parseable: boolean;
|
|
3
|
+
ruleCount: number;
|
|
4
|
+
groups: RobotsGroup[];
|
|
5
|
+
sitemaps: string[];
|
|
6
|
+
}
|
|
7
|
+
export interface RobotsGroup {
|
|
8
|
+
userAgents: string[];
|
|
9
|
+
rules: RobotsRule[];
|
|
10
|
+
}
|
|
11
|
+
export interface RobotsRule {
|
|
12
|
+
type: "allow" | "disallow";
|
|
13
|
+
path: string;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Parse raw robots.txt content into structured data.
|
|
17
|
+
* Handles RFC 9309 edge cases: BOM, CRLF/CR/LF, comments,
|
|
18
|
+
* case-insensitive directives, case-sensitive paths, group boundaries.
|
|
19
|
+
*/
|
|
20
|
+
export declare function parseRobotsTxt(content: string): RobotsTxtResult;
|
|
21
|
+
/**
|
|
22
|
+
* Check whether a robots.txt path pattern matches a given URL path.
|
|
23
|
+
* Supports * wildcards and $ end anchor per RFC 9309.
|
|
24
|
+
*/
|
|
25
|
+
export declare function matchesPath(pattern: string, path: string): boolean;
|
|
26
|
+
//# sourceMappingURL=robots-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-parser.d.ts","sourceRoot":"","sources":["../../../src/bridges/reachability/robots-parser.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,OAAO,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,KAAK,EAAE,UAAU,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,OAAO,GAAG,UAAU,CAAC;IAC3B,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,eAAe,CAmF/D;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAyBlE"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// RFC 9309 robots.txt parser and path matcher
|
|
2
|
+
// Pure logic, no I/O
|
|
3
|
+
/**
|
|
4
|
+
* Parse raw robots.txt content into structured data.
|
|
5
|
+
* Handles RFC 9309 edge cases: BOM, CRLF/CR/LF, comments,
|
|
6
|
+
* case-insensitive directives, case-sensitive paths, group boundaries.
|
|
7
|
+
*/
|
|
8
|
+
export function parseRobotsTxt(content) {
|
|
9
|
+
const groups = [];
|
|
10
|
+
const sitemaps = [];
|
|
11
|
+
// 1. Strip UTF-8 BOM
|
|
12
|
+
let text = content;
|
|
13
|
+
if (text.charCodeAt(0) === 0xfeff || text.startsWith("\xEF\xBB\xBF")) {
|
|
14
|
+
text = text.replace(/^\xEF\xBB\xBF/, "").replace(/^\uFEFF/, "");
|
|
15
|
+
}
|
|
16
|
+
// 2. Split on CRLF, CR, or LF
|
|
17
|
+
const lines = text.split(/\r\n|\r|\n/);
|
|
18
|
+
let currentGroup = null;
|
|
19
|
+
for (const rawLine of lines) {
|
|
20
|
+
// 3. Strip comments and trim
|
|
21
|
+
const commentIdx = rawLine.indexOf("#");
|
|
22
|
+
const line = (commentIdx >= 0 ? rawLine.substring(0, commentIdx) : rawLine).trim();
|
|
23
|
+
if (line === "")
|
|
24
|
+
continue;
|
|
25
|
+
// 4. Parse directive: split on first ':'
|
|
26
|
+
const colonIdx = line.indexOf(":");
|
|
27
|
+
if (colonIdx < 0)
|
|
28
|
+
continue;
|
|
29
|
+
const directive = line.substring(0, colonIdx).trim().toLowerCase();
|
|
30
|
+
const value = line.substring(colonIdx + 1).trim();
|
|
31
|
+
switch (directive) {
|
|
32
|
+
case "user-agent": {
|
|
33
|
+
// 6. If currentGroup has rules, start a new group
|
|
34
|
+
if (currentGroup && currentGroup.rules.length > 0) {
|
|
35
|
+
groups.push(currentGroup);
|
|
36
|
+
currentGroup = { userAgents: [value.toLowerCase()], rules: [] };
|
|
37
|
+
}
|
|
38
|
+
else if (currentGroup) {
|
|
39
|
+
// Consecutive User-agent lines = same group
|
|
40
|
+
currentGroup.userAgents.push(value.toLowerCase());
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
currentGroup = { userAgents: [value.toLowerCase()], rules: [] };
|
|
44
|
+
}
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
case "allow":
|
|
48
|
+
case "disallow": {
|
|
49
|
+
// 7. If no currentGroup, create one with empty userAgents
|
|
50
|
+
if (!currentGroup) {
|
|
51
|
+
currentGroup = { userAgents: [], rules: [] };
|
|
52
|
+
}
|
|
53
|
+
currentGroup.rules.push({ type: directive, path: value });
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
case "sitemap": {
|
|
57
|
+
// 8. Sitemaps are not tied to any group
|
|
58
|
+
sitemaps.push(value);
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
default:
|
|
62
|
+
// 9. Unknown directives: ignore
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
// 10. Push final group
|
|
67
|
+
if (currentGroup) {
|
|
68
|
+
groups.push(currentGroup);
|
|
69
|
+
}
|
|
70
|
+
// 11. Calculate ruleCount
|
|
71
|
+
const ruleCount = groups.reduce((sum, g) => sum + g.rules.length, 0);
|
|
72
|
+
return {
|
|
73
|
+
parseable: true,
|
|
74
|
+
ruleCount,
|
|
75
|
+
groups,
|
|
76
|
+
sitemaps,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Check whether a robots.txt path pattern matches a given URL path.
|
|
81
|
+
* Supports * wildcards and $ end anchor per RFC 9309.
|
|
82
|
+
*/
|
|
83
|
+
export function matchesPath(pattern, path) {
|
|
84
|
+
// 1. Empty pattern matches everything
|
|
85
|
+
if (pattern === "")
|
|
86
|
+
return true;
|
|
87
|
+
// 2. Check for $ end anchor
|
|
88
|
+
let anchorEnd = false;
|
|
89
|
+
let pat = pattern;
|
|
90
|
+
if (pat.endsWith("$")) {
|
|
91
|
+
anchorEnd = true;
|
|
92
|
+
pat = pat.slice(0, -1);
|
|
93
|
+
}
|
|
94
|
+
// 3. Escape regex special chars except * (already removed $ if present)
|
|
95
|
+
// Escape: \ . + ? [ ] ( ) { } | ^ /
|
|
96
|
+
const escaped = pat.replace(/([\\.\+\?\[\]\(\)\{\}\|\^\/])/g, "\\$1");
|
|
97
|
+
// 4. Replace * with .*
|
|
98
|
+
const regexStr = escaped.replace(/\*/g, ".*");
|
|
99
|
+
// 5. Build regex: prefix match unless $ anchor
|
|
100
|
+
const fullRegex = anchorEnd
|
|
101
|
+
? new RegExp("^" + regexStr + "$")
|
|
102
|
+
: new RegExp("^" + regexStr);
|
|
103
|
+
return fullRegex.test(path);
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=robots-parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-parser.js","sourceRoot":"","sources":["../../../src/bridges/reachability/robots-parser.ts"],"names":[],"mappings":"AAAA,8CAA8C;AAC9C,qBAAqB;AAmBrB;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,OAAe;IAC5C,MAAM,MAAM,GAAkB,EAAE,CAAC;IACjC,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,qBAAqB;IACrB,IAAI,IAAI,GAAG,OAAO,CAAC;IACnB,IAAI,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,KAAK,MAAM,IAAI,IAAI,CAAC,UAAU,CAAC,cAAc,CAAC,EAAE,CAAC;QACrE,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAClE,CAAC;IAED,8BAA8B;IAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;IAEvC,IAAI,YAAY,GACd,IAAI,CAAC;IAEP,KAAK,MAAM,OAAO,IAAI,KAAK,EAAE,CAAC;QAC5B,6BAA6B;QAC7B,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACxC,MAAM,IAAI,GAAG,CACX,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,OAAO,CAC7D,CAAC,IAAI,EAAE,CAAC;QAET,IAAI,IAAI,KAAK,EAAE;YAAE,SAAS;QAE1B,yCAAyC;QACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACnC,IAAI,QAAQ,GAAG,CAAC;YAAE,SAAS;QAE3B,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACnE,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAElD,QAAQ,SAAS,EAAE,CAAC;YAClB,KAAK,YAAY,CAAC,CAAC,CAAC;gBAClB,kDAAkD;gBAClD,IAAI,YAAY,IAAI,YAAY,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAClD,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;oBAC1B,YAAY,GAAG,EAAE,UAAU,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;gBAClE,CAAC;qBAAM,IAAI,YAAY,EAAE,CAAC;oBACxB,4CAA4C;oBAC5C,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;gBACpD,CAAC;qBAAM,CAAC;oBACN,YAAY,GAAG,EAAE,UAAU,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;gBAClE,CAAC;gBACD,MAAM;YACR,CAAC;YAED,KAAK,OAAO,CAAC;YACb,KAAK,UAAU,CAAC,CAAC,CAAC;gBAChB,0DAA0D;gBAC1D,IAAI,CAAC,YAAY,EAAE,CAAC;oBAClB,YAAY,GAAG,EAAE,UAAU,EAAE,EAAE,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;gBAC/C,CAAC;gBACD,YAAY,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;gBAC1D,MAAM;YACR,CAAC;YAED,KAAK,SAAS,CAAC,CAAC,CAAC;gBACf,wCAAwC;gBACxC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACrB,MAAM;YACR,CAAC;YAED;gBACE,gCAAgC;gBAChC,MAAM;QACV,CAAC;IACH,CAAC;IAED,uBAAuB;IACvB,IAAI,YAAY,EAAE,CAAC;QACjB,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IAC5B,CAAC;IAED,0BAA0B;IAC1B,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAErE,OAAO;QACL,SAAS,EAAE,IAAI;QACf,SAAS;QACT,MAAM;QACN,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,OAAe,EAAE,IAAY;IACvD,sCAAsC;IACtC,IAAI,OAAO,KAAK,EAAE;QAAE,OAAO,IAAI,CAAC;IAEhC,4BAA4B;IAC5B,IAAI,SAAS,GAAG,KAAK,CAAC;IACtB,IAAI,GAAG,GAAG,OAAO,CAAC;IAClB,IAAI,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACtB,SAAS,GAAG,IAAI,CAAC;QACjB,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACzB,CAAC;IAED,wEAAwE;IACxE,oCAAoC;IACpC,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;IAEtE,uBAAuB;IACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAE9C,+CAA+C;IAC/C,MAAM,SAAS,GAAG,SAAS;QACzB,CAAC,CAAC,IAAI,MAAM,CAAC,GAAG,GAAG,QAAQ,GAAG,GAAG,CAAC;QAClC,CAAC,CAAC,IAAI,MAAM,CAAC,GAAG,GAAG,QAAQ,CAAC,CAAC;IAE/B,OAAO,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Check } from "../../core/types.js";
|
|
2
|
+
import { type RobotsTxtResult } from "./robots-parser.js";
|
|
3
|
+
export interface RobotsTxtCheckResult {
|
|
4
|
+
check: Check;
|
|
5
|
+
parsed: RobotsTxtResult | null;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Fetch and parse robots.txt for the given domain.
|
|
9
|
+
*
|
|
10
|
+
* Returns both the Check result and the parsed data for downstream
|
|
11
|
+
* use by crawler policy evaluation.
|
|
12
|
+
*/
|
|
13
|
+
export declare function checkRobotsTxt(domain: string, timeout?: number): Promise<RobotsTxtCheckResult>;
|
|
14
|
+
//# sourceMappingURL=robots-txt.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-txt.d.ts","sourceRoot":"","sources":["../../../src/bridges/reachability/robots-txt.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,qBAAqB,CAAC;AAEjD,OAAO,EAAkB,KAAK,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAE1E,MAAM,WAAW,oBAAoB;IACnC,KAAK,EAAE,KAAK,CAAC;IACb,MAAM,EAAE,eAAe,GAAG,IAAI,CAAC;CAChC;AAID;;;;;GAKG;AACH,wBAAsB,cAAc,CAClC,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,MAAM,GACf,OAAO,CAAC,oBAAoB,CAAC,CA+E/B"}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { httpGet } from "../../utils/http-client.js";
|
|
2
|
+
import { parseRobotsTxt } from "./robots-parser.js";
|
|
3
|
+
const DIRECTIVE_PATTERN = /\b(user-agent|disallow|allow)\s*:/i;
|
|
4
|
+
/**
|
|
5
|
+
* Fetch and parse robots.txt for the given domain.
|
|
6
|
+
*
|
|
7
|
+
* Returns both the Check result and the parsed data for downstream
|
|
8
|
+
* use by crawler policy evaluation.
|
|
9
|
+
*/
|
|
10
|
+
export async function checkRobotsTxt(domain, timeout) {
|
|
11
|
+
const id = "robots_txt";
|
|
12
|
+
const label = "robots.txt";
|
|
13
|
+
const result = await httpGet("https://" + domain + "/robots.txt", {
|
|
14
|
+
timeout,
|
|
15
|
+
});
|
|
16
|
+
if (!result.ok) {
|
|
17
|
+
// 404 = no robots.txt
|
|
18
|
+
if (result.error.kind === "http_error" &&
|
|
19
|
+
result.error.statusCode === 404) {
|
|
20
|
+
return {
|
|
21
|
+
check: {
|
|
22
|
+
id,
|
|
23
|
+
label,
|
|
24
|
+
status: "partial",
|
|
25
|
+
detail: "No robots.txt found",
|
|
26
|
+
},
|
|
27
|
+
parsed: null,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// Other errors
|
|
31
|
+
return {
|
|
32
|
+
check: {
|
|
33
|
+
id,
|
|
34
|
+
label,
|
|
35
|
+
status: "fail",
|
|
36
|
+
detail: result.error.message,
|
|
37
|
+
},
|
|
38
|
+
parsed: null,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
// Success -- validate content type
|
|
42
|
+
const contentType = result.headers["content-type"] ?? "";
|
|
43
|
+
const isTextPlain = contentType.toLowerCase().includes("text/plain");
|
|
44
|
+
const hasDirectives = DIRECTIVE_PATTERN.test(result.body);
|
|
45
|
+
if (!isTextPlain && !hasDirectives) {
|
|
46
|
+
return {
|
|
47
|
+
check: {
|
|
48
|
+
id,
|
|
49
|
+
label,
|
|
50
|
+
status: "fail",
|
|
51
|
+
detail: "robots.txt is not a text file",
|
|
52
|
+
},
|
|
53
|
+
parsed: null,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
// Parse
|
|
57
|
+
const parsed = parseRobotsTxt(result.body);
|
|
58
|
+
if (parsed.ruleCount === 0 && parsed.groups.length === 0) {
|
|
59
|
+
return {
|
|
60
|
+
check: {
|
|
61
|
+
id,
|
|
62
|
+
label,
|
|
63
|
+
status: "partial",
|
|
64
|
+
detail: "robots.txt exists but has no rules",
|
|
65
|
+
},
|
|
66
|
+
parsed,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
return {
|
|
70
|
+
check: {
|
|
71
|
+
id,
|
|
72
|
+
label,
|
|
73
|
+
status: "pass",
|
|
74
|
+
detail: `robots.txt found with ${parsed.ruleCount} rules`,
|
|
75
|
+
data: { ruleCount: parsed.ruleCount, sitemaps: parsed.sitemaps },
|
|
76
|
+
},
|
|
77
|
+
parsed,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
//# sourceMappingURL=robots-txt.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"robots-txt.js","sourceRoot":"","sources":["../../../src/bridges/reachability/robots-txt.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,OAAO,EAAE,MAAM,4BAA4B,CAAC;AACrD,OAAO,EAAE,cAAc,EAAwB,MAAM,oBAAoB,CAAC;AAO1E,MAAM,iBAAiB,GAAG,oCAAoC,CAAC;AAE/D;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,MAAc,EACd,OAAgB;IAEhB,MAAM,EAAE,GAAG,YAAY,CAAC;IACxB,MAAM,KAAK,GAAG,YAAY,CAAC;IAE3B,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,UAAU,GAAG,MAAM,GAAG,aAAa,EAAE;QAChE,OAAO;KACR,CAAC,CAAC;IAEH,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;QACf,sBAAsB;QACtB,IACE,MAAM,CAAC,KAAK,CAAC,IAAI,KAAK,YAAY;YAClC,MAAM,CAAC,KAAK,CAAC,UAAU,KAAK,GAAG,EAC/B,CAAC;YACD,OAAO;gBACL,KAAK,EAAE;oBACL,EAAE;oBACF,KAAK;oBACL,MAAM,EAAE,SAAS;oBACjB,MAAM,EAAE,qBAAqB;iBAC9B;gBACD,MAAM,EAAE,IAAI;aACb,CAAC;QACJ,CAAC;QAED,eAAe;QACf,OAAO;YACL,KAAK,EAAE;gBACL,EAAE;gBACF,KAAK;gBACL,MAAM,EAAE,MAAM;gBACd,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;aAC7B;YACD,MAAM,EAAE,IAAI;SACb,CAAC;IACJ,CAAC;IAED,mCAAmC;IACnC,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;IACzD,MAAM,WAAW,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;IACrE,MAAM,aAAa,GAAG,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE1D,IAAI,CAAC,WAAW,IAAI,CAAC,aAAa,EAAE,CAAC;QACnC,OAAO;YACL,KAAK,EAAE;gBACL,EAAE;gBACF,KAAK;gBACL,MAAM,EAAE,MAAM;gBACd,MAAM,EAAE,+BAA+B;aACxC;YACD,MAAM,EAAE,IAAI;SACb,CAAC;IACJ,CAAC;IAED,QAAQ;IACR,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE3C,IAAI,MAAM,CAAC,SAAS,KAAK,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzD,OAAO;YACL,KAAK,EAAE;gBACL,EAAE;gBACF,KAAK;gBACL,MAAM,EAAE,SAAS;gBACjB,MAAM,EAAE,oCAAoC;aAC7C;YACD,MAAM;SACP,CAAC;IACJ,CAAC;IAED,OAAO;QACL,KAAK,EAAE;YACL,EAAE;YACF,KAAK;YACL,MAAM,EAAE,MAAM;YACd,MAAM,EAAE,yBAAyB,MAAM,CAAC,SAAS,QAAQ;YACzD,IAAI,EAAE,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE;SACjE;QACD,MAAM;KACP,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Check, ContentSource } from "../../core/types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Detect API presence via multiple signals across content sources.
|
|
4
|
+
*
|
|
5
|
+
* Four signal sources:
|
|
6
|
+
* 1. OpenAPI spec detected by Bridge 2 (boolean from ctx.shared.openApiDetected)
|
|
7
|
+
* 2. API-related response headers (X-RateLimit-*, X-Request-Id, etc.)
|
|
8
|
+
* 3. HTML links containing /api/ paths (scanned from all content sources)
|
|
9
|
+
* 4. Markdown links containing /api/ paths (scanned from all content sources)
|
|
10
|
+
*
|
|
11
|
+
* Pure function -- no HTTP calls.
|
|
12
|
+
*/
|
|
13
|
+
export declare function checkApiPresence(openApiDetected: boolean, sources: ContentSource[], headers: Record<string, string>): Check;
|
|
14
|
+
//# sourceMappingURL=api-presence.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"api-presence.d.ts","sourceRoot":"","sources":["../../../src/bridges/separation/api-presence.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AA8ChE;;;;;;;;;;GAUG;AACH,wBAAgB,gBAAgB,CAC9B,eAAe,EAAE,OAAO,EACxB,OAAO,EAAE,aAAa,EAAE,EACxB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAC9B,KAAK,CA0CP"}
|