webmcp-cli 1.2.2 → 1.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analysis/form-to-tool-mapper.d.ts +61 -0
- package/dist/analysis/form-to-tool-mapper.js +360 -0
- package/dist/analysis/form-to-tool-mapper.js.map +1 -0
- package/dist/analysis/index.d.ts +84 -0
- package/dist/analysis/index.js +81 -0
- package/dist/analysis/index.js.map +1 -0
- package/dist/analysis/missing-tool-analyzer.d.ts +35 -0
- package/dist/analysis/missing-tool-analyzer.js +617 -0
- package/dist/analysis/missing-tool-analyzer.js.map +1 -0
- package/dist/audit/run-multi-page-audit.d.ts +34 -0
- package/dist/audit/run-multi-page-audit.js +233 -0
- package/dist/audit/run-multi-page-audit.js.map +1 -0
- package/dist/cli/commands/potential.d.ts +8 -0
- package/dist/cli/commands/potential.js +323 -0
- package/dist/cli/commands/potential.js.map +1 -0
- package/dist/cli/commands/report.d.ts +12 -0
- package/dist/cli/commands/report.js +89 -0
- package/dist/cli/commands/report.js.map +1 -0
- package/dist/cli/index.js +35 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/config/defaults.d.ts +36 -0
- package/dist/config/defaults.js +33 -0
- package/dist/config/defaults.js.map +1 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +7 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/loader.d.ts +22 -0
- package/dist/config/loader.js +91 -0
- package/dist/config/loader.js.map +1 -0
- package/dist/config/schema.d.ts +280 -0
- package/dist/config/schema.js +42 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/core/types/audit.d.ts +1 -1
- package/dist/core/types/index.d.ts +1 -0
- package/dist/core/types/index.js +1 -0
- package/dist/core/types/index.js.map +1 -1
- package/dist/core/types/recon.d.ts +265 -0
- package/dist/core/types/recon.js +5 -0
- package/dist/core/types/recon.js.map +1 -0
- package/dist/core/types/rule.d.ts +1 -1
- package/dist/core/types/rule.js +7 -5
- package/dist/core/types/rule.js.map +1 -1
- package/dist/crawler/depth-crawler.d.ts +29 -0
- package/dist/crawler/depth-crawler.js +212 -0
- package/dist/crawler/depth-crawler.js.map +1 -0
- package/dist/crawler/index.d.ts +2 -0
- package/dist/crawler/index.js +3 -0
- package/dist/crawler/index.js.map +1 -0
- package/dist/crawler/link-extractor.d.ts +1 -0
- package/dist/crawler/link-extractor.js +49 -0
- package/dist/crawler/link-extractor.js.map +1 -0
- package/dist/generators/index.d.ts +10 -0
- package/dist/generators/index.js +8 -0
- package/dist/generators/index.js.map +1 -0
- package/dist/generators/report-html.d.ts +12 -0
- package/dist/generators/report-html.js +470 -0
- package/dist/generators/report-html.js.map +1 -0
- package/dist/generators/report-json.d.ts +95 -0
- package/dist/generators/report-json.js +144 -0
- package/dist/generators/report-json.js.map +1 -0
- package/dist/generators/report-manager.d.ts +31 -0
- package/dist/generators/report-manager.js +208 -0
- package/dist/generators/report-manager.js.map +1 -0
- package/dist/generators/tool-code-generator.d.ts +31 -0
- package/dist/generators/tool-code-generator.js +201 -0
- package/dist/generators/tool-code-generator.js.map +1 -0
- package/dist/potential/ai-recommender.d.ts +33 -0
- package/dist/potential/ai-recommender.js +414 -0
- package/dist/potential/ai-recommender.js.map +1 -0
- package/dist/potential/analyzer.d.ts +32 -0
- package/dist/potential/analyzer.js +383 -0
- package/dist/potential/analyzer.js.map +1 -0
- package/dist/potential/index.d.ts +3 -0
- package/dist/potential/index.js +4 -0
- package/dist/potential/index.js.map +1 -0
- package/dist/potential/prompts.d.ts +20 -0
- package/dist/potential/prompts.js +42 -0
- package/dist/potential/prompts.js.map +1 -0
- package/dist/potential/types.d.ts +40 -0
- package/dist/potential/types.js +2 -0
- package/dist/potential/types.js.map +1 -0
- package/dist/recon/index.d.ts +20 -0
- package/dist/recon/index.js +143 -0
- package/dist/recon/index.js.map +1 -0
- package/dist/recon/manifest.d.ts +16 -0
- package/dist/recon/manifest.js +108 -0
- package/dist/recon/manifest.js.map +1 -0
- package/dist/recon/meta-extractor.d.ts +11 -0
- package/dist/recon/meta-extractor.js +276 -0
- package/dist/recon/meta-extractor.js.map +1 -0
- package/dist/recon/robots.d.ts +16 -0
- package/dist/recon/robots.js +158 -0
- package/dist/recon/robots.js.map +1 -0
- package/dist/recon/route-discovery.d.ts +25 -0
- package/dist/recon/route-discovery.js +303 -0
- package/dist/recon/route-discovery.js.map +1 -0
- package/dist/recon/sitemap.d.ts +12 -0
- package/dist/recon/sitemap.js +177 -0
- package/dist/recon/sitemap.js.map +1 -0
- package/dist/rules/accessibility/AXE-001.d.ts +9 -0
- package/dist/rules/accessibility/AXE-001.js +109 -0
- package/dist/rules/accessibility/AXE-001.js.map +1 -0
- package/dist/rules/accessibility/AXE-002.d.ts +8 -0
- package/dist/rules/accessibility/AXE-002.js +85 -0
- package/dist/rules/accessibility/AXE-002.js.map +1 -0
- package/dist/rules/accessibility/AXE-003.d.ts +8 -0
- package/dist/rules/accessibility/AXE-003.js +94 -0
- package/dist/rules/accessibility/AXE-003.js.map +1 -0
- package/dist/rules/accessibility/AXE-004.d.ts +8 -0
- package/dist/rules/accessibility/AXE-004.js +101 -0
- package/dist/rules/accessibility/AXE-004.js.map +1 -0
- package/dist/rules/accessibility/AXE-005.d.ts +9 -0
- package/dist/rules/accessibility/AXE-005.js +89 -0
- package/dist/rules/accessibility/AXE-005.js.map +1 -0
- package/dist/rules/best-practices/BP-004.d.ts +9 -0
- package/dist/rules/best-practices/BP-004.js +96 -0
- package/dist/rules/best-practices/BP-004.js.map +1 -0
- package/dist/rules/best-practices/BP-005.d.ts +8 -0
- package/dist/rules/best-practices/BP-005.js +94 -0
- package/dist/rules/best-practices/BP-005.js.map +1 -0
- package/dist/rules/best-practices/BP-006.d.ts +8 -0
- package/dist/rules/best-practices/BP-006.js +80 -0
- package/dist/rules/best-practices/BP-006.js.map +1 -0
- package/dist/rules/best-practices/BP-007.d.ts +8 -0
- package/dist/rules/best-practices/BP-007.js +92 -0
- package/dist/rules/best-practices/BP-007.js.map +1 -0
- package/dist/rules/best-practices/BP-008.d.ts +12 -0
- package/dist/rules/best-practices/BP-008.js +86 -0
- package/dist/rules/best-practices/BP-008.js.map +1 -0
- package/dist/rules/best-practices/BP-009.d.ts +9 -0
- package/dist/rules/best-practices/BP-009.js +77 -0
- package/dist/rules/best-practices/BP-009.js.map +1 -0
- package/dist/rules/best-practices/BP-010.d.ts +8 -0
- package/dist/rules/best-practices/BP-010.js +85 -0
- package/dist/rules/best-practices/BP-010.js.map +1 -0
- package/dist/rules/coverage/COV-002.d.ts +8 -0
- package/dist/rules/coverage/COV-002.js +68 -0
- package/dist/rules/coverage/COV-002.js.map +1 -0
- package/dist/rules/coverage/COV-003.d.ts +8 -0
- package/dist/rules/coverage/COV-003.js +68 -0
- package/dist/rules/coverage/COV-003.js.map +1 -0
- package/dist/rules/coverage/COV-004.d.ts +8 -0
- package/dist/rules/coverage/COV-004.js +89 -0
- package/dist/rules/coverage/COV-004.js.map +1 -0
- package/dist/rules/coverage/COV-005.d.ts +8 -0
- package/dist/rules/coverage/COV-005.js +67 -0
- package/dist/rules/coverage/COV-005.js.map +1 -0
- package/dist/rules/coverage/COV-006.d.ts +9 -0
- package/dist/rules/coverage/COV-006.js +76 -0
- package/dist/rules/coverage/COV-006.js.map +1 -0
- package/dist/rules/coverage/COV-007.d.ts +8 -0
- package/dist/rules/coverage/COV-007.js +67 -0
- package/dist/rules/coverage/COV-007.js.map +1 -0
- package/dist/rules/coverage/COV-008.d.ts +9 -0
- package/dist/rules/coverage/COV-008.js +87 -0
- package/dist/rules/coverage/COV-008.js.map +1 -0
- package/dist/rules/coverage/COV-009.d.ts +8 -0
- package/dist/rules/coverage/COV-009.js +73 -0
- package/dist/rules/coverage/COV-009.js.map +1 -0
- package/dist/rules/coverage/COV-010.d.ts +9 -0
- package/dist/rules/coverage/COV-010.js +82 -0
- package/dist/rules/coverage/COV-010.js.map +1 -0
- package/dist/rules/description/DESC-001.d.ts +9 -0
- package/dist/rules/description/DESC-001.js +88 -0
- package/dist/rules/description/DESC-001.js.map +1 -0
- package/dist/rules/description/DESC-002.d.ts +10 -0
- package/dist/rules/description/DESC-002.js +99 -0
- package/dist/rules/description/DESC-002.js.map +1 -0
- package/dist/rules/description/DESC-006.d.ts +9 -0
- package/dist/rules/description/DESC-006.js +78 -0
- package/dist/rules/description/DESC-006.js.map +1 -0
- package/dist/rules/description/DESC-007.d.ts +9 -0
- package/dist/rules/description/DESC-007.js +70 -0
- package/dist/rules/description/DESC-007.js.map +1 -0
- package/dist/rules/description/DESC-008.d.ts +9 -0
- package/dist/rules/description/DESC-008.js +70 -0
- package/dist/rules/description/DESC-008.js.map +1 -0
- package/dist/rules/description/DESC-009.d.ts +8 -0
- package/dist/rules/description/DESC-009.js +55 -0
- package/dist/rules/description/DESC-009.js.map +1 -0
- package/dist/rules/description/DESC-010.d.ts +9 -0
- package/dist/rules/description/DESC-010.js +92 -0
- package/dist/rules/description/DESC-010.js.map +1 -0
- package/dist/rules/description/DESC-011.d.ts +9 -0
- package/dist/rules/description/DESC-011.js +81 -0
- package/dist/rules/description/DESC-011.js.map +1 -0
- package/dist/rules/description/DESC-012.d.ts +9 -0
- package/dist/rules/description/DESC-012.js +98 -0
- package/dist/rules/description/DESC-012.js.map +1 -0
- package/dist/rules/implementation/IMP-002.d.ts +9 -0
- package/dist/rules/implementation/IMP-002.js +59 -0
- package/dist/rules/implementation/IMP-002.js.map +1 -0
- package/dist/rules/implementation/IMP-006.d.ts +9 -0
- package/dist/rules/implementation/IMP-006.js +48 -0
- package/dist/rules/implementation/IMP-006.js.map +1 -0
- package/dist/rules/implementation/IMP-008.d.ts +9 -0
- package/dist/rules/implementation/IMP-008.js +46 -0
- package/dist/rules/implementation/IMP-008.js.map +1 -0
- package/dist/rules/implementation/IMP-009.d.ts +9 -0
- package/dist/rules/implementation/IMP-009.js +48 -0
- package/dist/rules/implementation/IMP-009.js.map +1 -0
- package/dist/rules/implementation/IMP-010.d.ts +9 -0
- package/dist/rules/implementation/IMP-010.js +66 -0
- package/dist/rules/implementation/IMP-010.js.map +1 -0
- package/dist/rules/implementation/IMP-011.d.ts +9 -0
- package/dist/rules/implementation/IMP-011.js +82 -0
- package/dist/rules/implementation/IMP-011.js.map +1 -0
- package/dist/rules/implementation/IMP-012.d.ts +9 -0
- package/dist/rules/implementation/IMP-012.js +88 -0
- package/dist/rules/implementation/IMP-012.js.map +1 -0
- package/dist/rules/implementation/IMP-014.d.ts +9 -0
- package/dist/rules/implementation/IMP-014.js +58 -0
- package/dist/rules/implementation/IMP-014.js.map +1 -0
- package/dist/rules/implementation/IMP-015.d.ts +9 -0
- package/dist/rules/implementation/IMP-015.js +64 -0
- package/dist/rules/implementation/IMP-015.js.map +1 -0
- package/dist/rules/implementation/IMP-016.d.ts +9 -0
- package/dist/rules/implementation/IMP-016.js +52 -0
- package/dist/rules/implementation/IMP-016.js.map +1 -0
- package/dist/rules/implementation/IMP-017.d.ts +8 -0
- package/dist/rules/implementation/IMP-017.js +51 -0
- package/dist/rules/implementation/IMP-017.js.map +1 -0
- package/dist/rules/implementation/IMP-018.d.ts +8 -0
- package/dist/rules/implementation/IMP-018.js +52 -0
- package/dist/rules/implementation/IMP-018.js.map +1 -0
- package/dist/rules/implementation/IMP-019.d.ts +8 -0
- package/dist/rules/implementation/IMP-019.js +53 -0
- package/dist/rules/implementation/IMP-019.js.map +1 -0
- package/dist/rules/implementation/IMP-020.d.ts +9 -0
- package/dist/rules/implementation/IMP-020.js +62 -0
- package/dist/rules/implementation/IMP-020.js.map +1 -0
- package/dist/rules/implementation/IMP-021.d.ts +8 -0
- package/dist/rules/implementation/IMP-021.js +64 -0
- package/dist/rules/implementation/IMP-021.js.map +1 -0
- package/dist/rules/implementation/IMP-022.d.ts +8 -0
- package/dist/rules/implementation/IMP-022.js +70 -0
- package/dist/rules/implementation/IMP-022.js.map +1 -0
- package/dist/rules/index.d.ts +73 -6
- package/dist/rules/index.js +141 -6
- package/dist/rules/index.js.map +1 -1
- package/dist/rules/schema/SCHEMA-004.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-004.js +57 -0
- package/dist/rules/schema/SCHEMA-004.js.map +1 -0
- package/dist/rules/schema/SCHEMA-005.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-005.js +61 -0
- package/dist/rules/schema/SCHEMA-005.js.map +1 -0
- package/dist/rules/schema/SCHEMA-006.d.ts +10 -0
- package/dist/rules/schema/SCHEMA-006.js +85 -0
- package/dist/rules/schema/SCHEMA-006.js.map +1 -0
- package/dist/rules/schema/SCHEMA-007.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-007.js +73 -0
- package/dist/rules/schema/SCHEMA-007.js.map +1 -0
- package/dist/rules/schema/SCHEMA-008.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-008.js +70 -0
- package/dist/rules/schema/SCHEMA-008.js.map +1 -0
- package/dist/rules/schema/SCHEMA-009.d.ts +10 -0
- package/dist/rules/schema/SCHEMA-009.js +80 -0
- package/dist/rules/schema/SCHEMA-009.js.map +1 -0
- package/dist/rules/schema/SCHEMA-010.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-010.js +96 -0
- package/dist/rules/schema/SCHEMA-010.js.map +1 -0
- package/dist/rules/schema/SCHEMA-012.d.ts +9 -0
- package/dist/rules/schema/SCHEMA-012.js +65 -0
- package/dist/rules/schema/SCHEMA-012.js.map +1 -0
- package/dist/rules/security/SEC-002.d.ts +8 -0
- package/dist/rules/security/SEC-002.js +81 -0
- package/dist/rules/security/SEC-002.js.map +1 -0
- package/dist/rules/security/SEC-003.d.ts +8 -0
- package/dist/rules/security/SEC-003.js +85 -0
- package/dist/rules/security/SEC-003.js.map +1 -0
- package/dist/rules/security/SEC-004.d.ts +9 -0
- package/dist/rules/security/SEC-004.js +87 -0
- package/dist/rules/security/SEC-004.js.map +1 -0
- package/dist/rules/security/SEC-005.d.ts +8 -0
- package/dist/rules/security/SEC-005.js +87 -0
- package/dist/rules/security/SEC-005.js.map +1 -0
- package/dist/rules/security/SEC-006.d.ts +10 -0
- package/dist/rules/security/SEC-006.js +108 -0
- package/dist/rules/security/SEC-006.js.map +1 -0
- package/dist/rules/security/SEC-007.d.ts +9 -0
- package/dist/rules/security/SEC-007.js +108 -0
- package/dist/rules/security/SEC-007.js.map +1 -0
- package/dist/rules/security/SEC-008.d.ts +8 -0
- package/dist/rules/security/SEC-008.js +109 -0
- package/dist/rules/security/SEC-008.js.map +1 -0
- package/dist/rules/security/SEC-009.d.ts +9 -0
- package/dist/rules/security/SEC-009.js +93 -0
- package/dist/rules/security/SEC-009.js.map +1 -0
- package/dist/rules/security/SEC-010.d.ts +8 -0
- package/dist/rules/security/SEC-010.js +78 -0
- package/dist/rules/security/SEC-010.js.map +1 -0
- package/dist/rules/security/SEC-011.d.ts +8 -0
- package/dist/rules/security/SEC-011.js +93 -0
- package/dist/rules/security/SEC-011.js.map +1 -0
- package/dist/rules/security/SEC-012.d.ts +8 -0
- package/dist/rules/security/SEC-012.js +79 -0
- package/dist/rules/security/SEC-012.js.map +1 -0
- package/dist/rules/security/SEC-013.d.ts +9 -0
- package/dist/rules/security/SEC-013.js +107 -0
- package/dist/rules/security/SEC-013.js.map +1 -0
- package/dist/scoring/calculator.js +1 -0
- package/dist/scoring/calculator.js.map +1 -1
- package/dist/ui/ink/components/AIRecommendationCard.d.ts +11 -0
- package/dist/ui/ink/components/AIRecommendationCard.js +23 -0
- package/dist/ui/ink/components/AIRecommendationCard.js.map +1 -0
- package/dist/ui/ink/components/OpportunityList.d.ts +10 -0
- package/dist/ui/ink/components/OpportunityList.js +48 -0
- package/dist/ui/ink/components/OpportunityList.js.map +1 -0
- package/dist/ui/ink/components/PotentialPageCard.d.ts +13 -0
- package/dist/ui/ink/components/PotentialPageCard.js +43 -0
- package/dist/ui/ink/components/PotentialPageCard.js.map +1 -0
- package/dist/ui/ink/components/PotentialProgress.d.ts +16 -0
- package/dist/ui/ink/components/PotentialProgress.js +44 -0
- package/dist/ui/ink/components/PotentialProgress.js.map +1 -0
- package/dist/ui/ink/components/PotentialSummary.d.ts +10 -0
- package/dist/ui/ink/components/PotentialSummary.js +86 -0
- package/dist/ui/ink/components/PotentialSummary.js.map +1 -0
- package/dist/ui/ink/components/SuggestionCard.d.ts +34 -0
- package/dist/ui/ink/components/SuggestionCard.js +36 -0
- package/dist/ui/ink/components/SuggestionCard.js.map +1 -0
- package/dist/ui/ink/components/views/MultiPageCrawlView.d.ts +21 -0
- package/dist/ui/ink/components/views/MultiPageCrawlView.js +55 -0
- package/dist/ui/ink/components/views/MultiPageCrawlView.js.map +1 -0
- package/dist/ui/ink/components/views/PotentialView.d.ts +18 -0
- package/dist/ui/ink/components/views/PotentialView.js +74 -0
- package/dist/ui/ink/components/views/PotentialView.js.map +1 -0
- package/dist/ui/ink/components/views/ReconView.d.ts +22 -0
- package/dist/ui/ink/components/views/ReconView.js +30 -0
- package/dist/ui/ink/components/views/ReconView.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reconnaissance Module
|
|
3
|
+
*
|
|
4
|
+
* Orchestrates all recon sub-tasks: sitemap, robots.txt, manifest,
|
|
5
|
+
* meta extraction, and route discovery. Runs as many tasks in parallel
|
|
6
|
+
* as possible to minimize wall-clock time.
|
|
7
|
+
*/
|
|
8
|
+
import { parseSitemap } from './sitemap.js';
|
|
9
|
+
import { parseRobots } from './robots.js';
|
|
10
|
+
import { parseManifest } from './manifest.js';
|
|
11
|
+
import { extractMeta } from './meta-extractor.js';
|
|
12
|
+
import { discoverRoutes } from './route-discovery.js';
|
|
13
|
+
/** Fetch timeout (ms) */
|
|
14
|
+
const ROOT_FETCH_TIMEOUT_MS = 15_000;
|
|
15
|
+
/**
|
|
16
|
+
* Fetch the root page HTML via plain HTTP (no browser needed).
|
|
17
|
+
* Returns the resolved URL (after redirects), status, response time, and HTML.
|
|
18
|
+
*/
|
|
19
|
+
async function fetchRootPage(url) {
|
|
20
|
+
const controller = new AbortController();
|
|
21
|
+
const timer = setTimeout(() => controller.abort(), ROOT_FETCH_TIMEOUT_MS);
|
|
22
|
+
const start = Date.now();
|
|
23
|
+
try {
|
|
24
|
+
const response = await fetch(url, {
|
|
25
|
+
signal: controller.signal,
|
|
26
|
+
headers: {
|
|
27
|
+
'User-Agent': 'Mozilla/5.0 (compatible; WebMCP-CLI/1.0; +https://webmcp.org)',
|
|
28
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
29
|
+
},
|
|
30
|
+
redirect: 'follow',
|
|
31
|
+
});
|
|
32
|
+
const responseTime = Date.now() - start;
|
|
33
|
+
const html = await response.text();
|
|
34
|
+
return {
|
|
35
|
+
resolvedUrl: response.url,
|
|
36
|
+
statusCode: response.status,
|
|
37
|
+
responseTime,
|
|
38
|
+
html,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
finally {
|
|
42
|
+
clearTimeout(timer);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Run full reconnaissance on a target URL.
|
|
47
|
+
*
|
|
48
|
+
* Gathers sitemap, robots.txt, manifest, meta tags, and discovers
|
|
49
|
+
* routes — all without launching a browser.
|
|
50
|
+
*/
|
|
51
|
+
export async function runReconnaissance(url) {
|
|
52
|
+
const errors = [];
|
|
53
|
+
// Step 1: Fetch root page HTML + robots + sitemap in parallel
|
|
54
|
+
// (robots.txt is needed for route filtering, and may reference sitemaps)
|
|
55
|
+
const [rootResult, robotsResult] = await Promise.allSettled([
|
|
56
|
+
fetchRootPage(url),
|
|
57
|
+
parseRobots(url),
|
|
58
|
+
]);
|
|
59
|
+
// Handle root page failure (fatal)
|
|
60
|
+
if (rootResult.status === 'rejected') {
|
|
61
|
+
const errMsg = rootResult.reason instanceof Error
|
|
62
|
+
? rootResult.reason.message
|
|
63
|
+
: 'Failed to fetch root page';
|
|
64
|
+
return {
|
|
65
|
+
url,
|
|
66
|
+
resolvedUrl: url,
|
|
67
|
+
statusCode: 0,
|
|
68
|
+
responseTime: 0,
|
|
69
|
+
sitemap: null,
|
|
70
|
+
robots: null,
|
|
71
|
+
manifest: null,
|
|
72
|
+
meta: null,
|
|
73
|
+
discoveredRoutes: [],
|
|
74
|
+
rootHtml: '',
|
|
75
|
+
errors: [errMsg],
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
const { resolvedUrl, statusCode, responseTime, html } = rootResult.value;
|
|
79
|
+
const robots = robotsResult.status === 'fulfilled'
|
|
80
|
+
? robotsResult.value
|
|
81
|
+
: null;
|
|
82
|
+
if (robotsResult.status === 'rejected') {
|
|
83
|
+
errors.push(`robots.txt: ${robotsResult.reason instanceof Error ? robotsResult.reason.message : 'Failed'}`);
|
|
84
|
+
}
|
|
85
|
+
// Step 2: Determine sitemap URL(s)
|
|
86
|
+
// Check robots.txt for sitemap references first
|
|
87
|
+
const sitemapUrlsFromRobots = robots?.sitemapUrls ?? [];
|
|
88
|
+
// Step 3: Fetch sitemap + manifest + extract meta in parallel
|
|
89
|
+
const [sitemapResult, manifestResult] = await Promise.allSettled([
|
|
90
|
+
sitemapUrlsFromRobots.length > 0
|
|
91
|
+
? parseSitemap(url, sitemapUrlsFromRobots[0])
|
|
92
|
+
: parseSitemap(url),
|
|
93
|
+
parseManifest(url, html),
|
|
94
|
+
]);
|
|
95
|
+
const sitemap = sitemapResult.status === 'fulfilled'
|
|
96
|
+
? sitemapResult.value
|
|
97
|
+
: null;
|
|
98
|
+
const manifest = manifestResult.status === 'fulfilled'
|
|
99
|
+
? manifestResult.value
|
|
100
|
+
: null;
|
|
101
|
+
if (sitemapResult.status === 'rejected') {
|
|
102
|
+
errors.push(`sitemap: ${sitemapResult.reason instanceof Error ? sitemapResult.reason.message : 'Failed'}`);
|
|
103
|
+
}
|
|
104
|
+
if (manifestResult.status === 'rejected') {
|
|
105
|
+
errors.push(`manifest: ${manifestResult.reason instanceof Error ? manifestResult.reason.message : 'Failed'}`);
|
|
106
|
+
}
|
|
107
|
+
// Step 4: Extract meta (synchronous cheerio operation)
|
|
108
|
+
let meta = null;
|
|
109
|
+
try {
|
|
110
|
+
meta = extractMeta(html, resolvedUrl);
|
|
111
|
+
}
|
|
112
|
+
catch (err) {
|
|
113
|
+
errors.push(`meta: ${err instanceof Error ? err.message : 'Failed'}`);
|
|
114
|
+
}
|
|
115
|
+
// Step 5: Discover and score routes
|
|
116
|
+
const discoveredRoutes = discoverRoutes({
|
|
117
|
+
baseUrl: resolvedUrl,
|
|
118
|
+
rootHtml: html,
|
|
119
|
+
sitemapEntries: sitemap?.entries ?? [],
|
|
120
|
+
robotsDirectives: robots?.directives ?? [],
|
|
121
|
+
respectRobotsTxt: true,
|
|
122
|
+
});
|
|
123
|
+
return {
|
|
124
|
+
url,
|
|
125
|
+
resolvedUrl,
|
|
126
|
+
statusCode,
|
|
127
|
+
responseTime,
|
|
128
|
+
sitemap,
|
|
129
|
+
robots,
|
|
130
|
+
manifest,
|
|
131
|
+
meta,
|
|
132
|
+
discoveredRoutes,
|
|
133
|
+
rootHtml: html,
|
|
134
|
+
errors,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
// Re-export sub-modules for direct use
|
|
138
|
+
export { parseSitemap } from './sitemap.js';
|
|
139
|
+
export { parseRobots, isPathAllowed } from './robots.js';
|
|
140
|
+
export { parseManifest } from './manifest.js';
|
|
141
|
+
export { extractMeta } from './meta-extractor.js';
|
|
142
|
+
export { discoverRoutes } from './route-discovery.js';
|
|
143
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/recon/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAEtD,yBAAyB;AACzB,MAAM,qBAAqB,GAAG,MAAM,CAAC;AAErC;;;GAGG;AACH,KAAK,UAAU,aAAa,CAAC,GAAW;IAMtC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,qBAAqB,CAAC,CAAC;IAE1E,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE;gBACP,YAAY,EACV,+DAA+D;gBACjE,MAAM,EACJ,iEAAiE;aACpE;YACD,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;QACH,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QACxC,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,WAAW,EAAE,QAAQ,CAAC,GAAG;YACzB,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,YAAY;YACZ,IAAI;SACL,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,GAAW;IACjD,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,8DAA8D;IAC9D,yEAAyE;IACzE,MAAM,CAAC,UAAU,EAAE,YAAY,CAAC,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;QAC1D,aAAa,CAAC,GAAG,CAAC;QAClB,WAAW,CAAC,GAAG,CAAC;KACjB,CAAC,CAAC;IAEH,mCAAmC;IACnC,IAAI,UAAU,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACrC,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,YAAY,KAAK;YAC/C,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,OAAO;YAC3B,CAAC,CAAC,2BAA2B,CAAC;QAChC,OAAO;YACL,GAAG;YACH,WAAW,EAAE,GAAG;YAChB,UAAU,EAAE,CAAC;YACb,YAAY,EAAE,CAAC;YACf,OAAO,EAAE,IAAI;YACb,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,IAAI;YACd,IAAI,EAAE,IAAI;YACV,gBAAgB,EAAE,EAAE;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,CAAC,MAAM,CAAC;SACjB,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,WAAW,EAAE,UAAU,EAAE,YAAY,EAAE,IAAI,EAAE,GAAG,UAAU,CAAC,KAAK,CAAC;IACzE,MAAM,MAAM,GAAG,YAAY,CAAC,MAAM,KAAK,WAAW;QAChD,CAAC,CAAC,YAAY,CAAC,KAAK;QACpB,CAAC,CAAC,IAAI,CAAC;IAET,IAAI,YAAY,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACvC,MAAM,CAAC,IAAI,CAAC,eAAe,YAAY,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC9G,CAAC;IAED,mCAAmC;IACnC,gDAAgD;IAChD,MAAM,qBAAqB,GAAG,MAAM,EAAE,WAAW,IAAI,EAAE,CAAC;IAExD,8DAA8D;IAC9D,MAAM,CAAC,aAAa,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;QAC/D,qBAAqB,CAAC,MAAM,GAAG,CAAC;YAC9B,CAAC,CAAC,YAAY,CAAC,GAAG,EAAE,qBAAqB,CAAC,CAAC,CAAC,CAAC;YAC7C,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC;QACrB,aAAa,CAAC,GAAG,EAAE,IAAI,CAAC;KACzB,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,KAAK,WAAW;QAClD,CAAC,CAAC,aAAa,CAAC,KAAK;QACrB,CAAC,CAAC,IAAI,CAAC;IACT,MAAM,QAAQ,GAAG,cAAc,CAAC,MAAM,KAAK,WAAW;QACpD,CAAC,CAAC,cAAc,CAAC,KAAK;QACtB,CAAC,CAAC,IAAI,CAAC;IAET,IAAI,aAAa,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACxC,MAAM,CAAC,IAAI,CAAC,YAAY,aAAa,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7G,CAAC;IACD,IAAI,cAAc,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACzC,MAAM,CAAC,IAAI,CAAC,aAAa,cAAc,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAChH,CAAC;IAED,uDAAuD;IACvD,IAAI,IAAI,GAAG,IAAI,CAAC;IAChB,IAAI,CAAC;QACH,IAAI,GAAG,WAAW,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IACxC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,CAAC,IAAI,CAAC,SAAS,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,oCAAoC;IACpC,MAAM,gBAAgB,GAAG,cAAc,CAAC;QACtC,OAAO,EAAE,WAAW;QACpB,QAAQ,EAAE,IAAI;QACd,cAAc,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE;QACtC,gBAAgB,EAAE,MAAM,EAAE,UAAU,IAAI,EAAE;QAC1C,gBAAgB,EAAE,IAAI;KACvB,CAAC,CAAC;IAEH,OAAO;QACL,GAAG;QACH,WAAW;QACX,UAAU;QACV,YAAY;QACZ,OAAO;QACP,MAAM;QACN,QAAQ;QACR,IAAI;QACJ,gBAAgB;QAChB,QAAQ,EAAE,IAAI;QACd,MAAM;KACP,CAAC;AACJ,CAAC;AAED,uCAAuC;AACvC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACzD,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web App Manifest Parser
|
|
3
|
+
*
|
|
4
|
+
* Extracts and parses the web app manifest from <link rel="manifest">
|
|
5
|
+
* or the default /manifest.json path. Handles missing manifests gracefully.
|
|
6
|
+
*/
|
|
7
|
+
import type { ManifestResult } from '../core/types/recon.js';
|
|
8
|
+
/**
|
|
9
|
+
* Fetch and parse the web app manifest for a site.
|
|
10
|
+
*
|
|
11
|
+
* Strategy:
|
|
12
|
+
* 1. If rootHtml provided, look for <link rel="manifest"> href
|
|
13
|
+
* 2. Otherwise try /manifest.json
|
|
14
|
+
* 3. Try /manifest.webmanifest as fallback
|
|
15
|
+
*/
|
|
16
|
+
export declare function parseManifest(baseUrl: string, rootHtml?: string): Promise<ManifestResult>;
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web App Manifest Parser
|
|
3
|
+
*
|
|
4
|
+
* Extracts and parses the web app manifest from <link rel="manifest">
|
|
5
|
+
* or the default /manifest.json path. Handles missing manifests gracefully.
|
|
6
|
+
*/
|
|
7
|
+
import * as cheerio from 'cheerio';
|
|
8
|
+
/** Fetch timeout (ms) */
|
|
9
|
+
const FETCH_TIMEOUT_MS = 10_000;
|
|
10
|
+
/**
|
|
11
|
+
* Fetch with timeout using AbortController
|
|
12
|
+
*/
|
|
13
|
+
async function fetchWithTimeout(url, timeoutMs = FETCH_TIMEOUT_MS) {
|
|
14
|
+
const controller = new AbortController();
|
|
15
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
16
|
+
try {
|
|
17
|
+
return await fetch(url, {
|
|
18
|
+
signal: controller.signal,
|
|
19
|
+
headers: { 'User-Agent': 'WebMCP-CLI/1.0 (manifest-parser)' },
|
|
20
|
+
redirect: 'follow',
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
finally {
|
|
24
|
+
clearTimeout(timer);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Extract manifest URL from HTML <link rel="manifest"> tag
|
|
29
|
+
*/
|
|
30
|
+
function extractManifestUrl(html, baseUrl) {
|
|
31
|
+
const $ = cheerio.load(html);
|
|
32
|
+
const href = $('link[rel="manifest"]').attr('href');
|
|
33
|
+
if (!href)
|
|
34
|
+
return null;
|
|
35
|
+
try {
|
|
36
|
+
return new URL(href, baseUrl).href;
|
|
37
|
+
}
|
|
38
|
+
catch {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Fetch and parse the web app manifest for a site.
|
|
44
|
+
*
|
|
45
|
+
* Strategy:
|
|
46
|
+
* 1. If rootHtml provided, look for <link rel="manifest"> href
|
|
47
|
+
* 2. Otherwise try /manifest.json
|
|
48
|
+
* 3. Try /manifest.webmanifest as fallback
|
|
49
|
+
*/
|
|
50
|
+
export async function parseManifest(baseUrl, rootHtml) {
|
|
51
|
+
// Build list of candidate URLs to try
|
|
52
|
+
const candidates = [];
|
|
53
|
+
if (rootHtml) {
|
|
54
|
+
const fromLink = extractManifestUrl(rootHtml, baseUrl);
|
|
55
|
+
if (fromLink)
|
|
56
|
+
candidates.push(fromLink);
|
|
57
|
+
}
|
|
58
|
+
candidates.push(new URL('/manifest.json', baseUrl).href, new URL('/manifest.webmanifest', baseUrl).href);
|
|
59
|
+
// Deduplicate
|
|
60
|
+
const uniqueCandidates = [...new Set(candidates)];
|
|
61
|
+
for (const url of uniqueCandidates) {
|
|
62
|
+
try {
|
|
63
|
+
const response = await fetchWithTimeout(url);
|
|
64
|
+
if (!response.ok)
|
|
65
|
+
continue;
|
|
66
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
67
|
+
// Skip HTML responses (custom 404 pages)
|
|
68
|
+
if (contentType.includes('text/html'))
|
|
69
|
+
continue;
|
|
70
|
+
const text = await response.text();
|
|
71
|
+
let data;
|
|
72
|
+
try {
|
|
73
|
+
data = JSON.parse(text);
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
continue; // Not valid JSON
|
|
77
|
+
}
|
|
78
|
+
return {
|
|
79
|
+
found: true,
|
|
80
|
+
name: typeof data['name'] === 'string' ? data['name'] : undefined,
|
|
81
|
+
shortName: typeof data['short_name'] === 'string'
|
|
82
|
+
? data['short_name']
|
|
83
|
+
: undefined,
|
|
84
|
+
description: typeof data['description'] === 'string'
|
|
85
|
+
? data['description']
|
|
86
|
+
: undefined,
|
|
87
|
+
startUrl: typeof data['start_url'] === 'string'
|
|
88
|
+
? data['start_url']
|
|
89
|
+
: undefined,
|
|
90
|
+
display: typeof data['display'] === 'string' ? data['display'] : undefined,
|
|
91
|
+
themeColor: typeof data['theme_color'] === 'string'
|
|
92
|
+
? data['theme_color']
|
|
93
|
+
: undefined,
|
|
94
|
+
source: url,
|
|
95
|
+
raw: data,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
catch {
|
|
99
|
+
// Try next candidate
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return {
|
|
104
|
+
found: false,
|
|
105
|
+
error: 'No manifest found',
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
//# sourceMappingURL=manifest.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"manifest.js","sourceRoot":"","sources":["../../src/recon/manifest.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,yBAAyB;AACzB,MAAM,gBAAgB,GAAG,MAAM,CAAC;AAEhC;;GAEG;AACH,KAAK,UAAU,gBAAgB,CAC7B,GAAW,EACX,YAAoB,gBAAgB;IAEpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,OAAO,MAAM,KAAK,CAAC,GAAG,EAAE;YACtB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE,EAAE,YAAY,EAAE,kCAAkC,EAAE;YAC7D,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAY,EAAE,OAAe;IACvD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,CAAC,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IAEvB,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;IACrC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,QAAiB;IAEjB,sCAAsC;IACtC,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,kBAAkB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACvD,IAAI,QAAQ;YAAE,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,IAAI,CACb,IAAI,GAAG,CAAC,gBAAgB,EAAE,OAAO,CAAC,CAAC,IAAI,EACvC,IAAI,GAAG,CAAC,uBAAuB,EAAE,OAAO,CAAC,CAAC,IAAI,CAC/C,CAAC;IAEF,cAAc;IACd,MAAM,gBAAgB,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC;IAElD,KAAK,MAAM,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACnC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,GAAG,CAAC,CAAC;YAC7C,IAAI,CAAC,QAAQ,CAAC,EAAE;gBAAE,SAAS;YAE3B,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,yCAAyC;YACzC,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;gBAAE,SAAS;YAEhD,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,IAAI,IAA6B,CAAC;YAClC,IAAI,CAAC;gBACH,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAA4B,CAAC;YACrD,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS,CAAC,iBAAiB;YAC7B,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,IAAI;gBACX,IAAI,EAAE,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS;gBACjE,SAAS,EACP,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,QAAQ;oBACpC,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC;oBACpB,CAAC,CAAC,SAAS;gBACf,WAAW,EACT,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,QAAQ;oBACrC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC;oBACrB,CAAC,CAAC,SAAS;gBACf,QAAQ,EACN,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,QAAQ;oBACnC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC;oBACnB,CAAC,CAAC,SAAS;gBACf,OAAO,EACL,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;gBACnE,UAAU,EACR,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,QAAQ;oBACrC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC;oBACrB,CAAC,CAAC,SAAS;gBACf,MAAM,EAAE,GAAG;gBACX,GAAG,EAAE,IAAI;aACV,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,qBAAqB;YACrB,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK,EAAE,KAAK;QACZ,KAAK,EAAE,mBAAmB;KAC3B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Meta Tag & Schema.org Extractor
|
|
3
|
+
*
|
|
4
|
+
* Extracts OpenGraph, Twitter Card, Schema.org JSON-LD, and generic
|
|
5
|
+
* meta tags from root page HTML. Classifies the site type using cheerio.
|
|
6
|
+
*/
|
|
7
|
+
import type { MetaExtractResult } from '../core/types/recon.js';
|
|
8
|
+
/**
|
|
9
|
+
* Extract meta tags, OpenGraph, Twitter Card, Schema.org, and classify site
|
|
10
|
+
*/
|
|
11
|
+
export declare function extractMeta(html: string, pageUrl: string): MetaExtractResult;
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Meta Tag & Schema.org Extractor
|
|
3
|
+
*
|
|
4
|
+
* Extracts OpenGraph, Twitter Card, Schema.org JSON-LD, and generic
|
|
5
|
+
* meta tags from root page HTML. Classifies the site type using cheerio.
|
|
6
|
+
*/
|
|
7
|
+
import * as cheerio from 'cheerio';
|
|
8
|
+
/**
|
|
9
|
+
* Site classification keywords mapped to category
|
|
10
|
+
*/
|
|
11
|
+
const CLASSIFICATION_SIGNALS = {
|
|
12
|
+
'e-commerce': [
|
|
13
|
+
/shop|store|product|cart|checkout|buy|price|add.to.cart/i,
|
|
14
|
+
/e-?commerce|retail|merchant|catalog/i,
|
|
15
|
+
],
|
|
16
|
+
travel: [
|
|
17
|
+
/travel|flight|hotel|booking|reservation|trip|tour|airline/i,
|
|
18
|
+
/destination|cruise|vacation|itinerary/i,
|
|
19
|
+
],
|
|
20
|
+
healthcare: [
|
|
21
|
+
/health|medical|doctor|patient|hospital|clinic|pharma/i,
|
|
22
|
+
/diagnosis|treatment|symptom|appointment/i,
|
|
23
|
+
],
|
|
24
|
+
finance: [
|
|
25
|
+
/bank|finance|invest|trading|loan|mortgage|insurance|credit/i,
|
|
26
|
+
/portfolio|stock|fund|payment/i,
|
|
27
|
+
],
|
|
28
|
+
education: [
|
|
29
|
+
/education|course|learn|school|university|student|teacher/i,
|
|
30
|
+
/curriculum|enrollment|lecture|tutorial/i,
|
|
31
|
+
],
|
|
32
|
+
'news-media': [
|
|
33
|
+
/news|article|blog|press|journal|media|publish|editorial/i,
|
|
34
|
+
/breaking|headline|reporter|column/i,
|
|
35
|
+
],
|
|
36
|
+
social: [
|
|
37
|
+
/social|community|profile|friend|follow|post|feed|message/i,
|
|
38
|
+
/network|connect|share|like|comment/i,
|
|
39
|
+
],
|
|
40
|
+
saas: [
|
|
41
|
+
/dashboard|analytics|platform|subscription|api|integration/i,
|
|
42
|
+
/workspace|team|workflow|automation|saas/i,
|
|
43
|
+
],
|
|
44
|
+
government: [
|
|
45
|
+
/government|gov\.|public.service|citizen|municipality|permit/i,
|
|
46
|
+
/regulation|compliance|legislation|agency/i,
|
|
47
|
+
],
|
|
48
|
+
entertainment: [
|
|
49
|
+
/entertainment|game|movie|music|stream|watch|play|video/i,
|
|
50
|
+
/podcast|episode|series|concert/i,
|
|
51
|
+
],
|
|
52
|
+
'food-restaurant': [
|
|
53
|
+
/restaurant|food|menu|order|delivery|recipe|cuisine/i,
|
|
54
|
+
/dining|chef|meal|reservation/i,
|
|
55
|
+
],
|
|
56
|
+
'real-estate': [
|
|
57
|
+
/real.estate|property|listing|rent|lease|apartment|house/i,
|
|
58
|
+
/mortgage|realtor|condo|housing/i,
|
|
59
|
+
],
|
|
60
|
+
general: [], // fallback
|
|
61
|
+
};
|
|
62
|
+
/**
|
|
63
|
+
* Schema.org type to site classification mapping
|
|
64
|
+
*/
|
|
65
|
+
const SCHEMA_TYPE_MAP = {
|
|
66
|
+
Product: 'e-commerce',
|
|
67
|
+
Offer: 'e-commerce',
|
|
68
|
+
ShoppingCenter: 'e-commerce',
|
|
69
|
+
Store: 'e-commerce',
|
|
70
|
+
Airline: 'travel',
|
|
71
|
+
Flight: 'travel',
|
|
72
|
+
Hotel: 'travel',
|
|
73
|
+
TouristAttraction: 'travel',
|
|
74
|
+
LodgingBusiness: 'travel',
|
|
75
|
+
TravelAgency: 'travel',
|
|
76
|
+
Hospital: 'healthcare',
|
|
77
|
+
MedicalClinic: 'healthcare',
|
|
78
|
+
Physician: 'healthcare',
|
|
79
|
+
Pharmacy: 'healthcare',
|
|
80
|
+
BankOrCreditUnion: 'finance',
|
|
81
|
+
FinancialService: 'finance',
|
|
82
|
+
InsuranceAgency: 'finance',
|
|
83
|
+
EducationalOrganization: 'education',
|
|
84
|
+
Course: 'education',
|
|
85
|
+
School: 'education',
|
|
86
|
+
University: 'education',
|
|
87
|
+
NewsArticle: 'news-media',
|
|
88
|
+
NewsMediaOrganization: 'news-media',
|
|
89
|
+
BlogPosting: 'news-media',
|
|
90
|
+
SocialMediaPosting: 'social',
|
|
91
|
+
SoftwareApplication: 'saas',
|
|
92
|
+
WebApplication: 'saas',
|
|
93
|
+
GovernmentOrganization: 'government',
|
|
94
|
+
GovernmentService: 'government',
|
|
95
|
+
Movie: 'entertainment',
|
|
96
|
+
MusicRecording: 'entertainment',
|
|
97
|
+
VideoGame: 'entertainment',
|
|
98
|
+
Restaurant: 'food-restaurant',
|
|
99
|
+
FoodEstablishment: 'food-restaurant',
|
|
100
|
+
Menu: 'food-restaurant',
|
|
101
|
+
RealEstateAgent: 'real-estate',
|
|
102
|
+
Apartment: 'real-estate',
|
|
103
|
+
House: 'real-estate',
|
|
104
|
+
};
|
|
105
|
+
/**
|
|
106
|
+
* Extract all meta tags from HTML
|
|
107
|
+
*/
|
|
108
|
+
function extractMetaTags($) {
|
|
109
|
+
const tags = [];
|
|
110
|
+
$('meta').each((_i, el) => {
|
|
111
|
+
const $el = $(el);
|
|
112
|
+
const property = $el.attr('property') ?? '';
|
|
113
|
+
const name = $el.attr('name') ?? '';
|
|
114
|
+
const content = $el.attr('content') ?? '';
|
|
115
|
+
if (!content)
|
|
116
|
+
return;
|
|
117
|
+
if (property.startsWith('og:')) {
|
|
118
|
+
tags.push({ name: property, content, category: 'og' });
|
|
119
|
+
}
|
|
120
|
+
else if (property.startsWith('twitter:') ||
|
|
121
|
+
name.startsWith('twitter:')) {
|
|
122
|
+
tags.push({
|
|
123
|
+
name: property || name,
|
|
124
|
+
content,
|
|
125
|
+
category: 'twitter',
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
else if (name) {
|
|
129
|
+
tags.push({ name, content, category: 'generic' });
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
return tags;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Extract OpenGraph data as a flat record
|
|
136
|
+
*/
|
|
137
|
+
function extractOpenGraph(tags) {
|
|
138
|
+
const og = {};
|
|
139
|
+
for (const tag of tags) {
|
|
140
|
+
if (tag.category === 'og') {
|
|
141
|
+
og[tag.name] = tag.content;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return og;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Extract Twitter Card data as a flat record
|
|
148
|
+
*/
|
|
149
|
+
function extractTwitterCard(tags) {
|
|
150
|
+
const twitter = {};
|
|
151
|
+
for (const tag of tags) {
|
|
152
|
+
if (tag.category === 'twitter') {
|
|
153
|
+
twitter[tag.name] = tag.content;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return twitter;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Extract Schema.org JSON-LD entries from <script type="application/ld+json">
|
|
160
|
+
*/
|
|
161
|
+
function extractSchemaOrg($) {
|
|
162
|
+
const schemas = [];
|
|
163
|
+
$('script[type="application/ld+json"]').each((_i, el) => {
|
|
164
|
+
const raw = $(el).html();
|
|
165
|
+
if (!raw)
|
|
166
|
+
return;
|
|
167
|
+
try {
|
|
168
|
+
const data = JSON.parse(raw);
|
|
169
|
+
// Handle @graph arrays
|
|
170
|
+
if (Array.isArray(data['@graph'])) {
|
|
171
|
+
for (const item of data['@graph']) {
|
|
172
|
+
if (item &&
|
|
173
|
+
typeof item === 'object' &&
|
|
174
|
+
'@type' in item) {
|
|
175
|
+
schemas.push(item);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
else if (data['@type']) {
|
|
180
|
+
schemas.push(data);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
catch {
|
|
184
|
+
// Invalid JSON-LD — skip
|
|
185
|
+
}
|
|
186
|
+
});
|
|
187
|
+
return schemas;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Classify the site type based on all available signals
|
|
191
|
+
*/
|
|
192
|
+
function classifySite(metaTags, schemaOrg, pageTitle, html) {
|
|
193
|
+
const scores = {
|
|
194
|
+
'e-commerce': 0,
|
|
195
|
+
travel: 0,
|
|
196
|
+
healthcare: 0,
|
|
197
|
+
finance: 0,
|
|
198
|
+
education: 0,
|
|
199
|
+
'news-media': 0,
|
|
200
|
+
social: 0,
|
|
201
|
+
saas: 0,
|
|
202
|
+
government: 0,
|
|
203
|
+
entertainment: 0,
|
|
204
|
+
'food-restaurant': 0,
|
|
205
|
+
'real-estate': 0,
|
|
206
|
+
general: 0,
|
|
207
|
+
};
|
|
208
|
+
// Signal 1: Schema.org @type (strongest signal)
|
|
209
|
+
for (const schema of schemaOrg) {
|
|
210
|
+
const schemaType = schema['@type'];
|
|
211
|
+
if (typeof schemaType === 'string') {
|
|
212
|
+
const mapped = SCHEMA_TYPE_MAP[schemaType];
|
|
213
|
+
if (mapped)
|
|
214
|
+
scores[mapped] += 10;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
// Signal 2: Meta tags + title + HTML content
|
|
218
|
+
const textSignal = [
|
|
219
|
+
pageTitle,
|
|
220
|
+
...metaTags.map((t) => t.content),
|
|
221
|
+
// Sample a small portion of the HTML for keyword signals
|
|
222
|
+
html.substring(0, 5000),
|
|
223
|
+
].join(' ');
|
|
224
|
+
for (const [category, patterns] of Object.entries(CLASSIFICATION_SIGNALS)) {
|
|
225
|
+
for (const pattern of patterns) {
|
|
226
|
+
if (pattern.test(textSignal)) {
|
|
227
|
+
scores[category] += 3;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
// Find highest score
|
|
232
|
+
let bestCategory = 'general';
|
|
233
|
+
let bestScore = 0;
|
|
234
|
+
for (const [category, score] of Object.entries(scores)) {
|
|
235
|
+
if (score > bestScore) {
|
|
236
|
+
bestScore = score;
|
|
237
|
+
bestCategory = category;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
return bestScore >= 3 ? bestCategory : 'general';
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Extract meta tags, OpenGraph, Twitter Card, Schema.org, and classify site
|
|
244
|
+
*/
|
|
245
|
+
export function extractMeta(html, pageUrl) {
|
|
246
|
+
const $ = cheerio.load(html);
|
|
247
|
+
const metaTags = extractMetaTags($);
|
|
248
|
+
const openGraph = extractOpenGraph(metaTags);
|
|
249
|
+
const twitterCard = extractTwitterCard(metaTags);
|
|
250
|
+
const schemaOrg = extractSchemaOrg($);
|
|
251
|
+
const pageTitle = $('title').text().trim();
|
|
252
|
+
const descriptionTag = metaTags.find((t) => t.name === 'description' && t.category === 'generic');
|
|
253
|
+
const pageDescription = descriptionTag?.content ?? openGraph['og:description'] ?? '';
|
|
254
|
+
const canonicalEl = $('link[rel="canonical"]').attr('href');
|
|
255
|
+
let canonicalUrl;
|
|
256
|
+
if (canonicalEl) {
|
|
257
|
+
try {
|
|
258
|
+
canonicalUrl = new URL(canonicalEl, pageUrl).href;
|
|
259
|
+
}
|
|
260
|
+
catch {
|
|
261
|
+
// Invalid canonical
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
const siteClassification = classifySite(metaTags, schemaOrg, pageTitle, html);
|
|
265
|
+
return {
|
|
266
|
+
metaTags,
|
|
267
|
+
openGraph,
|
|
268
|
+
twitterCard,
|
|
269
|
+
schemaOrg,
|
|
270
|
+
siteClassification,
|
|
271
|
+
pageTitle,
|
|
272
|
+
pageDescription,
|
|
273
|
+
canonicalUrl,
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
//# sourceMappingURL=meta-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"meta-extractor.js","sourceRoot":"","sources":["../../src/recon/meta-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAQnC;;GAEG;AACH,MAAM,sBAAsB,GAAyC;IACnE,YAAY,EAAE;QACZ,yDAAyD;QACzD,sCAAsC;KACvC;IACD,MAAM,EAAE;QACN,4DAA4D;QAC5D,wCAAwC;KACzC;IACD,UAAU,EAAE;QACV,uDAAuD;QACvD,0CAA0C;KAC3C;IACD,OAAO,EAAE;QACP,6DAA6D;QAC7D,+BAA+B;KAChC;IACD,SAAS,EAAE;QACT,2DAA2D;QAC3D,yCAAyC;KAC1C;IACD,YAAY,EAAE;QACZ,0DAA0D;QAC1D,oCAAoC;KACrC;IACD,MAAM,EAAE;QACN,2DAA2D;QAC3D,qCAAqC;KACtC;IACD,IAAI,EAAE;QACJ,4DAA4D;QAC5D,0CAA0C;KAC3C;IACD,UAAU,EAAE;QACV,8DAA8D;QAC9D,2CAA2C;KAC5C;IACD,aAAa,EAAE;QACb,yDAAyD;QACzD,iCAAiC;KAClC;IACD,iBAAiB,EAAE;QACjB,qDAAqD;QACrD,+BAA+B;KAChC;IACD,aAAa,EAAE;QACb,0DAA0D;QAC1D,iCAAiC;KAClC;IACD,OAAO,EAAE,EAAE,EAAE,WAAW;CACzB,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAAuC;IAC1D,OAAO,EAAE,YAAY;IACrB,KAAK,EAAE,YAAY;IACnB,cAAc,EAAE,YAAY;IAC5B,KAAK,EAAE,YAAY;IACnB,OAAO,EAAE,QAAQ;IACjB,MAAM,EAAE,QAAQ;IAChB,KAAK,EAAE,QAAQ;IACf,iBAAiB,EAAE,QAAQ;IAC3B,eAAe,EAAE,QAAQ;IACzB,YAAY,EAAE,QAAQ;IACtB,QAAQ,EAAE,YAAY;IACtB,aAAa,EAAE,YAAY;IAC3B,SAAS,EAAE,YAAY;IACvB,QAAQ,EAAE,YAAY;IACtB,iBAAiB,EAAE,SAAS;IAC5B,gBAAgB,EAAE,SAAS;IAC3B,eAAe,EAAE,SAAS;IAC1B,uBAAuB,EAAE,WAAW;IACpC,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,WAAW;IACnB,UAAU,EAAE,WAAW;IACvB,WAAW,EAAE,YAAY;IACzB,qBAAqB,EAAE,YAAY;IACnC,WAAW,EAAE,YAAY;IACzB,kBAAkB,EAAE,QAAQ;IAC5B,mBAAmB,EAAE,MAAM;IAC3B,cAAc,EAAE,MAAM;IACtB,sBAAsB,EAAE,YAAY;IACpC,iBAAiB,EAAE,YAAY;IAC/B,KAAK,EAAE,eAAe;IACtB,cAAc,EAAE,eAAe;IAC/B,SAAS,EAAE,eAAe;IAC1B,UAAU,EAAE,iBAAiB;IAC7B,iBAAiB,EAAE,iBAAiB;IACpC,IAAI,EAAE,iBAAiB;IACvB,eAAe,EAAE,aAAa;IAC9B,SAAS,EAAE,aAAa;IACxB,KAAK,EAAE,aAAa;CACrB,CAAC;AAEF;;GAEG;AACH,SAAS,eAAe,CAAC,CAAqB;IAC5C,MAAM,IAAI,GAAoB,EAAE,CAAC;IAEjC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACxB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAC5C,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAE1C,IAAI,CAAC,OAAO;YAAE,OAAO;QAErB,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;aAAM,IACL,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC;YAC/B,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,EAC3B,CAAC;YACD,IAAI,CAAC,IAAI,CAAC;gBACR,IAAI,EAAE,QAAQ,IAAI,IAAI;gBACtB,OAAO;gBACP,QAAQ,EAAE,SAAS;aACpB,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,IAAI,EAAE,CAAC;YAChB,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAqB;IAC7C,MAAM,EAAE,GAA2B,EAAE,CAAC;IACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,GAAG,CAAC,QAAQ,KAAK,IAAI,EAAE,CAAC;YAC1B,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC;QAC7B,CAAC;IACH,CAAC;IACD,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAqB;IAC/C,MAAM,OAAO,GAA2B,EAAE,CAAC;IAC3C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC;QAClC,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,CAAqB;IAC7C,MAAM,OAAO,GAAoB,EAAE,CAAC;IAEpC,CAAC,CAAC,oCAAoC,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACtD,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACzB,IAAI,CAAC,GAAG;YAAE,OAAO;QAEjB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAA4B,CAAC;YAExD,uBAAuB;YACvB,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAClC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAClC,IACE,IAAI;wBACJ,OAAO,IAAI,KAAK,QAAQ;wBACxB,OAAO,IAAK,IAAgC,EAC5C,CAAC;wBACD,OAAO,CAAC,IAAI,CAAC,IAAqB,CAAC,CAAC;oBACtC,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,IAAgC,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,yBAAyB;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CACnB,QAAyB,EACzB,SAA0B,EAC1B,SAAiB,EACjB,IAAY;IAEZ,MAAM,MAAM,GAAuC;QACjD,YAAY,EAAE,CAAC;QACf,MAAM,EAAE,CAAC;QACT,UAAU,EAAE,CAAC;QACb,OAAO,EAAE,CAAC;QACV,SAAS,EAAE,CAAC;QACZ,YAAY,EAAE,CAAC;QACf,MAAM,EAAE,CAAC;QACT,IAAI,EAAE,CAAC;QACP,UAAU,EAAE,CAAC;QACb,aAAa,EAAE,CAAC;QAChB,iBAAiB,EAAE,CAAC;QACpB,aAAa,EAAE,CAAC;QAChB,OAAO,EAAE,CAAC;KACX,CAAC;IAEF,gDAAgD;IAChD,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;QAC/B,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,OAAO,UAAU,KAAK,QAAQ,EAAE,CAAC;YACnC,MAAM,MAAM,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;YAC3C,IAAI,MAAM;gBAAE,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACnC,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,UAAU,GAAG;QACjB,SAAS;QACT,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjC,yDAAyD;QACzD,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC;KACxB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAEZ,KAAK,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,sBAAsB,CAAC,EAAE,CAAC;QAC1E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,QAA8B,CAAC,IAAI,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,qBAAqB;IACrB,IAAI,YAAY,GAAuB,SAAS,CAAC;IACjD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QACvD,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YACtB,SAAS,GAAG,KAAK,CAAC;YAClB,YAAY,GAAG,QAA8B,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,SAAS,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAe;IAEf,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;IACpC,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC;IACjD,MAAM,SAAS,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAEtC,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC3C,MAAM,cAAc,GAAG,QAAQ,CAAC,IAAI,CAClC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,aAAa,IAAI,CAAC,CAAC,QAAQ,KAAK,SAAS,CAC5D,CAAC;IACF,MAAM,eAAe,GACnB,cAAc,EAAE,OAAO,IAAI,SAAS,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;IAE/D,MAAM,WAAW,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5D,IAAI,YAAgC,CAAC;IACrC,IAAI,WAAW,EAAE,CAAC;QAChB,IAAI,CAAC;YACH,YAAY,GAAG,IAAI,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACpD,CAAC;QAAC,MAAM,CAAC;YACP,oBAAoB;QACtB,CAAC;IACH,CAAC;IAED,MAAM,kBAAkB,GAAG,YAAY,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAE9E,OAAO;QACL,QAAQ;QACR,SAAS;QACT,WAAW;QACX,SAAS;QACT,kBAAkB;QAClB,SAAS;QACT,eAAe;QACf,YAAY;KACb,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Robots.txt Parser
|
|
3
|
+
*
|
|
4
|
+
* Fetches and parses robots.txt. Extracts allowed/disallowed paths,
|
|
5
|
+
* crawl-delay, and sitemap references. Handles missing robots.txt gracefully.
|
|
6
|
+
*/
|
|
7
|
+
import type { RobotsDirective, RobotsResult } from '../core/types/recon.js';
|
|
8
|
+
/**
|
|
9
|
+
* Check whether a path is allowed by the robots.txt directives.
|
|
10
|
+
* Uses wildcard user-agent (*) rules if no specific match.
|
|
11
|
+
*/
|
|
12
|
+
export declare function isPathAllowed(path: string, directives: RobotsDirective[], userAgent?: string): boolean;
|
|
13
|
+
/**
|
|
14
|
+
* Fetch and parse robots.txt for a site
|
|
15
|
+
*/
|
|
16
|
+
export declare function parseRobots(baseUrl: string): Promise<RobotsResult>;
|