webmcp-cli 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. package/dist/analysis/form-to-tool-mapper.d.ts +61 -0
  2. package/dist/analysis/form-to-tool-mapper.js +360 -0
  3. package/dist/analysis/form-to-tool-mapper.js.map +1 -0
  4. package/dist/analysis/index.d.ts +84 -0
  5. package/dist/analysis/index.js +81 -0
  6. package/dist/analysis/index.js.map +1 -0
  7. package/dist/analysis/missing-tool-analyzer.d.ts +35 -0
  8. package/dist/analysis/missing-tool-analyzer.js +617 -0
  9. package/dist/analysis/missing-tool-analyzer.js.map +1 -0
  10. package/dist/audit/run-multi-page-audit.d.ts +34 -0
  11. package/dist/audit/run-multi-page-audit.js +233 -0
  12. package/dist/audit/run-multi-page-audit.js.map +1 -0
  13. package/dist/cli/commands/potential.d.ts +8 -0
  14. package/dist/cli/commands/potential.js +323 -0
  15. package/dist/cli/commands/potential.js.map +1 -0
  16. package/dist/cli/commands/report.d.ts +12 -0
  17. package/dist/cli/commands/report.js +89 -0
  18. package/dist/cli/commands/report.js.map +1 -0
  19. package/dist/cli/index.js +35 -0
  20. package/dist/cli/index.js.map +1 -1
  21. package/dist/config/defaults.d.ts +36 -0
  22. package/dist/config/defaults.js +33 -0
  23. package/dist/config/defaults.js.map +1 -0
  24. package/dist/config/index.d.ts +7 -0
  25. package/dist/config/index.js +7 -0
  26. package/dist/config/index.js.map +1 -0
  27. package/dist/config/loader.d.ts +22 -0
  28. package/dist/config/loader.js +91 -0
  29. package/dist/config/loader.js.map +1 -0
  30. package/dist/config/schema.d.ts +280 -0
  31. package/dist/config/schema.js +42 -0
  32. package/dist/config/schema.js.map +1 -0
  33. package/dist/core/types/audit.d.ts +1 -1
  34. package/dist/core/types/index.d.ts +1 -0
  35. package/dist/core/types/index.js +1 -0
  36. package/dist/core/types/index.js.map +1 -1
  37. package/dist/core/types/recon.d.ts +265 -0
  38. package/dist/core/types/recon.js +5 -0
  39. package/dist/core/types/recon.js.map +1 -0
  40. package/dist/core/types/rule.d.ts +1 -1
  41. package/dist/core/types/rule.js +7 -5
  42. package/dist/core/types/rule.js.map +1 -1
  43. package/dist/crawler/depth-crawler.d.ts +29 -0
  44. package/dist/crawler/depth-crawler.js +212 -0
  45. package/dist/crawler/depth-crawler.js.map +1 -0
  46. package/dist/crawler/index.d.ts +2 -0
  47. package/dist/crawler/index.js +3 -0
  48. package/dist/crawler/index.js.map +1 -0
  49. package/dist/crawler/link-extractor.d.ts +1 -0
  50. package/dist/crawler/link-extractor.js +49 -0
  51. package/dist/crawler/link-extractor.js.map +1 -0
  52. package/dist/generators/index.d.ts +10 -0
  53. package/dist/generators/index.js +8 -0
  54. package/dist/generators/index.js.map +1 -0
  55. package/dist/generators/report-html.d.ts +12 -0
  56. package/dist/generators/report-html.js +470 -0
  57. package/dist/generators/report-html.js.map +1 -0
  58. package/dist/generators/report-json.d.ts +95 -0
  59. package/dist/generators/report-json.js +144 -0
  60. package/dist/generators/report-json.js.map +1 -0
  61. package/dist/generators/report-manager.d.ts +31 -0
  62. package/dist/generators/report-manager.js +208 -0
  63. package/dist/generators/report-manager.js.map +1 -0
  64. package/dist/generators/tool-code-generator.d.ts +31 -0
  65. package/dist/generators/tool-code-generator.js +201 -0
  66. package/dist/generators/tool-code-generator.js.map +1 -0
  67. package/dist/potential/ai-recommender.d.ts +33 -0
  68. package/dist/potential/ai-recommender.js +414 -0
  69. package/dist/potential/ai-recommender.js.map +1 -0
  70. package/dist/potential/analyzer.d.ts +32 -0
  71. package/dist/potential/analyzer.js +383 -0
  72. package/dist/potential/analyzer.js.map +1 -0
  73. package/dist/potential/index.d.ts +3 -0
  74. package/dist/potential/index.js +4 -0
  75. package/dist/potential/index.js.map +1 -0
  76. package/dist/potential/prompts.d.ts +20 -0
  77. package/dist/potential/prompts.js +42 -0
  78. package/dist/potential/prompts.js.map +1 -0
  79. package/dist/potential/types.d.ts +40 -0
  80. package/dist/potential/types.js +2 -0
  81. package/dist/potential/types.js.map +1 -0
  82. package/dist/recon/index.d.ts +20 -0
  83. package/dist/recon/index.js +143 -0
  84. package/dist/recon/index.js.map +1 -0
  85. package/dist/recon/manifest.d.ts +16 -0
  86. package/dist/recon/manifest.js +108 -0
  87. package/dist/recon/manifest.js.map +1 -0
  88. package/dist/recon/meta-extractor.d.ts +11 -0
  89. package/dist/recon/meta-extractor.js +276 -0
  90. package/dist/recon/meta-extractor.js.map +1 -0
  91. package/dist/recon/robots.d.ts +16 -0
  92. package/dist/recon/robots.js +158 -0
  93. package/dist/recon/robots.js.map +1 -0
  94. package/dist/recon/route-discovery.d.ts +25 -0
  95. package/dist/recon/route-discovery.js +303 -0
  96. package/dist/recon/route-discovery.js.map +1 -0
  97. package/dist/recon/sitemap.d.ts +12 -0
  98. package/dist/recon/sitemap.js +177 -0
  99. package/dist/recon/sitemap.js.map +1 -0
  100. package/dist/rules/accessibility/AXE-001.d.ts +9 -0
  101. package/dist/rules/accessibility/AXE-001.js +109 -0
  102. package/dist/rules/accessibility/AXE-001.js.map +1 -0
  103. package/dist/rules/accessibility/AXE-002.d.ts +8 -0
  104. package/dist/rules/accessibility/AXE-002.js +85 -0
  105. package/dist/rules/accessibility/AXE-002.js.map +1 -0
  106. package/dist/rules/accessibility/AXE-003.d.ts +8 -0
  107. package/dist/rules/accessibility/AXE-003.js +94 -0
  108. package/dist/rules/accessibility/AXE-003.js.map +1 -0
  109. package/dist/rules/accessibility/AXE-004.d.ts +8 -0
  110. package/dist/rules/accessibility/AXE-004.js +101 -0
  111. package/dist/rules/accessibility/AXE-004.js.map +1 -0
  112. package/dist/rules/accessibility/AXE-005.d.ts +9 -0
  113. package/dist/rules/accessibility/AXE-005.js +89 -0
  114. package/dist/rules/accessibility/AXE-005.js.map +1 -0
  115. package/dist/rules/best-practices/BP-004.d.ts +9 -0
  116. package/dist/rules/best-practices/BP-004.js +96 -0
  117. package/dist/rules/best-practices/BP-004.js.map +1 -0
  118. package/dist/rules/best-practices/BP-005.d.ts +8 -0
  119. package/dist/rules/best-practices/BP-005.js +94 -0
  120. package/dist/rules/best-practices/BP-005.js.map +1 -0
  121. package/dist/rules/best-practices/BP-006.d.ts +8 -0
  122. package/dist/rules/best-practices/BP-006.js +80 -0
  123. package/dist/rules/best-practices/BP-006.js.map +1 -0
  124. package/dist/rules/best-practices/BP-007.d.ts +8 -0
  125. package/dist/rules/best-practices/BP-007.js +92 -0
  126. package/dist/rules/best-practices/BP-007.js.map +1 -0
  127. package/dist/rules/best-practices/BP-008.d.ts +12 -0
  128. package/dist/rules/best-practices/BP-008.js +86 -0
  129. package/dist/rules/best-practices/BP-008.js.map +1 -0
  130. package/dist/rules/best-practices/BP-009.d.ts +9 -0
  131. package/dist/rules/best-practices/BP-009.js +77 -0
  132. package/dist/rules/best-practices/BP-009.js.map +1 -0
  133. package/dist/rules/best-practices/BP-010.d.ts +8 -0
  134. package/dist/rules/best-practices/BP-010.js +85 -0
  135. package/dist/rules/best-practices/BP-010.js.map +1 -0
  136. package/dist/rules/coverage/COV-002.d.ts +8 -0
  137. package/dist/rules/coverage/COV-002.js +68 -0
  138. package/dist/rules/coverage/COV-002.js.map +1 -0
  139. package/dist/rules/coverage/COV-003.d.ts +8 -0
  140. package/dist/rules/coverage/COV-003.js +68 -0
  141. package/dist/rules/coverage/COV-003.js.map +1 -0
  142. package/dist/rules/coverage/COV-004.d.ts +8 -0
  143. package/dist/rules/coverage/COV-004.js +89 -0
  144. package/dist/rules/coverage/COV-004.js.map +1 -0
  145. package/dist/rules/coverage/COV-005.d.ts +8 -0
  146. package/dist/rules/coverage/COV-005.js +67 -0
  147. package/dist/rules/coverage/COV-005.js.map +1 -0
  148. package/dist/rules/coverage/COV-006.d.ts +9 -0
  149. package/dist/rules/coverage/COV-006.js +76 -0
  150. package/dist/rules/coverage/COV-006.js.map +1 -0
  151. package/dist/rules/coverage/COV-007.d.ts +8 -0
  152. package/dist/rules/coverage/COV-007.js +67 -0
  153. package/dist/rules/coverage/COV-007.js.map +1 -0
  154. package/dist/rules/coverage/COV-008.d.ts +9 -0
  155. package/dist/rules/coverage/COV-008.js +87 -0
  156. package/dist/rules/coverage/COV-008.js.map +1 -0
  157. package/dist/rules/coverage/COV-009.d.ts +8 -0
  158. package/dist/rules/coverage/COV-009.js +73 -0
  159. package/dist/rules/coverage/COV-009.js.map +1 -0
  160. package/dist/rules/coverage/COV-010.d.ts +9 -0
  161. package/dist/rules/coverage/COV-010.js +82 -0
  162. package/dist/rules/coverage/COV-010.js.map +1 -0
  163. package/dist/rules/description/DESC-001.d.ts +9 -0
  164. package/dist/rules/description/DESC-001.js +88 -0
  165. package/dist/rules/description/DESC-001.js.map +1 -0
  166. package/dist/rules/description/DESC-002.d.ts +10 -0
  167. package/dist/rules/description/DESC-002.js +99 -0
  168. package/dist/rules/description/DESC-002.js.map +1 -0
  169. package/dist/rules/description/DESC-006.d.ts +9 -0
  170. package/dist/rules/description/DESC-006.js +78 -0
  171. package/dist/rules/description/DESC-006.js.map +1 -0
  172. package/dist/rules/description/DESC-007.d.ts +9 -0
  173. package/dist/rules/description/DESC-007.js +70 -0
  174. package/dist/rules/description/DESC-007.js.map +1 -0
  175. package/dist/rules/description/DESC-008.d.ts +9 -0
  176. package/dist/rules/description/DESC-008.js +70 -0
  177. package/dist/rules/description/DESC-008.js.map +1 -0
  178. package/dist/rules/description/DESC-009.d.ts +8 -0
  179. package/dist/rules/description/DESC-009.js +55 -0
  180. package/dist/rules/description/DESC-009.js.map +1 -0
  181. package/dist/rules/description/DESC-010.d.ts +9 -0
  182. package/dist/rules/description/DESC-010.js +92 -0
  183. package/dist/rules/description/DESC-010.js.map +1 -0
  184. package/dist/rules/description/DESC-011.d.ts +9 -0
  185. package/dist/rules/description/DESC-011.js +81 -0
  186. package/dist/rules/description/DESC-011.js.map +1 -0
  187. package/dist/rules/description/DESC-012.d.ts +9 -0
  188. package/dist/rules/description/DESC-012.js +98 -0
  189. package/dist/rules/description/DESC-012.js.map +1 -0
  190. package/dist/rules/implementation/IMP-002.d.ts +9 -0
  191. package/dist/rules/implementation/IMP-002.js +59 -0
  192. package/dist/rules/implementation/IMP-002.js.map +1 -0
  193. package/dist/rules/implementation/IMP-006.d.ts +9 -0
  194. package/dist/rules/implementation/IMP-006.js +48 -0
  195. package/dist/rules/implementation/IMP-006.js.map +1 -0
  196. package/dist/rules/implementation/IMP-008.d.ts +9 -0
  197. package/dist/rules/implementation/IMP-008.js +46 -0
  198. package/dist/rules/implementation/IMP-008.js.map +1 -0
  199. package/dist/rules/implementation/IMP-009.d.ts +9 -0
  200. package/dist/rules/implementation/IMP-009.js +48 -0
  201. package/dist/rules/implementation/IMP-009.js.map +1 -0
  202. package/dist/rules/implementation/IMP-010.d.ts +9 -0
  203. package/dist/rules/implementation/IMP-010.js +66 -0
  204. package/dist/rules/implementation/IMP-010.js.map +1 -0
  205. package/dist/rules/implementation/IMP-011.d.ts +9 -0
  206. package/dist/rules/implementation/IMP-011.js +82 -0
  207. package/dist/rules/implementation/IMP-011.js.map +1 -0
  208. package/dist/rules/implementation/IMP-012.d.ts +9 -0
  209. package/dist/rules/implementation/IMP-012.js +88 -0
  210. package/dist/rules/implementation/IMP-012.js.map +1 -0
  211. package/dist/rules/implementation/IMP-014.d.ts +9 -0
  212. package/dist/rules/implementation/IMP-014.js +58 -0
  213. package/dist/rules/implementation/IMP-014.js.map +1 -0
  214. package/dist/rules/implementation/IMP-015.d.ts +9 -0
  215. package/dist/rules/implementation/IMP-015.js +64 -0
  216. package/dist/rules/implementation/IMP-015.js.map +1 -0
  217. package/dist/rules/implementation/IMP-016.d.ts +9 -0
  218. package/dist/rules/implementation/IMP-016.js +52 -0
  219. package/dist/rules/implementation/IMP-016.js.map +1 -0
  220. package/dist/rules/implementation/IMP-017.d.ts +8 -0
  221. package/dist/rules/implementation/IMP-017.js +51 -0
  222. package/dist/rules/implementation/IMP-017.js.map +1 -0
  223. package/dist/rules/implementation/IMP-018.d.ts +8 -0
  224. package/dist/rules/implementation/IMP-018.js +52 -0
  225. package/dist/rules/implementation/IMP-018.js.map +1 -0
  226. package/dist/rules/implementation/IMP-019.d.ts +8 -0
  227. package/dist/rules/implementation/IMP-019.js +53 -0
  228. package/dist/rules/implementation/IMP-019.js.map +1 -0
  229. package/dist/rules/implementation/IMP-020.d.ts +9 -0
  230. package/dist/rules/implementation/IMP-020.js +62 -0
  231. package/dist/rules/implementation/IMP-020.js.map +1 -0
  232. package/dist/rules/implementation/IMP-021.d.ts +8 -0
  233. package/dist/rules/implementation/IMP-021.js +64 -0
  234. package/dist/rules/implementation/IMP-021.js.map +1 -0
  235. package/dist/rules/implementation/IMP-022.d.ts +8 -0
  236. package/dist/rules/implementation/IMP-022.js +70 -0
  237. package/dist/rules/implementation/IMP-022.js.map +1 -0
  238. package/dist/rules/index.d.ts +73 -6
  239. package/dist/rules/index.js +141 -6
  240. package/dist/rules/index.js.map +1 -1
  241. package/dist/rules/schema/SCHEMA-004.d.ts +9 -0
  242. package/dist/rules/schema/SCHEMA-004.js +57 -0
  243. package/dist/rules/schema/SCHEMA-004.js.map +1 -0
  244. package/dist/rules/schema/SCHEMA-005.d.ts +9 -0
  245. package/dist/rules/schema/SCHEMA-005.js +61 -0
  246. package/dist/rules/schema/SCHEMA-005.js.map +1 -0
  247. package/dist/rules/schema/SCHEMA-006.d.ts +10 -0
  248. package/dist/rules/schema/SCHEMA-006.js +85 -0
  249. package/dist/rules/schema/SCHEMA-006.js.map +1 -0
  250. package/dist/rules/schema/SCHEMA-007.d.ts +9 -0
  251. package/dist/rules/schema/SCHEMA-007.js +73 -0
  252. package/dist/rules/schema/SCHEMA-007.js.map +1 -0
  253. package/dist/rules/schema/SCHEMA-008.d.ts +9 -0
  254. package/dist/rules/schema/SCHEMA-008.js +70 -0
  255. package/dist/rules/schema/SCHEMA-008.js.map +1 -0
  256. package/dist/rules/schema/SCHEMA-009.d.ts +10 -0
  257. package/dist/rules/schema/SCHEMA-009.js +80 -0
  258. package/dist/rules/schema/SCHEMA-009.js.map +1 -0
  259. package/dist/rules/schema/SCHEMA-010.d.ts +9 -0
  260. package/dist/rules/schema/SCHEMA-010.js +96 -0
  261. package/dist/rules/schema/SCHEMA-010.js.map +1 -0
  262. package/dist/rules/schema/SCHEMA-012.d.ts +9 -0
  263. package/dist/rules/schema/SCHEMA-012.js +65 -0
  264. package/dist/rules/schema/SCHEMA-012.js.map +1 -0
  265. package/dist/rules/security/SEC-002.d.ts +8 -0
  266. package/dist/rules/security/SEC-002.js +81 -0
  267. package/dist/rules/security/SEC-002.js.map +1 -0
  268. package/dist/rules/security/SEC-003.d.ts +8 -0
  269. package/dist/rules/security/SEC-003.js +85 -0
  270. package/dist/rules/security/SEC-003.js.map +1 -0
  271. package/dist/rules/security/SEC-004.d.ts +9 -0
  272. package/dist/rules/security/SEC-004.js +87 -0
  273. package/dist/rules/security/SEC-004.js.map +1 -0
  274. package/dist/rules/security/SEC-005.d.ts +8 -0
  275. package/dist/rules/security/SEC-005.js +87 -0
  276. package/dist/rules/security/SEC-005.js.map +1 -0
  277. package/dist/rules/security/SEC-006.d.ts +10 -0
  278. package/dist/rules/security/SEC-006.js +108 -0
  279. package/dist/rules/security/SEC-006.js.map +1 -0
  280. package/dist/rules/security/SEC-007.d.ts +9 -0
  281. package/dist/rules/security/SEC-007.js +108 -0
  282. package/dist/rules/security/SEC-007.js.map +1 -0
  283. package/dist/rules/security/SEC-008.d.ts +8 -0
  284. package/dist/rules/security/SEC-008.js +109 -0
  285. package/dist/rules/security/SEC-008.js.map +1 -0
  286. package/dist/rules/security/SEC-009.d.ts +9 -0
  287. package/dist/rules/security/SEC-009.js +93 -0
  288. package/dist/rules/security/SEC-009.js.map +1 -0
  289. package/dist/rules/security/SEC-010.d.ts +8 -0
  290. package/dist/rules/security/SEC-010.js +78 -0
  291. package/dist/rules/security/SEC-010.js.map +1 -0
  292. package/dist/rules/security/SEC-011.d.ts +8 -0
  293. package/dist/rules/security/SEC-011.js +93 -0
  294. package/dist/rules/security/SEC-011.js.map +1 -0
  295. package/dist/rules/security/SEC-012.d.ts +8 -0
  296. package/dist/rules/security/SEC-012.js +79 -0
  297. package/dist/rules/security/SEC-012.js.map +1 -0
  298. package/dist/rules/security/SEC-013.d.ts +9 -0
  299. package/dist/rules/security/SEC-013.js +107 -0
  300. package/dist/rules/security/SEC-013.js.map +1 -0
  301. package/dist/scoring/calculator.js +1 -0
  302. package/dist/scoring/calculator.js.map +1 -1
  303. package/dist/ui/ink/components/AIRecommendationCard.d.ts +11 -0
  304. package/dist/ui/ink/components/AIRecommendationCard.js +23 -0
  305. package/dist/ui/ink/components/AIRecommendationCard.js.map +1 -0
  306. package/dist/ui/ink/components/OpportunityList.d.ts +10 -0
  307. package/dist/ui/ink/components/OpportunityList.js +48 -0
  308. package/dist/ui/ink/components/OpportunityList.js.map +1 -0
  309. package/dist/ui/ink/components/PotentialPageCard.d.ts +13 -0
  310. package/dist/ui/ink/components/PotentialPageCard.js +43 -0
  311. package/dist/ui/ink/components/PotentialPageCard.js.map +1 -0
  312. package/dist/ui/ink/components/PotentialProgress.d.ts +16 -0
  313. package/dist/ui/ink/components/PotentialProgress.js +44 -0
  314. package/dist/ui/ink/components/PotentialProgress.js.map +1 -0
  315. package/dist/ui/ink/components/PotentialSummary.d.ts +10 -0
  316. package/dist/ui/ink/components/PotentialSummary.js +86 -0
  317. package/dist/ui/ink/components/PotentialSummary.js.map +1 -0
  318. package/dist/ui/ink/components/SuggestionCard.d.ts +34 -0
  319. package/dist/ui/ink/components/SuggestionCard.js +36 -0
  320. package/dist/ui/ink/components/SuggestionCard.js.map +1 -0
  321. package/dist/ui/ink/components/views/MultiPageCrawlView.d.ts +21 -0
  322. package/dist/ui/ink/components/views/MultiPageCrawlView.js +55 -0
  323. package/dist/ui/ink/components/views/MultiPageCrawlView.js.map +1 -0
  324. package/dist/ui/ink/components/views/PotentialView.d.ts +18 -0
  325. package/dist/ui/ink/components/views/PotentialView.js +74 -0
  326. package/dist/ui/ink/components/views/PotentialView.js.map +1 -0
  327. package/dist/ui/ink/components/views/ReconView.d.ts +22 -0
  328. package/dist/ui/ink/components/views/ReconView.js +30 -0
  329. package/dist/ui/ink/components/views/ReconView.js.map +1 -0
  330. package/package.json +2 -1
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Reconnaissance Module
3
+ *
4
+ * Orchestrates all recon sub-tasks: sitemap, robots.txt, manifest,
5
+ * meta extraction, and route discovery. Runs as many tasks in parallel
6
+ * as possible to minimize wall-clock time.
7
+ */
8
+ import { parseSitemap } from './sitemap.js';
9
+ import { parseRobots } from './robots.js';
10
+ import { parseManifest } from './manifest.js';
11
+ import { extractMeta } from './meta-extractor.js';
12
+ import { discoverRoutes } from './route-discovery.js';
13
+ /** Fetch timeout (ms) */
14
+ const ROOT_FETCH_TIMEOUT_MS = 15_000;
15
+ /**
16
+ * Fetch the root page HTML via plain HTTP (no browser needed).
17
+ * Returns the resolved URL (after redirects), status, response time, and HTML.
18
+ */
19
+ async function fetchRootPage(url) {
20
+ const controller = new AbortController();
21
+ const timer = setTimeout(() => controller.abort(), ROOT_FETCH_TIMEOUT_MS);
22
+ const start = Date.now();
23
+ try {
24
+ const response = await fetch(url, {
25
+ signal: controller.signal,
26
+ headers: {
27
+ 'User-Agent': 'Mozilla/5.0 (compatible; WebMCP-CLI/1.0; +https://webmcp.org)',
28
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29
+ },
30
+ redirect: 'follow',
31
+ });
32
+ const responseTime = Date.now() - start;
33
+ const html = await response.text();
34
+ return {
35
+ resolvedUrl: response.url,
36
+ statusCode: response.status,
37
+ responseTime,
38
+ html,
39
+ };
40
+ }
41
+ finally {
42
+ clearTimeout(timer);
43
+ }
44
+ }
45
+ /**
46
+ * Run full reconnaissance on a target URL.
47
+ *
48
+ * Gathers sitemap, robots.txt, manifest, meta tags, and discovers
49
+ * routes — all without launching a browser.
50
+ */
51
+ export async function runReconnaissance(url) {
52
+ const errors = [];
53
+ // Step 1: Fetch root page HTML + robots + sitemap in parallel
54
+ // (robots.txt is needed for route filtering, and may reference sitemaps)
55
+ const [rootResult, robotsResult] = await Promise.allSettled([
56
+ fetchRootPage(url),
57
+ parseRobots(url),
58
+ ]);
59
+ // Handle root page failure (fatal)
60
+ if (rootResult.status === 'rejected') {
61
+ const errMsg = rootResult.reason instanceof Error
62
+ ? rootResult.reason.message
63
+ : 'Failed to fetch root page';
64
+ return {
65
+ url,
66
+ resolvedUrl: url,
67
+ statusCode: 0,
68
+ responseTime: 0,
69
+ sitemap: null,
70
+ robots: null,
71
+ manifest: null,
72
+ meta: null,
73
+ discoveredRoutes: [],
74
+ rootHtml: '',
75
+ errors: [errMsg],
76
+ };
77
+ }
78
+ const { resolvedUrl, statusCode, responseTime, html } = rootResult.value;
79
+ const robots = robotsResult.status === 'fulfilled'
80
+ ? robotsResult.value
81
+ : null;
82
+ if (robotsResult.status === 'rejected') {
83
+ errors.push(`robots.txt: ${robotsResult.reason instanceof Error ? robotsResult.reason.message : 'Failed'}`);
84
+ }
85
+ // Step 2: Determine sitemap URL(s)
86
+ // Check robots.txt for sitemap references first
87
+ const sitemapUrlsFromRobots = robots?.sitemapUrls ?? [];
88
+ // Step 3: Fetch sitemap + manifest + extract meta in parallel
89
+ const [sitemapResult, manifestResult] = await Promise.allSettled([
90
+ sitemapUrlsFromRobots.length > 0
91
+ ? parseSitemap(url, sitemapUrlsFromRobots[0])
92
+ : parseSitemap(url),
93
+ parseManifest(url, html),
94
+ ]);
95
+ const sitemap = sitemapResult.status === 'fulfilled'
96
+ ? sitemapResult.value
97
+ : null;
98
+ const manifest = manifestResult.status === 'fulfilled'
99
+ ? manifestResult.value
100
+ : null;
101
+ if (sitemapResult.status === 'rejected') {
102
+ errors.push(`sitemap: ${sitemapResult.reason instanceof Error ? sitemapResult.reason.message : 'Failed'}`);
103
+ }
104
+ if (manifestResult.status === 'rejected') {
105
+ errors.push(`manifest: ${manifestResult.reason instanceof Error ? manifestResult.reason.message : 'Failed'}`);
106
+ }
107
+ // Step 4: Extract meta (synchronous cheerio operation)
108
+ let meta = null;
109
+ try {
110
+ meta = extractMeta(html, resolvedUrl);
111
+ }
112
+ catch (err) {
113
+ errors.push(`meta: ${err instanceof Error ? err.message : 'Failed'}`);
114
+ }
115
+ // Step 5: Discover and score routes
116
+ const discoveredRoutes = discoverRoutes({
117
+ baseUrl: resolvedUrl,
118
+ rootHtml: html,
119
+ sitemapEntries: sitemap?.entries ?? [],
120
+ robotsDirectives: robots?.directives ?? [],
121
+ respectRobotsTxt: true,
122
+ });
123
+ return {
124
+ url,
125
+ resolvedUrl,
126
+ statusCode,
127
+ responseTime,
128
+ sitemap,
129
+ robots,
130
+ manifest,
131
+ meta,
132
+ discoveredRoutes,
133
+ rootHtml: html,
134
+ errors,
135
+ };
136
+ }
137
+ // Re-export sub-modules for direct use
138
+ export { parseSitemap } from './sitemap.js';
139
+ export { parseRobots, isPathAllowed } from './robots.js';
140
+ export { parseManifest } from './manifest.js';
141
+ export { extractMeta } from './meta-extractor.js';
142
+ export { discoverRoutes } from './route-discovery.js';
143
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/recon/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAEtD,yBAAyB;AACzB,MAAM,qBAAqB,GAAG,MAAM,CAAC;AAErC;;;GAGG;AACH,KAAK,UAAU,aAAa,CAAC,GAAW;IAMtC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,qBAAqB,CAAC,CAAC;IAE1E,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;YAChC,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE;gBACP,YAAY,EACV,+DAA+D;gBACjE,MAAM,EACJ,iEAAiE;aACpE;YACD,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;QACH,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;QACxC,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;QAEnC,OAAO;YACL,WAAW,EAAE,QAAQ,CAAC,GAAG;YACzB,UAAU,EAAE,QAAQ,CAAC,MAAM;YAC3B,YAAY;YACZ,IAAI;SACL,CAAC;IACJ,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,GAAW;IACjD,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,8DAA8D;IAC9D,yEAAyE;IACzE,MAAM,CAAC,UAAU,EAAE,YAAY,CAAC,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;QAC1D,aAAa,CAAC,GAAG,CAAC;QAClB,WAAW,CAAC,GAAG,CAAC;KACjB,CAAC,CAAC;IAEH,mCAAmC;IACnC,IAAI,UAAU,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACrC,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,YAAY,KAAK;YAC/C,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,OAAO;YAC3B,CAAC,CAAC,2BAA2B,CAAC;QAChC,OAAO;YACL,GAAG;YACH,WAAW,EAAE,GAAG;YAChB,UAAU,EAAE,CAAC;YACb,YAAY,EAAE,CAAC;YACf,OAAO,EAAE,IAAI;YACb,MAAM,EAAE,IAAI;YACZ,QAAQ,EAAE,IAAI;YACd,IAAI,EAAE,IAAI;YACV,gBAAgB,EAAE,EAAE;YACpB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,CAAC,MAAM,CAAC;SACjB,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,WAAW,EAAE,UAAU,EAAE,YAAY,EAAE,IAAI,EAAE,GAAG,UAAU,CAAC,KAAK,CAAC;IACzE,MAAM,MAAM,GAAG,YAAY,CAAC,MAAM,KAAK,WAAW;QAChD,CAAC,CAAC,YAAY,CAAC,KAAK;QACpB,CAAC,CAAC,IAAI,CAAC;IAET,IAAI,YAAY,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACvC,MAAM,CAAC,IAAI,CAAC,eAAe,YAAY,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC9G,CAAC;IAED,mCAAmC;IACnC,gDAAgD;IAChD,MAAM,qBAAqB,GAAG,MAAM,EAAE,WAAW,IAAI,EAAE,CAAC;IAExD,8DAA8D;IAC9D,MAAM,CAAC,aAAa,EAAE,cAAc,CAAC,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;QAC/D,qBAAqB,CAAC,MAAM,GAAG,CAAC;YAC9B,CAAC,CAAC,YAAY,CAAC,GAAG,EAAE,qBAAqB,CAAC,CAAC,CAAC,CAAC;YAC7C,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC;QACrB,aAAa,CAAC,GAAG,EAAE,IAAI,CAAC;KACzB,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,KAAK,WAAW;QAClD,CAAC,CAAC,aAAa,CAAC,KAAK;QACrB,CAAC,CAAC,IAAI,CAAC;IACT,MAAM,QAAQ,GAAG,cAAc,CAAC,MAAM,KAAK,WAAW;QACpD,CAAC,CAAC,cAAc,CAAC,KAAK;QACtB,CAAC,CAAC,IAAI,CAAC;IAET,IAAI,aAAa,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACxC,MAAM,CAAC,IAAI,CAAC,YAAY,aAAa,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC7G,CAAC;IACD,IAAI,cAAc,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACzC,MAAM,CAAC,IAAI,CAAC,aAAa,cAAc,CAAC,MAAM,YAAY,KAAK,CAAC,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IAChH,CAAC;IAED,uDAAuD;IACvD,IAAI,IAAI,GAAG,IAAI,CAAC;IAChB,IAAI,CAAC;QACH,IAAI,GAAG,WAAW,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IACxC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,CAAC,IAAI,CAAC,SAAS,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,oCAAoC;IACpC,MAAM,gBAAgB,GAAG,cAAc,CAAC;QACtC,OAAO,EAAE,WAAW;QACpB,QAAQ,EAAE,IAAI;QACd,cAAc,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE;QACtC,gBAAgB,EAAE,MAAM,EAAE,UAAU,IAAI,EAAE;QAC1C,gBAAgB,EAAE,IAAI;KACvB,CAAC,CAAC;IAEH,OAAO;QACL,GAAG;QACH,WAAW;QACX,UAAU;QACV,YAAY;QACZ,OAAO;QACP,MAAM;QACN,QAAQ;QACR,IAAI;QACJ,gBAAgB;QAChB,QAAQ,EAAE,IAAI;QACd,MAAM;KACP,CAAC;AACJ,CAAC;AAED,uCAAuC;AACvC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACzD,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAClD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Web App Manifest Parser
3
+ *
4
+ * Extracts and parses the web app manifest from <link rel="manifest">
5
+ * or the default /manifest.json path. Handles missing manifests gracefully.
6
+ */
7
+ import type { ManifestResult } from '../core/types/recon.js';
8
+ /**
9
+ * Fetch and parse the web app manifest for a site.
10
+ *
11
+ * Strategy:
12
+ * 1. If rootHtml provided, look for <link rel="manifest"> href
13
+ * 2. Otherwise try /manifest.json
14
+ * 3. Try /manifest.webmanifest as fallback
15
+ */
16
+ export declare function parseManifest(baseUrl: string, rootHtml?: string): Promise<ManifestResult>;
@@ -0,0 +1,108 @@
1
+ /**
2
+ * Web App Manifest Parser
3
+ *
4
+ * Extracts and parses the web app manifest from <link rel="manifest">
5
+ * or the default /manifest.json path. Handles missing manifests gracefully.
6
+ */
7
+ import * as cheerio from 'cheerio';
8
+ /** Fetch timeout (ms) */
9
+ const FETCH_TIMEOUT_MS = 10_000;
10
+ /**
11
+ * Fetch with timeout using AbortController
12
+ */
13
+ async function fetchWithTimeout(url, timeoutMs = FETCH_TIMEOUT_MS) {
14
+ const controller = new AbortController();
15
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
16
+ try {
17
+ return await fetch(url, {
18
+ signal: controller.signal,
19
+ headers: { 'User-Agent': 'WebMCP-CLI/1.0 (manifest-parser)' },
20
+ redirect: 'follow',
21
+ });
22
+ }
23
+ finally {
24
+ clearTimeout(timer);
25
+ }
26
+ }
27
+ /**
28
+ * Extract manifest URL from HTML <link rel="manifest"> tag
29
+ */
30
+ function extractManifestUrl(html, baseUrl) {
31
+ const $ = cheerio.load(html);
32
+ const href = $('link[rel="manifest"]').attr('href');
33
+ if (!href)
34
+ return null;
35
+ try {
36
+ return new URL(href, baseUrl).href;
37
+ }
38
+ catch {
39
+ return null;
40
+ }
41
+ }
42
+ /**
43
+ * Fetch and parse the web app manifest for a site.
44
+ *
45
+ * Strategy:
46
+ * 1. If rootHtml provided, look for <link rel="manifest"> href
47
+ * 2. Otherwise try /manifest.json
48
+ * 3. Try /manifest.webmanifest as fallback
49
+ */
50
+ export async function parseManifest(baseUrl, rootHtml) {
51
+ // Build list of candidate URLs to try
52
+ const candidates = [];
53
+ if (rootHtml) {
54
+ const fromLink = extractManifestUrl(rootHtml, baseUrl);
55
+ if (fromLink)
56
+ candidates.push(fromLink);
57
+ }
58
+ candidates.push(new URL('/manifest.json', baseUrl).href, new URL('/manifest.webmanifest', baseUrl).href);
59
+ // Deduplicate
60
+ const uniqueCandidates = [...new Set(candidates)];
61
+ for (const url of uniqueCandidates) {
62
+ try {
63
+ const response = await fetchWithTimeout(url);
64
+ if (!response.ok)
65
+ continue;
66
+ const contentType = response.headers.get('content-type') ?? '';
67
+ // Skip HTML responses (custom 404 pages)
68
+ if (contentType.includes('text/html'))
69
+ continue;
70
+ const text = await response.text();
71
+ let data;
72
+ try {
73
+ data = JSON.parse(text);
74
+ }
75
+ catch {
76
+ continue; // Not valid JSON
77
+ }
78
+ return {
79
+ found: true,
80
+ name: typeof data['name'] === 'string' ? data['name'] : undefined,
81
+ shortName: typeof data['short_name'] === 'string'
82
+ ? data['short_name']
83
+ : undefined,
84
+ description: typeof data['description'] === 'string'
85
+ ? data['description']
86
+ : undefined,
87
+ startUrl: typeof data['start_url'] === 'string'
88
+ ? data['start_url']
89
+ : undefined,
90
+ display: typeof data['display'] === 'string' ? data['display'] : undefined,
91
+ themeColor: typeof data['theme_color'] === 'string'
92
+ ? data['theme_color']
93
+ : undefined,
94
+ source: url,
95
+ raw: data,
96
+ };
97
+ }
98
+ catch {
99
+ // Try next candidate
100
+ continue;
101
+ }
102
+ }
103
+ return {
104
+ found: false,
105
+ error: 'No manifest found',
106
+ };
107
+ }
108
+ //# sourceMappingURL=manifest.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"manifest.js","sourceRoot":"","sources":["../../src/recon/manifest.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC,yBAAyB;AACzB,MAAM,gBAAgB,GAAG,MAAM,CAAC;AAEhC;;GAEG;AACH,KAAK,UAAU,gBAAgB,CAC7B,GAAW,EACX,YAAoB,gBAAgB;IAEpC,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;IACzC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,SAAS,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,OAAO,MAAM,KAAK,CAAC,GAAG,EAAE;YACtB,MAAM,EAAE,UAAU,CAAC,MAAM;YACzB,OAAO,EAAE,EAAE,YAAY,EAAE,kCAAkC,EAAE;YAC7D,QAAQ,EAAE,QAAQ;SACnB,CAAC,CAAC;IACL,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;IACtB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAY,EAAE,OAAe;IACvD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,CAAC,CAAC,sBAAsB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IAEvB,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;IACrC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,OAAe,EACf,QAAiB;IAEjB,sCAAsC;IACtC,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,QAAQ,GAAG,kBAAkB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACvD,IAAI,QAAQ;YAAE,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,IAAI,CACb,IAAI,GAAG,CAAC,gBAAgB,EAAE,OAAO,CAAC,CAAC,IAAI,EACvC,IAAI,GAAG,CAAC,uBAAuB,EAAE,OAAO,CAAC,CAAC,IAAI,CAC/C,CAAC;IAEF,cAAc;IACd,MAAM,gBAAgB,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC;IAElD,KAAK,MAAM,GAAG,IAAI,gBAAgB,EAAE,CAAC;QACnC,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,gBAAgB,CAAC,GAAG,CAAC,CAAC;YAC7C,IAAI,CAAC,QAAQ,CAAC,EAAE;gBAAE,SAAS;YAE3B,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;YAC/D,yCAAyC;YACzC,IAAI,WAAW,CAAC,QAAQ,CAAC,WAAW,CAAC;gBAAE,SAAS;YAEhD,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACnC,IAAI,IAA6B,CAAC;YAClC,IAAI,CAAC;gBACH,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAA4B,CAAC;YACrD,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS,CAAC,iBAAiB;YAC7B,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,IAAI;gBACX,IAAI,EAAE,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS;gBACjE,SAAS,EACP,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,QAAQ;oBACpC,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC;oBACpB,CAAC,CAAC,SAAS;gBACf,WAAW,EACT,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,QAAQ;oBACrC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC;oBACrB,CAAC,CAAC,SAAS;gBACf,QAAQ,EACN,OAAO,IAAI,CAAC,WAAW,CAAC,KAAK,QAAQ;oBACnC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC;oBACnB,CAAC,CAAC,SAAS;gBACf,OAAO,EACL,OAAO,IAAI,CAAC,SAAS,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS;gBACnE,UAAU,EACR,OAAO,IAAI,CAAC,aAAa,CAAC,KAAK,QAAQ;oBACrC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC;oBACrB,CAAC,CAAC,SAAS;gBACf,MAAM,EAAE,GAAG;gBACX,GAAG,EAAE,IAAI;aACV,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,qBAAqB;YACrB,SAAS;QACX,CAAC;IACH,CAAC;IAED,OAAO;QACL,KAAK,EAAE,KAAK;QACZ,KAAK,EAAE,mBAAmB;KAC3B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Meta Tag & Schema.org Extractor
3
+ *
4
+ * Extracts OpenGraph, Twitter Card, Schema.org JSON-LD, and generic
5
+ * meta tags from root page HTML. Classifies the site type using cheerio.
6
+ */
7
+ import type { MetaExtractResult } from '../core/types/recon.js';
8
+ /**
9
+ * Extract meta tags, OpenGraph, Twitter Card, Schema.org, and classify site
10
+ */
11
+ export declare function extractMeta(html: string, pageUrl: string): MetaExtractResult;
@@ -0,0 +1,276 @@
1
+ /**
2
+ * Meta Tag & Schema.org Extractor
3
+ *
4
+ * Extracts OpenGraph, Twitter Card, Schema.org JSON-LD, and generic
5
+ * meta tags from root page HTML. Classifies the site type using cheerio.
6
+ */
7
+ import * as cheerio from 'cheerio';
8
+ /**
9
+ * Site classification keywords mapped to category
10
+ */
11
+ const CLASSIFICATION_SIGNALS = {
12
+ 'e-commerce': [
13
+ /shop|store|product|cart|checkout|buy|price|add.to.cart/i,
14
+ /e-?commerce|retail|merchant|catalog/i,
15
+ ],
16
+ travel: [
17
+ /travel|flight|hotel|booking|reservation|trip|tour|airline/i,
18
+ /destination|cruise|vacation|itinerary/i,
19
+ ],
20
+ healthcare: [
21
+ /health|medical|doctor|patient|hospital|clinic|pharma/i,
22
+ /diagnosis|treatment|symptom|appointment/i,
23
+ ],
24
+ finance: [
25
+ /bank|finance|invest|trading|loan|mortgage|insurance|credit/i,
26
+ /portfolio|stock|fund|payment/i,
27
+ ],
28
+ education: [
29
+ /education|course|learn|school|university|student|teacher/i,
30
+ /curriculum|enrollment|lecture|tutorial/i,
31
+ ],
32
+ 'news-media': [
33
+ /news|article|blog|press|journal|media|publish|editorial/i,
34
+ /breaking|headline|reporter|column/i,
35
+ ],
36
+ social: [
37
+ /social|community|profile|friend|follow|post|feed|message/i,
38
+ /network|connect|share|like|comment/i,
39
+ ],
40
+ saas: [
41
+ /dashboard|analytics|platform|subscription|api|integration/i,
42
+ /workspace|team|workflow|automation|saas/i,
43
+ ],
44
+ government: [
45
+ /government|gov\.|public.service|citizen|municipality|permit/i,
46
+ /regulation|compliance|legislation|agency/i,
47
+ ],
48
+ entertainment: [
49
+ /entertainment|game|movie|music|stream|watch|play|video/i,
50
+ /podcast|episode|series|concert/i,
51
+ ],
52
+ 'food-restaurant': [
53
+ /restaurant|food|menu|order|delivery|recipe|cuisine/i,
54
+ /dining|chef|meal|reservation/i,
55
+ ],
56
+ 'real-estate': [
57
+ /real.estate|property|listing|rent|lease|apartment|house/i,
58
+ /mortgage|realtor|condo|housing/i,
59
+ ],
60
+ general: [], // fallback
61
+ };
62
+ /**
63
+ * Schema.org type to site classification mapping
64
+ */
65
+ const SCHEMA_TYPE_MAP = {
66
+ Product: 'e-commerce',
67
+ Offer: 'e-commerce',
68
+ ShoppingCenter: 'e-commerce',
69
+ Store: 'e-commerce',
70
+ Airline: 'travel',
71
+ Flight: 'travel',
72
+ Hotel: 'travel',
73
+ TouristAttraction: 'travel',
74
+ LodgingBusiness: 'travel',
75
+ TravelAgency: 'travel',
76
+ Hospital: 'healthcare',
77
+ MedicalClinic: 'healthcare',
78
+ Physician: 'healthcare',
79
+ Pharmacy: 'healthcare',
80
+ BankOrCreditUnion: 'finance',
81
+ FinancialService: 'finance',
82
+ InsuranceAgency: 'finance',
83
+ EducationalOrganization: 'education',
84
+ Course: 'education',
85
+ School: 'education',
86
+ University: 'education',
87
+ NewsArticle: 'news-media',
88
+ NewsMediaOrganization: 'news-media',
89
+ BlogPosting: 'news-media',
90
+ SocialMediaPosting: 'social',
91
+ SoftwareApplication: 'saas',
92
+ WebApplication: 'saas',
93
+ GovernmentOrganization: 'government',
94
+ GovernmentService: 'government',
95
+ Movie: 'entertainment',
96
+ MusicRecording: 'entertainment',
97
+ VideoGame: 'entertainment',
98
+ Restaurant: 'food-restaurant',
99
+ FoodEstablishment: 'food-restaurant',
100
+ Menu: 'food-restaurant',
101
+ RealEstateAgent: 'real-estate',
102
+ Apartment: 'real-estate',
103
+ House: 'real-estate',
104
+ };
105
+ /**
106
+ * Extract all meta tags from HTML
107
+ */
108
+ function extractMetaTags($) {
109
+ const tags = [];
110
+ $('meta').each((_i, el) => {
111
+ const $el = $(el);
112
+ const property = $el.attr('property') ?? '';
113
+ const name = $el.attr('name') ?? '';
114
+ const content = $el.attr('content') ?? '';
115
+ if (!content)
116
+ return;
117
+ if (property.startsWith('og:')) {
118
+ tags.push({ name: property, content, category: 'og' });
119
+ }
120
+ else if (property.startsWith('twitter:') ||
121
+ name.startsWith('twitter:')) {
122
+ tags.push({
123
+ name: property || name,
124
+ content,
125
+ category: 'twitter',
126
+ });
127
+ }
128
+ else if (name) {
129
+ tags.push({ name, content, category: 'generic' });
130
+ }
131
+ });
132
+ return tags;
133
+ }
134
+ /**
135
+ * Extract OpenGraph data as a flat record
136
+ */
137
+ function extractOpenGraph(tags) {
138
+ const og = {};
139
+ for (const tag of tags) {
140
+ if (tag.category === 'og') {
141
+ og[tag.name] = tag.content;
142
+ }
143
+ }
144
+ return og;
145
+ }
146
+ /**
147
+ * Extract Twitter Card data as a flat record
148
+ */
149
+ function extractTwitterCard(tags) {
150
+ const twitter = {};
151
+ for (const tag of tags) {
152
+ if (tag.category === 'twitter') {
153
+ twitter[tag.name] = tag.content;
154
+ }
155
+ }
156
+ return twitter;
157
+ }
158
+ /**
159
+ * Extract Schema.org JSON-LD entries from <script type="application/ld+json">
160
+ */
161
+ function extractSchemaOrg($) {
162
+ const schemas = [];
163
+ $('script[type="application/ld+json"]').each((_i, el) => {
164
+ const raw = $(el).html();
165
+ if (!raw)
166
+ return;
167
+ try {
168
+ const data = JSON.parse(raw);
169
+ // Handle @graph arrays
170
+ if (Array.isArray(data['@graph'])) {
171
+ for (const item of data['@graph']) {
172
+ if (item &&
173
+ typeof item === 'object' &&
174
+ '@type' in item) {
175
+ schemas.push(item);
176
+ }
177
+ }
178
+ }
179
+ else if (data['@type']) {
180
+ schemas.push(data);
181
+ }
182
+ }
183
+ catch {
184
+ // Invalid JSON-LD — skip
185
+ }
186
+ });
187
+ return schemas;
188
+ }
189
+ /**
190
+ * Classify the site type based on all available signals
191
+ */
192
+ function classifySite(metaTags, schemaOrg, pageTitle, html) {
193
+ const scores = {
194
+ 'e-commerce': 0,
195
+ travel: 0,
196
+ healthcare: 0,
197
+ finance: 0,
198
+ education: 0,
199
+ 'news-media': 0,
200
+ social: 0,
201
+ saas: 0,
202
+ government: 0,
203
+ entertainment: 0,
204
+ 'food-restaurant': 0,
205
+ 'real-estate': 0,
206
+ general: 0,
207
+ };
208
+ // Signal 1: Schema.org @type (strongest signal)
209
+ for (const schema of schemaOrg) {
210
+ const schemaType = schema['@type'];
211
+ if (typeof schemaType === 'string') {
212
+ const mapped = SCHEMA_TYPE_MAP[schemaType];
213
+ if (mapped)
214
+ scores[mapped] += 10;
215
+ }
216
+ }
217
+ // Signal 2: Meta tags + title + HTML content
218
+ const textSignal = [
219
+ pageTitle,
220
+ ...metaTags.map((t) => t.content),
221
+ // Sample a small portion of the HTML for keyword signals
222
+ html.substring(0, 5000),
223
+ ].join(' ');
224
+ for (const [category, patterns] of Object.entries(CLASSIFICATION_SIGNALS)) {
225
+ for (const pattern of patterns) {
226
+ if (pattern.test(textSignal)) {
227
+ scores[category] += 3;
228
+ }
229
+ }
230
+ }
231
+ // Find highest score
232
+ let bestCategory = 'general';
233
+ let bestScore = 0;
234
+ for (const [category, score] of Object.entries(scores)) {
235
+ if (score > bestScore) {
236
+ bestScore = score;
237
+ bestCategory = category;
238
+ }
239
+ }
240
+ return bestScore >= 3 ? bestCategory : 'general';
241
+ }
242
+ /**
243
+ * Extract meta tags, OpenGraph, Twitter Card, Schema.org, and classify site
244
+ */
245
+ export function extractMeta(html, pageUrl) {
246
+ const $ = cheerio.load(html);
247
+ const metaTags = extractMetaTags($);
248
+ const openGraph = extractOpenGraph(metaTags);
249
+ const twitterCard = extractTwitterCard(metaTags);
250
+ const schemaOrg = extractSchemaOrg($);
251
+ const pageTitle = $('title').text().trim();
252
+ const descriptionTag = metaTags.find((t) => t.name === 'description' && t.category === 'generic');
253
+ const pageDescription = descriptionTag?.content ?? openGraph['og:description'] ?? '';
254
+ const canonicalEl = $('link[rel="canonical"]').attr('href');
255
+ let canonicalUrl;
256
+ if (canonicalEl) {
257
+ try {
258
+ canonicalUrl = new URL(canonicalEl, pageUrl).href;
259
+ }
260
+ catch {
261
+ // Invalid canonical
262
+ }
263
+ }
264
+ const siteClassification = classifySite(metaTags, schemaOrg, pageTitle, html);
265
+ return {
266
+ metaTags,
267
+ openGraph,
268
+ twitterCard,
269
+ schemaOrg,
270
+ siteClassification,
271
+ pageTitle,
272
+ pageDescription,
273
+ canonicalUrl,
274
+ };
275
+ }
276
+ //# sourceMappingURL=meta-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"meta-extractor.js","sourceRoot":"","sources":["../../src/recon/meta-extractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAQnC;;GAEG;AACH,MAAM,sBAAsB,GAAyC;IACnE,YAAY,EAAE;QACZ,yDAAyD;QACzD,sCAAsC;KACvC;IACD,MAAM,EAAE;QACN,4DAA4D;QAC5D,wCAAwC;KACzC;IACD,UAAU,EAAE;QACV,uDAAuD;QACvD,0CAA0C;KAC3C;IACD,OAAO,EAAE;QACP,6DAA6D;QAC7D,+BAA+B;KAChC;IACD,SAAS,EAAE;QACT,2DAA2D;QAC3D,yCAAyC;KAC1C;IACD,YAAY,EAAE;QACZ,0DAA0D;QAC1D,oCAAoC;KACrC;IACD,MAAM,EAAE;QACN,2DAA2D;QAC3D,qCAAqC;KACtC;IACD,IAAI,EAAE;QACJ,4DAA4D;QAC5D,0CAA0C;KAC3C;IACD,UAAU,EAAE;QACV,8DAA8D;QAC9D,2CAA2C;KAC5C;IACD,aAAa,EAAE;QACb,yDAAyD;QACzD,iCAAiC;KAClC;IACD,iBAAiB,EAAE;QACjB,qDAAqD;QACrD,+BAA+B;KAChC;IACD,aAAa,EAAE;QACb,0DAA0D;QAC1D,iCAAiC;KAClC;IACD,OAAO,EAAE,EAAE,EAAE,WAAW;CACzB,CAAC;AAEF;;GAEG;AACH,MAAM,eAAe,GAAuC;IAC1D,OAAO,EAAE,YAAY;IACrB,KAAK,EAAE,YAAY;IACnB,cAAc,EAAE,YAAY;IAC5B,KAAK,EAAE,YAAY;IACnB,OAAO,EAAE,QAAQ;IACjB,MAAM,EAAE,QAAQ;IAChB,KAAK,EAAE,QAAQ;IACf,iBAAiB,EAAE,QAAQ;IAC3B,eAAe,EAAE,QAAQ;IACzB,YAAY,EAAE,QAAQ;IACtB,QAAQ,EAAE,YAAY;IACtB,aAAa,EAAE,YAAY;IAC3B,SAAS,EAAE,YAAY;IACvB,QAAQ,EAAE,YAAY;IACtB,iBAAiB,EAAE,SAAS;IAC5B,gBAAgB,EAAE,SAAS;IAC3B,eAAe,EAAE,SAAS;IAC1B,uBAAuB,EAAE,WAAW;IACpC,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,WAAW;IACnB,UAAU,EAAE,WAAW;IACvB,WAAW,EAAE,YAAY;IACzB,qBAAqB,EAAE,YAAY;IACnC,WAAW,EAAE,YAAY;IACzB,kBAAkB,EAAE,QAAQ;IAC5B,mBAAmB,EAAE,MAAM;IAC3B,cAAc,EAAE,MAAM;IACtB,sBAAsB,EAAE,YAAY;IACpC,iBAAiB,EAAE,YAAY;IAC/B,KAAK,EAAE,eAAe;IACtB,cAAc,EAAE,eAAe;IAC/B,SAAS,EAAE,eAAe;IAC1B,UAAU,EAAE,iBAAiB;IAC7B,iBAAiB,EAAE,iBAAiB;IACpC,IAAI,EAAE,iBAAiB;IACvB,eAAe,EAAE,aAAa;IAC9B,SAAS,EAAE,aAAa;IACxB,KAAK,EAAE,aAAa;CACrB,CAAC;AAEF;;GAEG;AACH,SAAS,eAAe,CAAC,CAAqB;IAC5C,MAAM,IAAI,GAAoB,EAAE,CAAC;IAEjC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACxB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QAC5C,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC;QAE1C,IAAI,CAAC,OAAO;YAAE,OAAO;QAErB,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,CAAC;aAAM,IACL,QAAQ,CAAC,UAAU,CAAC,UAAU,CAAC;YAC/B,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,EAC3B,CAAC;YACD,IAAI,CAAC,IAAI,CAAC;gBACR,IAAI,EAAE,QAAQ,IAAI,IAAI;gBACtB,OAAO;gBACP,QAAQ,EAAE,SAAS;aACpB,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,IAAI,EAAE,CAAC;YAChB,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAqB;IAC7C,MAAM,EAAE,GAA2B,EAAE,CAAC;IACtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,GAAG,CAAC,QAAQ,KAAK,IAAI,EAAE,CAAC;YAC1B,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC;QAC7B,CAAC;IACH,CAAC;IACD,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CAAC,IAAqB;IAC/C,MAAM,OAAO,GAA2B,EAAE,CAAC;IAC3C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,GAAG,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC;QAClC,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,CAAqB;IAC7C,MAAM,OAAO,GAAoB,EAAE,CAAC;IAEpC,CAAC,CAAC,oCAAoC,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE;QACtD,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACzB,IAAI,CAAC,GAAG;YAAE,OAAO;QAEjB,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAA4B,CAAC;YAExD,uBAAuB;YACvB,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;gBAClC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAClC,IACE,IAAI;wBACJ,OAAO,IAAI,KAAK,QAAQ;wBACxB,OAAO,IAAK,IAAgC,EAC5C,CAAC;wBACD,OAAO,CAAC,IAAI,CAAC,IAAqB,CAAC,CAAC;oBACtC,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBACzB,OAAO,CAAC,IAAI,CAAC,IAAgC,CAAC,CAAC;YACjD,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,yBAAyB;QAC3B,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CACnB,QAAyB,EACzB,SAA0B,EAC1B,SAAiB,EACjB,IAAY;IAEZ,MAAM,MAAM,GAAuC;QACjD,YAAY,EAAE,CAAC;QACf,MAAM,EAAE,CAAC;QACT,UAAU,EAAE,CAAC;QACb,OAAO,EAAE,CAAC;QACV,SAAS,EAAE,CAAC;QACZ,YAAY,EAAE,CAAC;QACf,MAAM,EAAE,CAAC;QACT,IAAI,EAAE,CAAC;QACP,UAAU,EAAE,CAAC;QACb,aAAa,EAAE,CAAC;QAChB,iBAAiB,EAAE,CAAC;QACpB,aAAa,EAAE,CAAC;QAChB,OAAO,EAAE,CAAC;KACX,CAAC;IAEF,gDAAgD;IAChD,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;QAC/B,MAAM,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,OAAO,UAAU,KAAK,QAAQ,EAAE,CAAC;YACnC,MAAM,MAAM,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;YAC3C,IAAI,MAAM;gBAAE,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACnC,CAAC;IACH,CAAC;IAED,6CAA6C;IAC7C,MAAM,UAAU,GAAG;QACjB,SAAS;QACT,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjC,yDAAyD;QACzD,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC;KACxB,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAEZ,KAAK,MAAM,CAAC,QAAQ,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,sBAAsB,CAAC,EAAE,CAAC;QAC1E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,QAA8B,CAAC,IAAI,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;IACH,CAAC;IAED,qBAAqB;IACrB,IAAI,YAAY,GAAuB,SAAS,CAAC;IACjD,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QACvD,IAAI,KAAK,GAAG,SAAS,EAAE,CAAC;YACtB,SAAS,GAAG,KAAK,CAAC;YAClB,YAAY,GAAG,QAA8B,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,SAAS,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CACzB,IAAY,EACZ,OAAe;IAEf,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;IACpC,MAAM,SAAS,GAAG,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAC7C,MAAM,WAAW,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC;IACjD,MAAM,SAAS,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAEtC,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAC3C,MAAM,cAAc,GAAG,QAAQ,CAAC,IAAI,CAClC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,aAAa,IAAI,CAAC,CAAC,QAAQ,KAAK,SAAS,CAC5D,CAAC;IACF,MAAM,eAAe,GACnB,cAAc,EAAE,OAAO,IAAI,SAAS,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC;IAE/D,MAAM,WAAW,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC5D,IAAI,YAAgC,CAAC;IACrC,IAAI,WAAW,EAAE,CAAC;QAChB,IAAI,CAAC;YACH,YAAY,GAAG,IAAI,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACpD,CAAC;QAAC,MAAM,CAAC;YACP,oBAAoB;QACtB,CAAC;IACH,CAAC;IAED,MAAM,kBAAkB,GAAG,YAAY,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;IAE9E,OAAO;QACL,QAAQ;QACR,SAAS;QACT,WAAW;QACX,SAAS;QACT,kBAAkB;QAClB,SAAS;QACT,eAAe;QACf,YAAY;KACb,CAAC;AACJ,CAAC"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Robots.txt Parser
3
+ *
4
+ * Fetches and parses robots.txt. Extracts allowed/disallowed paths,
5
+ * crawl-delay, and sitemap references. Handles missing robots.txt gracefully.
6
+ */
7
+ import type { RobotsDirective, RobotsResult } from '../core/types/recon.js';
8
+ /**
9
+ * Check whether a path is allowed by the robots.txt directives.
10
+ * Uses wildcard user-agent (*) rules if no specific match.
11
+ */
12
+ export declare function isPathAllowed(path: string, directives: RobotsDirective[], userAgent?: string): boolean;
13
+ /**
14
+ * Fetch and parse robots.txt for a site
15
+ */
16
+ export declare function parseRobots(baseUrl: string): Promise<RobotsResult>;