recker 1.0.43 → 1.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (459) hide show
  1. package/README.md +47 -0
  2. package/dist/bin/recker-linux-x64 +0 -0
  3. package/dist/bin/recker-macos-x64 +0 -0
  4. package/dist/bin/recker-win-x64.exe +0 -0
  5. package/dist/bin/rek.cjs +85152 -100207
  6. package/dist/browser/ai/adaptive-timeout.d.ts +50 -0
  7. package/dist/browser/ai/adaptive-timeout.js +208 -0
  8. package/dist/browser/ai/client.d.ts +22 -0
  9. package/dist/browser/ai/client.js +294 -0
  10. package/dist/browser/ai/index.d.ts +14 -0
  11. package/dist/browser/ai/index.js +11 -0
  12. package/dist/browser/ai/providers/anthropic.d.ts +63 -0
  13. package/dist/browser/ai/providers/anthropic.js +370 -0
  14. package/dist/browser/ai/providers/base.d.ts +48 -0
  15. package/dist/browser/ai/providers/base.js +150 -0
  16. package/dist/browser/ai/providers/google.d.ts +59 -0
  17. package/dist/browser/ai/providers/google.js +305 -0
  18. package/dist/browser/ai/providers/ollama.d.ts +44 -0
  19. package/dist/browser/ai/providers/ollama.js +240 -0
  20. package/dist/browser/ai/providers/openai.d.ts +64 -0
  21. package/dist/browser/ai/providers/openai.js +298 -0
  22. package/dist/browser/ai/rate-limiter.d.ts +43 -0
  23. package/dist/browser/ai/rate-limiter.js +215 -0
  24. package/dist/browser/ai/vector/index.d.ts +2 -0
  25. package/dist/browser/ai/vector/index.js +2 -0
  26. package/dist/browser/ai/vector/similarity.d.ts +2 -0
  27. package/dist/browser/ai/vector/similarity.js +27 -0
  28. package/dist/browser/ai/vector/store.d.ts +27 -0
  29. package/dist/browser/ai/vector/store.js +82 -0
  30. package/dist/browser/browser/cache.d.ts +2 -40
  31. package/dist/browser/browser/cache.js +2 -199
  32. package/dist/browser/browser/index.d.ts +8 -0
  33. package/dist/browser/browser/index.js +8 -0
  34. package/dist/browser/browser/recker.d.ts +8 -1
  35. package/dist/browser/browser/recker.js +8 -2
  36. package/dist/browser/cache/indexed-db.d.ts +10 -0
  37. package/dist/browser/cache/indexed-db.js +88 -0
  38. package/dist/browser/cache/service-worker-cache.d.ts +18 -0
  39. package/dist/browser/cache/service-worker-cache.js +103 -0
  40. package/dist/browser/cache.d.ts +2 -40
  41. package/dist/browser/cache.js +2 -199
  42. package/dist/browser/constants/user-agents.d.ts +7 -0
  43. package/dist/browser/constants/user-agents.js +7 -0
  44. package/dist/browser/core/client.d.ts +2 -0
  45. package/dist/browser/core/client.js +19 -1
  46. package/dist/browser/index.d.ts +8 -0
  47. package/dist/browser/index.js +8 -0
  48. package/dist/browser/plugins/har-recorder.d.ts +40 -0
  49. package/dist/browser/plugins/har-recorder.js +120 -0
  50. package/dist/browser/plugins/network-simulation.d.ts +7 -0
  51. package/dist/browser/plugins/network-simulation.js +13 -0
  52. package/dist/browser/presets/android.d.ts +2 -0
  53. package/dist/browser/presets/android.js +16 -0
  54. package/dist/browser/presets/anthropic.d.ts +8 -0
  55. package/dist/browser/presets/anthropic.js +27 -0
  56. package/dist/browser/presets/aws.d.ts +19 -0
  57. package/dist/browser/presets/aws.js +68 -0
  58. package/dist/browser/presets/azure-openai.d.ts +10 -0
  59. package/dist/browser/presets/azure-openai.js +35 -0
  60. package/dist/browser/presets/azure.d.ts +41 -0
  61. package/dist/browser/presets/azure.js +104 -0
  62. package/dist/browser/presets/chaturbate.d.ts +2 -0
  63. package/dist/browser/presets/chaturbate.js +17 -0
  64. package/dist/browser/presets/cloudflare.d.ts +12 -0
  65. package/dist/browser/presets/cloudflare.js +39 -0
  66. package/dist/browser/presets/cohere.d.ts +7 -0
  67. package/dist/browser/presets/cohere.js +22 -0
  68. package/dist/browser/presets/deepseek.d.ts +7 -0
  69. package/dist/browser/presets/deepseek.js +22 -0
  70. package/dist/browser/presets/digitalocean.d.ts +5 -0
  71. package/dist/browser/presets/digitalocean.js +16 -0
  72. package/dist/browser/presets/discord.d.ts +6 -0
  73. package/dist/browser/presets/discord.js +17 -0
  74. package/dist/browser/presets/elevenlabs.d.ts +6 -0
  75. package/dist/browser/presets/elevenlabs.js +20 -0
  76. package/dist/browser/presets/enhancers.d.ts +20 -0
  77. package/dist/browser/presets/enhancers.js +85 -0
  78. package/dist/browser/presets/fireworks.d.ts +7 -0
  79. package/dist/browser/presets/fireworks.js +22 -0
  80. package/dist/browser/presets/gcp.d.ts +34 -0
  81. package/dist/browser/presets/gcp.js +91 -0
  82. package/dist/browser/presets/gemini.d.ts +7 -0
  83. package/dist/browser/presets/gemini.js +23 -0
  84. package/dist/browser/presets/github.d.ts +6 -0
  85. package/dist/browser/presets/github.js +17 -0
  86. package/dist/browser/presets/gitlab.d.ts +6 -0
  87. package/dist/browser/presets/gitlab.js +16 -0
  88. package/dist/browser/presets/groq.d.ts +7 -0
  89. package/dist/browser/presets/groq.js +22 -0
  90. package/dist/browser/presets/hubspot.d.ts +9 -0
  91. package/dist/browser/presets/hubspot.js +28 -0
  92. package/dist/browser/presets/huggingface.d.ts +7 -0
  93. package/dist/browser/presets/huggingface.js +23 -0
  94. package/dist/browser/presets/index.d.ts +47 -0
  95. package/dist/browser/presets/index.js +47 -0
  96. package/dist/browser/presets/ios.d.ts +2 -0
  97. package/dist/browser/presets/ios.js +13 -0
  98. package/dist/browser/presets/linear.d.ts +5 -0
  99. package/dist/browser/presets/linear.js +16 -0
  100. package/dist/browser/presets/mailgun.d.ts +7 -0
  101. package/dist/browser/presets/mailgun.js +20 -0
  102. package/dist/browser/presets/meta.d.ts +10 -0
  103. package/dist/browser/presets/meta.js +33 -0
  104. package/dist/browser/presets/mistral.d.ts +7 -0
  105. package/dist/browser/presets/mistral.js +22 -0
  106. package/dist/browser/presets/notion.d.ts +6 -0
  107. package/dist/browser/presets/notion.js +17 -0
  108. package/dist/browser/presets/openai.d.ts +9 -0
  109. package/dist/browser/presets/openai.js +30 -0
  110. package/dist/browser/presets/oracle.d.ts +19 -0
  111. package/dist/browser/presets/oracle.js +117 -0
  112. package/dist/browser/presets/perplexity.d.ts +7 -0
  113. package/dist/browser/presets/perplexity.js +22 -0
  114. package/dist/browser/presets/pinecone.d.ts +8 -0
  115. package/dist/browser/presets/pinecone.js +42 -0
  116. package/dist/browser/presets/registry.d.ts +23 -0
  117. package/dist/browser/presets/registry.js +519 -0
  118. package/dist/browser/presets/replicate.d.ts +7 -0
  119. package/dist/browser/presets/replicate.js +23 -0
  120. package/dist/browser/presets/sendgrid.d.ts +6 -0
  121. package/dist/browser/presets/sendgrid.js +20 -0
  122. package/dist/browser/presets/sentry.d.ts +11 -0
  123. package/dist/browser/presets/sentry.js +48 -0
  124. package/dist/browser/presets/sinch.d.ts +9 -0
  125. package/dist/browser/presets/sinch.js +39 -0
  126. package/dist/browser/presets/slack.d.ts +5 -0
  127. package/dist/browser/presets/slack.js +16 -0
  128. package/dist/browser/presets/square.d.ts +10 -0
  129. package/dist/browser/presets/square.js +33 -0
  130. package/dist/browser/presets/stripe.d.ts +7 -0
  131. package/dist/browser/presets/stripe.js +23 -0
  132. package/dist/browser/presets/supabase.d.ts +6 -0
  133. package/dist/browser/presets/supabase.js +18 -0
  134. package/dist/browser/presets/tiktok.d.ts +10 -0
  135. package/dist/browser/presets/tiktok.js +38 -0
  136. package/dist/browser/presets/together.d.ts +7 -0
  137. package/dist/browser/presets/together.js +22 -0
  138. package/dist/browser/presets/twilio.d.ts +6 -0
  139. package/dist/browser/presets/twilio.js +17 -0
  140. package/dist/browser/presets/vercel.d.ts +6 -0
  141. package/dist/browser/presets/vercel.js +23 -0
  142. package/dist/browser/presets/vultr.d.ts +5 -0
  143. package/dist/browser/presets/vultr.js +16 -0
  144. package/dist/browser/presets/xai.d.ts +8 -0
  145. package/dist/browser/presets/xai.js +23 -0
  146. package/dist/browser/presets/youtube.d.ts +5 -0
  147. package/dist/browser/presets/youtube.js +20 -0
  148. package/dist/browser/recker.d.ts +8 -1
  149. package/dist/browser/recker.js +8 -2
  150. package/dist/browser/scrape/document.d.ts +5 -4
  151. package/dist/browser/scrape/document.js +89 -76
  152. package/dist/browser/scrape/element.d.ts +10 -8
  153. package/dist/browser/scrape/element.js +295 -81
  154. package/dist/browser/scrape/extractors.d.ts +11 -11
  155. package/dist/browser/scrape/extractors.js +145 -113
  156. package/dist/browser/scrape/parser/back.d.ts +1 -0
  157. package/dist/browser/scrape/parser/back.js +3 -0
  158. package/dist/browser/scrape/parser/index.d.ts +20 -0
  159. package/dist/browser/scrape/parser/index.js +19 -0
  160. package/dist/browser/scrape/parser/matcher.d.ts +30 -0
  161. package/dist/browser/scrape/parser/matcher.js +99 -0
  162. package/dist/browser/scrape/parser/nodes/comment.d.ts +12 -0
  163. package/dist/browser/scrape/parser/nodes/comment.js +21 -0
  164. package/dist/browser/scrape/parser/nodes/html.d.ts +110 -0
  165. package/dist/browser/scrape/parser/nodes/html.js +978 -0
  166. package/dist/browser/scrape/parser/nodes/node.d.ts +18 -0
  167. package/dist/browser/scrape/parser/nodes/node.js +31 -0
  168. package/dist/browser/scrape/parser/nodes/text.d.ts +14 -0
  169. package/dist/browser/scrape/parser/nodes/text.js +30 -0
  170. package/dist/browser/scrape/parser/nodes/type.d.ts +6 -0
  171. package/dist/browser/scrape/parser/nodes/type.js +7 -0
  172. package/dist/browser/scrape/parser/parse.d.ts +1 -0
  173. package/dist/browser/scrape/parser/parse.js +1 -0
  174. package/dist/browser/scrape/parser/valid.d.ts +2 -0
  175. package/dist/browser/scrape/parser/valid.js +5 -0
  176. package/dist/browser/scrape/parser/void-tag.d.ts +7 -0
  177. package/dist/browser/scrape/parser/void-tag.js +43 -0
  178. package/dist/browser/scrape/types.d.ts +7 -0
  179. package/dist/browser/seo/analyzer.d.ts +59 -0
  180. package/dist/browser/seo/analyzer.js +1399 -0
  181. package/dist/browser/seo/keywords.d.ts +16 -0
  182. package/dist/browser/seo/keywords.js +55 -0
  183. package/dist/browser/seo/rules/accessibility.d.ts +2 -0
  184. package/dist/browser/seo/rules/accessibility.js +733 -0
  185. package/dist/browser/seo/rules/ai-search.d.ts +2 -0
  186. package/dist/browser/seo/rules/ai-search.js +436 -0
  187. package/dist/browser/seo/rules/analytics.d.ts +2 -0
  188. package/dist/browser/seo/rules/analytics.js +306 -0
  189. package/dist/browser/seo/rules/best-practices.d.ts +2 -0
  190. package/dist/browser/seo/rules/best-practices.js +195 -0
  191. package/dist/browser/seo/rules/canonical.d.ts +12 -0
  192. package/dist/browser/seo/rules/canonical.js +270 -0
  193. package/dist/browser/seo/rules/content.d.ts +2 -0
  194. package/dist/browser/seo/rules/content.js +522 -0
  195. package/dist/browser/seo/rules/crawl.d.ts +2 -0
  196. package/dist/browser/seo/rules/crawl.js +435 -0
  197. package/dist/browser/seo/rules/cwv.d.ts +2 -0
  198. package/dist/browser/seo/rules/cwv.js +248 -0
  199. package/dist/browser/seo/rules/ecommerce.d.ts +2 -0
  200. package/dist/browser/seo/rules/ecommerce.js +312 -0
  201. package/dist/browser/seo/rules/i18n.d.ts +2 -0
  202. package/dist/browser/seo/rules/i18n.js +288 -0
  203. package/dist/browser/seo/rules/images.d.ts +2 -0
  204. package/dist/browser/seo/rules/images.js +255 -0
  205. package/dist/browser/seo/rules/index.d.ts +52 -0
  206. package/dist/browser/seo/rules/index.js +159 -0
  207. package/dist/browser/seo/rules/internal-linking.d.ts +2 -0
  208. package/dist/browser/seo/rules/internal-linking.js +394 -0
  209. package/dist/browser/seo/rules/links.d.ts +2 -0
  210. package/dist/browser/seo/rules/links.js +498 -0
  211. package/dist/browser/seo/rules/local.d.ts +2 -0
  212. package/dist/browser/seo/rules/local.js +289 -0
  213. package/dist/browser/seo/rules/meta.d.ts +2 -0
  214. package/dist/browser/seo/rules/meta.js +805 -0
  215. package/dist/browser/seo/rules/mobile.d.ts +2 -0
  216. package/dist/browser/seo/rules/mobile.js +161 -0
  217. package/dist/browser/seo/rules/performance.d.ts +2 -0
  218. package/dist/browser/seo/rules/performance.js +738 -0
  219. package/dist/browser/seo/rules/pwa.d.ts +2 -0
  220. package/dist/browser/seo/rules/pwa.js +299 -0
  221. package/dist/browser/seo/rules/readability.d.ts +2 -0
  222. package/dist/browser/seo/rules/readability.js +264 -0
  223. package/dist/browser/seo/rules/redirects.d.ts +16 -0
  224. package/dist/browser/seo/rules/redirects.js +199 -0
  225. package/dist/browser/seo/rules/resources.d.ts +2 -0
  226. package/dist/browser/seo/rules/resources.js +390 -0
  227. package/dist/browser/seo/rules/schema.d.ts +2 -0
  228. package/dist/browser/seo/rules/schema.js +379 -0
  229. package/dist/browser/seo/rules/security.d.ts +2 -0
  230. package/dist/browser/seo/rules/security.js +877 -0
  231. package/dist/browser/seo/rules/social.d.ts +2 -0
  232. package/dist/browser/seo/rules/social.js +603 -0
  233. package/dist/browser/seo/rules/structural.d.ts +2 -0
  234. package/dist/browser/seo/rules/structural.js +223 -0
  235. package/dist/browser/seo/rules/technical-advanced.d.ts +10 -0
  236. package/dist/browser/seo/rules/technical-advanced.js +289 -0
  237. package/dist/browser/seo/rules/technical.d.ts +2 -0
  238. package/dist/browser/seo/rules/technical.js +480 -0
  239. package/dist/browser/seo/rules/thresholds.d.ts +196 -0
  240. package/dist/browser/seo/rules/thresholds.js +118 -0
  241. package/dist/browser/seo/rules/types.d.ts +498 -0
  242. package/dist/browser/seo/rules/types.js +11 -0
  243. package/dist/browser/seo/types.d.ts +211 -0
  244. package/dist/browser/seo/types.js +1 -0
  245. package/dist/browser/transport/curl.d.ts +4 -0
  246. package/dist/browser/transport/curl.js +101 -0
  247. package/dist/browser/transport/undici.js +1 -2
  248. package/dist/browser/transport/worker.d.ts +18 -0
  249. package/dist/browser/transport/worker.js +278 -0
  250. package/dist/browser/types/index.d.ts +4 -1
  251. package/dist/browser/utils/binary-manager.d.ts +4 -0
  252. package/dist/browser/utils/binary-manager.js +72 -0
  253. package/dist/browser/utils/user-agent.js +2 -13
  254. package/dist/cache/indexed-db.d.ts +10 -0
  255. package/dist/cache/indexed-db.js +88 -0
  256. package/dist/cache/service-worker-cache.d.ts +18 -0
  257. package/dist/cache/service-worker-cache.js +103 -0
  258. package/dist/cli/commands/ai.d.ts +2 -0
  259. package/dist/cli/commands/ai.js +162 -0
  260. package/dist/cli/commands/bench.d.ts +2 -0
  261. package/dist/cli/commands/bench.js +51 -0
  262. package/dist/cli/commands/dns.d.ts +2 -0
  263. package/dist/cli/commands/dns.js +295 -0
  264. package/dist/cli/commands/har.d.ts +2 -0
  265. package/dist/cli/commands/har.js +171 -0
  266. package/dist/cli/commands/hls.d.ts +2 -0
  267. package/dist/cli/commands/hls.js +192 -0
  268. package/dist/cli/commands/network.d.ts +2 -0
  269. package/dist/cli/commands/network.js +288 -0
  270. package/dist/cli/commands/protocols.d.ts +2 -0
  271. package/dist/cli/commands/protocols.js +344 -0
  272. package/dist/cli/commands/scrape.d.ts +2 -0
  273. package/dist/cli/commands/scrape.js +176 -0
  274. package/dist/cli/commands/security.d.ts +2 -0
  275. package/dist/cli/commands/security.js +57 -0
  276. package/dist/cli/commands/seo.d.ts +2 -0
  277. package/dist/cli/commands/seo.js +125 -0
  278. package/dist/cli/commands/serve.d.ts +2 -0
  279. package/dist/cli/commands/serve.js +531 -0
  280. package/dist/cli/commands/spider.d.ts +3 -0
  281. package/dist/cli/commands/spider.js +456 -0
  282. package/dist/cli/commands/utils.d.ts +2 -0
  283. package/dist/cli/commands/utils.js +176 -0
  284. package/dist/cli/commands/vector.d.ts +2 -0
  285. package/dist/cli/commands/vector.js +158 -0
  286. package/dist/cli/handler.d.ts +2 -2
  287. package/dist/cli/handler.js +6 -6
  288. package/dist/cli/helpers.d.ts +7 -0
  289. package/dist/cli/helpers.js +128 -0
  290. package/dist/cli/index.js +96 -5228
  291. package/dist/cli/parser/help.d.ts +2 -0
  292. package/dist/cli/parser/help.js +52 -0
  293. package/dist/cli/parser/index.d.ts +3 -0
  294. package/dist/cli/parser/index.js +3 -0
  295. package/dist/cli/parser/parser.d.ts +4 -0
  296. package/dist/cli/parser/parser.js +146 -0
  297. package/dist/cli/parser/types.d.ts +41 -0
  298. package/dist/cli/parser/types.js +1 -0
  299. package/dist/cli/presets.d.ts +1 -1
  300. package/dist/cli/presets.js +1 -1
  301. package/dist/cli/router.d.ts +36 -0
  302. package/dist/cli/router.js +195 -0
  303. package/dist/cli/tui/ai-chat.js +1 -1
  304. package/dist/cli/tui/commands/context.d.ts +9 -0
  305. package/dist/cli/tui/commands/context.js +1 -0
  306. package/dist/cli/tui/commands/dns.d.ts +10 -0
  307. package/dist/cli/tui/commands/dns.js +461 -0
  308. package/dist/cli/tui/commands/hls.d.ts +2 -0
  309. package/dist/cli/tui/commands/hls.js +162 -0
  310. package/dist/cli/tui/commands/ip.d.ts +2 -0
  311. package/dist/cli/tui/commands/ip.js +45 -0
  312. package/dist/cli/tui/commands/network.d.ts +3 -0
  313. package/dist/cli/tui/commands/network.js +81 -0
  314. package/dist/cli/tui/commands/protocols.d.ts +6 -0
  315. package/dist/cli/tui/commands/protocols.js +531 -0
  316. package/dist/cli/tui/commands/security.d.ts +2 -0
  317. package/dist/cli/tui/commands/security.js +48 -0
  318. package/dist/cli/tui/commands/seo.d.ts +2 -0
  319. package/dist/cli/tui/commands/seo.js +74 -0
  320. package/dist/cli/tui/context.d.ts +12 -0
  321. package/dist/cli/tui/context.js +1 -0
  322. package/dist/cli/tui/shell.d.ts +11 -20
  323. package/dist/cli/tui/shell.js +216 -1873
  324. package/dist/constants/user-agents.d.ts +7 -0
  325. package/dist/constants/user-agents.js +7 -0
  326. package/dist/core/client.d.ts +2 -0
  327. package/dist/core/client.js +19 -1
  328. package/dist/index.d.ts +1 -0
  329. package/dist/index.js +1 -0
  330. package/dist/mcp/cli.js +2 -3
  331. package/dist/mcp/data/embeddings.json +1 -1
  332. package/dist/mcp/tools/network.js +298 -158
  333. package/dist/plugins/har-player.d.ts +23 -0
  334. package/dist/plugins/har-player.js +49 -0
  335. package/dist/plugins/har-recorder.d.ts +37 -3
  336. package/dist/plugins/har-recorder.js +116 -63
  337. package/dist/plugins/network-simulation.d.ts +7 -0
  338. package/dist/plugins/network-simulation.js +13 -0
  339. package/dist/presets/android.d.ts +2 -0
  340. package/dist/presets/android.js +16 -0
  341. package/dist/presets/chaturbate.d.ts +2 -0
  342. package/dist/presets/chaturbate.js +17 -0
  343. package/dist/presets/elevenlabs.d.ts +6 -0
  344. package/dist/presets/elevenlabs.js +20 -0
  345. package/dist/presets/enhancers.d.ts +20 -0
  346. package/dist/presets/enhancers.js +85 -0
  347. package/dist/presets/hubspot.d.ts +9 -0
  348. package/dist/presets/hubspot.js +28 -0
  349. package/dist/presets/index.d.ts +10 -0
  350. package/dist/presets/index.js +10 -0
  351. package/dist/presets/ios.d.ts +2 -0
  352. package/dist/presets/ios.js +13 -0
  353. package/dist/presets/pinecone.d.ts +8 -0
  354. package/dist/presets/pinecone.js +42 -0
  355. package/dist/presets/registry.js +60 -0
  356. package/dist/presets/sendgrid.d.ts +6 -0
  357. package/dist/presets/sendgrid.js +20 -0
  358. package/dist/presets/sentry.d.ts +11 -0
  359. package/dist/presets/sentry.js +48 -0
  360. package/dist/presets/square.d.ts +10 -0
  361. package/dist/presets/square.js +33 -0
  362. package/dist/recker.d.ts +3 -0
  363. package/dist/recker.js +4 -0
  364. package/dist/scrape/document.d.ts +5 -4
  365. package/dist/scrape/document.js +89 -76
  366. package/dist/scrape/element.d.ts +10 -8
  367. package/dist/scrape/element.js +295 -81
  368. package/dist/scrape/extractors.d.ts +11 -11
  369. package/dist/scrape/extractors.js +145 -113
  370. package/dist/scrape/index.d.ts +2 -0
  371. package/dist/scrape/index.js +1 -0
  372. package/dist/scrape/parser/back.d.ts +1 -0
  373. package/dist/scrape/parser/back.js +3 -0
  374. package/dist/scrape/parser/index.d.ts +20 -0
  375. package/dist/scrape/parser/index.js +19 -0
  376. package/dist/scrape/parser/matcher.d.ts +30 -0
  377. package/dist/scrape/parser/matcher.js +99 -0
  378. package/dist/scrape/parser/nodes/comment.d.ts +12 -0
  379. package/dist/scrape/parser/nodes/comment.js +21 -0
  380. package/dist/scrape/parser/nodes/html.d.ts +110 -0
  381. package/dist/scrape/parser/nodes/html.js +978 -0
  382. package/dist/scrape/parser/nodes/node.d.ts +18 -0
  383. package/dist/scrape/parser/nodes/node.js +31 -0
  384. package/dist/scrape/parser/nodes/text.d.ts +14 -0
  385. package/dist/scrape/parser/nodes/text.js +30 -0
  386. package/dist/scrape/parser/nodes/type.d.ts +6 -0
  387. package/dist/scrape/parser/nodes/type.js +7 -0
  388. package/dist/scrape/parser/parse.d.ts +1 -0
  389. package/dist/scrape/parser/parse.js +1 -0
  390. package/dist/scrape/parser/valid.d.ts +2 -0
  391. package/dist/scrape/parser/valid.js +5 -0
  392. package/dist/scrape/parser/void-tag.d.ts +7 -0
  393. package/dist/scrape/parser/void-tag.js +43 -0
  394. package/dist/scrape/spider.d.ts +19 -0
  395. package/dist/scrape/spider.js +28 -3
  396. package/dist/scrape/types.d.ts +7 -0
  397. package/dist/seo/analyzer.d.ts +15 -5
  398. package/dist/seo/analyzer.js +636 -175
  399. package/dist/seo/formatter.d.ts +16 -0
  400. package/dist/seo/formatter.js +228 -0
  401. package/dist/seo/index.d.ts +2 -0
  402. package/dist/seo/index.js +1 -0
  403. package/dist/seo/keywords.d.ts +16 -0
  404. package/dist/seo/keywords.js +55 -0
  405. package/dist/seo/rules/accessibility.js +96 -57
  406. package/dist/seo/rules/ai-search.js +44 -31
  407. package/dist/seo/rules/analytics.d.ts +2 -0
  408. package/dist/seo/rules/analytics.js +306 -0
  409. package/dist/seo/rules/best-practices.js +21 -14
  410. package/dist/seo/rules/canonical.js +53 -32
  411. package/dist/seo/rules/content.js +317 -31
  412. package/dist/seo/rules/crawl.js +55 -40
  413. package/dist/seo/rules/cwv.js +21 -15
  414. package/dist/seo/rules/ecommerce.js +82 -22
  415. package/dist/seo/rules/i18n.js +75 -36
  416. package/dist/seo/rules/images.js +109 -30
  417. package/dist/seo/rules/index.js +2 -0
  418. package/dist/seo/rules/internal-linking.js +58 -39
  419. package/dist/seo/rules/links.js +79 -52
  420. package/dist/seo/rules/local.js +49 -25
  421. package/dist/seo/rules/meta.js +339 -81
  422. package/dist/seo/rules/mobile.js +112 -2
  423. package/dist/seo/rules/performance.js +434 -66
  424. package/dist/seo/rules/pwa.js +36 -39
  425. package/dist/seo/rules/readability.js +31 -22
  426. package/dist/seo/rules/redirects.js +21 -15
  427. package/dist/seo/rules/resources.js +59 -42
  428. package/dist/seo/rules/schema.js +333 -8
  429. package/dist/seo/rules/security.js +142 -80
  430. package/dist/seo/rules/social.js +277 -47
  431. package/dist/seo/rules/structural.js +87 -19
  432. package/dist/seo/rules/technical-advanced.js +30 -24
  433. package/dist/seo/rules/technical.js +243 -42
  434. package/dist/seo/rules/types.d.ts +53 -1
  435. package/dist/seo/seo-spider.d.ts +22 -0
  436. package/dist/seo/seo-spider.js +77 -13
  437. package/dist/seo/types.d.ts +8 -1
  438. package/dist/seo/validators/llms-txt.js +19 -0
  439. package/dist/seo/validators/rss.d.ts +11 -0
  440. package/dist/seo/validators/rss.js +93 -0
  441. package/dist/seo/validators/sitemap.js +36 -26
  442. package/dist/transport/curl.d.ts +4 -0
  443. package/dist/transport/curl.js +101 -0
  444. package/dist/transport/udp.js +0 -1
  445. package/dist/transport/undici.js +1 -2
  446. package/dist/transport/worker.d.ts +18 -0
  447. package/dist/transport/worker.js +278 -0
  448. package/dist/types/index.d.ts +4 -1
  449. package/dist/utils/binary-manager.d.ts +4 -0
  450. package/dist/utils/binary-manager.js +72 -0
  451. package/dist/utils/optional-require.d.ts +7 -8
  452. package/dist/utils/optional-require.js +2 -21
  453. package/dist/utils/upload.d.ts +6 -0
  454. package/dist/utils/upload.js +11 -0
  455. package/dist/utils/user-agent.js +2 -13
  456. package/dist/version.js +1 -1
  457. package/package.json +12 -6
  458. package/dist/browser/utils/optional-require.d.ts +0 -19
  459. package/dist/browser/utils/optional-require.js +0 -105
@@ -3,6 +3,23 @@ import type { ExtractedLink } from '../../scrape/types.js';
3
3
  export type RuleSeverity = 'error' | 'warning' | 'info';
4
4
  export type RuleCategory = 'title' | 'meta' | 'og' | 'twitter' | 'headings' | 'images' | 'links' | 'content' | 'technical' | 'security' | 'mobile' | 'structured-data' | 'performance' | 'accessibility' | 'ai-search' | 'resources' | 'crawlability' | 'canonicalization';
5
5
  export interface RuleContext {
6
+ keywordsInTitle?: boolean;
7
+ keywordsInDescription?: boolean;
8
+ keywordsInH1?: boolean;
9
+ keywordsInUrl?: boolean;
10
+ keywordsInFirstParagraph?: boolean;
11
+ keywordsInAltText?: boolean;
12
+ keywordConsistencyScore?: number;
13
+ keywordConsistencyDetails?: {
14
+ inTitle: boolean;
15
+ inDescription: boolean;
16
+ inH1: boolean;
17
+ inUrl: boolean;
18
+ inFirstParagraph: boolean;
19
+ inAltText: boolean;
20
+ };
21
+ topKeywords?: string[];
22
+ mainKeyword?: string;
6
23
  title?: string;
7
24
  titleLength?: number;
8
25
  metaDescription?: string;
@@ -34,9 +51,12 @@ export interface RuleContext {
34
51
  imagesWithDimensions?: number;
35
52
  imagesMissingDimensions?: number;
36
53
  imagesWithEmptyAlt?: number;
54
+ imagesWithSrcset?: number;
55
+ largeBase64ImagesCount?: number;
37
56
  imagesDecorativeCount?: number;
38
57
  imagesUsingModernFormats?: number;
39
58
  altTextLengths?: number[];
59
+ imageAltTexts?: string[];
40
60
  imageFilenames?: string[];
41
61
  imagesWithAsyncDecoding?: number;
42
62
  brokenExternalImages?: number;
@@ -70,6 +90,8 @@ export interface RuleContext {
70
90
  totalLinks?: number;
71
91
  internalLinks?: number;
72
92
  externalLinks?: number;
93
+ internalHttpLinks?: number;
94
+ internalHttpLinkUrls?: string[];
73
95
  linksWithoutText?: number;
74
96
  nofollowLinks?: number;
75
97
  sponsoredLinks?: number;
@@ -89,6 +111,8 @@ export interface RuleContext {
89
111
  missingNoreferrer?: ExtractedLink[];
90
112
  };
91
113
  wordCount?: number;
114
+ emailsFound?: string[];
115
+ socialLinksFound?: string[];
92
116
  characterCount?: number;
93
117
  sentenceCount?: number;
94
118
  paragraphCount?: number;
@@ -102,7 +126,6 @@ export interface RuleContext {
102
126
  avgSentenceLength?: number;
103
127
  faqCount?: number;
104
128
  imagePerWordRatio?: number;
105
- mainKeyword?: string;
106
129
  keywordDensity?: number;
107
130
  fleschReadingEase?: number;
108
131
  hasQuestionHeadings?: boolean;
@@ -120,6 +143,7 @@ export interface RuleContext {
120
143
  hasBreadcrumbsSchema?: boolean;
121
144
  videoCount?: number;
122
145
  audioCount?: number;
146
+ hasAutoplay?: boolean;
123
147
  hasCanonical?: boolean;
124
148
  canonicalUrl?: string;
125
149
  hasViewport?: boolean;
@@ -134,7 +158,10 @@ export interface RuleContext {
134
158
  textHtmlRatio?: number;
135
159
  hasDeprecatedPlugins?: boolean;
136
160
  deprecatedPluginTypes?: string[];
161
+ deprecatedTagsCount?: number;
162
+ deprecatedTagsFound?: string[];
137
163
  hasFrameTags?: boolean;
164
+ iframeCount?: number;
138
165
  hasFavicon?: boolean;
139
166
  faviconUrl?: string;
140
167
  hasPreconnect?: boolean;
@@ -321,6 +348,21 @@ export interface RuleContext {
321
348
  pinterestRichPinSupport?: boolean;
322
349
  hasPinterestNopin?: boolean;
323
350
  fbAppId?: string;
351
+ totalSocialLinks?: number;
352
+ socialLinksInHeader?: number;
353
+ socialLinksInFooter?: number;
354
+ socialLinksWithoutAccessibility?: number;
355
+ socialLinksWithoutNewTab?: number;
356
+ socialLinksWithoutNoopener?: number;
357
+ platformsFound?: string[];
358
+ socialLinkDetails?: Array<{
359
+ href: string;
360
+ platform: string;
361
+ hasAccessibility: boolean;
362
+ hasNewTab: boolean;
363
+ hasNoopener: boolean;
364
+ location: 'header' | 'footer' | 'body';
365
+ }>;
324
366
  navLinkCount?: number;
325
367
  footerLinkCount?: number;
326
368
  contextualLinkCount?: number;
@@ -408,6 +450,16 @@ export interface RuleContext {
408
450
  tlsVersion?: string;
409
451
  hasPasswordField?: boolean;
410
452
  formsOnHttp?: number;
453
+ analyticsDetected?: boolean;
454
+ analyticsProviders?: string[];
455
+ hasRssFeed?: boolean;
456
+ rssFeedUrl?: string;
457
+ hasAtomFeed?: boolean;
458
+ atomFeedUrl?: string;
459
+ ctaButtonsCount?: number;
460
+ formCount?: number;
461
+ hasWhatsAppLink?: boolean;
462
+ pageInSitemap?: boolean;
411
463
  }
412
464
  export interface RuleEvidence {
413
465
  found?: string | number | string[];
@@ -1,5 +1,6 @@
1
1
  import { SpiderOptions, SpiderResult, SpiderPageResult } from '../scrape/spider.js';
2
2
  import type { SeoReport } from './types.js';
3
+ import { type SitemapValidationResult } from './validators/sitemap.js';
3
4
  export interface SeoSpiderOptions extends SpiderOptions {
4
5
  seo?: boolean;
5
6
  output?: string;
@@ -20,6 +21,25 @@ export interface SiteWideIssue {
20
21
  export interface SeoSpiderResult extends Omit<SpiderResult, 'pages'> {
21
22
  pages: SeoPageResult[];
22
23
  siteWideIssues: SiteWideIssue[];
24
+ txtFiles?: {
25
+ humans: {
26
+ found: boolean;
27
+ content?: string;
28
+ url: string;
29
+ };
30
+ llms: {
31
+ found: boolean;
32
+ content?: string;
33
+ url: string;
34
+ };
35
+ };
36
+ rssFeeds?: Array<{
37
+ url: string;
38
+ type: 'rss' | 'atom' | 'unknown';
39
+ title?: string;
40
+ itemCount: number;
41
+ }>;
42
+ sitemapValidation?: SitemapValidationResult;
23
43
  summary: {
24
44
  totalPages: number;
25
45
  pagesWithErrors: number;
@@ -37,6 +57,8 @@ export declare class SeoSpider {
37
57
  private seoResults;
38
58
  constructor(options?: SeoSpiderOptions);
39
59
  crawl(startUrl: string): Promise<SeoSpiderResult>;
60
+ private checkTextFiles;
61
+ private validateSitemap;
40
62
  private analyzePages;
41
63
  private createReportFromPageData;
42
64
  private detectSiteWideIssues;
@@ -1,6 +1,8 @@
1
1
  import { Spider } from '../scrape/spider.js';
2
2
  import { analyzeSeo } from './analyzer.js';
3
3
  import { createClient } from '../core/client.js';
4
+ import { discoverFeeds } from './validators/rss.js';
5
+ import { fetchAndValidateSitemap } from './validators/sitemap.js';
4
6
  import * as fs from 'fs/promises';
5
7
  export class SeoSpider {
6
8
  spider;
@@ -32,10 +34,23 @@ export class SeoSpider {
32
34
  const seoPages = await this.analyzePages(result.pages);
33
35
  const siteWideIssues = this.detectSiteWideIssues(seoPages);
34
36
  const summary = this.calculateSummary(seoPages, siteWideIssues);
37
+ const txtFiles = await this.checkTextFiles(startUrl);
38
+ let homeHtml = '';
39
+ try {
40
+ const client = createClient({ timeout: 10000 });
41
+ const res = await client.get(startUrl);
42
+ homeHtml = await res.text();
43
+ }
44
+ catch { }
45
+ const rssFeeds = await discoverFeeds(new URL(startUrl).origin, homeHtml);
46
+ const sitemapValidation = await this.validateSitemap(startUrl);
35
47
  const seoResult = {
36
48
  ...result,
37
49
  pages: seoPages,
38
50
  siteWideIssues,
51
+ txtFiles,
52
+ rssFeeds,
53
+ sitemapValidation,
39
54
  summary,
40
55
  };
41
56
  if (this.options.output) {
@@ -43,6 +58,57 @@ export class SeoSpider {
43
58
  }
44
59
  return seoResult;
45
60
  }
61
+ async checkTextFiles(startUrl) {
62
+ try {
63
+ const baseUrl = new URL(startUrl).origin;
64
+ const client = createClient({ timeout: 5000 });
65
+ const results = {
66
+ humans: { found: false, content: undefined, url: `${baseUrl}/humans.txt` },
67
+ llms: { found: false, content: undefined, url: `${baseUrl}/llms.txt` },
68
+ };
69
+ try {
70
+ const res = await client.get(results.humans.url);
71
+ if (res.status === 200) {
72
+ results.humans.found = true;
73
+ results.humans.content = await res.text();
74
+ }
75
+ }
76
+ catch { }
77
+ try {
78
+ const res = await client.get(results.llms.url);
79
+ if (res.status === 200) {
80
+ results.llms.found = true;
81
+ results.llms.content = await res.text();
82
+ }
83
+ }
84
+ catch { }
85
+ return results;
86
+ }
87
+ catch {
88
+ return undefined;
89
+ }
90
+ }
91
+ async validateSitemap(startUrl) {
92
+ try {
93
+ const baseUrl = new URL(startUrl).origin;
94
+ const sitemapUrl = `${baseUrl}/sitemap.xml`;
95
+ const client = createClient({ timeout: this.options.timeout || 15000 });
96
+ const fetcher = async (url) => {
97
+ const res = await client.get(url);
98
+ const text = await res.text();
99
+ return {
100
+ status: res.status,
101
+ text,
102
+ headers: Object.fromEntries([...res.headers.entries()]),
103
+ };
104
+ };
105
+ const result = await fetchAndValidateSitemap(sitemapUrl, fetcher);
106
+ return result;
107
+ }
108
+ catch {
109
+ return undefined;
110
+ }
111
+ }
46
112
  async analyzePages(pages) {
47
113
  const results = [];
48
114
  const client = createClient({
@@ -93,8 +159,9 @@ export class SeoSpider {
93
159
  if (titleLength < 30) {
94
160
  checks.push({
95
161
  name: 'Title Length',
162
+ category: 'title',
96
163
  status: 'warn',
97
- message: `Title is too short (${titleLength} chars)`,
164
+ message: `Title is ${titleLength} characters`,
98
165
  value: titleLength,
99
166
  recommendation: 'Title should be 50-60 characters',
100
167
  });
@@ -102,6 +169,7 @@ export class SeoSpider {
102
169
  else if (titleLength > 60) {
103
170
  checks.push({
104
171
  name: 'Title Length',
172
+ category: 'title',
105
173
  status: 'warn',
106
174
  message: `Title is too long (${titleLength} chars)`,
107
175
  value: titleLength,
@@ -111,6 +179,7 @@ export class SeoSpider {
111
179
  else {
112
180
  checks.push({
113
181
  name: 'Title Length',
182
+ category: 'title',
114
183
  status: 'pass',
115
184
  message: `Good title length (${titleLength} chars)`,
116
185
  value: titleLength,
@@ -120,6 +189,7 @@ export class SeoSpider {
120
189
  else {
121
190
  checks.push({
122
191
  name: 'Title',
192
+ category: 'title',
123
193
  status: 'fail',
124
194
  message: 'Page has no title',
125
195
  recommendation: 'Add a descriptive <title> tag',
@@ -130,6 +200,7 @@ export class SeoSpider {
130
200
  if (internalLinks === 0) {
131
201
  checks.push({
132
202
  name: 'Internal Links',
203
+ category: 'links',
133
204
  status: 'warn',
134
205
  message: 'No internal links found',
135
206
  recommendation: 'Add internal links to improve site structure',
@@ -138,6 +209,7 @@ export class SeoSpider {
138
209
  else {
139
210
  checks.push({
140
211
  name: 'Internal Links',
212
+ category: 'links',
141
213
  status: 'pass',
142
214
  message: `${internalLinks} internal links found`,
143
215
  value: internalLinks,
@@ -232,27 +304,19 @@ export class SeoSpider {
232
304
  missingDimensions: 0,
233
305
  modernFormats: 0,
234
306
  altTextLengths: [],
307
+ imageAltTexts: [],
235
308
  imageFilenames: [],
236
309
  imagesWithAsyncDecoding: 0,
237
310
  },
238
311
  social: {
239
312
  openGraph: {
240
- present: false,
241
- hasTitle: false,
242
- hasDescription: false,
243
- hasImage: false,
244
- hasUrl: false,
245
- issues: [],
313
+ present: false, hasTitle: false, hasDescription: false, hasImage: false, hasUrl: false, issues: []
246
314
  },
247
315
  twitterCard: {
248
- present: false,
249
- hasCard: false,
250
- hasTitle: false,
251
- hasDescription: false,
252
- hasImage: false,
253
- issues: [],
316
+ present: false, hasCard: false, hasTitle: false, hasDescription: false, hasImage: false, issues: []
254
317
  },
255
318
  },
319
+ keywords: { totalWords: 0, uniqueWords: 0, topKeywords: [] },
256
320
  technical: {
257
321
  hasCanonical: false,
258
322
  hasRobotsMeta: false,
@@ -1,3 +1,5 @@
1
+ import { KeywordCloud } from './keywords.js';
2
+ export type { KeywordCloud, KeywordItem } from './keywords.js';
1
3
  export type SeoStatus = 'pass' | 'warn' | 'fail' | 'info';
2
4
  export interface SeoCheckEvidence {
3
5
  found?: string | number | string[];
@@ -10,6 +12,7 @@ export interface SeoCheckEvidence {
10
12
  }
11
13
  export interface SeoCheckResult {
12
14
  name: string;
15
+ category: string;
13
16
  status: SeoStatus;
14
17
  message: string;
15
18
  value?: string | number;
@@ -50,6 +53,8 @@ export interface LinkAnalysis {
50
53
  withoutText: number;
51
54
  sponsoredLinks: number;
52
55
  ugcLinks: number;
56
+ internalHttpLinks?: number;
57
+ internalHttpLinkUrls?: string[];
53
58
  }
54
59
  export interface ImageAnalysis {
55
60
  total: number;
@@ -59,6 +64,7 @@ export interface ImageAnalysis {
59
64
  missingDimensions: number;
60
65
  modernFormats: number;
61
66
  altTextLengths: number[];
67
+ imageAltTexts: string[];
62
68
  imageFilenames: string[];
63
69
  imagesWithAsyncDecoding: number;
64
70
  }
@@ -172,8 +178,9 @@ export interface SeoReport {
172
178
  types: string[];
173
179
  items: Record<string, unknown>[];
174
180
  };
175
- headings: HeadingAnalysis;
176
181
  content: ContentMetrics;
182
+ headings: HeadingAnalysis;
183
+ keywords: KeywordCloud;
177
184
  links: LinkAnalysis;
178
185
  images: ImageAnalysis;
179
186
  social: SocialMetaAnalysis;
@@ -1,6 +1,8 @@
1
1
  const MAX_FILE_SIZE = 100 * 1024;
2
2
  const MIN_DESCRIPTION_LENGTH = 50;
3
3
  const MAX_DESCRIPTION_LENGTH = 500;
4
+ const OPTIMAL_MIN_LINKS = 10;
5
+ const OPTIMAL_MAX_LINKS = 30;
4
6
  export function parseLlmsTxt(content) {
5
7
  const errors = [];
6
8
  const warnings = [];
@@ -181,6 +183,23 @@ export function validateLlmsTxt(content, baseUrl) {
181
183
  seenUrls.add(normalized);
182
184
  }
183
185
  }
186
+ const linkCount = parseResult.links.length;
187
+ if (linkCount > 0 && linkCount < OPTIMAL_MIN_LINKS) {
188
+ issues.push({
189
+ type: 'info',
190
+ code: 'FEW_LINKS',
191
+ message: `Only ${linkCount} link(s) found in llms.txt`,
192
+ recommendation: `Consider adding ${OPTIMAL_MIN_LINKS}-${OPTIMAL_MAX_LINKS} of your most valuable pages for better AI coverage`,
193
+ });
194
+ }
195
+ else if (linkCount > OPTIMAL_MAX_LINKS) {
196
+ issues.push({
197
+ type: 'info',
198
+ code: 'MANY_LINKS',
199
+ message: `${linkCount} links found in llms.txt`,
200
+ recommendation: `Focus on quality over quantity. ${OPTIMAL_MIN_LINKS}-${OPTIMAL_MAX_LINKS} high-value links are recommended to help AI systems identify your truly important content`,
201
+ });
202
+ }
184
203
  return {
185
204
  valid: issues.filter(i => i.type === 'error').length === 0,
186
205
  issues,
@@ -0,0 +1,11 @@
1
+ export interface RssFeed {
2
+ url: string;
3
+ type: 'rss' | 'atom' | 'unknown';
4
+ title?: string;
5
+ description?: string;
6
+ itemCount: number;
7
+ lastBuildDate?: string;
8
+ isValid: boolean;
9
+ error?: string;
10
+ }
11
+ export declare function discoverFeeds(baseUrl: string, html?: string): Promise<RssFeed[]>;
@@ -0,0 +1,93 @@
1
+ import { createClient } from '../../core/client.js';
2
+ const COMMON_PATHS = [
3
+ '/rss.xml',
4
+ '/feed.xml',
5
+ '/rss',
6
+ '/feed',
7
+ '/atom.xml',
8
+ '/feeds/posts/default',
9
+ '/index.xml'
10
+ ];
11
+ export async function discoverFeeds(baseUrl, html) {
12
+ const candidateUrls = new Set();
13
+ const feeds = [];
14
+ if (html) {
15
+ const linkRegex = /<link[^>]+?type=["']application\/(rss\+xml|atom\+xml)["'][^>]*?>/gi;
16
+ const hrefRegex = /href=["']([^"']+)["']/;
17
+ const titleRegex = /title=["']([^"']+)["']/;
18
+ let match;
19
+ while ((match = linkRegex.exec(html)) !== null) {
20
+ const tag = match[0];
21
+ const hrefMatch = hrefRegex.exec(tag);
22
+ if (hrefMatch) {
23
+ let href = hrefMatch[1];
24
+ try {
25
+ href = new URL(href, baseUrl).toString();
26
+ candidateUrls.add(href);
27
+ }
28
+ catch { }
29
+ }
30
+ }
31
+ }
32
+ if (candidateUrls.size === 0) {
33
+ for (const path of COMMON_PATHS) {
34
+ try {
35
+ const url = new URL(path, baseUrl).toString();
36
+ candidateUrls.add(url);
37
+ }
38
+ catch { }
39
+ }
40
+ }
41
+ const client = createClient({ timeout: 8000 });
42
+ await Promise.all(Array.from(candidateUrls).map(async (url) => {
43
+ try {
44
+ const response = await client.get(url);
45
+ if (response.status !== 200)
46
+ return;
47
+ const contentType = response.headers.get('content-type') || '';
48
+ const text = await response.text();
49
+ let type = 'unknown';
50
+ let isValid = false;
51
+ let itemCount = 0;
52
+ let title;
53
+ let description;
54
+ let lastBuildDate;
55
+ if (text.includes('<rss') && text.includes('version="2.0"')) {
56
+ type = 'rss';
57
+ isValid = true;
58
+ itemCount = (text.match(/<item>/g) || []).length;
59
+ const titleMatch = text.match(/<channel>[\s\S]*?<title>(.*?)<\/title>/);
60
+ if (titleMatch)
61
+ title = titleMatch[1].replace(/<!\[CDATA\[(.*?)\]\]>/g, '$1').trim();
62
+ const descMatch = text.match(/<channel>[\s\S]*?<description>(.*?)<\/description>/);
63
+ if (descMatch)
64
+ description = descMatch[1].replace(/<!\[CDATA\[(.*?)\]\]>/g, '$1').trim();
65
+ const dateMatch = text.match(/<lastBuildDate>(.*?)<\/lastBuildDate>/);
66
+ if (dateMatch)
67
+ lastBuildDate = dateMatch[1];
68
+ }
69
+ else if (text.includes('<feed') && text.includes('xmlns="http://www.w3.org/2005/Atom"')) {
70
+ type = 'atom';
71
+ isValid = true;
72
+ itemCount = (text.match(/<entry>/g) || []).length;
73
+ const titleMatch = text.match(/<title>(.*?)<\/title>/);
74
+ if (titleMatch)
75
+ title = titleMatch[1].trim();
76
+ }
77
+ if (isValid) {
78
+ feeds.push({
79
+ url,
80
+ type,
81
+ isValid,
82
+ title,
83
+ description,
84
+ itemCount,
85
+ lastBuildDate
86
+ });
87
+ }
88
+ }
89
+ catch (err) {
90
+ }
91
+ }));
92
+ return feeds;
93
+ }
@@ -1,4 +1,4 @@
1
- import * as cheerio from 'cheerio';
1
+ import { parse } from '../../scrape/parser/index.js';
2
2
  const VALID_CHANGEFREQ = ['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'];
3
3
  const MAX_URLS_PER_SITEMAP = 50000;
4
4
  const MAX_SITEMAP_SIZE = 50 * 1024 * 1024;
@@ -9,12 +9,12 @@ export function parseSitemap(content, compressed = false) {
9
9
  const sitemaps = [];
10
10
  let type = 'unknown';
11
11
  try {
12
- const $ = cheerio.load(content, { xmlMode: true });
13
- if ($('urlset').length > 0) {
12
+ const root = parse(content, { lowerCaseTagName: false });
13
+ if (root.querySelectorAll('urlset').length > 0) {
14
14
  type = 'urlset';
15
- $('url').each((_, elem) => {
16
- const $url = $(elem);
17
- const loc = $url.find('loc').first().text().trim();
15
+ root.querySelectorAll('url').forEach((urlElem) => {
16
+ const locElem = urlElem.querySelector('loc');
17
+ const loc = locElem ? locElem.text.trim() : '';
18
18
  if (!loc) {
19
19
  errors.push('URL entry missing <loc> element');
20
20
  return;
@@ -27,7 +27,8 @@ export function parseSitemap(content, compressed = false) {
27
27
  return;
28
28
  }
29
29
  const url = { loc };
30
- const lastmod = $url.find('lastmod').first().text().trim();
30
+ const lastmodElem = urlElem.querySelector('lastmod');
31
+ const lastmod = lastmodElem ? lastmodElem.text.trim() : '';
31
32
  if (lastmod) {
32
33
  if (isValidDate(lastmod)) {
33
34
  url.lastmod = lastmod;
@@ -36,7 +37,8 @@ export function parseSitemap(content, compressed = false) {
36
37
  warnings.push(`Invalid lastmod date for ${loc}: ${lastmod}`);
37
38
  }
38
39
  }
39
- const changefreq = $url.find('changefreq').first().text().trim().toLowerCase();
40
+ const changefreqElem = urlElem.querySelector('changefreq');
41
+ const changefreq = changefreqElem ? changefreqElem.text.trim().toLowerCase() : '';
40
42
  if (changefreq) {
41
43
  if (VALID_CHANGEFREQ.includes(changefreq)) {
42
44
  url.changefreq = changefreq;
@@ -45,7 +47,8 @@ export function parseSitemap(content, compressed = false) {
45
47
  warnings.push(`Invalid changefreq for ${loc}: ${changefreq}`);
46
48
  }
47
49
  }
48
- const priority = $url.find('priority').first().text().trim();
50
+ const priorityElem = urlElem.querySelector('priority');
51
+ const priority = priorityElem ? priorityElem.text.trim() : '';
49
52
  if (priority) {
50
53
  const p = parseFloat(priority);
51
54
  if (isNaN(p) || p < 0 || p > 1) {
@@ -56,14 +59,17 @@ export function parseSitemap(content, compressed = false) {
56
59
  }
57
60
  }
58
61
  const images = [];
59
- $url.find('image\\:image, image').each((_, imgElem) => {
60
- const $img = $(imgElem);
61
- const imgLoc = $img.find('image\\:loc, loc').first().text().trim();
62
+ const imageElems = urlElem.querySelectorAll('image\:image, image');
63
+ imageElems.forEach((imgElem) => {
64
+ const locEl = imgElem.querySelector('image\:loc, loc');
65
+ const imgLoc = locEl ? locEl.text.trim() : '';
62
66
  if (imgLoc) {
67
+ const captionEl = imgElem.querySelector('image\:caption, caption');
68
+ const titleEl = imgElem.querySelector('image\:title, title');
63
69
  images.push({
64
70
  loc: imgLoc,
65
- caption: $img.find('image\\:caption, caption').first().text().trim() || undefined,
66
- title: $img.find('image\\:title, title').first().text().trim() || undefined,
71
+ caption: captionEl ? captionEl.text.trim() || undefined : undefined,
72
+ title: titleEl ? titleEl.text.trim() || undefined : undefined,
67
73
  });
68
74
  }
69
75
  });
@@ -71,10 +77,10 @@ export function parseSitemap(content, compressed = false) {
71
77
  url.images = images;
72
78
  }
73
79
  const alternates = [];
74
- $url.find('xhtml\\:link[rel="alternate"], link[rel="alternate"]').each((_, linkElem) => {
75
- const $link = $(linkElem);
76
- const hreflang = $link.attr('hreflang');
77
- const href = $link.attr('href');
80
+ const linkElems = urlElem.querySelectorAll('xhtml\:link[rel="alternate"], link[rel="alternate"]');
81
+ linkElems.forEach((linkElem) => {
82
+ const hreflang = linkElem.getAttribute('hreflang');
83
+ const href = linkElem.getAttribute('href');
78
84
  if (hreflang && href) {
79
85
  alternates.push({ hreflang, href });
80
86
  }
@@ -85,17 +91,18 @@ export function parseSitemap(content, compressed = false) {
85
91
  urls.push(url);
86
92
  });
87
93
  }
88
- else if ($('sitemapindex').length > 0) {
94
+ else if (root.querySelectorAll('sitemapindex').length > 0) {
89
95
  type = 'sitemapindex';
90
- $('sitemap').each((_, elem) => {
91
- const $sitemap = $(elem);
92
- const loc = $sitemap.find('loc').first().text().trim();
96
+ root.querySelectorAll('sitemap').forEach((sitemapElem) => {
97
+ const locElem = sitemapElem.querySelector('loc');
98
+ const loc = locElem ? locElem.text.trim() : '';
93
99
  if (!loc) {
94
100
  errors.push('Sitemap entry missing <loc> element');
95
101
  return;
96
102
  }
97
103
  const sitemap = { loc };
98
- const lastmod = $sitemap.find('lastmod').first().text().trim();
104
+ const lastmodElem = sitemapElem.querySelector('lastmod');
105
+ const lastmod = lastmodElem ? lastmodElem.text.trim() : '';
99
106
  if (lastmod) {
100
107
  if (isValidDate(lastmod)) {
101
108
  sitemap.lastmod = lastmod;
@@ -285,6 +292,9 @@ export async function discoverSitemaps(baseUrl, robotsTxtContent, fetcher) {
285
292
  }
286
293
  }
287
294
  }
295
+ if (discovered.size > 0) {
296
+ return Array.from(discovered);
297
+ }
288
298
  for (const path of commonLocations) {
289
299
  const url = new URL(path, base).href;
290
300
  try {
@@ -399,9 +409,9 @@ export async function fetchAndValidateSitemap(url, fetcher) {
399
409
  }
400
410
  function isValidDate(dateString) {
401
411
  const patterns = [
402
- /^\d{4}-\d{2}-\d{2}$/,
403
- /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2}|Z)$/,
404
- /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}([+-]\d{2}:\d{2}|Z)$/,
412
+ /\d{4}-\d{2}-\d{2}$/,
413
+ /\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}([+-]\d{2}:\d{2}|Z)$/,
414
+ /\d{4}-\d{2}-\d{2}T\d{2}:\d{2}([+-]\d{2}:\d{2}|Z)$/,
405
415
  ];
406
416
  if (!patterns.some(p => p.test(dateString))) {
407
417
  return false;
@@ -0,0 +1,4 @@
1
+ import { Transport, ReckerRequest, ReckerResponse } from '../types/index.js';
2
+ export declare class CurlTransport implements Transport {
3
+ dispatch(req: ReckerRequest): Promise<ReckerResponse>;
4
+ }