@staticn0va/wigolo 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1003) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +146 -227
  3. package/SKILL.md +382 -0
  4. package/assets/blocks/claude-code/CLAUDE.md.block +20 -0
  5. package/assets/blocks/claude-code/wigolo-command.md +40 -0
  6. package/assets/blocks/cursor/wigolo.mdc +46 -0
  7. package/assets/blocks/gemini-cli/GEMINI.md.block +18 -0
  8. package/assets/blocks/vscode/copilot-instructions.md.block +18 -0
  9. package/assets/skills/wigolo/SKILL.md +50 -0
  10. package/assets/skills/wigolo/rules/cache-first.md +30 -0
  11. package/assets/skills/wigolo/rules/synthesis.md +43 -0
  12. package/assets/skills/wigolo-agent/SKILL.md +73 -0
  13. package/assets/skills/wigolo-crawl/SKILL.md +60 -0
  14. package/assets/skills/wigolo-extract/SKILL.md +59 -0
  15. package/assets/skills/wigolo-fetch/SKILL.md +65 -0
  16. package/assets/skills/wigolo-find-similar/SKILL.md +72 -0
  17. package/assets/skills/wigolo-research/SKILL.md +77 -0
  18. package/assets/skills/wigolo-search/SKILL.md +78 -0
  19. package/dist/agent/executor.d.ts +33 -0
  20. package/dist/agent/executor.d.ts.map +1 -0
  21. package/dist/agent/executor.js +233 -0
  22. package/dist/agent/executor.js.map +1 -0
  23. package/dist/agent/pipeline.d.ts +5 -0
  24. package/dist/agent/pipeline.d.ts.map +1 -0
  25. package/dist/agent/pipeline.js +238 -0
  26. package/dist/agent/pipeline.js.map +1 -0
  27. package/dist/agent/planner.d.ts +13 -0
  28. package/dist/agent/planner.d.ts.map +1 -0
  29. package/dist/agent/planner.js +271 -0
  30. package/dist/agent/planner.js.map +1 -0
  31. package/dist/agent/relevance.d.ts +15 -0
  32. package/dist/agent/relevance.d.ts.map +1 -0
  33. package/dist/agent/relevance.js +60 -0
  34. package/dist/agent/relevance.js.map +1 -0
  35. package/dist/cache/backfill-embeddings.d.ts +23 -0
  36. package/dist/cache/backfill-embeddings.d.ts.map +1 -0
  37. package/dist/cache/backfill-embeddings.js +105 -0
  38. package/dist/cache/backfill-embeddings.js.map +1 -0
  39. package/dist/cache/change-detector.d.ts +7 -0
  40. package/dist/cache/change-detector.d.ts.map +1 -0
  41. package/dist/cache/change-detector.js +43 -0
  42. package/dist/cache/change-detector.js.map +1 -0
  43. package/dist/cache/db.d.ts +1 -0
  44. package/dist/cache/db.d.ts.map +1 -1
  45. package/dist/cache/db.js +94 -22
  46. package/dist/cache/db.js.map +1 -1
  47. package/dist/cache/diff-summary.d.ts +2 -0
  48. package/dist/cache/diff-summary.d.ts.map +1 -0
  49. package/dist/cache/diff-summary.js +82 -0
  50. package/dist/cache/diff-summary.js.map +1 -0
  51. package/dist/cache/migrations/runner.d.ts +29 -0
  52. package/dist/cache/migrations/runner.d.ts.map +1 -0
  53. package/dist/cache/migrations/runner.js +147 -0
  54. package/dist/cache/migrations/runner.js.map +1 -0
  55. package/dist/cache/sqlite-vec-store.d.ts +42 -0
  56. package/dist/cache/sqlite-vec-store.d.ts.map +1 -0
  57. package/dist/cache/sqlite-vec-store.js +176 -0
  58. package/dist/cache/sqlite-vec-store.js.map +1 -0
  59. package/dist/cache/store.d.ts +47 -1
  60. package/dist/cache/store.d.ts.map +1 -1
  61. package/dist/cache/store.js +364 -168
  62. package/dist/cache/store.js.map +1 -1
  63. package/dist/cli/agents/antigravity.d.ts +20 -0
  64. package/dist/cli/agents/antigravity.d.ts.map +1 -0
  65. package/dist/cli/agents/antigravity.js +49 -0
  66. package/dist/cli/agents/antigravity.js.map +1 -0
  67. package/dist/cli/agents/claude-code.d.ts +25 -0
  68. package/dist/cli/agents/claude-code.d.ts.map +1 -0
  69. package/dist/cli/agents/claude-code.js +111 -0
  70. package/dist/cli/agents/claude-code.js.map +1 -0
  71. package/dist/cli/agents/cursor.d.ts +21 -0
  72. package/dist/cli/agents/cursor.d.ts.map +1 -0
  73. package/dist/cli/agents/cursor.js +58 -0
  74. package/dist/cli/agents/cursor.js.map +1 -0
  75. package/dist/cli/agents/gemini-cli.d.ts +21 -0
  76. package/dist/cli/agents/gemini-cli.d.ts.map +1 -0
  77. package/dist/cli/agents/gemini-cli.js +55 -0
  78. package/dist/cli/agents/gemini-cli.js.map +1 -0
  79. package/dist/cli/agents/registry.d.ts +21 -0
  80. package/dist/cli/agents/registry.d.ts.map +1 -0
  81. package/dist/cli/agents/registry.js +27 -0
  82. package/dist/cli/agents/registry.js.map +1 -0
  83. package/dist/cli/agents/utils.d.ts +26 -0
  84. package/dist/cli/agents/utils.d.ts.map +1 -0
  85. package/dist/cli/agents/utils.js +136 -0
  86. package/dist/cli/agents/utils.js.map +1 -0
  87. package/dist/cli/agents/vscode.d.ts +21 -0
  88. package/dist/cli/agents/vscode.d.ts.map +1 -0
  89. package/dist/cli/agents/vscode.js +62 -0
  90. package/dist/cli/agents/vscode.js.map +1 -0
  91. package/dist/cli/auth.d.ts +2 -0
  92. package/dist/cli/auth.d.ts.map +1 -0
  93. package/dist/cli/auth.js +94 -0
  94. package/dist/cli/auth.js.map +1 -0
  95. package/dist/cli/backfill.d.ts +2 -0
  96. package/dist/cli/backfill.d.ts.map +1 -0
  97. package/dist/cli/backfill.js +58 -0
  98. package/dist/cli/backfill.js.map +1 -0
  99. package/dist/cli/daemon.d.ts +6 -1
  100. package/dist/cli/daemon.d.ts.map +1 -1
  101. package/dist/cli/daemon.js +61 -3
  102. package/dist/cli/daemon.js.map +1 -1
  103. package/dist/cli/doctor.d.ts +8 -0
  104. package/dist/cli/doctor.d.ts.map +1 -0
  105. package/dist/cli/doctor.js +344 -0
  106. package/dist/cli/doctor.js.map +1 -0
  107. package/dist/cli/health.d.ts +1 -1
  108. package/dist/cli/health.d.ts.map +1 -1
  109. package/dist/cli/health.js +42 -3
  110. package/dist/cli/health.js.map +1 -1
  111. package/dist/cli/help.d.ts +6 -0
  112. package/dist/cli/help.d.ts.map +1 -0
  113. package/dist/cli/help.js +63 -0
  114. package/dist/cli/help.js.map +1 -0
  115. package/dist/cli/index.d.ts +1 -1
  116. package/dist/cli/index.d.ts.map +1 -1
  117. package/dist/cli/index.js +35 -7
  118. package/dist/cli/index.js.map +1 -1
  119. package/dist/cli/init.d.ts +2 -0
  120. package/dist/cli/init.d.ts.map +1 -0
  121. package/dist/cli/init.js +201 -0
  122. package/dist/cli/init.js.map +1 -0
  123. package/dist/cli/plugin.d.ts +5 -0
  124. package/dist/cli/plugin.d.ts.map +1 -0
  125. package/dist/cli/plugin.js +185 -0
  126. package/dist/cli/plugin.js.map +1 -0
  127. package/dist/cli/setup-mcp.d.ts +2 -0
  128. package/dist/cli/setup-mcp.d.ts.map +1 -0
  129. package/dist/cli/setup-mcp.js +114 -0
  130. package/dist/cli/setup-mcp.js.map +1 -0
  131. package/dist/cli/shell.d.ts +2 -0
  132. package/dist/cli/shell.d.ts.map +1 -0
  133. package/dist/cli/shell.js +86 -0
  134. package/dist/cli/shell.js.map +1 -0
  135. package/dist/cli/shutdown.d.ts +2 -0
  136. package/dist/cli/shutdown.d.ts.map +1 -0
  137. package/dist/cli/shutdown.js +26 -0
  138. package/dist/cli/shutdown.js.map +1 -0
  139. package/dist/cli/status.d.ts +2 -0
  140. package/dist/cli/status.d.ts.map +1 -0
  141. package/dist/cli/status.js +31 -0
  142. package/dist/cli/status.js.map +1 -0
  143. package/dist/cli/telemetry.d.ts +10 -0
  144. package/dist/cli/telemetry.d.ts.map +1 -0
  145. package/dist/cli/telemetry.js +56 -0
  146. package/dist/cli/telemetry.js.map +1 -0
  147. package/dist/cli/tui/agents-types.d.ts +28 -0
  148. package/dist/cli/tui/agents-types.d.ts.map +1 -0
  149. package/dist/cli/tui/agents-types.js +1 -0
  150. package/dist/cli/tui/agents-types.js.map +1 -0
  151. package/dist/cli/tui/agents.d.ts +11 -0
  152. package/dist/cli/tui/agents.d.ts.map +1 -0
  153. package/dist/cli/tui/agents.js +93 -0
  154. package/dist/cli/tui/agents.js.map +1 -0
  155. package/dist/cli/tui/banner.d.ts +3 -0
  156. package/dist/cli/tui/banner.d.ts.map +1 -0
  157. package/dist/cli/tui/banner.js +30 -0
  158. package/dist/cli/tui/banner.js.map +1 -0
  159. package/dist/cli/tui/components/AgentSelect.d.ts +13 -0
  160. package/dist/cli/tui/components/AgentSelect.d.ts.map +1 -0
  161. package/dist/cli/tui/components/AgentSelect.js +116 -0
  162. package/dist/cli/tui/components/AgentSelect.js.map +1 -0
  163. package/dist/cli/tui/components/Banner.d.ts +6 -0
  164. package/dist/cli/tui/components/Banner.d.ts.map +1 -0
  165. package/dist/cli/tui/components/Banner.js +25 -0
  166. package/dist/cli/tui/components/Banner.js.map +1 -0
  167. package/dist/cli/tui/components/BrowserSelect.d.ts +7 -0
  168. package/dist/cli/tui/components/BrowserSelect.d.ts.map +1 -0
  169. package/dist/cli/tui/components/BrowserSelect.js +19 -0
  170. package/dist/cli/tui/components/BrowserSelect.js.map +1 -0
  171. package/dist/cli/tui/components/InstallProgress.d.ts +9 -0
  172. package/dist/cli/tui/components/InstallProgress.d.ts.map +1 -0
  173. package/dist/cli/tui/components/InstallProgress.js +67 -0
  174. package/dist/cli/tui/components/InstallProgress.js.map +1 -0
  175. package/dist/cli/tui/components/SkillInstall.d.ts +14 -0
  176. package/dist/cli/tui/components/SkillInstall.d.ts.map +1 -0
  177. package/dist/cli/tui/components/SkillInstall.js +94 -0
  178. package/dist/cli/tui/components/SkillInstall.js.map +1 -0
  179. package/dist/cli/tui/components/Summary.d.ts +22 -0
  180. package/dist/cli/tui/components/Summary.d.ts.map +1 -0
  181. package/dist/cli/tui/components/Summary.js +135 -0
  182. package/dist/cli/tui/components/Summary.js.map +1 -0
  183. package/dist/cli/tui/components/SystemCheck.d.ts +8 -0
  184. package/dist/cli/tui/components/SystemCheck.d.ts.map +1 -0
  185. package/dist/cli/tui/components/SystemCheck.js +71 -0
  186. package/dist/cli/tui/components/SystemCheck.js.map +1 -0
  187. package/dist/cli/tui/components/Verification.d.ts +8 -0
  188. package/dist/cli/tui/components/Verification.d.ts.map +1 -0
  189. package/dist/cli/tui/components/Verification.js +63 -0
  190. package/dist/cli/tui/components/Verification.js.map +1 -0
  191. package/dist/cli/tui/config-writer-cli.d.ts +12 -0
  192. package/dist/cli/tui/config-writer-cli.d.ts.map +1 -0
  193. package/dist/cli/tui/config-writer-cli.js +39 -0
  194. package/dist/cli/tui/config-writer-cli.js.map +1 -0
  195. package/dist/cli/tui/config-writer-json.d.ts +16 -0
  196. package/dist/cli/tui/config-writer-json.d.ts.map +1 -0
  197. package/dist/cli/tui/config-writer-json.js +86 -0
  198. package/dist/cli/tui/config-writer-json.js.map +1 -0
  199. package/dist/cli/tui/config-writer-toml.d.ts +16 -0
  200. package/dist/cli/tui/config-writer-toml.d.ts.map +1 -0
  201. package/dist/cli/tui/config-writer-toml.js +83 -0
  202. package/dist/cli/tui/config-writer-toml.js.map +1 -0
  203. package/dist/cli/tui/config-writer.d.ts +25 -0
  204. package/dist/cli/tui/config-writer.d.ts.map +1 -0
  205. package/dist/cli/tui/config-writer.js +101 -0
  206. package/dist/cli/tui/config-writer.js.map +1 -0
  207. package/dist/cli/tui/detect-helpers.d.ts +6 -0
  208. package/dist/cli/tui/detect-helpers.d.ts.map +1 -0
  209. package/dist/cli/tui/detect-helpers.js +45 -0
  210. package/dist/cli/tui/detect-helpers.js.map +1 -0
  211. package/dist/cli/tui/extras-prompt.d.ts +7 -0
  212. package/dist/cli/tui/extras-prompt.d.ts.map +1 -0
  213. package/dist/cli/tui/extras-prompt.js +42 -0
  214. package/dist/cli/tui/extras-prompt.js.map +1 -0
  215. package/dist/cli/tui/flags-types.d.ts +19 -0
  216. package/dist/cli/tui/flags-types.d.ts.map +1 -0
  217. package/dist/cli/tui/flags-types.js +23 -0
  218. package/dist/cli/tui/flags-types.js.map +1 -0
  219. package/dist/cli/tui/flags.d.ts +5 -0
  220. package/dist/cli/tui/flags.d.ts.map +1 -0
  221. package/dist/cli/tui/flags.js +132 -0
  222. package/dist/cli/tui/flags.js.map +1 -0
  223. package/dist/cli/tui/format.d.ts +14 -0
  224. package/dist/cli/tui/format.d.ts.map +1 -0
  225. package/dist/cli/tui/format.js +37 -0
  226. package/dist/cli/tui/format.js.map +1 -0
  227. package/dist/cli/tui/hooks/useAgentDetect.d.ts +6 -0
  228. package/dist/cli/tui/hooks/useAgentDetect.d.ts.map +1 -0
  229. package/dist/cli/tui/hooks/useAgentDetect.js +19 -0
  230. package/dist/cli/tui/hooks/useAgentDetect.js.map +1 -0
  231. package/dist/cli/tui/hooks/useInstall.d.ts +14 -0
  232. package/dist/cli/tui/hooks/useInstall.d.ts.map +1 -0
  233. package/dist/cli/tui/hooks/useInstall.js +90 -0
  234. package/dist/cli/tui/hooks/useInstall.js.map +1 -0
  235. package/dist/cli/tui/hooks/useSystemCheck.d.ts +13 -0
  236. package/dist/cli/tui/hooks/useSystemCheck.d.ts.map +1 -0
  237. package/dist/cli/tui/hooks/useSystemCheck.js +95 -0
  238. package/dist/cli/tui/hooks/useSystemCheck.js.map +1 -0
  239. package/dist/cli/tui/hooks/useVerify.d.ts +14 -0
  240. package/dist/cli/tui/hooks/useVerify.d.ts.map +1 -0
  241. package/dist/cli/tui/hooks/useVerify.js +71 -0
  242. package/dist/cli/tui/hooks/useVerify.js.map +1 -0
  243. package/dist/cli/tui/ink-init.d.ts +2 -0
  244. package/dist/cli/tui/ink-init.d.ts.map +1 -0
  245. package/dist/cli/tui/ink-init.js +198 -0
  246. package/dist/cli/tui/ink-init.js.map +1 -0
  247. package/dist/cli/tui/reporter-auto.d.ts +7 -0
  248. package/dist/cli/tui/reporter-auto.d.ts.map +1 -0
  249. package/dist/cli/tui/reporter-auto.js +15 -0
  250. package/dist/cli/tui/reporter-auto.js.map +1 -0
  251. package/dist/cli/tui/reporter.d.ts +26 -0
  252. package/dist/cli/tui/reporter.d.ts.map +1 -0
  253. package/dist/cli/tui/reporter.js +32 -0
  254. package/dist/cli/tui/reporter.js.map +1 -0
  255. package/dist/cli/tui/run-command.d.ts +14 -0
  256. package/dist/cli/tui/run-command.d.ts.map +1 -0
  257. package/dist/cli/tui/run-command.js +72 -0
  258. package/dist/cli/tui/run-command.js.map +1 -0
  259. package/dist/cli/tui/select-agents.d.ts +6 -0
  260. package/dist/cli/tui/select-agents.d.ts.map +1 -0
  261. package/dist/cli/tui/select-agents.js +32 -0
  262. package/dist/cli/tui/select-agents.js.map +1 -0
  263. package/dist/cli/tui/status-agents.d.ts +11 -0
  264. package/dist/cli/tui/status-agents.d.ts.map +1 -0
  265. package/dist/cli/tui/status-agents.js +53 -0
  266. package/dist/cli/tui/status-agents.js.map +1 -0
  267. package/dist/cli/tui/status-cache.d.ts +6 -0
  268. package/dist/cli/tui/status-cache.d.ts.map +1 -0
  269. package/dist/cli/tui/status-cache.js +39 -0
  270. package/dist/cli/tui/status-cache.js.map +1 -0
  271. package/dist/cli/tui/status-format.d.ts +14 -0
  272. package/dist/cli/tui/status-format.d.ts.map +1 -0
  273. package/dist/cli/tui/status-format.js +41 -0
  274. package/dist/cli/tui/status-format.js.map +1 -0
  275. package/dist/cli/tui/status-python.d.ts +6 -0
  276. package/dist/cli/tui/status-python.d.ts.map +1 -0
  277. package/dist/cli/tui/status-python.js +30 -0
  278. package/dist/cli/tui/status-python.js.map +1 -0
  279. package/dist/cli/tui/system-check.d.ts +24 -0
  280. package/dist/cli/tui/system-check.d.ts.map +1 -0
  281. package/dist/cli/tui/system-check.js +103 -0
  282. package/dist/cli/tui/system-check.js.map +1 -0
  283. package/dist/cli/tui/tui-reporter.d.ts +19 -0
  284. package/dist/cli/tui/tui-reporter.d.ts.map +1 -0
  285. package/dist/cli/tui/tui-reporter.js +95 -0
  286. package/dist/cli/tui/tui-reporter.js.map +1 -0
  287. package/dist/cli/tui/utils/config-writer.d.ts +3 -0
  288. package/dist/cli/tui/utils/config-writer.d.ts.map +1 -0
  289. package/dist/cli/tui/utils/config-writer.js +22 -0
  290. package/dist/cli/tui/utils/config-writer.js.map +1 -0
  291. package/dist/cli/tui/utils/suppress-logs.d.ts +3 -0
  292. package/dist/cli/tui/utils/suppress-logs.d.ts.map +1 -0
  293. package/dist/cli/tui/utils/suppress-logs.js +11 -0
  294. package/dist/cli/tui/utils/suppress-logs.js.map +1 -0
  295. package/dist/cli/tui/verify-suggestions.d.ts +5 -0
  296. package/dist/cli/tui/verify-suggestions.d.ts.map +1 -0
  297. package/dist/cli/tui/verify-suggestions.js +20 -0
  298. package/dist/cli/tui/verify-suggestions.js.map +1 -0
  299. package/dist/cli/tui/verify.d.ts +14 -0
  300. package/dist/cli/tui/verify.d.ts.map +1 -0
  301. package/dist/cli/tui/verify.js +101 -0
  302. package/dist/cli/tui/verify.js.map +1 -0
  303. package/dist/cli/tui/version.d.ts +2 -0
  304. package/dist/cli/tui/version.d.ts.map +1 -0
  305. package/dist/cli/tui/version.js +14 -0
  306. package/dist/cli/tui/version.js.map +1 -0
  307. package/dist/cli/uninstall.d.ts +2 -0
  308. package/dist/cli/uninstall.d.ts.map +1 -0
  309. package/dist/cli/uninstall.js +57 -0
  310. package/dist/cli/uninstall.js.map +1 -0
  311. package/dist/cli/warmup.d.ts +10 -2
  312. package/dist/cli/warmup.d.ts.map +1 -1
  313. package/dist/cli/warmup.js +226 -93
  314. package/dist/cli/warmup.js.map +1 -1
  315. package/dist/config.d.ts +28 -2
  316. package/dist/config.d.ts.map +1 -1
  317. package/dist/config.js +106 -56
  318. package/dist/config.js.map +1 -1
  319. package/dist/crawl/crawler.d.ts +6 -0
  320. package/dist/crawl/crawler.d.ts.map +1 -1
  321. package/dist/crawl/crawler.js +210 -209
  322. package/dist/crawl/crawler.js.map +1 -1
  323. package/dist/crawl/dedup.d.ts +1 -0
  324. package/dist/crawl/dedup.d.ts.map +1 -1
  325. package/dist/crawl/dedup.js +124 -81
  326. package/dist/crawl/dedup.js.map +1 -1
  327. package/dist/crawl/etag-incremental.d.ts +43 -0
  328. package/dist/crawl/etag-incremental.d.ts.map +1 -0
  329. package/dist/crawl/etag-incremental.js +94 -0
  330. package/dist/crawl/etag-incremental.js.map +1 -0
  331. package/dist/crawl/index-to-vec.d.ts +10 -0
  332. package/dist/crawl/index-to-vec.d.ts.map +1 -0
  333. package/dist/crawl/index-to-vec.js +44 -0
  334. package/dist/crawl/index-to-vec.js.map +1 -0
  335. package/dist/crawl/mapper.js +136 -164
  336. package/dist/crawl/mapper.js.map +1 -1
  337. package/dist/crawl/rate-limiter.js +63 -66
  338. package/dist/crawl/rate-limiter.js.map +1 -1
  339. package/dist/crawl/robots.js +58 -57
  340. package/dist/crawl/robots.js.map +1 -1
  341. package/dist/crawl/sitemap-first.d.ts +12 -0
  342. package/dist/crawl/sitemap-first.d.ts.map +1 -0
  343. package/dist/crawl/sitemap-first.js +47 -0
  344. package/dist/crawl/sitemap-first.js.map +1 -0
  345. package/dist/crawl/sitemap.js +33 -32
  346. package/dist/crawl/sitemap.js.map +1 -1
  347. package/dist/crawl/url-utils.d.ts +1 -0
  348. package/dist/crawl/url-utils.d.ts.map +1 -1
  349. package/dist/crawl/url-utils.js +49 -37
  350. package/dist/crawl/url-utils.js.map +1 -1
  351. package/dist/daemon/health-check.d.ts +16 -0
  352. package/dist/daemon/health-check.d.ts.map +1 -0
  353. package/dist/daemon/health-check.js +33 -0
  354. package/dist/daemon/health-check.js.map +1 -0
  355. package/dist/daemon/http-server.d.ts +26 -0
  356. package/dist/daemon/http-server.d.ts.map +1 -0
  357. package/dist/daemon/http-server.js +275 -0
  358. package/dist/daemon/http-server.js.map +1 -0
  359. package/dist/daemon/proxy.d.ts +10 -0
  360. package/dist/daemon/proxy.d.ts.map +1 -0
  361. package/dist/daemon/proxy.js +93 -0
  362. package/dist/daemon/proxy.js.map +1 -0
  363. package/dist/embedding/embed.d.ts +59 -0
  364. package/dist/embedding/embed.d.ts.map +1 -0
  365. package/dist/embedding/embed.js +233 -0
  366. package/dist/embedding/embed.js.map +1 -0
  367. package/dist/embedding/fastembed-provider.d.ts +19 -0
  368. package/dist/embedding/fastembed-provider.d.ts.map +1 -0
  369. package/dist/embedding/fastembed-provider.js +51 -0
  370. package/dist/embedding/fastembed-provider.js.map +1 -0
  371. package/dist/embedding/key-terms.d.ts +12 -0
  372. package/dist/embedding/key-terms.d.ts.map +1 -0
  373. package/dist/embedding/key-terms.js +234 -0
  374. package/dist/embedding/key-terms.js.map +1 -0
  375. package/dist/extraction/boilerplate.d.ts +15 -0
  376. package/dist/extraction/boilerplate.d.ts.map +1 -0
  377. package/dist/extraction/boilerplate.js +52 -0
  378. package/dist/extraction/boilerplate.js.map +1 -0
  379. package/dist/extraction/defuddle.d.ts.map +1 -1
  380. package/dist/extraction/defuddle.js +27 -23
  381. package/dist/extraction/defuddle.js.map +1 -1
  382. package/dist/extraction/extract.d.ts.map +1 -1
  383. package/dist/extraction/extract.js +76 -76
  384. package/dist/extraction/extract.js.map +1 -1
  385. package/dist/extraction/jsonld.js +50 -54
  386. package/dist/extraction/jsonld.js.map +1 -1
  387. package/dist/extraction/lang-hints.d.ts +2 -0
  388. package/dist/extraction/lang-hints.d.ts.map +1 -0
  389. package/dist/extraction/lang-hints.js +30 -0
  390. package/dist/extraction/lang-hints.js.map +1 -0
  391. package/dist/extraction/llm-fallback.d.ts +17 -0
  392. package/dist/extraction/llm-fallback.d.ts.map +1 -0
  393. package/dist/extraction/llm-fallback.js +130 -0
  394. package/dist/extraction/llm-fallback.js.map +1 -0
  395. package/dist/extraction/markdown-sanitize.d.ts +2 -0
  396. package/dist/extraction/markdown-sanitize.d.ts.map +1 -0
  397. package/dist/extraction/markdown-sanitize.js +151 -0
  398. package/dist/extraction/markdown-sanitize.js.map +1 -0
  399. package/dist/extraction/markdown.d.ts +11 -0
  400. package/dist/extraction/markdown.d.ts.map +1 -1
  401. package/dist/extraction/markdown.js +195 -91
  402. package/dist/extraction/markdown.js.map +1 -1
  403. package/dist/extraction/pipeline.d.ts +8 -0
  404. package/dist/extraction/pipeline.d.ts.map +1 -1
  405. package/dist/extraction/pipeline.js +57 -91
  406. package/dist/extraction/pipeline.js.map +1 -1
  407. package/dist/extraction/readability.d.ts +1 -1
  408. package/dist/extraction/readability.d.ts.map +1 -1
  409. package/dist/extraction/readability.js +28 -29
  410. package/dist/extraction/readability.js.map +1 -1
  411. package/dist/extraction/schema.d.ts +12 -0
  412. package/dist/extraction/schema.d.ts.map +1 -1
  413. package/dist/extraction/schema.js +135 -72
  414. package/dist/extraction/schema.js.map +1 -1
  415. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -1
  416. package/dist/extraction/site-extractors/docs-generic.js +81 -91
  417. package/dist/extraction/site-extractors/docs-generic.js.map +1 -1
  418. package/dist/extraction/site-extractors/github.d.ts.map +1 -1
  419. package/dist/extraction/site-extractors/github.js +87 -95
  420. package/dist/extraction/site-extractors/github.js.map +1 -1
  421. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -1
  422. package/dist/extraction/site-extractors/mdn.js +46 -54
  423. package/dist/extraction/site-extractors/mdn.js.map +1 -1
  424. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -1
  425. package/dist/extraction/site-extractors/stackoverflow.js +71 -80
  426. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -1
  427. package/dist/extraction/structured-data.d.ts +4 -0
  428. package/dist/extraction/structured-data.d.ts.map +1 -0
  429. package/dist/extraction/structured-data.js +173 -0
  430. package/dist/extraction/structured-data.js.map +1 -0
  431. package/dist/extraction/structured.d.ts +4 -0
  432. package/dist/extraction/structured.d.ts.map +1 -0
  433. package/dist/extraction/structured.js +163 -0
  434. package/dist/extraction/structured.js.map +1 -0
  435. package/dist/extraction/v1/classifier.d.ts +3 -0
  436. package/dist/extraction/v1/classifier.d.ts.map +1 -0
  437. package/dist/extraction/v1/classifier.js +110 -0
  438. package/dist/extraction/v1/classifier.js.map +1 -0
  439. package/dist/extraction/v1/extract-provider.d.ts +16 -0
  440. package/dist/extraction/v1/extract-provider.d.ts.map +1 -0
  441. package/dist/extraction/v1/extract-provider.js +43 -0
  442. package/dist/extraction/v1/extract-provider.js.map +1 -0
  443. package/dist/extraction/v1/local-llm.d.ts +8 -0
  444. package/dist/extraction/v1/local-llm.d.ts.map +1 -0
  445. package/dist/extraction/v1/local-llm.js +34 -0
  446. package/dist/extraction/v1/local-llm.js.map +1 -0
  447. package/dist/extraction/v1/news.d.ts +3 -0
  448. package/dist/extraction/v1/news.d.ts.map +1 -0
  449. package/dist/extraction/v1/news.js +61 -0
  450. package/dist/extraction/v1/news.js.map +1 -0
  451. package/dist/extraction/v1/product.d.ts +3 -0
  452. package/dist/extraction/v1/product.d.ts.map +1 -0
  453. package/dist/extraction/v1/product.js +166 -0
  454. package/dist/extraction/v1/product.js.map +1 -0
  455. package/dist/extraction/v1/recipe.d.ts +3 -0
  456. package/dist/extraction/v1/recipe.d.ts.map +1 -0
  457. package/dist/extraction/v1/recipe.js +136 -0
  458. package/dist/extraction/v1/recipe.js.map +1 -0
  459. package/dist/extraction/v1/routed.d.ts +17 -0
  460. package/dist/extraction/v1/routed.d.ts.map +1 -0
  461. package/dist/extraction/v1/routed.js +68 -0
  462. package/dist/extraction/v1/routed.js.map +1 -0
  463. package/dist/extraction/v1/schemas/Article.d.ts +11 -0
  464. package/dist/extraction/v1/schemas/Article.d.ts.map +1 -0
  465. package/dist/extraction/v1/schemas/Article.js +23 -0
  466. package/dist/extraction/v1/schemas/Article.js.map +1 -0
  467. package/dist/extraction/v1/schemas/CodeSnippet.d.ts +9 -0
  468. package/dist/extraction/v1/schemas/CodeSnippet.d.ts.map +1 -0
  469. package/dist/extraction/v1/schemas/CodeSnippet.js +90 -0
  470. package/dist/extraction/v1/schemas/CodeSnippet.js.map +1 -0
  471. package/dist/extraction/v1/schemas/EventListing.d.ts +10 -0
  472. package/dist/extraction/v1/schemas/EventListing.d.ts.map +1 -0
  473. package/dist/extraction/v1/schemas/EventListing.js +122 -0
  474. package/dist/extraction/v1/schemas/EventListing.js.map +1 -0
  475. package/dist/extraction/v1/schemas/Paper.d.ts +10 -0
  476. package/dist/extraction/v1/schemas/Paper.d.ts.map +1 -0
  477. package/dist/extraction/v1/schemas/Paper.js +156 -0
  478. package/dist/extraction/v1/schemas/Paper.js.map +1 -0
  479. package/dist/extraction/v1/schemas/Product.d.ts +17 -0
  480. package/dist/extraction/v1/schemas/Product.d.ts.map +1 -0
  481. package/dist/extraction/v1/schemas/Product.js +149 -0
  482. package/dist/extraction/v1/schemas/Product.js.map +1 -0
  483. package/dist/extraction/v1/schemas/Recipe.d.ts +14 -0
  484. package/dist/extraction/v1/schemas/Recipe.d.ts.map +1 -0
  485. package/dist/extraction/v1/schemas/Recipe.js +160 -0
  486. package/dist/extraction/v1/schemas/Recipe.js.map +1 -0
  487. package/dist/extraction/v1/schemas/index.d.ts +13 -0
  488. package/dist/extraction/v1/schemas/index.d.ts.map +1 -0
  489. package/dist/extraction/v1/schemas/index.js +44 -0
  490. package/dist/extraction/v1/schemas/index.js.map +1 -0
  491. package/dist/extraction/v1/site-extractors.d.ts +5 -0
  492. package/dist/extraction/v1/site-extractors.d.ts.map +1 -0
  493. package/dist/extraction/v1/site-extractors.js +31 -0
  494. package/dist/extraction/v1/site-extractors.js.map +1 -0
  495. package/dist/fetch/action-executor.d.ts +28 -0
  496. package/dist/fetch/action-executor.d.ts.map +1 -0
  497. package/dist/fetch/action-executor.js +88 -0
  498. package/dist/fetch/action-executor.js.map +1 -0
  499. package/dist/fetch/auth.d.ts +2 -1
  500. package/dist/fetch/auth.d.ts.map +1 -1
  501. package/dist/fetch/auth.js +56 -26
  502. package/dist/fetch/auth.js.map +1 -1
  503. package/dist/fetch/browser-pool.d.ts +30 -11
  504. package/dist/fetch/browser-pool.d.ts.map +1 -1
  505. package/dist/fetch/browser-pool.js +303 -127
  506. package/dist/fetch/browser-pool.js.map +1 -1
  507. package/dist/fetch/browser-selector.d.ts +17 -0
  508. package/dist/fetch/browser-selector.d.ts.map +1 -0
  509. package/dist/fetch/browser-selector.js +72 -0
  510. package/dist/fetch/browser-selector.js.map +1 -0
  511. package/dist/fetch/browser-types.d.ts +3 -0
  512. package/dist/fetch/browser-types.d.ts.map +1 -0
  513. package/dist/fetch/browser-types.js +45 -0
  514. package/dist/fetch/browser-types.js.map +1 -0
  515. package/dist/fetch/cdp-client.d.ts +9 -0
  516. package/dist/fetch/cdp-client.d.ts.map +1 -0
  517. package/dist/fetch/cdp-client.js +89 -0
  518. package/dist/fetch/cdp-client.js.map +1 -0
  519. package/dist/fetch/content-check.js +39 -46
  520. package/dist/fetch/content-check.js.map +1 -1
  521. package/dist/fetch/error-describe.d.ts +7 -0
  522. package/dist/fetch/error-describe.d.ts.map +1 -0
  523. package/dist/fetch/error-describe.js +37 -0
  524. package/dist/fetch/error-describe.js.map +1 -0
  525. package/dist/fetch/http-client.d.ts +4 -0
  526. package/dist/fetch/http-client.d.ts.map +1 -1
  527. package/dist/fetch/http-client.js +147 -128
  528. package/dist/fetch/http-client.js.map +1 -1
  529. package/dist/fetch/lightpanda.d.ts +28 -0
  530. package/dist/fetch/lightpanda.d.ts.map +1 -0
  531. package/dist/fetch/lightpanda.js +174 -0
  532. package/dist/fetch/lightpanda.js.map +1 -0
  533. package/dist/fetch/playwright-tier.d.ts +19 -0
  534. package/dist/fetch/playwright-tier.d.ts.map +1 -0
  535. package/dist/fetch/playwright-tier.js +76 -0
  536. package/dist/fetch/playwright-tier.js.map +1 -0
  537. package/dist/fetch/router.d.ts +49 -3
  538. package/dist/fetch/router.d.ts.map +1 -1
  539. package/dist/fetch/router.js +187 -81
  540. package/dist/fetch/router.js.map +1 -1
  541. package/dist/index.js +102 -17
  542. package/dist/index.js.map +1 -1
  543. package/dist/instructions.d.ts +31 -0
  544. package/dist/instructions.d.ts.map +1 -0
  545. package/dist/instructions.js +245 -0
  546. package/dist/instructions.js.map +1 -0
  547. package/dist/integrations/cloud/llm/anthropic.d.ts +3 -0
  548. package/dist/integrations/cloud/llm/anthropic.d.ts.map +1 -0
  549. package/dist/integrations/cloud/llm/anthropic.js +41 -0
  550. package/dist/integrations/cloud/llm/anthropic.js.map +1 -0
  551. package/dist/integrations/cloud/llm/cache.d.ts +5 -0
  552. package/dist/integrations/cloud/llm/cache.d.ts.map +1 -0
  553. package/dist/integrations/cloud/llm/cache.js +49 -0
  554. package/dist/integrations/cloud/llm/cache.js.map +1 -0
  555. package/dist/integrations/cloud/llm/gemini.d.ts +3 -0
  556. package/dist/integrations/cloud/llm/gemini.d.ts.map +1 -0
  557. package/dist/integrations/cloud/llm/gemini.js +37 -0
  558. package/dist/integrations/cloud/llm/gemini.js.map +1 -0
  559. package/dist/integrations/cloud/llm/groq.d.ts +3 -0
  560. package/dist/integrations/cloud/llm/groq.d.ts.map +1 -0
  561. package/dist/integrations/cloud/llm/groq.js +74 -0
  562. package/dist/integrations/cloud/llm/groq.js.map +1 -0
  563. package/dist/integrations/cloud/llm/hash.d.ts +3 -0
  564. package/dist/integrations/cloud/llm/hash.d.ts.map +1 -0
  565. package/dist/integrations/cloud/llm/hash.js +26 -0
  566. package/dist/integrations/cloud/llm/hash.js.map +1 -0
  567. package/dist/integrations/cloud/llm/model-select.d.ts +5 -0
  568. package/dist/integrations/cloud/llm/model-select.d.ts.map +1 -0
  569. package/dist/integrations/cloud/llm/model-select.js +32 -0
  570. package/dist/integrations/cloud/llm/model-select.js.map +1 -0
  571. package/dist/integrations/cloud/llm/openai.d.ts +3 -0
  572. package/dist/integrations/cloud/llm/openai.d.ts.map +1 -0
  573. package/dist/integrations/cloud/llm/openai.js +43 -0
  574. package/dist/integrations/cloud/llm/openai.js.map +1 -0
  575. package/dist/integrations/cloud/llm/run.d.ts +27 -0
  576. package/dist/integrations/cloud/llm/run.d.ts.map +1 -0
  577. package/dist/integrations/cloud/llm/run.js +99 -0
  578. package/dist/integrations/cloud/llm/run.js.map +1 -0
  579. package/dist/integrations/cloud/llm/select.d.ts +5 -0
  580. package/dist/integrations/cloud/llm/select.d.ts.map +1 -0
  581. package/dist/integrations/cloud/llm/select.js +30 -0
  582. package/dist/integrations/cloud/llm/select.js.map +1 -0
  583. package/dist/integrations/cloud/llm/text-adapters.d.ts +19 -0
  584. package/dist/integrations/cloud/llm/text-adapters.d.ts.map +1 -0
  585. package/dist/integrations/cloud/llm/text-adapters.js +103 -0
  586. package/dist/integrations/cloud/llm/text-adapters.js.map +1 -0
  587. package/dist/integrations/cloud/llm/types.d.ts +24 -0
  588. package/dist/integrations/cloud/llm/types.d.ts.map +1 -0
  589. package/dist/integrations/cloud/llm/types.js +1 -0
  590. package/dist/integrations/cloud/llm/types.js.map +1 -0
  591. package/dist/integrations/cloud/llm/validate.d.ts +6 -0
  592. package/dist/integrations/cloud/llm/validate.d.ts.map +1 -0
  593. package/dist/integrations/cloud/llm/validate.js +63 -0
  594. package/dist/integrations/cloud/llm/validate.js.map +1 -0
  595. package/dist/logger.d.ts +4 -1
  596. package/dist/logger.d.ts.map +1 -1
  597. package/dist/logger.js +71 -30
  598. package/dist/logger.js.map +1 -1
  599. package/dist/pdf-parse.d.js +1 -0
  600. package/dist/pdf-parse.d.js.map +1 -0
  601. package/dist/plugins/loader.d.ts +20 -0
  602. package/dist/plugins/loader.d.ts.map +1 -0
  603. package/dist/plugins/loader.js +157 -0
  604. package/dist/plugins/loader.js.map +1 -0
  605. package/dist/plugins/registry.d.ts +26 -0
  606. package/dist/plugins/registry.d.ts.map +1 -0
  607. package/dist/plugins/registry.js +71 -0
  608. package/dist/plugins/registry.js.map +1 -0
  609. package/dist/plugins/validate.d.ts +9 -0
  610. package/dist/plugins/validate.d.ts.map +1 -0
  611. package/dist/plugins/validate.js +79 -0
  612. package/dist/plugins/validate.js.map +1 -0
  613. package/dist/providers/embed-provider.d.ts +11 -0
  614. package/dist/providers/embed-provider.d.ts.map +1 -0
  615. package/dist/providers/embed-provider.js +24 -0
  616. package/dist/providers/embed-provider.js.map +1 -0
  617. package/dist/providers/extract-provider.d.ts +23 -0
  618. package/dist/providers/extract-provider.d.ts.map +1 -0
  619. package/dist/providers/extract-provider.js +25 -0
  620. package/dist/providers/extract-provider.js.map +1 -0
  621. package/dist/providers/rerank-provider.d.ts +17 -0
  622. package/dist/providers/rerank-provider.d.ts.map +1 -0
  623. package/dist/providers/rerank-provider.js +41 -0
  624. package/dist/providers/rerank-provider.js.map +1 -0
  625. package/dist/providers/search-provider.d.ts +25 -0
  626. package/dist/providers/search-provider.d.ts.map +1 -0
  627. package/dist/providers/search-provider.js +44 -0
  628. package/dist/providers/search-provider.js.map +1 -0
  629. package/dist/providers/vector-store.d.ts +27 -0
  630. package/dist/providers/vector-store.d.ts.map +1 -0
  631. package/dist/providers/vector-store.js +27 -0
  632. package/dist/providers/vector-store.js.map +1 -0
  633. package/dist/python-env.d.ts +9 -0
  634. package/dist/python-env.d.ts.map +1 -0
  635. package/dist/python-env.js +13 -0
  636. package/dist/python-env.js.map +1 -0
  637. package/dist/repl/commands/agent.d.ts +5 -0
  638. package/dist/repl/commands/agent.d.ts.map +1 -0
  639. package/dist/repl/commands/agent.js +62 -0
  640. package/dist/repl/commands/agent.js.map +1 -0
  641. package/dist/repl/commands/cache.d.ts +4 -0
  642. package/dist/repl/commands/cache.d.ts.map +1 -0
  643. package/dist/repl/commands/cache.js +43 -0
  644. package/dist/repl/commands/cache.js.map +1 -0
  645. package/dist/repl/commands/crawl.d.ts +7 -0
  646. package/dist/repl/commands/crawl.d.ts.map +1 -0
  647. package/dist/repl/commands/crawl.js +44 -0
  648. package/dist/repl/commands/crawl.js.map +1 -0
  649. package/dist/repl/commands/extract.d.ts +5 -0
  650. package/dist/repl/commands/extract.d.ts.map +1 -0
  651. package/dist/repl/commands/extract.js +47 -0
  652. package/dist/repl/commands/extract.js.map +1 -0
  653. package/dist/repl/commands/fetch.d.ts +5 -0
  654. package/dist/repl/commands/fetch.d.ts.map +1 -0
  655. package/dist/repl/commands/fetch.js +67 -0
  656. package/dist/repl/commands/fetch.js.map +1 -0
  657. package/dist/repl/commands/find-similar.d.ts +5 -0
  658. package/dist/repl/commands/find-similar.d.ts.map +1 -0
  659. package/dist/repl/commands/find-similar.js +74 -0
  660. package/dist/repl/commands/find-similar.js.map +1 -0
  661. package/dist/repl/commands/research.d.ts +5 -0
  662. package/dist/repl/commands/research.d.ts.map +1 -0
  663. package/dist/repl/commands/research.js +65 -0
  664. package/dist/repl/commands/research.js.map +1 -0
  665. package/dist/repl/commands/search.d.ts +5 -0
  666. package/dist/repl/commands/search.d.ts.map +1 -0
  667. package/dist/repl/commands/search.js +74 -0
  668. package/dist/repl/commands/search.js.map +1 -0
  669. package/dist/repl/commands/types.d.ts +9 -0
  670. package/dist/repl/commands/types.d.ts.map +1 -0
  671. package/dist/repl/commands/types.js +1 -0
  672. package/dist/repl/commands/types.js.map +1 -0
  673. package/dist/repl/formatters.d.ts +13 -0
  674. package/dist/repl/formatters.d.ts.map +1 -0
  675. package/dist/repl/formatters.js +283 -0
  676. package/dist/repl/formatters.js.map +1 -0
  677. package/dist/repl/parser.d.ts +9 -0
  678. package/dist/repl/parser.d.ts.map +1 -0
  679. package/dist/repl/parser.js +86 -0
  680. package/dist/repl/parser.js.map +1 -0
  681. package/dist/repl/shell.d.ts +8 -0
  682. package/dist/repl/shell.d.ts.map +1 -0
  683. package/dist/repl/shell.js +184 -0
  684. package/dist/repl/shell.js.map +1 -0
  685. package/dist/research/branch-exploration.d.ts +14 -0
  686. package/dist/research/branch-exploration.d.ts.map +1 -0
  687. package/dist/research/branch-exploration.js +100 -0
  688. package/dist/research/branch-exploration.js.map +1 -0
  689. package/dist/research/brief.d.ts +6 -0
  690. package/dist/research/brief.d.ts.map +1 -0
  691. package/dist/research/brief.js +246 -0
  692. package/dist/research/brief.js.map +1 -0
  693. package/dist/research/citation-graph.d.ts +9 -0
  694. package/dist/research/citation-graph.d.ts.map +1 -0
  695. package/dist/research/citation-graph.js +114 -0
  696. package/dist/research/citation-graph.js.map +1 -0
  697. package/dist/research/decompose.d.ts +14 -0
  698. package/dist/research/decompose.d.ts.map +1 -0
  699. package/dist/research/decompose.js +439 -0
  700. package/dist/research/decompose.js.map +1 -0
  701. package/dist/research/pipeline.d.ts +5 -0
  702. package/dist/research/pipeline.d.ts.map +1 -0
  703. package/dist/research/pipeline.js +269 -0
  704. package/dist/research/pipeline.js.map +1 -0
  705. package/dist/research/synthesis-local.d.ts +19 -0
  706. package/dist/research/synthesis-local.d.ts.map +1 -0
  707. package/dist/research/synthesis-local.js +62 -0
  708. package/dist/research/synthesis-local.js.map +1 -0
  709. package/dist/research/synthesize.d.ts +10 -0
  710. package/dist/research/synthesize.d.ts.map +1 -0
  711. package/dist/research/synthesize.js +137 -0
  712. package/dist/research/synthesize.js.map +1 -0
  713. package/dist/search/answer-synthesis.d.ts +33 -0
  714. package/dist/search/answer-synthesis.d.ts.map +1 -0
  715. package/dist/search/answer-synthesis.js +244 -0
  716. package/dist/search/answer-synthesis.js.map +1 -0
  717. package/dist/search/context-formatter.d.ts +3 -0
  718. package/dist/search/context-formatter.d.ts.map +1 -0
  719. package/dist/search/context-formatter.js +56 -0
  720. package/dist/search/context-formatter.js.map +1 -0
  721. package/dist/search/dedup.d.ts +1 -0
  722. package/dist/search/dedup.d.ts.map +1 -1
  723. package/dist/search/dedup.js +40 -32
  724. package/dist/search/dedup.js.map +1 -1
  725. package/dist/search/engines/arxiv.d.ts +7 -0
  726. package/dist/search/engines/arxiv.d.ts.map +1 -0
  727. package/dist/search/engines/arxiv.js +70 -0
  728. package/dist/search/engines/arxiv.js.map +1 -0
  729. package/dist/search/engines/bing-news.d.ts +7 -0
  730. package/dist/search/engines/bing-news.d.ts.map +1 -0
  731. package/dist/search/engines/bing-news.js +97 -0
  732. package/dist/search/engines/bing-news.js.map +1 -0
  733. package/dist/search/engines/bing.d.ts +1 -0
  734. package/dist/search/engines/bing.d.ts.map +1 -1
  735. package/dist/search/engines/bing.js +100 -44
  736. package/dist/search/engines/bing.js.map +1 -1
  737. package/dist/search/engines/devdocs.d.ts +6 -0
  738. package/dist/search/engines/devdocs.d.ts.map +1 -0
  739. package/dist/search/engines/devdocs.js +56 -0
  740. package/dist/search/engines/devdocs.js.map +1 -0
  741. package/dist/search/engines/duckduckgo.d.ts.map +1 -1
  742. package/dist/search/engines/duckduckgo.js +56 -44
  743. package/dist/search/engines/duckduckgo.js.map +1 -1
  744. package/dist/search/engines/github-code.d.ts +7 -0
  745. package/dist/search/engines/github-code.d.ts.map +1 -0
  746. package/dist/search/engines/github-code.js +55 -0
  747. package/dist/search/engines/github-code.js.map +1 -0
  748. package/dist/search/engines/hn-algolia.d.ts +7 -0
  749. package/dist/search/engines/hn-algolia.d.ts.map +1 -0
  750. package/dist/search/engines/hn-algolia.js +76 -0
  751. package/dist/search/engines/hn-algolia.js.map +1 -0
  752. package/dist/search/engines/lobsters.d.ts +7 -0
  753. package/dist/search/engines/lobsters.d.ts.map +1 -0
  754. package/dist/search/engines/lobsters.js +83 -0
  755. package/dist/search/engines/lobsters.js.map +1 -0
  756. package/dist/search/engines/mdn.d.ts +7 -0
  757. package/dist/search/engines/mdn.d.ts.map +1 -0
  758. package/dist/search/engines/mdn.js +48 -0
  759. package/dist/search/engines/mdn.js.map +1 -0
  760. package/dist/search/engines/semantic-scholar.d.ts +7 -0
  761. package/dist/search/engines/semantic-scholar.d.ts.map +1 -0
  762. package/dist/search/engines/semantic-scholar.js +69 -0
  763. package/dist/search/engines/semantic-scholar.js.map +1 -0
  764. package/dist/search/engines/stackoverflow.d.ts +7 -0
  765. package/dist/search/engines/stackoverflow.d.ts.map +1 -0
  766. package/dist/search/engines/stackoverflow.js +73 -0
  767. package/dist/search/engines/stackoverflow.js.map +1 -0
  768. package/dist/search/engines/startpage.d.ts.map +1 -1
  769. package/dist/search/engines/startpage.js +65 -46
  770. package/dist/search/engines/startpage.js.map +1 -1
  771. package/dist/search/evidence.d.ts +25 -0
  772. package/dist/search/evidence.d.ts.map +1 -0
  773. package/dist/search/evidence.js +220 -0
  774. package/dist/search/evidence.js.map +1 -0
  775. package/dist/search/filters.d.ts.map +1 -1
  776. package/dist/search/filters.js +58 -54
  777. package/dist/search/filters.js.map +1 -1
  778. package/dist/search/find-similar/crawl-rank.d.ts +9 -0
  779. package/dist/search/find-similar/crawl-rank.d.ts.map +1 -0
  780. package/dist/search/find-similar/crawl-rank.js +272 -0
  781. package/dist/search/find-similar/crawl-rank.js.map +1 -0
  782. package/dist/search/find-similar/mode.d.ts +4 -0
  783. package/dist/search/find-similar/mode.d.ts.map +1 -0
  784. package/dist/search/find-similar/mode.js +12 -0
  785. package/dist/search/find-similar/mode.js.map +1 -0
  786. package/dist/search/find-similar.d.ts +5 -0
  787. package/dist/search/find-similar.d.ts.map +1 -0
  788. package/dist/search/find-similar.js +509 -0
  789. package/dist/search/find-similar.js.map +1 -0
  790. package/dist/search/highlights.d.ts +19 -0
  791. package/dist/search/highlights.d.ts.map +1 -0
  792. package/dist/search/highlights.js +167 -0
  793. package/dist/search/highlights.js.map +1 -0
  794. package/dist/search/language-filter.d.ts +29 -0
  795. package/dist/search/language-filter.d.ts.map +1 -0
  796. package/dist/search/language-filter.js +126 -0
  797. package/dist/search/language-filter.js.map +1 -0
  798. package/dist/search/legacy/searxng-orchestrator.d.ts +4 -0
  799. package/dist/search/legacy/searxng-orchestrator.d.ts.map +1 -0
  800. package/dist/search/legacy/searxng-orchestrator.js +501 -0
  801. package/dist/search/legacy/searxng-orchestrator.js.map +1 -0
  802. package/dist/search/legacy/searxng-provider.d.ts +7 -0
  803. package/dist/search/legacy/searxng-provider.d.ts.map +1 -0
  804. package/dist/search/legacy/searxng-provider.js +11 -0
  805. package/dist/search/legacy/searxng-provider.js.map +1 -0
  806. package/dist/search/multi-query.d.ts +25 -0
  807. package/dist/search/multi-query.d.ts.map +1 -0
  808. package/dist/search/multi-query.js +228 -0
  809. package/dist/search/multi-query.js.map +1 -0
  810. package/dist/search/query.js +32 -34
  811. package/dist/search/query.js.map +1 -1
  812. package/dist/search/rerank.d.ts +3 -1
  813. package/dist/search/rerank.d.ts.map +1 -1
  814. package/dist/search/rerank.js +44 -35
  815. package/dist/search/rerank.js.map +1 -1
  816. package/dist/search/reranker/authority-boost.d.ts +3 -0
  817. package/dist/search/reranker/authority-boost.d.ts.map +1 -0
  818. package/dist/search/reranker/authority-boost.js +179 -0
  819. package/dist/search/reranker/authority-boost.js.map +1 -0
  820. package/dist/search/reranker/consensus-boost.d.ts +3 -0
  821. package/dist/search/reranker/consensus-boost.d.ts.map +1 -0
  822. package/dist/search/reranker/consensus-boost.js +27 -0
  823. package/dist/search/reranker/consensus-boost.js.map +1 -0
  824. package/dist/search/reranker/recency-boost.d.ts +3 -0
  825. package/dist/search/reranker/recency-boost.d.ts.map +1 -0
  826. package/dist/search/reranker/recency-boost.js +13 -0
  827. package/dist/search/reranker/recency-boost.js.map +1 -0
  828. package/dist/search/reranker/recency.d.ts +3 -0
  829. package/dist/search/reranker/recency.d.ts.map +1 -0
  830. package/dist/search/reranker/recency.js +23 -0
  831. package/dist/search/reranker/recency.js.map +1 -0
  832. package/dist/search/reranker/transformers-rerank-provider.d.ts +13 -0
  833. package/dist/search/reranker/transformers-rerank-provider.d.ts.map +1 -0
  834. package/dist/search/reranker/transformers-rerank-provider.js +94 -0
  835. package/dist/search/reranker/transformers-rerank-provider.js.map +1 -0
  836. package/dist/search/rrf.d.ts +17 -0
  837. package/dist/search/rrf.d.ts.map +1 -0
  838. package/dist/search/rrf.js +39 -0
  839. package/dist/search/rrf.js.map +1 -0
  840. package/dist/search/sampling.d.ts +25 -0
  841. package/dist/search/sampling.d.ts.map +1 -0
  842. package/dist/search/sampling.js +52 -0
  843. package/dist/search/sampling.js.map +1 -0
  844. package/dist/search/searxng.d.ts.map +1 -1
  845. package/dist/search/searxng.js +69 -79
  846. package/dist/search/searxng.js.map +1 -1
  847. package/dist/search/tokens.d.ts +3 -0
  848. package/dist/search/tokens.d.ts.map +1 -0
  849. package/dist/search/tokens.js +39 -0
  850. package/dist/search/tokens.js.map +1 -0
  851. package/dist/search/truncate.d.ts +6 -0
  852. package/dist/search/truncate.d.ts.map +1 -0
  853. package/dist/search/truncate.js +26 -0
  854. package/dist/search/truncate.js.map +1 -0
  855. package/dist/search/url-unwrap.d.ts +3 -0
  856. package/dist/search/url-unwrap.d.ts.map +1 -0
  857. package/dist/search/url-unwrap.js +43 -0
  858. package/dist/search/url-unwrap.js.map +1 -0
  859. package/dist/search/v1/context-rank.d.ts +13 -0
  860. package/dist/search/v1/context-rank.d.ts.map +1 -0
  861. package/dist/search/v1/context-rank.js +74 -0
  862. package/dist/search/v1/context-rank.js.map +1 -0
  863. package/dist/search/v1/engine-base.d.ts +27 -0
  864. package/dist/search/v1/engine-base.d.ts.map +1 -0
  865. package/dist/search/v1/engine-base.js +110 -0
  866. package/dist/search/v1/engine-base.js.map +1 -0
  867. package/dist/search/v1/intent-router.d.ts +22 -0
  868. package/dist/search/v1/intent-router.d.ts.map +1 -0
  869. package/dist/search/v1/intent-router.js +138 -0
  870. package/dist/search/v1/intent-router.js.map +1 -0
  871. package/dist/search/v1/orchestrator.d.ts +24 -0
  872. package/dist/search/v1/orchestrator.d.ts.map +1 -0
  873. package/dist/search/v1/orchestrator.js +163 -0
  874. package/dist/search/v1/orchestrator.js.map +1 -0
  875. package/dist/search/v1/recency-boost.d.ts +9 -0
  876. package/dist/search/v1/recency-boost.d.ts.map +1 -0
  877. package/dist/search/v1/recency-boost.js +37 -0
  878. package/dist/search/v1/recency-boost.js.map +1 -0
  879. package/dist/search/v1/recent-cache-dedup.d.ts +6 -0
  880. package/dist/search/v1/recent-cache-dedup.d.ts.map +1 -0
  881. package/dist/search/v1/recent-cache-dedup.js +85 -0
  882. package/dist/search/v1/recent-cache-dedup.js.map +1 -0
  883. package/dist/search/v1/rss/feed-config.d.ts +21 -0
  884. package/dist/search/v1/rss/feed-config.d.ts.map +1 -0
  885. package/dist/search/v1/rss/feed-config.js +90 -0
  886. package/dist/search/v1/rss/feed-config.js.map +1 -0
  887. package/dist/search/v1/rss/feed-parser.d.ts +14 -0
  888. package/dist/search/v1/rss/feed-parser.d.ts.map +1 -0
  889. package/dist/search/v1/rss/feed-parser.js +104 -0
  890. package/dist/search/v1/rss/feed-parser.js.map +1 -0
  891. package/dist/search/v1/rss/feed-poller.d.ts +22 -0
  892. package/dist/search/v1/rss/feed-poller.d.ts.map +1 -0
  893. package/dist/search/v1/rss/feed-poller.js +102 -0
  894. package/dist/search/v1/rss/feed-poller.js.map +1 -0
  895. package/dist/search/v1/rss/feed-store.d.ts +30 -0
  896. package/dist/search/v1/rss/feed-store.d.ts.map +1 -0
  897. package/dist/search/v1/rss/feed-store.js +134 -0
  898. package/dist/search/v1/rss/feed-store.js.map +1 -0
  899. package/dist/search/v1/rss/rss-engine.d.ts +6 -0
  900. package/dist/search/v1/rss/rss-engine.d.ts.map +1 -0
  901. package/dist/search/v1/rss/rss-engine.js +28 -0
  902. package/dist/search/v1/rss/rss-engine.js.map +1 -0
  903. package/dist/search/v1/v1-provider.d.ts +7 -0
  904. package/dist/search/v1/v1-provider.d.ts.map +1 -0
  905. package/dist/search/v1/v1-provider.js +68 -0
  906. package/dist/search/v1/v1-provider.js.map +1 -0
  907. package/dist/search/v1/verticals/code.d.ts +4 -0
  908. package/dist/search/v1/verticals/code.d.ts.map +1 -0
  909. package/dist/search/v1/verticals/code.js +20 -0
  910. package/dist/search/v1/verticals/code.js.map +1 -0
  911. package/dist/search/v1/verticals/docs.d.ts +4 -0
  912. package/dist/search/v1/verticals/docs.d.ts.map +1 -0
  913. package/dist/search/v1/verticals/docs.js +20 -0
  914. package/dist/search/v1/verticals/docs.js.map +1 -0
  915. package/dist/search/v1/verticals/general.d.ts +4 -0
  916. package/dist/search/v1/verticals/general.d.ts.map +1 -0
  917. package/dist/search/v1/verticals/general.js +22 -0
  918. package/dist/search/v1/verticals/general.js.map +1 -0
  919. package/dist/search/v1/verticals/news.d.ts +10 -0
  920. package/dist/search/v1/verticals/news.d.ts.map +1 -0
  921. package/dist/search/v1/verticals/news.js +52 -0
  922. package/dist/search/v1/verticals/news.js.map +1 -0
  923. package/dist/search/v1/verticals/papers.d.ts +4 -0
  924. package/dist/search/v1/verticals/papers.d.ts.map +1 -0
  925. package/dist/search/v1/verticals/papers.js +23 -0
  926. package/dist/search/v1/verticals/papers.js.map +1 -0
  927. package/dist/search/validator.js +31 -31
  928. package/dist/search/validator.js.map +1 -1
  929. package/dist/searxng/bootstrap.d.ts +30 -0
  930. package/dist/searxng/bootstrap.d.ts.map +1 -1
  931. package/dist/searxng/bootstrap.js +223 -85
  932. package/dist/searxng/bootstrap.js.map +1 -1
  933. package/dist/searxng/docker.d.ts.map +1 -1
  934. package/dist/searxng/docker.js +69 -60
  935. package/dist/searxng/docker.js.map +1 -1
  936. package/dist/searxng/process.d.ts +13 -1
  937. package/dist/searxng/process.d.ts.map +1 -1
  938. package/dist/searxng/process.js +231 -164
  939. package/dist/searxng/process.js.map +1 -1
  940. package/dist/server/backend-status.d.ts +13 -0
  941. package/dist/server/backend-status.d.ts.map +1 -0
  942. package/dist/server/backend-status.js +40 -0
  943. package/dist/server/backend-status.js.map +1 -0
  944. package/dist/server/tool-schemas.d.ts +549 -0
  945. package/dist/server/tool-schemas.d.ts.map +1 -0
  946. package/dist/server/tool-schemas.js +464 -0
  947. package/dist/server/tool-schemas.js.map +1 -0
  948. package/dist/server/warmup-on-start.d.ts +9 -0
  949. package/dist/server/warmup-on-start.d.ts.map +1 -0
  950. package/dist/server/warmup-on-start.js +55 -0
  951. package/dist/server/warmup-on-start.js.map +1 -0
  952. package/dist/server.d.ts +17 -0
  953. package/dist/server.d.ts.map +1 -1
  954. package/dist/server.js +454 -297
  955. package/dist/server.js.map +1 -1
  956. package/dist/tools/agent.d.ts +5 -0
  957. package/dist/tools/agent.d.ts.map +1 -0
  958. package/dist/tools/agent.js +128 -0
  959. package/dist/tools/agent.js.map +1 -0
  960. package/dist/tools/cache.d.ts +2 -1
  961. package/dist/tools/cache.d.ts.map +1 -1
  962. package/dist/tools/cache.js +177 -44
  963. package/dist/tools/cache.js.map +1 -1
  964. package/dist/tools/crawl.d.ts.map +1 -1
  965. package/dist/tools/crawl.js +171 -88
  966. package/dist/tools/crawl.js.map +1 -1
  967. package/dist/tools/extract.d.ts +2 -2
  968. package/dist/tools/extract.d.ts.map +1 -1
  969. package/dist/tools/extract.js +175 -59
  970. package/dist/tools/extract.js.map +1 -1
  971. package/dist/tools/fetch.d.ts +2 -2
  972. package/dist/tools/fetch.d.ts.map +1 -1
  973. package/dist/tools/fetch.js +174 -68
  974. package/dist/tools/fetch.js.map +1 -1
  975. package/dist/tools/find-similar.d.ts +5 -0
  976. package/dist/tools/find-similar.d.ts.map +1 -0
  977. package/dist/tools/find-similar.js +127 -0
  978. package/dist/tools/find-similar.js.map +1 -0
  979. package/dist/tools/research.d.ts +5 -0
  980. package/dist/tools/research.d.ts.map +1 -0
  981. package/dist/tools/research.js +107 -0
  982. package/dist/tools/research.js.map +1 -0
  983. package/dist/tools/search.d.ts +10 -2
  984. package/dist/tools/search.d.ts.map +1 -1
  985. package/dist/tools/search.js +13 -158
  986. package/dist/tools/search.js.map +1 -1
  987. package/dist/types.d.ts +350 -7
  988. package/dist/types.d.ts.map +1 -1
  989. package/dist/types.js +6 -1
  990. package/dist/types.js.map +1 -1
  991. package/dist/util/mode.d.ts +4 -0
  992. package/dist/util/mode.d.ts.map +1 -0
  993. package/dist/util/mode.js +34 -0
  994. package/dist/util/mode.js.map +1 -0
  995. package/package.json +78 -8
  996. package/dist/extraction/trafilatura.d.ts +0 -6
  997. package/dist/extraction/trafilatura.d.ts.map +0 -1
  998. package/dist/extraction/trafilatura.js +0 -105
  999. package/dist/extraction/trafilatura.js.map +0 -1
  1000. package/dist/search/flashrank.d.ts +0 -12
  1001. package/dist/search/flashrank.d.ts.map +0 -1
  1002. package/dist/search/flashrank.js +0 -63
  1003. package/dist/search/flashrank.js.map +0 -1
@@ -1,228 +1,229 @@
1
- import { matchesPatterns } from './url-utils.js';
2
- import { RateLimiter } from './rate-limiter.js';
3
- import { RobotsParser } from './robots.js';
4
- import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';
5
- import { getConfig } from '../config.js';
6
- import { createLogger } from '../logger.js';
7
- const log = createLogger('crawl');
8
- export class Crawler {
9
- fetchFn;
10
- rawFetchFn;
11
- rateLimiter = new RateLimiter();
12
- constructor(fetchFn, rawFetchFn) {
13
- this.fetchFn = fetchFn;
14
- this.rawFetchFn = rawFetchFn;
1
+ import { matchesPatterns, canonicalForCrawl } from "./url-utils.js";
2
+ import { RateLimiter } from "./rate-limiter.js";
3
+ import { RobotsParser } from "./robots.js";
4
+ import { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from "./sitemap.js";
5
+ import { probeSitemap } from "./sitemap-first.js";
6
+ import { isIndexingEnabled, indexCrawlResult } from "./index-to-vec.js";
7
+ import { getConfig } from "../config.js";
8
+ import { createLogger } from "../logger.js";
9
+ const log = createLogger("crawl");
10
+ class Crawler {
11
+ fetchFn;
12
+ rawFetchFn;
13
+ rateLimiter = new RateLimiter();
14
+ constructor(fetchFn, rawFetchFn) {
15
+ this.fetchFn = fetchFn;
16
+ this.rawFetchFn = rawFetchFn;
17
+ }
18
+ async crawl(input) {
19
+ const strategy = input.strategy ?? "bfs";
20
+ const maxDepth = input.max_depth ?? 2;
21
+ const maxPages = input.max_pages ?? 20;
22
+ const seedOrigin = new URL(input.url).origin;
23
+ const config = getConfig();
24
+ let robotsParser = null;
25
+ if (config.respectRobotsTxt) {
26
+ robotsParser = await this.fetchRobots(seedOrigin);
15
27
  }
16
- async crawl(input) {
17
- const strategy = input.strategy ?? 'bfs';
18
- const maxDepth = input.max_depth ?? 2;
19
- const maxPages = input.max_pages ?? 20;
20
- const seedOrigin = new URL(input.url).origin;
21
- // Fetch and parse robots.txt if configured
22
- const config = getConfig();
23
- let robotsParser = null;
24
- if (config.respectRobotsTxt) {
25
- robotsParser = await this.fetchRobots(seedOrigin);
26
- }
27
- if (strategy === 'sitemap') {
28
- return this.crawlSitemap(input, seedOrigin, maxPages, robotsParser);
28
+ if (strategy === "auto") {
29
+ const sitemapUrls = await probeSitemap(seedOrigin, this.rawFetchFn);
30
+ if (sitemapUrls && sitemapUrls.length > 0) {
31
+ log.info("auto strategy: using sitemap", { origin: seedOrigin, urls: sitemapUrls.length });
32
+ return this.crawlFromExplicitUrls(input, sitemapUrls, maxPages, robotsParser);
33
+ }
34
+ log.info("auto strategy: no sitemap found, falling back to BFS", { origin: seedOrigin });
35
+ return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, "bfs", robotsParser);
36
+ }
37
+ if (strategy === "sitemap") {
38
+ return this.crawlSitemap(input, seedOrigin, maxPages, robotsParser);
39
+ }
40
+ const traversalStrategy = strategy === "map" ? "bfs" : strategy;
41
+ return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, traversalStrategy, robotsParser);
42
+ }
43
+ robotsTxtContent = null;
44
+ async fetchRobots(origin) {
45
+ try {
46
+ const result = await this.rawFetchFn(`${origin}/robots.txt`);
47
+ if (result.statusCode === 200 && result.html) {
48
+ this.robotsTxtContent = result.html;
49
+ const parser = new RobotsParser(result.html);
50
+ const crawlDelay = parser.getCrawlDelay();
51
+ if (crawlDelay !== null) {
52
+ const domain = new URL(origin).hostname;
53
+ this.rateLimiter.setRobotsCrawlDelay(domain, crawlDelay);
29
54
  }
30
- const traversalStrategy = strategy === 'map' ? 'bfs' : strategy;
31
- return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, traversalStrategy, robotsParser);
55
+ return parser;
56
+ }
57
+ } catch {
58
+ log.debug("Could not fetch robots.txt", { origin });
32
59
  }
33
- robotsTxtContent = null;
34
- async fetchRobots(origin) {
35
- try {
36
- const result = await this.rawFetchFn(`${origin}/robots.txt`);
37
- if (result.statusCode === 200 && result.html) {
38
- this.robotsTxtContent = result.html;
39
- const parser = new RobotsParser(result.html);
40
- const crawlDelay = parser.getCrawlDelay();
41
- if (crawlDelay !== null) {
42
- const domain = new URL(origin).hostname;
43
- this.rateLimiter.setRobotsCrawlDelay(domain, crawlDelay);
44
- }
45
- return parser;
46
- }
60
+ return null;
61
+ }
62
+ async crawlTraversal(input, seedOrigin, maxDepth, maxPages, strategy, robotsParser) {
63
+ const visited = /* @__PURE__ */ new Set();
64
+ const pages = [];
65
+ const allLinks = [];
66
+ const indexing = isIndexingEnabled();
67
+ const queue = [[input.url, 0]];
68
+ visited.add(canonicalForCrawl(input.url));
69
+ while (queue.length > 0 && pages.length < maxPages) {
70
+ const next = strategy === "dfs" ? queue.pop() : queue.shift();
71
+ const [url, depth] = next;
72
+ if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) {
73
+ log.debug("Blocked by robots.txt", { url });
74
+ continue;
75
+ }
76
+ const release = await this.rateLimiter.acquire(url);
77
+ let fetchResult;
78
+ try {
79
+ fetchResult = await this.fetchFn(url);
80
+ } catch (err) {
81
+ log.warn("Fetch failed during crawl", { url, error: String(err) });
82
+ release();
83
+ continue;
84
+ }
85
+ release();
86
+ if (fetchResult.error) {
87
+ log.warn("Fetch returned error", { url, error: fetchResult.error });
88
+ continue;
89
+ }
90
+ const item = {
91
+ url: fetchResult.url,
92
+ title: fetchResult.title,
93
+ markdown: fetchResult.markdown,
94
+ depth
95
+ };
96
+ pages.push(item);
97
+ if (indexing) await indexCrawlResult(item);
98
+ if (depth < maxDepth) {
99
+ const newLinks = this.filterLinks(fetchResult.links, seedOrigin, visited, input.include_patterns, input.exclude_patterns, robotsParser);
100
+ for (const link of newLinks) {
101
+ visited.add(canonicalForCrawl(link));
102
+ queue.push([link, depth + 1]);
47
103
  }
48
- catch {
49
- log.debug('Could not fetch robots.txt', { origin });
104
+ if (input.extract_links) {
105
+ for (const link of fetchResult.links) {
106
+ allLinks.push({ from: url, to: link });
107
+ }
50
108
  }
51
- return null;
109
+ }
52
110
  }
53
- async crawlTraversal(input, seedOrigin, maxDepth, maxPages, strategy, robotsParser) {
54
- const visited = new Set();
55
- const pages = [];
56
- const allLinks = [];
57
- // Queue: [url, depth]
58
- const queue = [[input.url, 0]];
59
- visited.add(input.url);
60
- while (queue.length > 0 && pages.length < maxPages) {
61
- const item = strategy === 'dfs' ? queue.pop() : queue.shift();
62
- const [url, depth] = item;
63
- // Check robots.txt
64
- if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) {
65
- log.debug('Blocked by robots.txt', { url });
66
- continue;
67
- }
68
- // Rate limit
69
- const release = await this.rateLimiter.acquire(url);
70
- let fetchResult;
71
- try {
72
- fetchResult = await this.fetchFn(url);
73
- }
74
- catch (err) {
75
- log.warn('Fetch failed during crawl', { url, error: String(err) });
76
- release();
77
- continue;
78
- }
79
- release();
80
- if (fetchResult.error) {
81
- log.warn('Fetch returned error', { url, error: fetchResult.error });
82
- continue;
83
- }
84
- pages.push({
85
- url: fetchResult.url,
86
- title: fetchResult.title,
87
- markdown: fetchResult.markdown,
88
- depth,
89
- });
90
- // Discover links for traversal
91
- if (depth < maxDepth) {
92
- const newLinks = this.filterLinks(fetchResult.links, seedOrigin, visited, input.include_patterns, input.exclude_patterns, robotsParser);
93
- for (const link of newLinks) {
94
- visited.add(link);
95
- queue.push([link, depth + 1]);
96
- }
97
- if (input.extract_links) {
98
- for (const link of fetchResult.links) {
99
- allLinks.push({ from: url, to: link });
100
- }
101
- }
111
+ return {
112
+ pages,
113
+ total_found: visited.size,
114
+ crawled: pages.length,
115
+ ...input.extract_links ? { links: allLinks } : {}
116
+ };
117
+ }
118
+ filterLinks(links, seedOrigin, visited, includePatterns, excludePatterns, robotsParser) {
119
+ const filtered = links.filter((link) => {
120
+ try {
121
+ const parsed = new URL(link);
122
+ if (parsed.origin !== seedOrigin) return false;
123
+ if (visited.has(canonicalForCrawl(link))) return false;
124
+ if (!matchesPatterns(link, includePatterns, excludePatterns)) return false;
125
+ if (robotsParser && !robotsParser.isAllowed(parsed.pathname)) return false;
126
+ return true;
127
+ } catch {
128
+ return false;
129
+ }
130
+ });
131
+ return filtered.sort((a, b) => {
132
+ const aDoc = isDocPage(a) ? 0 : 1;
133
+ const bDoc = isDocPage(b) ? 0 : 1;
134
+ return aDoc - bDoc;
135
+ });
136
+ }
137
+ async crawlSitemap(input, seedOrigin, maxPages, robotsParser) {
138
+ const sitemapUrls = await this.discoverSitemapUrls(seedOrigin, this.robotsTxtContent);
139
+ if (sitemapUrls.length === 0) {
140
+ log.info("No sitemap found, falling back to BFS");
141
+ return this.crawlTraversal(input, seedOrigin, input.max_depth ?? 2, maxPages, "bfs", robotsParser);
142
+ }
143
+ return this.crawlFromExplicitUrls(input, sitemapUrls, maxPages, robotsParser);
144
+ }
145
+ /**
146
+ * Crawl an explicit list of URLs (e.g. from a sitemap probe). Applies
147
+ * include/exclude patterns, robots.txt, max_pages, and rate limits the
148
+ * same way as crawlSitemap.
149
+ */
150
+ async crawlFromExplicitUrls(input, urls, maxPages, robotsParser) {
151
+ const filtered = urls.filter(
152
+ (url) => matchesPatterns(url, input.include_patterns, input.exclude_patterns)
153
+ );
154
+ const totalFound = filtered.length;
155
+ const toFetch = filtered.slice(0, maxPages);
156
+ const pages = [];
157
+ const allLinks = [];
158
+ const indexing = isIndexingEnabled();
159
+ for (const url of toFetch) {
160
+ if (pages.length >= maxPages) break;
161
+ if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) continue;
162
+ const release = await this.rateLimiter.acquire(url);
163
+ try {
164
+ const result = await this.fetchFn(url);
165
+ release();
166
+ if (!result.error) {
167
+ const item = { url: result.url, title: result.title, markdown: result.markdown, depth: 0 };
168
+ pages.push(item);
169
+ if (indexing) await indexCrawlResult(item);
170
+ if (input.extract_links) {
171
+ for (const link of result.links) {
172
+ allLinks.push({ from: url, to: link });
102
173
  }
174
+ }
103
175
  }
104
- // total_found = all unique URLs discovered (visited set), including unvisited queue items
105
- return {
106
- pages,
107
- total_found: visited.size,
108
- crawled: pages.length,
109
- ...(input.extract_links ? { links: allLinks } : {}),
110
- };
176
+ } catch (err) {
177
+ release();
178
+ log.warn("Sitemap fetch failed", { url, error: String(err) });
179
+ }
111
180
  }
112
- filterLinks(links, seedOrigin, visited, includePatterns, excludePatterns, robotsParser) {
113
- const filtered = links.filter((link) => {
114
- try {
115
- const parsed = new URL(link);
116
- if (parsed.origin !== seedOrigin)
117
- return false;
118
- if (visited.has(link))
119
- return false;
120
- if (!matchesPatterns(link, includePatterns, excludePatterns))
121
- return false;
122
- if (robotsParser && !robotsParser.isAllowed(parsed.pathname))
123
- return false;
124
- return true;
125
- }
126
- catch {
127
- return false;
128
- }
129
- });
130
- // Prioritize documentation pages over marketing/nav pages
131
- return filtered.sort((a, b) => {
132
- const aDoc = isDocPage(a) ? 0 : 1;
133
- const bDoc = isDocPage(b) ? 0 : 1;
134
- return aDoc - bDoc;
135
- });
181
+ return {
182
+ pages,
183
+ total_found: totalFound,
184
+ crawled: pages.length,
185
+ ...input.extract_links ? { links: allLinks } : {}
186
+ };
187
+ }
188
+ async discoverSitemapUrls(origin, robotsTxt) {
189
+ const sitemapLocations = [];
190
+ if (robotsTxt) {
191
+ sitemapLocations.push(...extractSitemapUrlFromRobots(robotsTxt));
136
192
  }
137
- async crawlSitemap(input, seedOrigin, maxPages, robotsParser) {
138
- // Discover sitemap URLs (pass already-fetched robots.txt content)
139
- let sitemapUrls = await this.discoverSitemapUrls(seedOrigin, this.robotsTxtContent);
140
- if (sitemapUrls.length === 0) {
141
- log.info('No sitemap found, falling back to BFS');
142
- return this.crawlTraversal(input, seedOrigin, input.max_depth ?? 2, maxPages, 'bfs', robotsParser);
143
- }
144
- // Filter by patterns
145
- sitemapUrls = sitemapUrls.filter((url) => matchesPatterns(url, input.include_patterns, input.exclude_patterns));
146
- const totalFound = sitemapUrls.length;
147
- const toFetch = sitemapUrls.slice(0, maxPages);
148
- const pages = [];
149
- const allLinks = [];
150
- for (const url of toFetch) {
151
- if (pages.length >= maxPages)
152
- break;
153
- if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname))
154
- continue;
155
- const release = await this.rateLimiter.acquire(url);
156
- try {
157
- const result = await this.fetchFn(url);
158
- release();
159
- if (!result.error) {
160
- pages.push({ url: result.url, title: result.title, markdown: result.markdown, depth: 0 });
161
- if (input.extract_links) {
162
- for (const link of result.links) {
163
- allLinks.push({ from: url, to: link });
164
- }
165
- }
166
- }
167
- }
168
- catch (err) {
169
- release();
170
- log.warn('Sitemap fetch failed', { url, error: String(err) });
171
- }
172
- }
173
- return {
174
- pages,
175
- total_found: totalFound,
176
- crawled: pages.length,
177
- ...(input.extract_links ? { links: allLinks } : {}),
178
- };
193
+ if (sitemapLocations.length === 0) {
194
+ sitemapLocations.push(`${origin}/sitemap.xml`);
179
195
  }
180
- async discoverSitemapUrls(origin, robotsTxt) {
181
- const sitemapLocations = [];
182
- // Check robots.txt for sitemap references (reuses already-fetched content)
183
- if (robotsTxt) {
184
- sitemapLocations.push(...extractSitemapUrlFromRobots(robotsTxt));
185
- }
186
- // Try default location
187
- if (sitemapLocations.length === 0) {
188
- sitemapLocations.push(`${origin}/sitemap.xml`);
189
- }
190
- const allUrls = [];
191
- for (const sitemapUrl of sitemapLocations) {
196
+ const allUrls = [];
197
+ for (const sitemapUrl of sitemapLocations) {
198
+ try {
199
+ const result = await this.rawFetchFn(sitemapUrl);
200
+ if (result.statusCode !== 200) continue;
201
+ const indexUrls = parseSitemapIndex(result.html);
202
+ if (indexUrls.length > 0) {
203
+ for (const subUrl of indexUrls) {
192
204
  try {
193
- const result = await this.rawFetchFn(sitemapUrl);
194
- if (result.statusCode !== 200)
195
- continue;
196
- // Check if it's a sitemap index
197
- const indexUrls = parseSitemapIndex(result.html);
198
- if (indexUrls.length > 0) {
199
- // Fetch each sub-sitemap
200
- for (const subUrl of indexUrls) {
201
- try {
202
- const subResult = await this.rawFetchFn(subUrl);
203
- if (subResult.statusCode === 200) {
204
- allUrls.push(...parseSitemap(subResult.html));
205
- }
206
- }
207
- catch {
208
- // skip failed sub-sitemaps
209
- }
210
- }
211
- }
212
- else {
213
- allUrls.push(...parseSitemap(result.html));
214
- }
215
- }
216
- catch {
217
- // skip failed sitemap fetches
205
+ const subResult = await this.rawFetchFn(subUrl);
206
+ if (subResult.statusCode === 200) {
207
+ allUrls.push(...parseSitemap(subResult.html));
208
+ }
209
+ } catch {
218
210
  }
211
+ }
212
+ } else {
213
+ allUrls.push(...parseSitemap(result.html));
219
214
  }
220
- return allUrls;
215
+ } catch {
216
+ }
221
217
  }
218
+ return allUrls;
219
+ }
222
220
  }
223
- const DOC_PATH_PATTERNS = ['/docs/', '/guide/', '/api/', '/reference/'];
221
+ const DOC_PATH_PATTERNS = ["/docs/", "/guide/", "/api/", "/reference/"];
224
222
  function isDocPage(url) {
225
- const path = new URL(url).pathname.toLowerCase();
226
- return DOC_PATH_PATTERNS.some(p => path.includes(p));
223
+ const path = new URL(url).pathname.toLowerCase();
224
+ return DOC_PATH_PATTERNS.some((p) => path.includes(p));
227
225
  }
226
+ export {
227
+ Crawler
228
+ };
228
229
  //# sourceMappingURL=crawler.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"crawler.js","sourceRoot":"","sources":["../../src/crawl/crawler.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAC3C,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,2BAA2B,EAAE,MAAM,cAAc,CAAC;AAC5F,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAE5C,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;AAKlC,MAAM,OAAO,OAAO;IACV,OAAO,CAAU;IACjB,UAAU,CAAa;IACvB,WAAW,GAAG,IAAI,WAAW,EAAE,CAAC;IAExC,YAAY,OAAgB,EAAE,UAAsB;QAClD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;IAC/B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,KAAiB;QAC3B,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,IAAI,KAAK,CAAC;QACzC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC;QACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,IAAI,EAAE,CAAC;QAEvC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;QAE7C,2CAA2C;QAC3C,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,YAAY,GAAwB,IAAI,CAAC;QAC7C,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;YAC5B,YAAY,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QACpD,CAAC;QAED,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,OAAO,IAAI,CAAC,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,YAAY,CAAC,CAAC;QACtE,CAAC;QAED,MAAM,iBAAiB,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC;QAChE,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,QAAQ,EAAE,iBAAiB,EAAE,YAAY,CAAC,CAAC;IACrG,CAAC;IAEO,gBAAgB,GAAkB,IAAI,CAAC;IAEvC,KAAK,CAAC,WAAW,CAAC,MAAc;QACtC,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,GAAG,MAAM,aAAa,CAAC,CAAC;YAC7D,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;gBAC7C,IAAI,CAAC,gBAAgB,GAAG,MAAM,CAAC,IAAI,CAAC;gBACpC,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBAC7C,MAAM,UAAU,GAAG,MAAM,CAAC,aAAa,EAAE,CAAC;gBAC1C,IAAI,UAAU,KAAK,IAAI,EAAE,CAAC;oBACxB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC;oBACxC,IAAI,CAAC,WAAW,CAAC,mBAAmB,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;gBAC3D,CAAC;gBACD,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,GAAG,CAAC,KAAK,CAAC,4BAA4B,EAAE,EAAE,MAAM,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAEO,KAAK,CAAC,cAAc,CAC1B,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,QAAgB,EAChB,QAAuB,EACvB,YAAiC;QAEjC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;QAClC,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,sBAAsB;QACtB,MAAM,KAAK,GAA4B,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QACxD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAEvB,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,QAAQ,KAAK,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,EAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,EAAG,CAAC;YAChE,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,IAAI,CAAC;YAE1B,mBAAmB;YACnB,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnE,GAAG,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;gBAC5C,SAAS;YACX,CAAC;YAED,aAAa;YACb,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,WAAwB,CAAC;YAC7B,IAAI,CAAC;gBACH,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,2BAA2B,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACnE,OAAO,EAAE,CAAC;gBACV,SAAS;YACX,CAAC;YAED,OAAO,EAAE,CAAC;YAEV,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;gBACtB,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,WAAW,CAAC,KAAK,EAAE,CAAC,CAAC;gBACpE,SAAS;YACX,CAAC;YAED,KAAK,CAAC,IAAI,CAAC;gBACT,GAAG,EAAE,WAAW,CAAC,GAAG;gBACpB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,QAAQ,EAAE,WAAW,CAAC,QAAQ;gBAC9B,KAAK;aACN,CAAC,CAAC;YAEH,+BAA+B;YAC/B,IAAI,KAAK,GAAG,QAAQ,EAAE,CAAC;gBACrB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,KAAK,EAAE,UAAU,EAAE,OAAO,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,EAAE,YAAY,CAAC,CAAC;gBAExI,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;oBAC5B,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC;gBAChC,CAAC;gBAED,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;oBACxB,KAAK,MAAM,IAAI,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;wBACrC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;oBACzC,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,0FAA0F;QAC1F,OAAO;YACL,KAAK;YACL,WAAW,EAAE,OAAO,CAAC,IAAI;YACzB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,WAAW,CACjB,KAAe,EACf,UAAkB,EAClB,OAAoB,EACpB,eAAqC,EACrC,eAAqC,EACrC,YAAiC;QAEjC,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE;YACrC,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;gBAC7B,IAAI,MAAM,CAAC,MAAM,KAAK,UAAU;oBAAE,OAAO,KAAK,CAAC;gBAC/C,IAAI,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;oBAAE,OAAO,KAAK,CAAC;gBACpC,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,eAAe,EAAE,eAAe,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC;oBAAE,OAAO,KAAK,CAAC;gBAC3E,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,KAAK,CAAC;YACf,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,0DAA0D;QAC1D,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAClC,OAAO,IAAI,GAAG,IAAI,CAAC;QACrB,CAAC,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,YAAY,CAChB,KAAiB,EACjB,UAAkB,EAClB,QAAgB,EAChB,YAAiC;QAEjC,kEAAkE;QAClE,IAAI,WAAW,GAAG,MAAM,IAAI,CAAC,mBAAmB,CAAC,UAAU,EAAE,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAEpF,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,GAAG,CAAC,IAAI,CAAC,uCAAuC,CAAC,CAAC;YAClD,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,UAAU,EAAE,KAAK,CAAC,SAAS,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;QACrG,CAAC;QAED,qBAAqB;QACrB,WAAW,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CACvC,eAAe,CAAC,GAAG,EAAE,KAAK,CAAC,gBAAgB,EAAE,KAAK,CAAC,gBAAgB,CAAC,CACrE,CAAC;QAEF,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC;QACtC,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;QAC/C,MAAM,KAAK,GAAsB,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAe,EAAE,CAAC;QAEhC,KAAK,MAAM,GAAG,IAAI,OAAO,EAAE,CAAC;YAC1B,IAAI,KAAK,CAAC,MAAM,IAAI,QAAQ;gBAAE,MAAM;YAEpC,IAAI,YAAY,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;gBAAE,SAAS;YAE7E,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;YAEpD,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;gBACvC,OAAO,EAAE,CAAC;gBAEV,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;oBAClB,KAAK,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;oBAE1F,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;wBACxB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;4BAChC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;wBACzC,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,EAAE,CAAC;gBACV,GAAG,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,OAAO;YACL,KAAK;YACL,WAAW,EAAE,UAAU;YACvB,OAAO,EAAE,KAAK,CAAC,MAAM;YACrB,GAAG,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACpD,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,mBAAmB,CAAC,MAAc,EAAE,SAAoC;QACpF,MAAM,gBAAgB,GAAa,EAAE,CAAC;QAEtC,2EAA2E;QAC3E,IAAI,SAAS,EAAE,CAAC;YACd,gBAAgB,CAAC,IAAI,CAAC,GAAG,2BAA2B,CAAC,SAAS,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,uBAAuB;QACvB,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,gBAAgB,CAAC,IAAI,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;QACjD,CAAC;QAED,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,KAAK,MAAM,UAAU,IAAI,gBAAgB,EAAE,CAAC;YAC1C,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC;gBACjD,IAAI,MAAM,CAAC,UAAU,KAAK,GAAG;oBAAE,SAAS;gBAExC,gCAAgC;gBAChC,MAAM,SAAS,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;gBACjD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACzB,yBAAyB;oBACzB,KAAK,MAAM,MAAM,IAAI,SAAS,EAAE,CAAC;wBAC/B,IAAI,CAAC;4BACH,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;4BAChD,IAAI,SAAS,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC;gCACjC,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;4BAChD,CAAC;wBACH,CAAC;wBAAC,MAAM,CAAC;4BACP,2BAA2B;wBAC7B,CAAC;oBACH,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC7C,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,8BAA8B;YAChC,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF;AAED,MAAM,iBAAiB,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;AAExE,SAAS,SAAS,CAAC,GAAW;IAC5B,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;IACjD,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;AACvD,CAAC"}
1
+ {"version":3,"sources":["../../src/crawl/crawler.ts"],"sourcesContent":["import type { FetchOutput, CrawlInput, CrawlOutput, CrawlResultItem, LinkEdge, RawFetchResult } from '../types.js';\nimport { matchesPatterns, canonicalForCrawl } from './url-utils.js';\nimport { RateLimiter } from './rate-limiter.js';\nimport { RobotsParser } from './robots.js';\nimport { parseSitemap, parseSitemapIndex, extractSitemapUrlFromRobots } from './sitemap.js';\nimport { probeSitemap } from './sitemap-first.js';\nimport { isIndexingEnabled, indexCrawlResult } from './index-to-vec.js';\nimport { getConfig } from '../config.js';\nimport { createLogger } from '../logger.js';\n\nconst log = createLogger('crawl');\n\nexport type FetchFn = (url: string) => Promise<FetchOutput>;\nexport type RawFetchFn = (url: string) => Promise<RawFetchResult>;\n\nexport class Crawler {\n private fetchFn: FetchFn;\n private rawFetchFn: RawFetchFn;\n private rateLimiter = new RateLimiter();\n\n constructor(fetchFn: FetchFn, rawFetchFn: RawFetchFn) {\n this.fetchFn = fetchFn;\n this.rawFetchFn = rawFetchFn;\n }\n\n async crawl(input: CrawlInput): Promise<CrawlOutput> {\n const strategy = input.strategy ?? 'bfs';\n const maxDepth = input.max_depth ?? 2;\n const maxPages = input.max_pages ?? 20;\n\n const seedOrigin = new URL(input.url).origin;\n\n // Fetch and parse robots.txt if configured\n const config = getConfig();\n let robotsParser: RobotsParser | null = null;\n if (config.respectRobotsTxt) {\n robotsParser = await this.fetchRobots(seedOrigin);\n }\n\n if (strategy === 'auto') {\n const sitemapUrls = await probeSitemap(seedOrigin, this.rawFetchFn);\n if (sitemapUrls && sitemapUrls.length > 0) {\n log.info('auto strategy: using sitemap', { origin: seedOrigin, urls: sitemapUrls.length });\n return this.crawlFromExplicitUrls(input, sitemapUrls, maxPages, robotsParser);\n }\n log.info('auto strategy: no sitemap found, falling back to BFS', { origin: seedOrigin });\n return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, 'bfs', robotsParser);\n }\n\n if (strategy === 'sitemap') {\n return this.crawlSitemap(input, seedOrigin, maxPages, robotsParser);\n }\n\n const traversalStrategy = strategy === 'map' ? 'bfs' : strategy;\n return this.crawlTraversal(input, seedOrigin, maxDepth, maxPages, traversalStrategy, robotsParser);\n }\n\n private robotsTxtContent: string | null = null;\n\n private async fetchRobots(origin: string): Promise<RobotsParser | null> {\n try {\n const result = await this.rawFetchFn(`${origin}/robots.txt`);\n if (result.statusCode === 200 && result.html) {\n this.robotsTxtContent = result.html;\n const parser = new RobotsParser(result.html);\n const crawlDelay = parser.getCrawlDelay();\n if (crawlDelay !== null) {\n const domain = new URL(origin).hostname;\n this.rateLimiter.setRobotsCrawlDelay(domain, crawlDelay);\n }\n return parser;\n }\n } catch {\n log.debug('Could not fetch robots.txt', { origin });\n }\n return null;\n }\n\n private async crawlTraversal(\n input: CrawlInput,\n seedOrigin: string,\n maxDepth: number,\n maxPages: number,\n strategy: 'bfs' | 'dfs',\n robotsParser: RobotsParser | null,\n ): Promise<CrawlOutput> {\n const visited = new Set<string>();\n const pages: CrawlResultItem[] = [];\n const allLinks: LinkEdge[] = [];\n const indexing = isIndexingEnabled();\n\n // Queue: [url, depth]\n const queue: Array<[string, number]> = [[input.url, 0]];\n visited.add(canonicalForCrawl(input.url));\n\n while (queue.length > 0 && pages.length < maxPages) {\n const next = strategy === 'dfs' ? queue.pop()! : queue.shift()!;\n const [url, depth] = next;\n\n // Check robots.txt\n if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) {\n log.debug('Blocked by robots.txt', { url });\n continue;\n }\n\n // Rate limit\n const release = await this.rateLimiter.acquire(url);\n\n let fetchResult: FetchOutput;\n try {\n fetchResult = await this.fetchFn(url);\n } catch (err) {\n log.warn('Fetch failed during crawl', { url, error: String(err) });\n release();\n continue;\n }\n\n release();\n\n if (fetchResult.error) {\n log.warn('Fetch returned error', { url, error: fetchResult.error });\n continue;\n }\n\n const item: CrawlResultItem = {\n url: fetchResult.url,\n title: fetchResult.title,\n markdown: fetchResult.markdown,\n depth,\n };\n pages.push(item);\n\n if (indexing) await indexCrawlResult(item);\n\n // Discover links for traversal\n if (depth < maxDepth) {\n const newLinks = this.filterLinks(fetchResult.links, seedOrigin, visited, input.include_patterns, input.exclude_patterns, robotsParser);\n\n for (const link of newLinks) {\n visited.add(canonicalForCrawl(link));\n queue.push([link, depth + 1]);\n }\n\n if (input.extract_links) {\n for (const link of fetchResult.links) {\n allLinks.push({ from: url, to: link });\n }\n }\n }\n }\n\n // total_found = all unique URLs discovered (visited set), including unvisited queue items\n return {\n pages,\n total_found: visited.size,\n crawled: pages.length,\n ...(input.extract_links ? { links: allLinks } : {}),\n };\n }\n\n private filterLinks(\n links: string[],\n seedOrigin: string,\n visited: Set<string>,\n includePatterns: string[] | undefined,\n excludePatterns: string[] | undefined,\n robotsParser: RobotsParser | null,\n ): string[] {\n const filtered = links.filter((link) => {\n try {\n const parsed = new URL(link);\n if (parsed.origin !== seedOrigin) return false;\n if (visited.has(canonicalForCrawl(link))) return false;\n if (!matchesPatterns(link, includePatterns, excludePatterns)) return false;\n if (robotsParser && !robotsParser.isAllowed(parsed.pathname)) return false;\n return true;\n } catch {\n return false;\n }\n });\n\n // Prioritize documentation pages over marketing/nav pages\n return filtered.sort((a, b) => {\n const aDoc = isDocPage(a) ? 0 : 1;\n const bDoc = isDocPage(b) ? 0 : 1;\n return aDoc - bDoc;\n });\n }\n\n async crawlSitemap(\n input: CrawlInput,\n seedOrigin: string,\n maxPages: number,\n robotsParser: RobotsParser | null,\n ): Promise<CrawlOutput> {\n // Discover sitemap URLs (pass already-fetched robots.txt content)\n const sitemapUrls = await this.discoverSitemapUrls(seedOrigin, this.robotsTxtContent);\n\n if (sitemapUrls.length === 0) {\n log.info('No sitemap found, falling back to BFS');\n return this.crawlTraversal(input, seedOrigin, input.max_depth ?? 2, maxPages, 'bfs', robotsParser);\n }\n\n return this.crawlFromExplicitUrls(input, sitemapUrls, maxPages, robotsParser);\n }\n\n /**\n * Crawl an explicit list of URLs (e.g. from a sitemap probe). Applies\n * include/exclude patterns, robots.txt, max_pages, and rate limits the\n * same way as crawlSitemap.\n */\n private async crawlFromExplicitUrls(\n input: CrawlInput,\n urls: string[],\n maxPages: number,\n robotsParser: RobotsParser | null,\n ): Promise<CrawlOutput> {\n const filtered = urls.filter((url) =>\n matchesPatterns(url, input.include_patterns, input.exclude_patterns),\n );\n\n const totalFound = filtered.length;\n const toFetch = filtered.slice(0, maxPages);\n const pages: CrawlResultItem[] = [];\n const allLinks: LinkEdge[] = [];\n const indexing = isIndexingEnabled();\n\n for (const url of toFetch) {\n if (pages.length >= maxPages) break;\n\n if (robotsParser && !robotsParser.isAllowed(new URL(url).pathname)) continue;\n\n const release = await this.rateLimiter.acquire(url);\n\n try {\n const result = await this.fetchFn(url);\n release();\n\n if (!result.error) {\n const item: CrawlResultItem = { url: result.url, title: result.title, markdown: result.markdown, depth: 0 };\n pages.push(item);\n\n if (indexing) await indexCrawlResult(item);\n\n if (input.extract_links) {\n for (const link of result.links) {\n allLinks.push({ from: url, to: link });\n }\n }\n }\n } catch (err) {\n release();\n log.warn('Sitemap fetch failed', { url, error: String(err) });\n }\n }\n\n return {\n pages,\n total_found: totalFound,\n crawled: pages.length,\n ...(input.extract_links ? { links: allLinks } : {}),\n };\n }\n\n private async discoverSitemapUrls(origin: string, robotsTxt: string | null | undefined): Promise<string[]> {\n const sitemapLocations: string[] = [];\n\n // Check robots.txt for sitemap references (reuses already-fetched content)\n if (robotsTxt) {\n sitemapLocations.push(...extractSitemapUrlFromRobots(robotsTxt));\n }\n\n // Try default location\n if (sitemapLocations.length === 0) {\n sitemapLocations.push(`${origin}/sitemap.xml`);\n }\n\n const allUrls: string[] = [];\n\n for (const sitemapUrl of sitemapLocations) {\n try {\n const result = await this.rawFetchFn(sitemapUrl);\n if (result.statusCode !== 200) continue;\n\n // Check if it's a sitemap index\n const indexUrls = parseSitemapIndex(result.html);\n if (indexUrls.length > 0) {\n // Fetch each sub-sitemap\n for (const subUrl of indexUrls) {\n try {\n const subResult = await this.rawFetchFn(subUrl);\n if (subResult.statusCode === 200) {\n allUrls.push(...parseSitemap(subResult.html));\n }\n } catch {\n // skip failed sub-sitemaps\n }\n }\n } else {\n allUrls.push(...parseSitemap(result.html));\n }\n } catch {\n // skip failed sitemap fetches\n }\n }\n\n return allUrls;\n }\n}\n\nconst DOC_PATH_PATTERNS = ['/docs/', '/guide/', '/api/', '/reference/'];\n\nfunction isDocPage(url: string): boolean {\n const path = new URL(url).pathname.toLowerCase();\n return DOC_PATH_PATTERNS.some(p => path.includes(p));\n}\n"],"mappings":"AACA,SAAS,iBAAiB,yBAAyB;AACnD,SAAS,mBAAmB;AAC5B,SAAS,oBAAoB;AAC7B,SAAS,cAAc,mBAAmB,mCAAmC;AAC7E,SAAS,oBAAoB;AAC7B,SAAS,mBAAmB,wBAAwB;AACpD,SAAS,iBAAiB;AAC1B,SAAS,oBAAoB;AAE7B,MAAM,MAAM,aAAa,OAAO;AAKzB,MAAM,QAAQ;AAAA,EACX;AAAA,EACA;AAAA,EACA,cAAc,IAAI,YAAY;AAAA,EAEtC,YAAY,SAAkB,YAAwB;AACpD,SAAK,UAAU;AACf,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,MAAM,MAAM,OAAyC;AACnD,UAAM,WAAW,MAAM,YAAY;AACnC,UAAM,WAAW,MAAM,aAAa;AACpC,UAAM,WAAW,MAAM,aAAa;AAEpC,UAAM,aAAa,IAAI,IAAI,MAAM,GAAG,EAAE;AAGtC,UAAM,SAAS,UAAU;AACzB,QAAI,eAAoC;AACxC,QAAI,OAAO,kBAAkB;AAC3B,qBAAe,MAAM,KAAK,YAAY,UAAU;AAAA,IAClD;AAEA,QAAI,aAAa,QAAQ;AACvB,YAAM,cAAc,MAAM,aAAa,YAAY,KAAK,UAAU;AAClE,UAAI,eAAe,YAAY,SAAS,GAAG;AACzC,YAAI,KAAK,gCAAgC,EAAE,QAAQ,YAAY,MAAM,YAAY,OAAO,CAAC;AACzF,eAAO,KAAK,sBAAsB,OAAO,aAAa,UAAU,YAAY;AAAA,MAC9E;AACA,UAAI,KAAK,wDAAwD,EAAE,QAAQ,WAAW,CAAC;AACvF,aAAO,KAAK,eAAe,OAAO,YAAY,UAAU,UAAU,OAAO,YAAY;AAAA,IACvF;AAEA,QAAI,aAAa,WAAW;AAC1B,aAAO,KAAK,aAAa,OAAO,YAAY,UAAU,YAAY;AAAA,IACpE;AAEA,UAAM,oBAAoB,aAAa,QAAQ,QAAQ;AACvD,WAAO,KAAK,eAAe,OAAO,YAAY,UAAU,UAAU,mBAAmB,YAAY;AAAA,EACnG;AAAA,EAEQ,mBAAkC;AAAA,EAE1C,MAAc,YAAY,QAA8C;AACtE,QAAI;AACF,YAAM,SAAS,MAAM,KAAK,WAAW,GAAG,MAAM,aAAa;AAC3D,UAAI,OAAO,eAAe,OAAO,OAAO,MAAM;AAC5C,aAAK,mBAAmB,OAAO;AAC/B,cAAM,SAAS,IAAI,aAAa,OAAO,IAAI;AAC3C,cAAM,aAAa,OAAO,cAAc;AACxC,YAAI,eAAe,MAAM;AACvB,gBAAM,SAAS,IAAI,IAAI,MAAM,EAAE;AAC/B,eAAK,YAAY,oBAAoB,QAAQ,UAAU;AAAA,QACzD;AACA,eAAO;AAAA,MACT;AAAA,IACF,QAAQ;AACN,UAAI,MAAM,8BAA8B,EAAE,OAAO,CAAC;AAAA,IACpD;AACA,WAAO;AAAA,EACT;AAAA,EAEA,MAAc,eACZ,OACA,YACA,UACA,UACA,UACA,cACsB;AACtB,UAAM,UAAU,oBAAI,IAAY;AAChC,UAAM,QAA2B,CAAC;AAClC,UAAM,WAAuB,CAAC;AAC9B,UAAM,WAAW,kBAAkB;AAGnC,UAAM,QAAiC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC;AACtD,YAAQ,IAAI,kBAAkB,MAAM,GAAG,CAAC;AAExC,WAAO,MAAM,SAAS,KAAK,MAAM,SAAS,UAAU;AAClD,YAAM,OAAO,aAAa,QAAQ,MAAM,IAAI,IAAK,MAAM,MAAM;AAC7D,YAAM,CAAC,KAAK,KAAK,IAAI;AAGrB,UAAI,gBAAgB,CAAC,aAAa,UAAU,IAAI,IAAI,GAAG,EAAE,QAAQ,GAAG;AAClE,YAAI,MAAM,yBAAyB,EAAE,IAAI,CAAC;AAC1C;AAAA,MACF;AAGA,YAAM,UAAU,MAAM,KAAK,YAAY,QAAQ,GAAG;AAElD,UAAI;AACJ,UAAI;AACF,sBAAc,MAAM,KAAK,QAAQ,GAAG;AAAA,MACtC,SAAS,KAAK;AACZ,YAAI,KAAK,6BAA6B,EAAE,KAAK,OAAO,OAAO,GAAG,EAAE,CAAC;AACjE,gBAAQ;AACR;AAAA,MACF;AAEA,cAAQ;AAER,UAAI,YAAY,OAAO;AACrB,YAAI,KAAK,wBAAwB,EAAE,KAAK,OAAO,YAAY,MAAM,CAAC;AAClE;AAAA,MACF;AAEA,YAAM,OAAwB;AAAA,QAC5B,KAAK,YAAY;AAAA,QACjB,OAAO,YAAY;AAAA,QACnB,UAAU,YAAY;AAAA,QACtB;AAAA,MACF;AACA,YAAM,KAAK,IAAI;AAEf,UAAI,SAAU,OAAM,iBAAiB,IAAI;AAGzC,UAAI,QAAQ,UAAU;AACpB,cAAM,WAAW,KAAK,YAAY,YAAY,OAAO,YAAY,SAAS,MAAM,kBAAkB,MAAM,kBAAkB,YAAY;AAEtI,mBAAW,QAAQ,UAAU;AAC3B,kBAAQ,IAAI,kBAAkB,IAAI,CAAC;AACnC,gBAAM,KAAK,CAAC,MAAM,QAAQ,CAAC,CAAC;AAAA,QAC9B;AAEA,YAAI,MAAM,eAAe;AACvB,qBAAW,QAAQ,YAAY,OAAO;AACpC,qBAAS,KAAK,EAAE,MAAM,KAAK,IAAI,KAAK,CAAC;AAAA,UACvC;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAGA,WAAO;AAAA,MACL;AAAA,MACA,aAAa,QAAQ;AAAA,MACrB,SAAS,MAAM;AAAA,MACf,GAAI,MAAM,gBAAgB,EAAE,OAAO,SAAS,IAAI,CAAC;AAAA,IACnD;AAAA,EACF;AAAA,EAEQ,YACN,OACA,YACA,SACA,iBACA,iBACA,cACU;AACV,UAAM,WAAW,MAAM,OAAO,CAAC,SAAS;AACtC,UAAI;AACF,cAAM,SAAS,IAAI,IAAI,IAAI;AAC3B,YAAI,OAAO,WAAW,WAAY,QAAO;AACzC,YAAI,QAAQ,IAAI,kBAAkB,IAAI,CAAC,EAAG,QAAO;AACjD,YAAI,CAAC,gBAAgB,MAAM,iBAAiB,eAAe,EAAG,QAAO;AACrE,YAAI,gBAAgB,CAAC,aAAa,UAAU,OAAO,QAAQ,EAAG,QAAO;AACrE,eAAO;AAAA,MACT,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF,CAAC;AAGD,WAAO,SAAS,KAAK,CAAC,GAAG,MAAM;AAC7B,YAAM,OAAO,UAAU,CAAC,IAAI,IAAI;AAChC,YAAM,OAAO,UAAU,CAAC,IAAI,IAAI;AAChC,aAAO,OAAO;AAAA,IAChB,CAAC;AAAA,EACH;AAAA,EAEA,MAAM,aACJ,OACA,YACA,UACA,cACsB;AAEtB,UAAM,cAAc,MAAM,KAAK,oBAAoB,YAAY,KAAK,gBAAgB;AAEpF,QAAI,YAAY,WAAW,GAAG;AAC5B,UAAI,KAAK,uCAAuC;AAChD,aAAO,KAAK,eAAe,OAAO,YAAY,MAAM,aAAa,GAAG,UAAU,OAAO,YAAY;AAAA,IACnG;AAEA,WAAO,KAAK,sBAAsB,OAAO,aAAa,UAAU,YAAY;AAAA,EAC9E;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAOA,MAAc,sBACZ,OACA,MACA,UACA,cACsB;AACtB,UAAM,WAAW,KAAK;AAAA,MAAO,CAAC,QAC5B,gBAAgB,KAAK,MAAM,kBAAkB,MAAM,gBAAgB;AAAA,IACrE;AAEA,UAAM,aAAa,SAAS;AAC5B,UAAM,UAAU,SAAS,MAAM,GAAG,QAAQ;AAC1C,UAAM,QAA2B,CAAC;AAClC,UAAM,WAAuB,CAAC;AAC9B,UAAM,WAAW,kBAAkB;AAEnC,eAAW,OAAO,SAAS;AACzB,UAAI,MAAM,UAAU,SAAU;AAE9B,UAAI,gBAAgB,CAAC,aAAa,UAAU,IAAI,IAAI,GAAG,EAAE,QAAQ,EAAG;AAEpE,YAAM,UAAU,MAAM,KAAK,YAAY,QAAQ,GAAG;AAElD,UAAI;AACF,cAAM,SAAS,MAAM,KAAK,QAAQ,GAAG;AACrC,gBAAQ;AAER,YAAI,CAAC,OAAO,OAAO;AACjB,gBAAM,OAAwB,EAAE,KAAK,OAAO,KAAK,OAAO,OAAO,OAAO,UAAU,OAAO,UAAU,OAAO,EAAE;AAC1G,gBAAM,KAAK,IAAI;AAEf,cAAI,SAAU,OAAM,iBAAiB,IAAI;AAEzC,cAAI,MAAM,eAAe;AACvB,uBAAW,QAAQ,OAAO,OAAO;AAC/B,uBAAS,KAAK,EAAE,MAAM,KAAK,IAAI,KAAK,CAAC;AAAA,YACvC;AAAA,UACF;AAAA,QACF;AAAA,MACF,SAAS,KAAK;AACZ,gBAAQ;AACR,YAAI,KAAK,wBAAwB,EAAE,KAAK,OAAO,OAAO,GAAG,EAAE,CAAC;AAAA,MAC9D;AAAA,IACF;AAEA,WAAO;AAAA,MACL;AAAA,MACA,aAAa;AAAA,MACb,SAAS,MAAM;AAAA,MACf,GAAI,MAAM,gBAAgB,EAAE,OAAO,SAAS,IAAI,CAAC;AAAA,IACnD;AAAA,EACF;AAAA,EAEA,MAAc,oBAAoB,QAAgB,WAAyD;AACzG,UAAM,mBAA6B,CAAC;AAGpC,QAAI,WAAW;AACb,uBAAiB,KAAK,GAAG,4BAA4B,SAAS,CAAC;AAAA,IACjE;AAGA,QAAI,iBAAiB,WAAW,GAAG;AACjC,uBAAiB,KAAK,GAAG,MAAM,cAAc;AAAA,IAC/C;AAEA,UAAM,UAAoB,CAAC;AAE3B,eAAW,cAAc,kBAAkB;AACzC,UAAI;AACF,cAAM,SAAS,MAAM,KAAK,WAAW,UAAU;AAC/C,YAAI,OAAO,eAAe,IAAK;AAG/B,cAAM,YAAY,kBAAkB,OAAO,IAAI;AAC/C,YAAI,UAAU,SAAS,GAAG;AAExB,qBAAW,UAAU,WAAW;AAC9B,gBAAI;AACF,oBAAM,YAAY,MAAM,KAAK,WAAW,MAAM;AAC9C,kBAAI,UAAU,eAAe,KAAK;AAChC,wBAAQ,KAAK,GAAG,aAAa,UAAU,IAAI,CAAC;AAAA,cAC9C;AAAA,YACF,QAAQ;AAAA,YAER;AAAA,UACF;AAAA,QACF,OAAO;AACL,kBAAQ,KAAK,GAAG,aAAa,OAAO,IAAI,CAAC;AAAA,QAC3C;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;AAEA,MAAM,oBAAoB,CAAC,UAAU,WAAW,SAAS,aAAa;AAEtE,SAAS,UAAU,KAAsB;AACvC,QAAM,OAAO,IAAI,IAAI,GAAG,EAAE,SAAS,YAAY;AAC/C,SAAO,kBAAkB,KAAK,OAAK,KAAK,SAAS,CAAC,CAAC;AACrD;","names":[]}
@@ -8,6 +8,7 @@ interface PageOutput {
8
8
  url: string;
9
9
  markdown: string;
10
10
  }
11
+ export declare function stripRepeatedNavigationLines(pages: PageInput[]): PageInput[];
11
12
  export declare function deduplicatePages(pages: PageInput[], domain?: string): PageOutput[];
12
13
  export declare function getStoredBoilerplate(domain: string): string[];
13
14
  export declare function storeBoilerplate(domain: string, hashes: string[]): void;
@@ -1 +1 @@
1
- {"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAGA,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CA2B1D;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAMD,UAAU,SAAS;IACjB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,UAAU;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,UAAU,EAAE,CA+ClF;AAED,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAI7D;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAavE"}
1
+ {"version":3,"file":"dedup.d.ts","sourceRoot":"","sources":["../../src/crawl/dedup.ts"],"names":[],"mappings":"AAGA,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CA2B1D;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEvD;AAMD,UAAU,SAAS;IACjB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,UAAU,UAAU;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,QAAQ,EAAE,MAAM,CAAC;CAClB;AAWD,wBAAgB,4BAA4B,CAAC,KAAK,EAAE,SAAS,EAAE,GAAG,SAAS,EAAE,CAqC5E;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,UAAU,EAAE,CAiDlF;AAED,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAI7D;AAED,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAavE"}