@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Crawl checkpoint system for resume capability.
3
+ * Saves progress to a JSON file so interrupted crawls can continue.
4
+ */
5
+ export interface CrawlCheckpoint {
6
+ /** Unique crawl job ID (hash of start URL + options) */
7
+ jobId: string;
8
+ /** Starting URL */
9
+ startUrl: string;
10
+ /** URLs already crawled (with their results) */
11
+ completed: Map<string, {
12
+ status: number;
13
+ contentLength: number;
14
+ timestamp: number;
15
+ }>;
16
+ /** URLs queued but not yet crawled */
17
+ pending: string[];
18
+ /** URLs discovered but not yet queued */
19
+ discovered: string[];
20
+ /** Crawl options (serialized) */
21
+ options: Record<string, any>;
22
+ /** When crawl started */
23
+ startedAt: number;
24
+ /** Last checkpoint time */
25
+ lastCheckpoint: number;
26
+ /** Total pages target */
27
+ maxPages: number;
28
+ }
29
+ /**
30
+ * Generate a deterministic job ID from URL + options.
31
+ */
32
+ export declare function generateJobId(url: string, options?: Record<string, any>): string;
33
+ /**
34
+ * Save a checkpoint to disk.
35
+ */
36
+ export declare function saveCheckpoint(checkpoint: CrawlCheckpoint): void;
37
+ /**
38
+ * Load a checkpoint from disk.
39
+ */
40
+ export declare function loadCheckpoint(jobId: string): CrawlCheckpoint | null;
41
+ /**
42
+ * Delete a checkpoint (crawl completed or abandoned).
43
+ */
44
+ export declare function deleteCheckpoint(jobId: string): void;
45
+ /**
46
+ * List all active checkpoints.
47
+ */
48
+ export declare function listCheckpoints(): Array<{
49
+ jobId: string;
50
+ startUrl: string;
51
+ completed: number;
52
+ pending: number;
53
+ lastCheckpoint: number;
54
+ }>;
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Crawl checkpoint system for resume capability.
3
+ * Saves progress to a JSON file so interrupted crawls can continue.
4
+ */
5
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'fs';
6
+ import { join } from 'path';
7
+ import { createHash } from 'crypto';
8
+ const CHECKPOINT_DIR = join(process.env.HOME || '/tmp', '.webpeel', 'checkpoints');
9
+ /**
10
+ * Generate a deterministic job ID from URL + options.
11
+ */
12
+ export function generateJobId(url, options = {}) {
13
+ const key = JSON.stringify({
14
+ url,
15
+ maxPages: options.maxPages,
16
+ maxDepth: options.maxDepth,
17
+ includes: options.includes,
18
+ excludes: options.excludes,
19
+ });
20
+ return createHash('sha256').update(key).digest('hex').slice(0, 16);
21
+ }
22
+ /**
23
+ * Get the checkpoint file path for a job.
24
+ */
25
+ function getCheckpointPath(jobId) {
26
+ return join(CHECKPOINT_DIR, `${jobId}.json`);
27
+ }
28
+ /**
29
+ * Save a checkpoint to disk.
30
+ */
31
+ export function saveCheckpoint(checkpoint) {
32
+ try {
33
+ mkdirSync(CHECKPOINT_DIR, { recursive: true });
34
+ const data = {
35
+ ...checkpoint,
36
+ completed: Object.fromEntries(checkpoint.completed),
37
+ lastCheckpoint: Date.now(),
38
+ };
39
+ writeFileSync(getCheckpointPath(checkpoint.jobId), JSON.stringify(data, null, 2));
40
+ }
41
+ catch (e) {
42
+ if (process.env.DEBUG) {
43
+ console.debug('[webpeel]', 'Failed to save checkpoint:', e instanceof Error ? e.message : e);
44
+ }
45
+ }
46
+ }
47
+ /**
48
+ * Load a checkpoint from disk.
49
+ */
50
+ export function loadCheckpoint(jobId) {
51
+ const path = getCheckpointPath(jobId);
52
+ if (!existsSync(path))
53
+ return null;
54
+ try {
55
+ const raw = JSON.parse(readFileSync(path, 'utf-8'));
56
+ return {
57
+ ...raw,
58
+ completed: new Map(Object.entries(raw.completed || {})),
59
+ };
60
+ }
61
+ catch {
62
+ return null;
63
+ }
64
+ }
65
+ /**
66
+ * Delete a checkpoint (crawl completed or abandoned).
67
+ */
68
+ export function deleteCheckpoint(jobId) {
69
+ const path = getCheckpointPath(jobId);
70
+ try {
71
+ if (existsSync(path)) {
72
+ unlinkSync(path);
73
+ }
74
+ }
75
+ catch { /* ignore */ }
76
+ }
77
+ /**
78
+ * List all active checkpoints.
79
+ */
80
+ export function listCheckpoints() {
81
+ try {
82
+ if (!existsSync(CHECKPOINT_DIR))
83
+ return [];
84
+ const files = readdirSync(CHECKPOINT_DIR).filter((f) => f.endsWith('.json'));
85
+ return files.map(f => {
86
+ try {
87
+ const raw = JSON.parse(readFileSync(join(CHECKPOINT_DIR, f), 'utf-8'));
88
+ return {
89
+ jobId: raw.jobId,
90
+ startUrl: raw.startUrl,
91
+ completed: Object.keys(raw.completed || {}).length,
92
+ pending: (raw.pending || []).length,
93
+ lastCheckpoint: raw.lastCheckpoint,
94
+ };
95
+ }
96
+ catch {
97
+ return null;
98
+ }
99
+ }).filter(Boolean);
100
+ }
101
+ catch {
102
+ return [];
103
+ }
104
+ }
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Web crawler functionality
3
+ * Crawls a starting URL and follows links matching specified patterns
4
+ */
5
+ import type { PeelOptions } from '../types.js';
6
+ export interface CrawlOptions extends Omit<PeelOptions, 'format'> {
7
+ /** Maximum number of pages to crawl (default: 10, max: tier-dependent) */
8
+ maxPages?: number;
9
+ /** Tier for determining the max pages cap (default: 'free') */
10
+ tier?: string;
11
+ /** Maximum depth to crawl (default: 2, max: 5) */
12
+ maxDepth?: number;
13
+ /** Only crawl URLs from these domains (default: same domain as starting URL) */
14
+ allowedDomains?: string[];
15
+ /** Exclude URLs matching these patterns (regex strings) */
16
+ excludePatterns?: string[];
17
+ /** Respect robots.txt (default: true) */
18
+ respectRobotsTxt?: boolean;
19
+ /** Rate limit between requests in milliseconds (default: 1000ms = 1 req/sec) */
20
+ rateLimitMs?: number;
21
+ /** Try sitemap.xml first to discover URLs (default: false) */
22
+ sitemapFirst?: boolean;
23
+ /** Crawl strategy: breadth-first or depth-first (default: 'bfs') */
24
+ strategy?: 'bfs' | 'dfs';
25
+ /** Skip duplicate content using fingerprinting (default: true) */
26
+ deduplication?: boolean;
27
+ /** Only crawl URLs matching these regex patterns */
28
+ includePatterns?: string[];
29
+ /** Progress callback called after each page */
30
+ onProgress?: (status: CrawlProgress) => void;
31
+ /** Per-page callback — receives the full result as soon as a page completes */
32
+ onPage?: (result: CrawlResult) => void;
33
+ /** Resume an interrupted crawl from its last checkpoint */
34
+ resume?: boolean;
35
+ }
36
+ export interface CrawlProgress {
37
+ crawled: number;
38
+ queued: number;
39
+ failed: number;
40
+ currentUrl: string;
41
+ elapsed: number;
42
+ }
43
+ export interface CrawlResult {
44
+ /** URL of the crawled page */
45
+ url: string;
46
+ /** Page title */
47
+ title: string;
48
+ /** Markdown content */
49
+ markdown: string;
50
+ /** Token count for this page's content */
51
+ tokens: number;
52
+ /** All links found on this page (absolute URLs) */
53
+ links: string[];
54
+ /** Depth level (0 = starting URL) */
55
+ depth: number;
56
+ /** Parent URL that linked to this page (null for starting URL) */
57
+ parent: string | null;
58
+ /** Time elapsed fetching this page (ms) */
59
+ elapsed: number;
60
+ /** Error message if page failed to fetch */
61
+ error?: string;
62
+ /** Content fingerprint for deduplication */
63
+ fingerprint?: string;
64
+ }
65
+ /**
66
+ * Crawl a website starting from a URL
67
+ *
68
+ * @param startUrl - Starting URL to crawl from
69
+ * @param options - Crawl options
70
+ * @returns Array of crawl results
71
+ *
72
+ * @example
73
+ * ```typescript
74
+ * import { crawl } from 'webpeel';
75
+ *
76
+ * const results = await crawl('https://example.com', {
77
+ * maxPages: 20,
78
+ * maxDepth: 2,
79
+ * });
80
+ *
81
+ * console.log(`Crawled ${results.length} pages`);
82
+ * ```
83
+ */
84
+ export declare function crawl(startUrl: string, options?: CrawlOptions): Promise<CrawlResult[]>;
@@ -0,0 +1,349 @@
1
+ /**
2
+ * Web crawler functionality
3
+ * Crawls a starting URL and follows links matching specified patterns
4
+ */
5
+ import { peel } from '../index.js';
6
+ import { fetch as undiciFetch } from 'undici';
7
+ import { createHash } from 'crypto';
8
+ import { discoverSitemap } from './sitemap.js';
9
+ import { generateJobId, loadCheckpoint, saveCheckpoint, deleteCheckpoint, } from './crawl-checkpoint.js';
10
+ import { createLogger } from './logger.js';
11
+ const log = createLogger('crawler');
12
+ /** Safely compile a user-supplied regex pattern. Rejects patterns longer than 200 chars
13
+ * and wraps compilation in a try-catch to prevent invalid regex crashes. */
14
+ function safeRegex(pattern) {
15
+ if (pattern.length > 200) {
16
+ throw new Error(`Regex pattern too long (${pattern.length} chars, max 200)`);
17
+ }
18
+ try {
19
+ return new RegExp(pattern);
20
+ }
21
+ catch {
22
+ throw new Error(`Invalid regex pattern: ${pattern}`);
23
+ }
24
+ }
25
+ /** Maximum pages allowed per tier */
26
+ const TIER_MAX_PAGES = {
27
+ free: 10,
28
+ starter: 25,
29
+ pro: 50,
30
+ enterprise: 100,
31
+ max: 100,
32
+ admin: 10000,
33
+ };
34
+ /**
35
+ * Parse robots.txt and return disallowed paths for User-agent: *
36
+ */
37
+ async function fetchRobotsTxt(domain) {
38
+ const robotsUrl = `https://${domain}/robots.txt`;
39
+ try {
40
+ const response = await undiciFetch(robotsUrl, {
41
+ headers: {
42
+ 'User-Agent': 'WebPeel/0.3.1 (+https://webpeel.dev)',
43
+ },
44
+ signal: AbortSignal.timeout(5000), // 5 second timeout
45
+ });
46
+ if (!response.ok) {
47
+ // If robots.txt doesn't exist, allow everything
48
+ return { disallowedPaths: [] };
49
+ }
50
+ const text = await response.text();
51
+ const lines = text.split('\n');
52
+ const disallowedPaths = [];
53
+ let crawlDelay;
54
+ let relevantSection = false;
55
+ for (const line of lines) {
56
+ const trimmed = line.trim();
57
+ // Check for User-agent: *
58
+ if (trimmed.toLowerCase().startsWith('user-agent:')) {
59
+ const agent = trimmed.substring('user-agent:'.length).trim();
60
+ relevantSection = agent === '*';
61
+ continue;
62
+ }
63
+ if (!relevantSection)
64
+ continue;
65
+ // Parse Disallow directives
66
+ if (trimmed.toLowerCase().startsWith('disallow:')) {
67
+ const path = trimmed.substring('disallow:'.length).trim();
68
+ if (path) {
69
+ disallowedPaths.push(path);
70
+ }
71
+ }
72
+ // Parse Crawl-delay directive
73
+ if (trimmed.toLowerCase().startsWith('crawl-delay:')) {
74
+ const delay = parseInt(trimmed.substring('crawl-delay:'.length).trim());
75
+ if (!isNaN(delay)) {
76
+ crawlDelay = delay * 1000; // Convert to milliseconds
77
+ }
78
+ }
79
+ }
80
+ return { disallowedPaths, crawlDelay };
81
+ }
82
+ catch {
83
+ // If we can't fetch robots.txt, allow everything
84
+ return { disallowedPaths: [] };
85
+ }
86
+ }
87
+ /**
88
+ * Check if a URL is allowed by robots.txt rules
89
+ */
90
+ function isAllowedByRobots(url, rules) {
91
+ const urlObj = new URL(url);
92
+ const path = urlObj.pathname;
93
+ for (const disallowed of rules.disallowedPaths) {
94
+ // Simple prefix matching (proper robots.txt parsing would handle wildcards)
95
+ if (path.startsWith(disallowed)) {
96
+ return false;
97
+ }
98
+ }
99
+ return true;
100
+ }
101
+ /**
102
+ * Crawl a website starting from a URL
103
+ *
104
+ * @param startUrl - Starting URL to crawl from
105
+ * @param options - Crawl options
106
+ * @returns Array of crawl results
107
+ *
108
+ * @example
109
+ * ```typescript
110
+ * import { crawl } from 'webpeel';
111
+ *
112
+ * const results = await crawl('https://example.com', {
113
+ * maxPages: 20,
114
+ * maxDepth: 2,
115
+ * });
116
+ *
117
+ * console.log(`Crawled ${results.length} pages`);
118
+ * ```
119
+ */
120
+ export async function crawl(startUrl, options = {}) {
121
+ const { maxPages = 10, tier, maxDepth = 2, allowedDomains, excludePatterns = [], respectRobotsTxt = true, rateLimitMs = 500, sitemapFirst = false, strategy = 'bfs', deduplication = true, includePatterns = [], resume = false, onProgress, onPage, ...peelOptions } = options;
122
+ const crawlStartTime = Date.now();
123
+ // Validate limits
124
+ const tierMaxPages = TIER_MAX_PAGES[tier || 'free'] ?? TIER_MAX_PAGES.free;
125
+ const validatedMaxPages = Math.min(Math.max(maxPages, 1), tierMaxPages);
126
+ const validatedMaxDepth = Math.min(Math.max(maxDepth, 1), 5);
127
+ const validatedRateLimit = Math.max(rateLimitMs, 100); // Min 100ms between requests
128
+ // Parse starting URL
129
+ const startUrlObj = new URL(startUrl);
130
+ const startDomain = startUrlObj.hostname;
131
+ // Default: only crawl same domain as starting URL
132
+ const validatedAllowedDomains = allowedDomains && allowedDomains.length > 0
133
+ ? allowedDomains
134
+ : [startDomain];
135
+ // Compile exclude patterns (with timeout protection against ReDoS)
136
+ const excludeRegexes = excludePatterns.map(pattern => safeRegex(pattern));
137
+ // Compile include patterns (with timeout protection against ReDoS)
138
+ const includeRegexes = includePatterns.map(pattern => safeRegex(pattern));
139
+ // Fetch robots.txt if needed
140
+ let robotsRules = { disallowedPaths: [] };
141
+ if (respectRobotsTxt) {
142
+ robotsRules = await fetchRobotsTxt(startDomain);
143
+ // Use crawl-delay from robots.txt if it's larger than our rate limit
144
+ if (robotsRules.crawlDelay && robotsRules.crawlDelay > validatedRateLimit) {
145
+ log.info(`Using Crawl-delay from robots.txt: ${robotsRules.crawlDelay}ms`);
146
+ }
147
+ }
148
+ const effectiveRateLimit = robotsRules.crawlDelay || validatedRateLimit;
149
+ // Checkpoint: generate a deterministic job ID for this crawl
150
+ const crawlOptionsForCheckpoint = {
151
+ maxPages: validatedMaxPages,
152
+ maxDepth: validatedMaxDepth,
153
+ includes: includePatterns,
154
+ excludes: excludePatterns,
155
+ };
156
+ const jobId = generateJobId(startUrl, crawlOptionsForCheckpoint);
157
+ // Load existing checkpoint if resume is requested
158
+ const checkpoint = resume ? loadCheckpoint(jobId) : null;
159
+ if (checkpoint) {
160
+ log.info(`Resuming crawl from checkpoint: ${checkpoint.completed.size} pages already crawled`);
161
+ }
162
+ // State tracking
163
+ const results = [];
164
+ const visited = new Set();
165
+ const contentFingerprints = new Set();
166
+ let failedCount = 0;
167
+ // If resuming, restore visited/results from checkpoint
168
+ if (checkpoint) {
169
+ for (const [url] of checkpoint.completed) {
170
+ visited.add(url);
171
+ }
172
+ }
173
+ const queue = [];
174
+ // If resuming with pending URLs, restore queue; otherwise start from scratch
175
+ if (checkpoint && checkpoint.pending.length > 0) {
176
+ for (const pendingUrl of checkpoint.pending) {
177
+ queue.push({ url: pendingUrl, depth: 1, parent: startUrl });
178
+ }
179
+ }
180
+ else {
181
+ queue.push({ url: startUrl, depth: 0, parent: null });
182
+ }
183
+ // Sitemap-first: Discover URLs from sitemap before crawling
184
+ if (sitemapFirst) {
185
+ try {
186
+ const sitemap = await discoverSitemap(startDomain, { timeout: 10000, maxUrls: validatedMaxPages });
187
+ for (const entry of sitemap.urls) {
188
+ const entryUrl = entry.url;
189
+ try {
190
+ const entryUrlObj = new URL(entryUrl);
191
+ if (validatedAllowedDomains.includes(entryUrlObj.hostname)) {
192
+ queue.push({ url: entryUrl, depth: 1, parent: startUrl });
193
+ }
194
+ }
195
+ catch { /* skip invalid URLs */ }
196
+ }
197
+ }
198
+ catch { /* skip sitemap errors */ }
199
+ }
200
+ while (queue.length > 0 && results.length < validatedMaxPages) {
201
+ // Use DFS (stack) or BFS (queue) strategy
202
+ const item = strategy === 'dfs' ? queue.pop() : queue.shift();
203
+ const { url, depth, parent } = item;
204
+ // Skip if already visited
205
+ if (visited.has(url))
206
+ continue;
207
+ visited.add(url);
208
+ // Skip if depth exceeded
209
+ if (depth > validatedMaxDepth)
210
+ continue;
211
+ // Validate URL
212
+ let urlObj;
213
+ try {
214
+ urlObj = new URL(url);
215
+ }
216
+ catch {
217
+ continue; // Skip invalid URLs
218
+ }
219
+ // Check if domain is allowed
220
+ if (!validatedAllowedDomains.includes(urlObj.hostname)) {
221
+ continue;
222
+ }
223
+ // Check exclude patterns
224
+ if (excludeRegexes.some(regex => regex.test(url))) {
225
+ continue;
226
+ }
227
+ // Check include patterns
228
+ if (includeRegexes.length > 0 && !includeRegexes.some(regex => regex.test(url))) {
229
+ continue;
230
+ }
231
+ // Check robots.txt
232
+ if (respectRobotsTxt && !isAllowedByRobots(url, robotsRules)) {
233
+ log.debug(`Skipping ${url} (disallowed by robots.txt)`);
234
+ continue;
235
+ }
236
+ // Fetch the page
237
+ try {
238
+ const result = await peel(url, {
239
+ ...peelOptions,
240
+ format: 'markdown',
241
+ });
242
+ // Deduplication: compute content fingerprint
243
+ let fingerprint;
244
+ if (deduplication) {
245
+ fingerprint = createHash('sha256').update(result.content).digest('hex');
246
+ if (contentFingerprints.has(fingerprint)) {
247
+ // Skip duplicate content
248
+ continue;
249
+ }
250
+ contentFingerprints.add(fingerprint);
251
+ }
252
+ const crawlResult = {
253
+ url: result.url,
254
+ title: result.title,
255
+ markdown: result.content,
256
+ tokens: result.tokens ?? 0,
257
+ links: result.links,
258
+ depth,
259
+ parent,
260
+ elapsed: result.elapsed,
261
+ };
262
+ if (fingerprint) {
263
+ crawlResult.fingerprint = fingerprint;
264
+ }
265
+ results.push(crawlResult);
266
+ // Save checkpoint every 5 pages
267
+ if (results.length % 5 === 0) {
268
+ saveCheckpoint({
269
+ jobId,
270
+ startUrl,
271
+ completed: new Map(results
272
+ .filter(r => !r.error)
273
+ .map(r => [r.url, { status: 200, contentLength: r.markdown.length, timestamp: Date.now() }])),
274
+ pending: queue.map(q => q.url),
275
+ discovered: [],
276
+ options: crawlOptionsForCheckpoint,
277
+ startedAt: crawlStartTime,
278
+ lastCheckpoint: Date.now(),
279
+ maxPages: validatedMaxPages,
280
+ });
281
+ }
282
+ // Call per-page callback with full result
283
+ if (onPage) {
284
+ onPage(crawlResult);
285
+ }
286
+ // Call progress callback
287
+ if (onProgress) {
288
+ onProgress({
289
+ crawled: results.length,
290
+ queued: queue.length,
291
+ failed: failedCount,
292
+ currentUrl: url,
293
+ elapsed: Date.now() - crawlStartTime,
294
+ });
295
+ }
296
+ // Add discovered links to queue
297
+ if (depth < validatedMaxDepth) {
298
+ for (const link of result.links) {
299
+ if (!visited.has(link)) {
300
+ queue.push({
301
+ url: link,
302
+ depth: depth + 1,
303
+ parent: url,
304
+ });
305
+ }
306
+ }
307
+ }
308
+ // Rate limiting
309
+ if (results.length < validatedMaxPages) {
310
+ await new Promise(resolve => setTimeout(resolve, effectiveRateLimit));
311
+ }
312
+ }
313
+ catch (error) {
314
+ // Log error and continue
315
+ failedCount++;
316
+ const errorMessage = error instanceof Error ? error.message : 'Unknown error';
317
+ log.error(`Failed to fetch ${url}: ${errorMessage}`);
318
+ const errorResult = {
319
+ url,
320
+ title: '',
321
+ markdown: '',
322
+ tokens: 0,
323
+ links: [],
324
+ depth,
325
+ parent,
326
+ elapsed: 0,
327
+ error: errorMessage,
328
+ };
329
+ results.push(errorResult);
330
+ // Call per-page callback with error result
331
+ if (onPage) {
332
+ onPage(errorResult);
333
+ }
334
+ // Call progress callback even for failed pages
335
+ if (onProgress) {
336
+ onProgress({
337
+ crawled: results.length,
338
+ queued: queue.length,
339
+ failed: failedCount,
340
+ currentUrl: url,
341
+ elapsed: Date.now() - crawlStartTime,
342
+ });
343
+ }
344
+ }
345
+ }
346
+ // Crawl complete — clean up checkpoint
347
+ deleteCheckpoint(jobId);
348
+ return results;
349
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Cross-source verification — search multiple engines, compare results,
3
+ * compute consensus/confidence scores.
4
+ */
5
+ import type { WebSearchResult } from './search-provider.js';
6
+ export interface CrossVerifyResult {
7
+ query: string;
8
+ sources: Array<{
9
+ engine: string;
10
+ resultCount: number;
11
+ topResults: WebSearchResult[];
12
+ }>;
13
+ consensus: Array<{
14
+ url: string;
15
+ title: string;
16
+ appearsIn: string[];
17
+ agreementScore: number;
18
+ averagePosition: number;
19
+ }>;
20
+ confidence: number;
21
+ totalSources: number;
22
+ elapsed: number;
23
+ }
24
+ export declare function crossVerifySearch(query: string, options?: {
25
+ engines?: string[];
26
+ count?: number;
27
+ }): Promise<CrossVerifyResult>;