@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,468 @@
1
+ /**
2
+ * Challenge / bot-protection page detection.
3
+ *
4
+ * Analyzes raw HTML (and optional HTTP status code) to determine whether the
5
+ * response is a bot-challenge or block page rather than real content.
6
+ *
7
+ * Design goals:
8
+ * - Fast: pure string/regex matching, no DOM parsing required
9
+ * - Low false-positive rate: uses confidence scoring, only flags at >= 0.7
10
+ * - No external dependencies
11
+ */
12
+ /* ---------- helpers ------------------------------------------------------ */
13
+ /** Case-insensitive substring presence test. */
14
+ function has(html, needle) {
15
+ return html.includes(needle);
16
+ }
17
+ /** Test multiple needles — return how many match. */
18
+ function countMatches(html, needles) {
19
+ let count = 0;
20
+ for (const needle of needles) {
21
+ if (html.includes(needle))
22
+ count++;
23
+ }
24
+ return count;
25
+ }
26
+ /** Extract <title> content (lowercased). */
27
+ function extractTitle(html) {
28
+ const m = html.match(/<title[^>]*>([^<]*)<\/title>/i);
29
+ return m ? m[1].toLowerCase().trim() : '';
30
+ }
31
+ /** Estimate visible text length after stripping scripts/styles/tags. */
32
+ function estimateVisibleTextLength(html) {
33
+ const stripped = html
34
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
35
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
36
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
37
+ .replace(/<[^>]*>/g, '')
38
+ .replace(/\s+/g, ' ')
39
+ .trim();
40
+ return stripped.length;
41
+ }
42
+ /* ---------- vendor-specific detectors ------------------------------------ */
43
+ function detectCloudflare(html, statusCode) {
44
+ let score = 0;
45
+ // Strong signals — each adds a lot of weight
46
+ const strongSignals = [
47
+ 'cf-browser-verification',
48
+ 'cf-turnstile',
49
+ 'cf-challenge',
50
+ 'cf-chl-widget',
51
+ 'challenge-running',
52
+ 'challenge-form',
53
+ 'window._cf_chl_opt',
54
+ '__cf_chl_f_tk',
55
+ 'cf_chl_prog',
56
+ 'cf-spinner',
57
+ 'cf-error-overview',
58
+ ];
59
+ const strongCount = countMatches(html, strongSignals);
60
+ score += Math.min(strongCount * 0.25, 0.75);
61
+ // Title check
62
+ const title = extractTitle(html);
63
+ if (title.includes('just a moment') ||
64
+ title.includes('attention required') ||
65
+ title.includes('checking your browser') ||
66
+ title.includes('one more step')) {
67
+ score += 0.35;
68
+ }
69
+ // Ray ID is a Cloudflare-specific identifier
70
+ if (/ray\s+id/i.test(html) || /ray id:/i.test(html)) {
71
+ score += 0.2;
72
+ }
73
+ // Cloudflare's cdn-cgi path
74
+ if (has(html, 'cdn-cgi/')) {
75
+ score += 0.15;
76
+ }
77
+ // 403/503 + Cloudflare signals
78
+ if ((statusCode === 403 || statusCode === 503) && score > 0) {
79
+ score += 0.2;
80
+ }
81
+ return Math.min(score, 1);
82
+ }
83
+ function detectPerimeterX(html, statusCode) {
84
+ let score = 0;
85
+ // Use lowercase for case-insensitive matching of new Expedia-style signals
86
+ const htmlLower = html.toLowerCase();
87
+ const signals = [
88
+ 'perimeterx',
89
+ '_pxhd',
90
+ 'px-captcha',
91
+ '_pxCaptcha',
92
+ 'window._pxAppId',
93
+ 'window._pxUuid',
94
+ 'pxCaptcha',
95
+ '_px3',
96
+ '_pxvid',
97
+ 'human.security',
98
+ 'px-block',
99
+ ];
100
+ const count = countMatches(html, signals);
101
+ score += Math.min(count * 0.3, 0.8);
102
+ // Case-insensitive Expedia/PerimeterX-specific signals
103
+ const expediaSignals = [
104
+ 'human or a bot',
105
+ 'show us your human side',
106
+ 'human-side',
107
+ 'bot or not',
108
+ ];
109
+ const expediaCount = countMatches(htmlLower, expediaSignals);
110
+ score += Math.min(expediaCount * 0.25, 0.6);
111
+ const title = extractTitle(htmlLower);
112
+ if (title.includes('access denied') ||
113
+ title.includes('has been denied') ||
114
+ title.includes('access to this page') ||
115
+ title.includes('please verify') ||
116
+ title.includes('bot detection') ||
117
+ title.includes('pardon our interruption') ||
118
+ title.includes('bot or not')) {
119
+ score += 0.15;
120
+ }
121
+ // PerimeterX "Press & Hold" challenge page (used by Zillow, etc.)
122
+ const hasPresssHold = has(html, 'Press & Hold') || has(html, 'Press &amp; Hold') || has(htmlLower, 'press and hold');
123
+ const hasHumanCheck = has(htmlLower, 'confirm you are human') || has(htmlLower, 'confirm you area human') || has(htmlLower, 'not a bot') || has(htmlLower, 'human or a bot') || has(htmlLower, 'show us your human side') || has(htmlLower, 'bot or not');
124
+ if (hasPresssHold && hasHumanCheck) {
125
+ score += 0.5;
126
+ }
127
+ else if (hasPresssHold || hasHumanCheck) {
128
+ score += 0.2;
129
+ }
130
+ // Reference ID pattern is common in PerimeterX block pages (supports "Reference ID:" and "Reference ID " formats)
131
+ if (/reference\s+id[:\s]+[0-9a-f-]{20,}/i.test(html)) {
132
+ score += 0.2;
133
+ }
134
+ if (statusCode === 403 && score > 0) {
135
+ score += 0.1;
136
+ }
137
+ return Math.min(score, 1);
138
+ }
139
+ function detectAkamai(html, statusCode) {
140
+ let score = 0;
141
+ const signals = [
142
+ 'ak_bmsc',
143
+ '_abck',
144
+ 'bm_sz',
145
+ 'akamaized.net',
146
+ 'akamai',
147
+ 'bmak.',
148
+ '__utmz',
149
+ 'akam/',
150
+ 'BotManagerSettings',
151
+ ];
152
+ const count = countMatches(html, signals);
153
+ score += Math.min(count * 0.2, 0.6);
154
+ // Akamai often shows a short "Access Denied" page
155
+ const title = extractTitle(html);
156
+ if (title.includes('access denied') || title.includes('forbidden')) {
157
+ score += 0.2;
158
+ }
159
+ // Akamai block pages tend to be small
160
+ if (html.length < 2000 && score > 0) {
161
+ score += 0.15;
162
+ }
163
+ if ((statusCode === 403 || statusCode === 503) && score > 0) {
164
+ score += 0.1;
165
+ }
166
+ return Math.min(score, 1);
167
+ }
168
+ function detectDataDome(html, _statusCode) {
169
+ let score = 0;
170
+ const signals = [
171
+ 'datadome',
172
+ 'dd.js',
173
+ 'datadome.co',
174
+ 'window.ddjskey',
175
+ 'ddjskey',
176
+ 'dd_referrer',
177
+ 'dd_cookie_test',
178
+ 'datadome/captcha',
179
+ // DataDome's CAPTCHA delivery infrastructure (used by Etsy, FootLocker, etc.)
180
+ 'captcha-delivery.com',
181
+ 'geo.captcha-delivery.com',
182
+ ];
183
+ const count = countMatches(html, signals);
184
+ score += Math.min(count * 0.3, 0.9);
185
+ // DataDome uses a short `var dd={...}` config variable with captcha-delivery host
186
+ if (/\bvar\s+dd\s*=\s*\{/.test(html) && html.includes('captcha-delivery')) {
187
+ score += 0.4;
188
+ }
189
+ return Math.min(score, 1);
190
+ }
191
+ function detectIncapsula(html, _statusCode) {
192
+ let score = 0;
193
+ const signals = [
194
+ 'incap_ses_',
195
+ 'visid_incap_',
196
+ '_incap_',
197
+ 'imperva',
198
+ 'incapsula',
199
+ 'incapsula.com',
200
+ 'incapcookies',
201
+ 'reese84',
202
+ ];
203
+ const count = countMatches(html, signals);
204
+ score += Math.min(count * 0.3, 0.8);
205
+ // Incapsula "requires JavaScript" pages
206
+ if (has(html, 'This site requires JavaScript') ||
207
+ has(html, 'requires javascript')) {
208
+ score += 0.15;
209
+ }
210
+ return Math.min(score, 1);
211
+ }
212
+ /**
213
+ * Detect generic block/challenge pages that don't belong to a specific vendor.
214
+ *
215
+ * We use multiple weak signals and require several of them to fire before
216
+ * flagging — this avoids false positives from pages that merely mention
217
+ * these terms in article content.
218
+ */
219
+ function detectGenericBlock(html, statusCode) {
220
+ let score = 0;
221
+ // Title signals (strong)
222
+ const title = extractTitle(html);
223
+ const blockTitles = [
224
+ 'access denied',
225
+ 'has been denied',
226
+ 'has been blocked',
227
+ 'access to this page',
228
+ '403 forbidden',
229
+ 'bot detected',
230
+ 'verify you are human',
231
+ 'security check',
232
+ 'ddos protection',
233
+ 'rate limit exceeded',
234
+ 'too many requests',
235
+ 'captcha required',
236
+ 'robot check',
237
+ 'unusual traffic',
238
+ 'automated access',
239
+ 'browser check',
240
+ 'human verification',
241
+ 'blocked by',
242
+ 'pardon our interruption',
243
+ 'bot or not',
244
+ 'blocked',
245
+ 'verification required',
246
+ 'are you a robot',
247
+ ];
248
+ for (const t of blockTitles) {
249
+ if (title.includes(t)) {
250
+ score += 0.35;
251
+ break; // Only count once from title
252
+ }
253
+ }
254
+ // Body signals — but require multiple (to avoid false positives from blog posts)
255
+ const bodySignals = [
256
+ 'automated access',
257
+ 'suspicious activity',
258
+ 'rate limit',
259
+ 'bot detected',
260
+ 'verify you are human',
261
+ 'verify that you are human',
262
+ 'confirm you are human',
263
+ 'confirm you area human', // known PerimeterX typo in the wild
264
+ 'are you a robot',
265
+ 'are you human',
266
+ 'not a bot',
267
+ 'and not a bot',
268
+ 'press & hold',
269
+ 'press and hold',
270
+ 'ddos protection by',
271
+ 'please complete the security check',
272
+ 'this page checks to see if it',
273
+ 'prove you are human',
274
+ 'security challenge',
275
+ 'enable javascript and cookies',
276
+ 'javascript and cookies to continue',
277
+ 'enable cookies',
278
+ 'reference id', // PerimeterX block pages include a Reference ID
279
+ 'why have i been blocked',
280
+ 'your access has been blocked',
281
+ 'detected unusual activity',
282
+ // New patterns for additional challenge pages
283
+ 'human or a bot',
284
+ 'show us your human side',
285
+ 'bot or not',
286
+ 'complete a captcha',
287
+ 'solve this puzzle',
288
+ 'verify your identity',
289
+ 'unusual traffic',
290
+ 'too many requests',
291
+ 'access denied',
292
+ 'automated traffic',
293
+ 'we need to verify',
294
+ 'human verification',
295
+ 'browser verification',
296
+ 'checking your browser',
297
+ 'please wait while we verify',
298
+ 'blocked by',
299
+ ];
300
+ const bodyCount = countMatches(html, bodySignals);
301
+ // Require at least 2 body signals to avoid flagging a blog post mentioning one
302
+ if (bodyCount >= 2) {
303
+ score += Math.min((bodyCount - 1) * 0.15, 0.4);
304
+ }
305
+ else if (bodyCount === 1 && title.length === 0) {
306
+ // Single body signal + no title = weak signal only
307
+ score += 0.05;
308
+ }
309
+ // Very short response with an error status
310
+ if (html.length < 1000 && (statusCode === 403 || statusCode === 503 || statusCode === 429)) {
311
+ score += 0.25;
312
+ // Tiny pages (< 500 chars) with a block status are almost certainly block pages
313
+ if (html.length < 500) {
314
+ score += 0.15;
315
+ }
316
+ }
317
+ // Meta refresh to a captcha/challenge URL — this ONLY happens on challenge interstitials;
318
+ // real content pages never redirect to a captcha URL via meta-refresh.
319
+ if (/meta[^>]*refresh/i.test(html) && /captcha|challenge/i.test(html)) {
320
+ score += 0.75;
321
+ }
322
+ // Page is almost entirely a form with nothing else (login-wall-adjacent)
323
+ // We want to avoid flagging actual login pages here, so only trigger if
324
+ // combined with other signals.
325
+ if (score > 0.2) {
326
+ const formOnly = html.length < 3000 &&
327
+ (html.match(/<form/gi) || []).length > 0 &&
328
+ estimateVisibleTextLength(html) < 150;
329
+ if (formOnly) {
330
+ score += 0.15;
331
+ }
332
+ }
333
+ // HTTP 429 on its own is a strong rate-limit signal
334
+ if (statusCode === 429) {
335
+ score += 0.25;
336
+ }
337
+ // A page that is mostly/entirely an iframe to a captcha service
338
+ // (short HTML + iframe with captcha in src/title)
339
+ if (html.length < 2000 &&
340
+ /iframe[^>]*captcha/i.test(html) &&
341
+ (statusCode === 403 || statusCode === 503 || statusCode === 429)) {
342
+ score += 0.5;
343
+ }
344
+ return Math.min(score, 1);
345
+ }
346
+ /**
347
+ * Detect SPA shells — large HTML but almost no visible text.
348
+ * These happen when a JS-rendered site returns an app shell without executing JS.
349
+ */
350
+ function detectEmptyShell(html, _statusCode) {
351
+ // Must be a substantial HTML payload (otherwise it's just a small page)
352
+ if (html.length < 2000)
353
+ return 0;
354
+ const visibleLen = estimateVisibleTextLength(html);
355
+ // Less than 200 chars of visible text in a large HTML doc = shell
356
+ if (visibleLen >= 200)
357
+ return 0;
358
+ let score = 0.65; // base confidence for a shell
359
+ // Known SPA root elements that are empty
360
+ const shellPatterns = [
361
+ '<div id="root"></div>',
362
+ '<div id="root"> </div>',
363
+ '<div id="app"></div>',
364
+ '<div id="app"> </div>',
365
+ '<div id="__next"></div>',
366
+ '<div id="__next"> </div>',
367
+ '<div id="gatsby-focus-wrapper"></div>',
368
+ '<div id="___gatsby"></div>',
369
+ 'id="root"', // weaker — just presence of root
370
+ 'id="__next"', // Next.js
371
+ ];
372
+ const shellCount = countMatches(html, shellPatterns);
373
+ if (shellCount > 0) {
374
+ score += Math.min(shellCount * 0.1, 0.2);
375
+ }
376
+ // Many script tags in a tiny-text page = SPA shell
377
+ const scriptTagCount = (html.match(/<script/gi) || []).length;
378
+ if (scriptTagCount >= 3) {
379
+ score += 0.1;
380
+ }
381
+ return Math.min(score, 1);
382
+ }
383
+ /* ---------- false-positive guards --------------------------------------- */
384
+ /**
385
+ * Returns true if the HTML looks like legitimate content that just happens
386
+ * to mention security/captcha terms (e.g. a blog post ABOUT CAPTCHAs).
387
+ */
388
+ function looksLikeRealContent(html) {
389
+ const visible = estimateVisibleTextLength(html);
390
+ // If there's a lot of visible text, it's almost certainly real content
391
+ if (visible > 1500)
392
+ return true;
393
+ // If visible text is 600+ chars and it's not a tiny page, likely real
394
+ if (visible > 600 && html.length > 5000)
395
+ return true;
396
+ return false;
397
+ }
398
+ /**
399
+ * Returns true if this looks like a normal 404 page (not a block page).
400
+ * 404s are sometimes mistaken for blocks when they have short content.
401
+ */
402
+ function looksLike404(html, statusCode) {
403
+ if (statusCode !== 404)
404
+ return false;
405
+ const title = extractTitle(html);
406
+ return (title.includes('not found') ||
407
+ title.includes('404') ||
408
+ title.includes('page not found') ||
409
+ title.includes('error 404'));
410
+ }
411
+ /* ---------- main export -------------------------------------------------- */
412
+ /**
413
+ * Detect whether an HTML response is a bot-challenge or block page.
414
+ *
415
+ * @param html - Raw HTML response body.
416
+ * @param statusCode - HTTP status code (optional but improves accuracy).
417
+ */
418
+ export function detectChallenge(html, statusCode) {
419
+ const THRESHOLD = 0.7;
420
+ // Sanity — empty input
421
+ if (!html || html.length === 0) {
422
+ return { isChallenge: false, confidence: 0 };
423
+ }
424
+ // Quick exit: if there's clearly lots of real content, don't bother scoring
425
+ // (still allow empty-shell detection to run since that has LOTS of html but no text)
426
+ const realContent = looksLikeRealContent(html);
427
+ const is404 = looksLike404(html, statusCode);
428
+ if (is404) {
429
+ return { isChallenge: false, confidence: 0, details: '404 page' };
430
+ }
431
+ // Normalize to lowercase for case-insensitive matching
432
+ // We keep a lowercase copy for patterns that don't need case sensitivity
433
+ const htmlLower = html.toLowerCase();
434
+ // Run each vendor detector
435
+ const scores = [
436
+ { type: 'cloudflare', score: detectCloudflare(html, statusCode) },
437
+ { type: 'perimeterx', score: detectPerimeterX(html, statusCode) },
438
+ { type: 'akamai', score: detectAkamai(html, statusCode) },
439
+ { type: 'datadome', score: detectDataDome(htmlLower, statusCode) },
440
+ { type: 'incapsula', score: detectIncapsula(htmlLower, statusCode) },
441
+ { type: 'generic-block', score: detectGenericBlock(htmlLower, statusCode) },
442
+ { type: 'empty-shell', score: detectEmptyShell(html, statusCode) },
443
+ ];
444
+ // Find highest scoring detector
445
+ let best = scores[0];
446
+ for (const entry of scores) {
447
+ if (entry.score > best.score)
448
+ best = entry;
449
+ }
450
+ // If real content guard fired, suppress non-empty-shell challenges
451
+ // (a blog post about Cloudflare can mention cf patterns in quoted code blocks)
452
+ if (realContent && best.type !== 'empty-shell') {
453
+ return {
454
+ isChallenge: false,
455
+ confidence: best.score * 0.4,
456
+ details: 'Suppressed: page has substantial real content',
457
+ };
458
+ }
459
+ if (best.score < THRESHOLD) {
460
+ return { isChallenge: false, confidence: best.score };
461
+ }
462
+ return {
463
+ isChallenge: true,
464
+ type: best.type,
465
+ confidence: best.score,
466
+ details: `Detected as ${best.type} (confidence ${best.score.toFixed(2)})`,
467
+ };
468
+ }
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Local-first content change tracking
3
+ * Stores snapshots in ~/.webpeel/snapshots/ and provides diffing
4
+ */
5
+ export interface Snapshot {
6
+ url: string;
7
+ fingerprint: string;
8
+ content: string;
9
+ timestamp: number;
10
+ metadata?: Record<string, any>;
11
+ }
12
+ export interface ChangeResult {
13
+ changeStatus: 'new' | 'same' | 'changed' | 'removed';
14
+ previousScrapeAt: string | null;
15
+ diff?: {
16
+ text: string;
17
+ additions: number;
18
+ deletions: number;
19
+ changes: Array<{
20
+ type: 'add' | 'del' | 'normal';
21
+ line: number;
22
+ content: string;
23
+ }>;
24
+ };
25
+ }
26
+ /**
27
+ * Get a snapshot for a URL
28
+ *
29
+ * @param url - URL to get snapshot for
30
+ * @returns Snapshot if exists, null otherwise
31
+ *
32
+ * @example
33
+ * ```typescript
34
+ * const snapshot = await getSnapshot('https://example.com');
35
+ * if (snapshot) {
36
+ * console.log('Last scraped:', new Date(snapshot.timestamp));
37
+ * }
38
+ * ```
39
+ */
40
+ export declare function getSnapshot(url: string): Promise<Snapshot | null>;
41
+ /**
42
+ * Track content changes for a URL
43
+ * Compares with previous snapshot and saves new one
44
+ *
45
+ * @param url - URL being tracked
46
+ * @param content - Current content
47
+ * @param fingerprint - Content fingerprint (SHA256 hash)
48
+ * @returns Change detection result
49
+ *
50
+ * @example
51
+ * ```typescript
52
+ * const result = await trackChange('https://example.com', content, fingerprint);
53
+ * if (result.changeStatus === 'changed') {
54
+ * console.log('Content changed!');
55
+ * console.log(`+${result.diff.additions} -${result.diff.deletions}`);
56
+ * }
57
+ * ```
58
+ */
59
+ export declare function trackChange(url: string, content: string, fingerprint: string): Promise<ChangeResult>;
60
+ /**
61
+ * Clear snapshots matching a URL pattern
62
+ *
63
+ * @param urlPattern - Optional regex pattern to match URLs (if not provided, clears all)
64
+ * @returns Number of snapshots cleared
65
+ *
66
+ * @example
67
+ * ```typescript
68
+ * // Clear all snapshots
69
+ * const count = await clearSnapshots();
70
+ *
71
+ * // Clear specific domain
72
+ * const count = await clearSnapshots('example\\.com');
73
+ * ```
74
+ */
75
+ export declare function clearSnapshots(urlPattern?: string): Promise<number>;