@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,809 @@
1
+ /**
2
+ * HTML to Markdown conversion with smart cleanup
3
+ */
4
+ import TurndownService from 'turndown';
5
+ import { gfm } from 'turndown-plugin-gfm';
6
+ import * as cheerio from 'cheerio';
7
+ import { pruneContent } from './content-pruner.js';
8
+ const JUNK_SELECTORS = [
9
+ // Scripts, styles, metadata
10
+ 'script', 'style', 'noscript', 'iframe', 'link[rel="stylesheet"]',
11
+ // Navigation
12
+ 'nav', '[role="navigation"]', '[role="search"]',
13
+ '.sidebar', '.topbar', '.top-bar', '.site-nav', '.main-nav',
14
+ '.breadcrumb', '.breadcrumbs', '[class*="breadcrumb"]',
15
+ '.pagination', '[class*="pagination"]',
16
+ // Ads & tracking
17
+ '.advertisement', '.ad', '[class*="ad-"]', '[id*="ad-"]',
18
+ '[class*="advert"]', '[class*="sponsor"]', '[class*="promo"]',
19
+ // Cookie & consent
20
+ '.cookie-banner', '.cookie-notice', '.cookie-consent',
21
+ '[class*="cookie"]', '[id*="cookie"]',
22
+ '[class*="consent"]', '[class*="gdpr"]',
23
+ // Popups, modals (precise selectors — no broad banner/overlay)
24
+ '[class*="popup"]', '[class*="modal"]',
25
+ '[class*="notification-bar"]',
26
+ // Banners — only known ad/promo banners
27
+ '.ad-banner', '.promo-banner',
28
+ // Social & sharing — only sharing widgets
29
+ '.social-share', '.share-buttons', '.share-widget',
30
+ // Newsletter & CTA — only forms/widgets
31
+ '.newsletter-signup', '[class*="newsletter"]',
32
+ '.subscribe-form', '.subscribe-widget',
33
+ '.signup-form', '.signup-widget', '.signup-cta',
34
+ '[class*="call-to-action"]',
35
+ // Related content — only explicit widgets
36
+ '.related-posts', '[class*="you-may-also"]', '[class*="more-stories"]',
37
+ // Comments — only sections/forms, not comment text
38
+ '.comments-section', '.comment-form', '#comments',
39
+ // Job site CTAs — resume upload prompts, apply nudges, sign-in gates
40
+ '[class*="resume-upload"]', '[class*="resumeUpload"]',
41
+ '[class*="job-alert"]', '[class*="jobAlert"]',
42
+ '[class*="sign-in-gate"]', '[class*="signin-prompt"]',
43
+ // Login/auth gates (specific patterns to avoid matching "navigate", "aggregate", etc.)
44
+ '[class*="login-wall"]', '[class*="paywall"]', '[class*="signin-gate"]',
45
+ '[class*="login-gate"]', '[class*="access-gate"]', '[class*="content-gate"]',
46
+ '[class*="registration-wall"]', '.login-prompt', '.auth-wall',
47
+ // Chat widgets
48
+ '[class*="chat-widget"]', '[class*="chatbot"]', '[class*="intercom"]',
49
+ '[class*="drift-"]', '[class*="zendesk"]', '[class*="crisp"]',
50
+ '[class*="hubspot"]', '#hubspot-messages-iframe-container',
51
+ // Skip links
52
+ '.skip-to-content', '.skip-link', '.skip-nav',
53
+ ];
54
+ const STATS_SELECTORS = {
55
+ scripts: ['script', 'noscript'],
56
+ styles: ['style', 'link[rel="stylesheet"]'],
57
+ ads: [
58
+ '.advertisement', '.ad', '[class*="ad-"]', '[id*="ad-"]',
59
+ '[class*="advert"]', '[class*="sponsor"]', '[class*="promo"]',
60
+ '.ad-banner', '.promo-banner',
61
+ ],
62
+ tracking: [
63
+ 'iframe', '.cookie-banner', '.cookie-notice', '.cookie-consent',
64
+ '[class*="cookie"]', '[id*="cookie"]',
65
+ '[class*="consent"]', '[class*="gdpr"]',
66
+ ],
67
+ navigation: [
68
+ 'nav', '[role="navigation"]', '[role="search"]',
69
+ '.sidebar', '.topbar', '.top-bar', '.site-nav', '.main-nav',
70
+ '.breadcrumb', '.breadcrumbs', '[class*="breadcrumb"]',
71
+ '.pagination', '[class*="pagination"]',
72
+ ],
73
+ socialWidgets: [
74
+ '.social-share', '.share-buttons', '.share-widget',
75
+ '.newsletter-signup', '[class*="newsletter"]',
76
+ '.subscribe-form', '.subscribe-widget',
77
+ '.signup-form', '.signup-widget', '.signup-cta',
78
+ '[class*="chat-widget"]', '[class*="chatbot"]',
79
+ '[class*="intercom"]', '[class*="drift-"]', '[class*="zendesk"]',
80
+ '[class*="crisp"]', '[class*="hubspot"]',
81
+ ],
82
+ popups: [
83
+ '[class*="popup"]', '[class*="modal"]',
84
+ '[class*="notification-bar"]',
85
+ ],
86
+ };
87
+ /**
88
+ * Count elements that would be removed by the cleaning pipeline.
89
+ * Call this BEFORE cleanHTML to get an accurate picture of what gets stripped.
90
+ */
91
+ export function countRemovedElements(html) {
92
+ const $ = cheerio.load(html);
93
+ // Track unique element nodes to avoid double-counting
94
+ const seen = new Set();
95
+ function countCategory(selectors) {
96
+ let count = 0;
97
+ for (const sel of selectors) {
98
+ try {
99
+ $(sel).each((_, el) => {
100
+ if (!seen.has(el)) {
101
+ seen.add(el);
102
+ count++;
103
+ }
104
+ });
105
+ }
106
+ catch {
107
+ // Ignore invalid selectors
108
+ }
109
+ }
110
+ return count;
111
+ }
112
+ const scripts = countCategory(STATS_SELECTORS.scripts);
113
+ const styles = countCategory(STATS_SELECTORS.styles);
114
+ const ads = countCategory(STATS_SELECTORS.ads);
115
+ const tracking = countCategory(STATS_SELECTORS.tracking);
116
+ const navigation = countCategory(STATS_SELECTORS.navigation);
117
+ const socialWidgets = countCategory(STATS_SELECTORS.socialWidgets);
118
+ const popups = countCategory(STATS_SELECTORS.popups);
119
+ const totalRemoved = scripts + styles + ads + tracking + navigation + socialWidgets + popups;
120
+ return {
121
+ scripts,
122
+ styles,
123
+ ads,
124
+ tracking,
125
+ navigation,
126
+ socialWidgets,
127
+ popups,
128
+ totalRemoved,
129
+ originalSizeBytes: Buffer.byteLength(html, 'utf8'),
130
+ cleanedSizeBytes: 0, // set by caller after cleaning
131
+ reductionPercent: 0, // set by caller after cleaning
132
+ };
133
+ }
134
+ /**
135
+ * Filter HTML by including or excluding specific tags/selectors
136
+ * Applied BEFORE markdown conversion for precise content control
137
+ *
138
+ * @param html - HTML to filter
139
+ * @param includeTags - Only keep content from these elements (e.g., ['article', 'main', '.content'])
140
+ * @param excludeTags - Remove these elements (e.g., ['nav', 'footer', 'header', '.sidebar'])
141
+ * @returns Filtered HTML
142
+ */
143
+ export function filterByTags(html, includeTags, excludeTags) {
144
+ const $ = cheerio.load(html);
145
+ // Apply exclude tags first (remove unwanted elements)
146
+ if (excludeTags?.length) {
147
+ excludeTags.forEach(selector => {
148
+ $(selector).remove();
149
+ });
150
+ }
151
+ // Apply include tags (only keep specified elements)
152
+ if (includeTags?.length) {
153
+ // Collect all matching elements
154
+ const included = [];
155
+ includeTags.forEach(selector => {
156
+ const matches = $(selector);
157
+ if (matches.length > 0) {
158
+ matches.each((_, el) => {
159
+ included.push($(el));
160
+ });
161
+ }
162
+ });
163
+ // If we found matching elements, return only those
164
+ if (included.length > 0) {
165
+ return included.map(el => $.html(el)).join('\n');
166
+ }
167
+ // If includeTags specified but nothing matched, return empty
168
+ return '';
169
+ }
170
+ // Return filtered HTML
171
+ return $.html();
172
+ }
173
+ /**
174
+ * Extract content matching a CSS selector
175
+ * Returns filtered HTML or full HTML if selector matches nothing
176
+ */
177
+ export function selectContent(html, selector, exclude) {
178
+ const $ = cheerio.load(html);
179
+ // Apply excludes first
180
+ if (exclude?.length) {
181
+ exclude.forEach(sel => $(sel).remove());
182
+ }
183
+ // Select matching elements
184
+ const selected = $(selector);
185
+ if (selected.length === 0) {
186
+ // Fallback to full page if selector matches nothing
187
+ return html;
188
+ }
189
+ // Return the HTML of all matched elements
190
+ return selected.map((_, el) => $.html(el)).get().join('\n');
191
+ }
192
+ /**
193
+ * Clean HTML before conversion
194
+ * Remove navigation, ads, cookie banners, and other junk
195
+ */
196
+ function cleanHTML(html) {
197
+ // SECURITY: Limit HTML size to prevent DoS
198
+ if (html.length > 10 * 1024 * 1024) { // 10MB
199
+ throw new Error('HTML too large to process (max 10MB)');
200
+ }
201
+ const $ = cheerio.load(html);
202
+ // Remove junk elements
203
+ JUNK_SELECTORS.forEach((selector) => {
204
+ $(selector).remove();
205
+ });
206
+ // Conditionally remove header/footer — keep if they have substantial content (>200 chars)
207
+ $('header, [role="banner"]').each((_, el) => {
208
+ const text = $(el).text().trim();
209
+ if (text.length < 200)
210
+ $(el).remove();
211
+ });
212
+ $('footer, [role="contentinfo"]').each((_, el) => {
213
+ const text = $(el).text().trim();
214
+ if (text.length < 200)
215
+ $(el).remove();
216
+ });
217
+ // Only remove sidebar-like asides, not all aside elements
218
+ $('aside.sidebar, aside[role="complementary"], aside[class*="sidebar"]').remove();
219
+ // Convert layout tables to clean divs before Turndown runs.
220
+ // Layout tables (HN, old Reddit, email HTML etc.) use <table> for positioning,
221
+ // not data — GFM's table plugin fails on them and emits raw HTML.
222
+ // Detection: has presentation attributes OR contains nested <table> OR no <th>.
223
+ $('table').each((_, tableEl) => {
224
+ const $table = $(tableEl);
225
+ const hasBorder = $table.attr('border') !== undefined;
226
+ const hasCellpadding = $table.attr('cellpadding') !== undefined;
227
+ const hasBgcolor = $table.attr('bgcolor') !== undefined;
228
+ const hasRolePresentation = $table.attr('role') === 'presentation';
229
+ const hasNestedTable = $table.find('table').length > 0;
230
+ const hasTh = $table.find('th').length > 0;
231
+ // Count rows and columns to distinguish data tables from layout tables.
232
+ const rowCount = $table.find('tr').length;
233
+ const maxCols = Math.max(0, ...$table.find('tr').toArray().map(r => $(r).children('td, th').length));
234
+ // Keep data tables: those with 3+ rows OR 3+ columns are likely real data
235
+ // even if they lack <th>. Only strip tables that are clearly decorative.
236
+ const isDataBySize = rowCount >= 3 || maxCols >= 3;
237
+ const isLayoutTable = (hasBorder || hasCellpadding || hasBgcolor || hasRolePresentation || hasNestedTable || !hasTh) && !isDataBySize;
238
+ if (!isLayoutTable)
239
+ return;
240
+ // Extract: links (as list items) + non-empty text from each <td>
241
+ const lines = [];
242
+ $table.find('td').each((_, td) => {
243
+ const $td = $(td);
244
+ // Preserve links found in this cell
245
+ $td.find('a').each((_, a) => {
246
+ const $a = $(a);
247
+ const href = $a.attr('href');
248
+ const label = $a.text().trim();
249
+ if (label && href)
250
+ lines.push(`<a href="${href}">${label}</a>`);
251
+ });
252
+ // Add non-link text if substantial
253
+ const nonLinkText = $td.clone().find('a').remove().end().text().trim();
254
+ if (nonLinkText.length > 10 && !$td.find('a').length) {
255
+ lines.push(`<p>${nonLinkText}</p>`);
256
+ }
257
+ });
258
+ const replacement = `<div>${lines.join('\n')}</div>`;
259
+ $table.replaceWith(replacement);
260
+ });
261
+ // Convert complex data tables to clean markdown-ready format.
262
+ // Turndown's GFM plugin fails on tables with colspan/rowspan, missing <thead>,
263
+ // or too many columns. Detect these and convert to readable text pre-Turndown.
264
+ $('table').each((_, tableEl) => {
265
+ const $table = $(tableEl);
266
+ // Detect complexity: colspan, rowspan, no <thead>, or >8 columns
267
+ const hasColspan = $table.find('[colspan]').length > 0;
268
+ const hasRowspan = $table.find('[rowspan]').length > 0;
269
+ const hasThead = $table.find('thead').length > 0;
270
+ const firstRow = $table.find('tr').first();
271
+ const colCount = firstRow.children('th, td').length;
272
+ const isComplex = hasColspan || hasRowspan || !hasThead || colCount > 8;
273
+ if (!isComplex) {
274
+ // Simple table: just strip attributes so GFM plugin handles it
275
+ const tableTags = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td', 'caption'];
276
+ tableTags.forEach(tag => {
277
+ $table.find(tag).addBack(tag).each((_i, el) => {
278
+ const attrs = el.attribs || {};
279
+ for (const attr of Object.keys(attrs)) {
280
+ $(el).removeAttr(attr);
281
+ }
282
+ });
283
+ });
284
+ return;
285
+ }
286
+ // Complex table: convert to structured text that reads well in markdown/chat
287
+ // Extract headers from first row of <th> elements
288
+ const headers = [];
289
+ $table.find('tr').first().children('th').each((_i, th) => {
290
+ headers.push($(th).text().trim());
291
+ });
292
+ // If first row had <th>, treat it as header row; otherwise no headers
293
+ const dataRows = $table.find('tr').toArray();
294
+ const startIdx = headers.length > 0 ? 1 : 0;
295
+ // For tables with ≤6 useful columns and headers, rebuild as a clean bare HTML table
296
+ // so Turndown's GFM plugin can convert it to a proper pipe table
297
+ if (headers.length >= 2 && headers.length <= 6) {
298
+ const theadRow = `<tr>${headers.map(h => `<th>${h}</th>`).join('')}</tr>`;
299
+ const tbodyRows = [];
300
+ const ROW_CAP = 50;
301
+ const totalDataRows = dataRows.length - startIdx;
302
+ for (let r = startIdx; r < dataRows.length && (r - startIdx) < ROW_CAP; r++) {
303
+ const cells = [];
304
+ $(dataRows[r]).children('td, th').each((_j, td) => {
305
+ const span = parseInt($(td).attr('colspan') || '1', 10);
306
+ const text = $(td).text().trim();
307
+ for (let s = 0; s < Math.min(span, 6); s++)
308
+ cells.push(text);
309
+ });
310
+ // Pad or trim to match header count
311
+ while (cells.length < headers.length)
312
+ cells.push('');
313
+ tbodyRows.push(`<tr>${cells.slice(0, headers.length).map(c => `<td>${c}</td>`).join('')}</tr>`);
314
+ }
315
+ if (totalDataRows > ROW_CAP) {
316
+ tbodyRows.push(`<tr><td colspan="${headers.length}">... (${ROW_CAP} of ${totalDataRows} rows shown)</td></tr>`);
317
+ }
318
+ $table.replaceWith(`<table><thead>${theadRow}</thead><tbody>${tbodyRows.join('')}</tbody></table>`);
319
+ return;
320
+ }
321
+ // Wide tables or no headers: convert to HTML list so Turndown handles it properly
322
+ // (never put pre-formatted markdown inside a div — Turndown will escape it)
323
+ const liItems = [];
324
+ const ROW_CAP_LIST = 50;
325
+ const totalListRows = dataRows.length - startIdx;
326
+ for (let r = startIdx; r < dataRows.length && (r - startIdx) < ROW_CAP_LIST; r++) {
327
+ const cells = [];
328
+ $(dataRows[r]).children('td, th').each((_j, td) => {
329
+ const span = parseInt($(td).attr('colspan') || '1', 10);
330
+ const text = $(td).text().trim();
331
+ for (let s = 0; s < Math.min(span, 3); s++)
332
+ cells.push(text);
333
+ });
334
+ if (cells.some(c => c)) {
335
+ if (headers.length > 0) {
336
+ const parts = cells
337
+ .map((c, j) => (headers[j] && c) ? `<strong>${headers[j]}:</strong> ${c}` : c)
338
+ .filter(Boolean)
339
+ .join(' &middot; ');
340
+ liItems.push(`<li>${parts}</li>`);
341
+ }
342
+ else {
343
+ liItems.push(`<li>${cells.filter(Boolean).join(' &middot; ')}</li>`);
344
+ }
345
+ }
346
+ }
347
+ if (totalListRows > ROW_CAP_LIST) {
348
+ liItems.push(`<li><em>... (${ROW_CAP_LIST} of ${totalListRows} rows shown)</em></li>`);
349
+ }
350
+ if (liItems.length > 0) {
351
+ $table.replaceWith(`<ul>${liItems.join('')}</ul>`);
352
+ }
353
+ });
354
+ // Remove empty paragraphs and divs
355
+ $('p:empty, div:empty').remove();
356
+ // Remove elements with only whitespace
357
+ $('*').each((_, elem) => {
358
+ const $elem = $(elem);
359
+ const text = $elem.text().trim();
360
+ if (!text && $elem.children().length === 0) {
361
+ $elem.remove();
362
+ }
363
+ });
364
+ return $.html();
365
+ }
366
+ /**
367
+ * MAIN CONTENT SELECTORS — prioritized list of selectors to find the article body
368
+ * Checked in order: first match wins
369
+ */
370
+ const MAIN_CONTENT_SELECTORS = [
371
+ 'article[role="main"]',
372
+ 'main article',
373
+ '[role="main"] article',
374
+ 'article',
375
+ '[role="main"]',
376
+ 'main',
377
+ '.post-content', '.article-content', '.article-body', '.entry-content',
378
+ '.post-body', '.story-body', '.page-content',
379
+ '#content', '#main-content', '#article', '#post',
380
+ '.content', '.main-content',
381
+ '.prose', '.markdown-body', '.post-text', '.article__body',
382
+ '.story-content', '.entry-text', '.post-entry',
383
+ '[itemprop="articleBody"]', '[data-article-body]',
384
+ '.blog-post-content', '.blog-content',
385
+ ];
386
+ /**
387
+ * Try to detect the main content area of a page.
388
+ * Returns the main content HTML, or the full cleaned HTML if no main content detected.
389
+ */
390
+ export function detectMainContent(html) {
391
+ const $ = cheerio.load(html);
392
+ // Helper: get visible text length (ignoring script/style/noscript)
393
+ function visibleTextLength(root) {
394
+ const clone = root.clone();
395
+ clone.find('script, style, noscript').remove();
396
+ return clone.text().trim().length;
397
+ }
398
+ const totalTextLen = visibleTextLength($.root());
399
+ for (const selector of MAIN_CONTENT_SELECTORS) {
400
+ const el = $(selector);
401
+ if (el.length > 0) {
402
+ // Check if it has meaningful content (at least 100 chars of text)
403
+ const text = el.first().text().trim();
404
+ if (text.length >= 100) {
405
+ // Text-coverage heuristic: if detected element has <50% of page text,
406
+ // the detection was too narrow — return full page instead
407
+ const candidateLen = visibleTextLength(el.first());
408
+ if (totalTextLen > 0 && candidateLen / totalTextLen < 0.5) {
409
+ return { html, detected: false };
410
+ }
411
+ return { html: $.html(el.first()), detected: true };
412
+ }
413
+ }
414
+ }
415
+ // Fallback: find the largest text block (div or section with most text)
416
+ let bestEl = null;
417
+ let bestLen = 0;
418
+ $('div, section').each((_, elem) => {
419
+ const $elem = $(elem);
420
+ const text = $elem.text().trim();
421
+ // Prefer elements with significant text that aren't too deeply nested
422
+ if (text.length > bestLen && text.length >= 200) {
423
+ // Check it's not a wrapper of the whole page
424
+ const parent = $elem.parent();
425
+ if (parent.length && parent[0] !== $('body')[0] && parent[0] !== $('html')[0]) {
426
+ bestEl = $elem;
427
+ bestLen = text.length;
428
+ }
429
+ }
430
+ });
431
+ if (bestEl && bestLen > 300) {
432
+ // Same coverage check for fallback
433
+ if (totalTextLen > 0 && bestLen / totalTextLen < 0.5) {
434
+ return { html, detected: false };
435
+ }
436
+ return { html: $.html(bestEl), detected: true };
437
+ }
438
+ return { html, detected: false };
439
+ }
440
+ /**
441
+ * Calculate content quality score (0-1)
442
+ * Measures how clean and useful the extracted content is
443
+ */
444
+ export function calculateQuality(content, originalHtml) {
445
+ if (!content || content.length < 10)
446
+ return 0;
447
+ const contentLen = content.length;
448
+ const htmlLen = originalHtml.length;
449
+ // Factor 1: Compression ratio (how much we stripped) — higher is better, up to a point
450
+ const compressionRatio = Math.min(contentLen / Math.max(htmlLen, 1), 1);
451
+ // Sweet spot: 5-30% of original HTML is usually the real content
452
+ const compressionScore = compressionRatio < 0.01 ? 0.3 :
453
+ compressionRatio < 0.05 ? 0.7 :
454
+ compressionRatio < 0.40 ? 1.0 :
455
+ compressionRatio < 0.60 ? 0.8 : 0.5;
456
+ // Factor 2: Text density (ratio of visible text to markdown formatting)
457
+ const textOnly = content.replace(/[#*_\[\]\(\)\-`|>]/g, '');
458
+ const textDensity = textOnly.trim().length / Math.max(contentLen, 1);
459
+ const densityScore = Math.min(textDensity / 0.7, 1);
460
+ // Factor 3: Has meaningful structure (headings, paragraphs)
461
+ const hasHeadings = /^#{1,6}\s/m.test(content) ? 1 : 0.7;
462
+ const hasParagraphs = content.split('\n\n').length > 2 ? 1 : 0.8;
463
+ // Factor 4: Not too short, not too long
464
+ const lengthScore = contentLen < 50 ? 0.3 :
465
+ contentLen < 200 ? 0.6 :
466
+ contentLen < 50000 ? 1.0 : 0.8;
467
+ // Weighted average
468
+ const quality = (compressionScore * 0.3 +
469
+ densityScore * 0.3 +
470
+ (hasHeadings * hasParagraphs) * 0.2 +
471
+ lengthScore * 0.2);
472
+ return Math.round(quality * 100) / 100;
473
+ }
474
+ // Module-level singleton TurndownService — stateless per-call, safe to reuse.
475
+ const turndownSingleton = (() => {
476
+ const td = new TurndownService({
477
+ headingStyle: 'atx',
478
+ codeBlockStyle: 'fenced',
479
+ bulletListMarker: '-',
480
+ emDelimiter: '_',
481
+ strongDelimiter: '**',
482
+ });
483
+ // Enable GFM support (tables, strikethrough, task lists)
484
+ td.use(gfm);
485
+ // Custom rule: convert images to alt text or skip
486
+ td.addRule('images', {
487
+ filter: 'img',
488
+ replacement: (_content, node) => {
489
+ const alt = node.alt;
490
+ const src = node.src;
491
+ if (alt) {
492
+ return `![${alt}](${src})`;
493
+ }
494
+ return '';
495
+ },
496
+ });
497
+ // Custom rule: preserve code blocks
498
+ td.addRule('codeBlocks', {
499
+ filter: (node) => {
500
+ return node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE';
501
+ },
502
+ replacement: (_content, node) => {
503
+ const codeNode = node.firstChild;
504
+ const className = codeNode.getAttribute('class') || '';
505
+ const language = className.match(/language-(\w+)/)?.[1] || '';
506
+ return '\n\n```' + language + '\n' + codeNode.textContent + '\n```\n\n';
507
+ },
508
+ });
509
+ return td;
510
+ })();
511
+ /**
512
+ * Convert HTML to clean, readable Markdown
513
+ * @param html - HTML to convert
514
+ * @param options.raw - Skip main-content heuristics (return full page)
515
+ * @param options.prune - Apply content density pruning (default: true)
516
+ */
517
+ export function htmlToMarkdown(html, options) {
518
+ let cleanedHTML = cleanHTML(html);
519
+ // Content density pruning — runs AFTER junk selector removal, BEFORE Turndown conversion
520
+ // Default ON; callers pass prune:false to skip (e.g. --full-content flag)
521
+ if (options?.prune !== false) {
522
+ const pruned = pruneContent(cleanedHTML, { dynamic: true });
523
+ cleanedHTML = pruned.html;
524
+ }
525
+ let markdown;
526
+ try {
527
+ markdown = turndownSingleton.turndown(cleanedHTML);
528
+ }
529
+ catch {
530
+ // Turndown GFM plugin crashes on malformed tables (e.g. <tr> without <table> parent)
531
+ // Fall back to basic text extraction
532
+ const $ = cheerio.load(cleanedHTML);
533
+ $('script, style, noscript, svg, iframe').remove();
534
+ markdown = $.text().replace(/\s+/g, ' ').trim();
535
+ }
536
+ // SECURITY: Protect against ReDoS - limit input size before regex
537
+ if (markdown.length > 1024 * 1024) { // 1MB limit for markdown
538
+ markdown = markdown.slice(0, 1024 * 1024);
539
+ }
540
+ // Clean up excessive newlines (use non-backtracking approach)
541
+ markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
542
+ if (i === 0)
543
+ return line;
544
+ const prevEmpty = arr[i - 1].trim() === '';
545
+ const currEmpty = line.trim() === '';
546
+ if (prevEmpty && currEmpty)
547
+ return acc;
548
+ return acc + '\n' + line;
549
+ }, '');
550
+ // Remove common CTA / noise lines (job sites, sign-up prompts, etc.)
551
+ // Strip markdown heading prefix before matching (e.g., "## Are you open...")
552
+ markdown = markdown.split('\n').filter(line => {
553
+ const trimmed = line.trim().toLowerCase().replace(/^#{1,6}\s*/, '');
554
+ // Job site CTA noise
555
+ if (trimmed === 'upload resume' || trimmed === 'upload your resume')
556
+ return false;
557
+ if (trimmed === 'apply now' || trimmed === 'apply on employer site' || trimmed === 'apply on employer siteapply now')
558
+ return false;
559
+ if (trimmed === 'easy apply' || trimmed === 'save job' || trimmed === 'easy apply onlyremote only')
560
+ return false;
561
+ if (/^(is your resume a good match|are you open to new opportunities)\??$/.test(trimmed))
562
+ return false;
563
+ if (/^upload your resume to increase your chances/i.test(trimmed))
564
+ return false;
565
+ if (/^use ai to find out how well/i.test(trimmed))
566
+ return false;
567
+ // Job site filter sidebar labels (standalone)
568
+ if (trimmed === 'company rating' || trimmed === 'date posted' || trimmed === 'salary range')
569
+ return false;
570
+ // Indeed profile insights noise
571
+ if (/^do you have (experience in|a )/i.test(trimmed))
572
+ return false;
573
+ if (trimmed === 'yesno' || trimmed === 'yes no')
574
+ return false;
575
+ if (trimmed === 'profile insights' || trimmed === 'find out how your skills align')
576
+ return false;
577
+ if (/^find out how your skills align/i.test(trimmed))
578
+ return false;
579
+ // Common UI artifacts (icons, loading, inline labels)
580
+ if (trimmed === 'save-icon' || trimmed === 'loading' || trimmed === 'report job')
581
+ return false;
582
+ if (/^show more(chevron down)?$/i.test(trimmed))
583
+ return false;
584
+ if (trimmed === 'whatwherefind jobs')
585
+ return false;
586
+ // Q&A site chrome (Stack Overflow, StackExchange, forums)
587
+ if (/^\[?(share|follow|flag|report)\]?(\(.*\))?$/i.test(trimmed))
588
+ return false;
589
+ if (/^\[?improve this (question|answer)\]?/i.test(trimmed))
590
+ return false;
591
+ if (/^(sorted by|highest score|trending|date modified|date created)/i.test(trimmed))
592
+ return false;
593
+ if (/^\[?(edited|answered|asked)\s+\w+\s+\d/i.test(trimmed))
594
+ return false;
595
+ if (/^community wiki$/i.test(trimmed))
596
+ return false;
597
+ if (/^\d+\s*(answers?|votes?|views?)\s*\d*$/i.test(trimmed))
598
+ return false;
599
+ if (/^\[?reset to default\]?/i.test(trimmed))
600
+ return false;
601
+ // Generic interactive chrome
602
+ if (/^\[?(bookmark|save|pin|mute|hide|block)\]?(\(.*\))?$/i.test(trimmed))
603
+ return false;
604
+ if (/^\[?(reply|retweet|repost|quote)\]?(\(.*\))?$/i.test(trimmed))
605
+ return false;
606
+ if (/^\[?copy\s*(link|url)?\]?(\(.*\))?$/i.test(trimmed))
607
+ return false;
608
+ if (/^(sign up|log in|create account|join now)\s*(to|for)?/i.test(trimmed))
609
+ return false;
610
+ return true;
611
+ }).join('\n');
612
+ // Truncate trailing recommendation/related-jobs sections (common on job sites like Indeed)
613
+ // These appear after the main content and add 1000+ tokens of noise
614
+ const trailCutPatterns = [
615
+ /^#{1,3}\s*(explore other jobs|discover opportunities beyond)/im,
616
+ /^#{1,3}\s*(jobs with similar titles)/im,
617
+ /^#{1,3}\s*(similar job categories)/im,
618
+ /^#{1,3}\s*(career guide articles)/im,
619
+ /^#{1,3}\s*(similar jobs nearby)/im,
620
+ /^#{1,3}\s*(company and salary information)/im,
621
+ ];
622
+ for (const pattern of trailCutPatterns) {
623
+ const match = pattern.exec(markdown);
624
+ if (match && match.index !== undefined) {
625
+ // Only truncate if the noise section is in the bottom 40% of the content
626
+ if (match.index > markdown.length * 0.6) {
627
+ markdown = markdown.slice(0, match.index).trim();
628
+ break;
629
+ }
630
+ }
631
+ }
632
+ // Remove leading/trailing whitespace
633
+ markdown = markdown.trim();
634
+ return markdown;
635
+ }
636
+ /**
637
+ * Convert HTML to markdown using Turndown directly, without the full cleanHTML pipeline.
638
+ * Useful when the caller has already cleaned the HTML and wants to preserve elements
639
+ * (like images) that cleanHTML would strip due to empty-element detection.
640
+ *
641
+ * The only pre-processing done: remove script/style tags for safety.
642
+ */
643
+ export function rawHtmlToMarkdown(html) {
644
+ const $ = cheerio.load(html);
645
+ // Remove scripts and styles (always)
646
+ $('script, style, noscript').remove();
647
+ // Run Turndown on the cleaned HTML
648
+ let markdown;
649
+ try {
650
+ markdown = turndownSingleton.turndown($.html());
651
+ }
652
+ catch {
653
+ // Turndown GFM plugin crashes on malformed tables — fall back to text
654
+ markdown = $.text().replace(/\s+/g, ' ').trim();
655
+ }
656
+ // Clean up excessive newlines
657
+ markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
658
+ if (i === 0)
659
+ return line;
660
+ const prevEmpty = arr[i - 1].trim() === '';
661
+ const currEmpty = line.trim() === '';
662
+ if (prevEmpty && currEmpty)
663
+ return acc;
664
+ return acc + '\n' + line;
665
+ }, '');
666
+ return markdown.trim();
667
+ }
668
+ /**
669
+ * Convert HTML to plain text (strip all formatting)
670
+ */
671
+ export function htmlToText(html) {
672
+ const cleanedHTML = cleanHTML(html);
673
+ const $ = cheerio.load(cleanedHTML);
674
+ // Get text content, preserving some structure
675
+ let text = '';
676
+ $('h1, h2, h3, h4, h5, h6, p, li').each((_, elem) => {
677
+ const content = $(elem).text().trim();
678
+ if (content) {
679
+ text += content + '\n\n';
680
+ }
681
+ });
682
+ // Fallback: if no structured content found, get all text
683
+ if (!text.trim()) {
684
+ text = $('body').text();
685
+ }
686
+ // Clean up excessive whitespace
687
+ text = text.replace(/\n{3,}/g, '\n\n');
688
+ text = text.replace(/[ \t]+/g, ' ');
689
+ return text.trim();
690
+ }
691
+ /**
692
+ * Estimate token count (very rough approximation)
693
+ * Rule of thumb: 1 token ≈ 4 characters for English text
694
+ */
695
+ export function estimateTokens(text) {
696
+ return Math.ceil(text.length / 4);
697
+ }
698
+ /**
699
+ * Truncate content to fit within a token budget
700
+ * Intelligently preserves structure (headings, first paragraph)
701
+ */
702
+ export function truncateToTokenBudget(content, maxTokens) {
703
+ const currentTokens = estimateTokens(content);
704
+ // If under budget, return as-is
705
+ if (currentTokens <= maxTokens) {
706
+ return content;
707
+ }
708
+ // Split into lines
709
+ const lines = content.split('\n');
710
+ // Build truncated content
711
+ const result = [];
712
+ let currentTokenCount = 0;
713
+ let foundFirstHeading = false;
714
+ for (const line of lines) {
715
+ const lineTokens = estimateTokens(line);
716
+ const isHeading = /^#{1,6}\s/.test(line);
717
+ // Always include the first heading
718
+ if (!foundFirstHeading && isHeading) {
719
+ result.push(line);
720
+ currentTokenCount += lineTokens;
721
+ foundFirstHeading = true;
722
+ continue;
723
+ }
724
+ // Check if adding this line would exceed budget
725
+ if (currentTokenCount + lineTokens > maxTokens) {
726
+ // Stop here
727
+ break;
728
+ }
729
+ // Add the line
730
+ result.push(line);
731
+ currentTokenCount += lineTokens;
732
+ }
733
+ // Add truncation notice
734
+ result.push('');
735
+ result.push(`[Content truncated to ~${maxTokens} tokens]`);
736
+ return result.join('\n');
737
+ }
738
+ /**
739
+ * Strip markdown link/image syntax for clean AI-readable text.
740
+ * Preserves headings, lists, bold, italic, code blocks.
741
+ * Removes: [text](url) → text, ![alt](src) → [Image: alt], reference links.
742
+ */
743
+ export function cleanForAI(markdown) {
744
+ return markdown
745
+ // Convert images to descriptive text: ![alt](url) → [Image: alt]
746
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, (_, alt) => alt ? `[Image: ${alt}]` : '')
747
+ // Convert links to just text: [text](url) → text
748
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
749
+ // Remove reference-style link definitions: [id]: url
750
+ .replace(/^\[[\w-]+\]:\s+\S+.*$/gm, '')
751
+ // Remove bare URLs that aren't in code blocks (heuristic: standalone URLs on a line)
752
+ .replace(/^https?:\/\/\S+$/gm, '')
753
+ // Remove HTML comments
754
+ .replace(/<!--[\s\S]*?-->/g, '')
755
+ // Remove empty link fragments like []
756
+ .replace(/\[\s*\]/g, '')
757
+ // Clean up citation references like [1], [2] etc (common in scraped content)
758
+ .replace(/\[(\d+)\]/g, '')
759
+ // Collapse multiple blank lines
760
+ .replace(/\n{3,}/g, '\n\n')
761
+ // Trim trailing whitespace on each line
762
+ .replace(/[ \t]+$/gm, '')
763
+ .trim();
764
+ }
765
+ /**
766
+ * Clean up common markdown noise patterns produced during HTML-to-markdown conversion.
767
+ * Removes empty links, orphaned image links, collapses excess newlines, strips trailing whitespace.
768
+ */
769
+ export function cleanMarkdownNoise(content) {
770
+ let result = content
771
+ // Remove empty links: [](url) or [ ](url)
772
+ .replace(/\[\s*\]\([^)]+\)/g, '')
773
+ // Remove image-only links that are just UI elements: [![](img)](link)
774
+ .replace(/\[\!\[\]\([^)]+\)\]\([^)]+\)/g, '')
775
+ // Collapse 3+ newlines to 2
776
+ .replace(/\n{3,}/g, '\n\n')
777
+ // Remove trailing whitespace on lines
778
+ .replace(/[ \t]+$/gm, '')
779
+ .trim();
780
+ // Collapse repeated link text appearing 5+ times (navigation spam)
781
+ // e.g. "Try Claude" appearing 20+ times as standalone lines or inline
782
+ const linkTextCounts = new Map();
783
+ const linkPattern = /\[([^\]]+)\]\([^)]+\)/g;
784
+ let m;
785
+ while ((m = linkPattern.exec(result)) !== null) {
786
+ const text = m[1].trim().toLowerCase();
787
+ linkTextCounts.set(text, (linkTextCounts.get(text) || 0) + 1);
788
+ }
789
+ // Remove repeated CTA links that appear 5+ times (keep first 2 occurrences)
790
+ for (const [text, count] of linkTextCounts) {
791
+ if (count >= 5) {
792
+ // Escape special regex characters in the link text for matching
793
+ const escaped = text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
794
+ // Match the full markdown link with this text (case-insensitive)
795
+ const spamPattern = new RegExp(`\\[${escaped}\\]\\([^)]+\\)`, 'gi');
796
+ let kept = 0;
797
+ result = result.replace(spamPattern, (match) => {
798
+ kept++;
799
+ // Keep first 2 occurrences, remove the rest
800
+ return kept <= 2 ? match : '';
801
+ });
802
+ }
803
+ }
804
+ // Remove "Button Text" placeholders (literal text from button elements)
805
+ result = result.replace(/^Button Text\s*$/gm, '');
806
+ // Clean up any new excess newlines from removals
807
+ result = result.replace(/\n{3,}/g, '\n\n').trim();
808
+ return result;
809
+ }