@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,235 @@
1
+ import { simpleFetch } from '../../core/fetcher.js';
2
+ import { stripHtml, fetchJson } from './shared.js';
3
+ // ---------------------------------------------------------------------------
4
+ // 5. Wikipedia extractor
5
+ // ---------------------------------------------------------------------------
6
+ /** Remove Wikipedia-specific noise from extracted content. */
7
+ function cleanWikipediaContent(content) {
8
+ return content
9
+ // Remove [edit] links
10
+ .replace(/\[edit\]/gi, '')
11
+ // Remove citation brackets [1], [2], etc.
12
+ .replace(/\[\d+\]/g, '')
13
+ // Remove [citation needed], [verification], etc.
14
+ .replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
15
+ // Remove [Learn how and when to remove this message]
16
+ .replace(/\[Learn how and when to remove this message\]/gi, '')
17
+ // Clean up excess whitespace
18
+ .replace(/\n{3,}/g, '\n\n')
19
+ .trim();
20
+ }
21
+ /** Max rows to extract from a single HTML table to prevent token explosion. */
22
+ const MAX_TABLE_ROWS = 50;
23
+ /**
24
+ * Convert an HTML <table> string to a markdown pipe table.
25
+ * Returns null if the table can't be meaningfully converted (e.g. layout table).
26
+ * Handles colspan/rowspan by flattening, caps at MAX_TABLE_ROWS.
27
+ */
28
+ function htmlTableToMarkdown(tableHtml) {
29
+ // Extract all rows
30
+ const rowMatches = tableHtml.match(/<tr[^>]*>([\s\S]*?)<\/tr>/gi);
31
+ if (!rowMatches || rowMatches.length < 2)
32
+ return null;
33
+ // Parse a row into cell texts
34
+ function parseRow(rowHtml) {
35
+ const cells = [];
36
+ const cellRegex = /<t[hd][^>]*>([\s\S]*?)<\/t[hd]>/gi;
37
+ let m;
38
+ while ((m = cellRegex.exec(rowHtml)) !== null) {
39
+ const colspanMatch = m[0].match(/colspan=["']?(\d+)/i);
40
+ const span = colspanMatch ? Math.min(parseInt(colspanMatch[1], 10), 6) : 1;
41
+ const text = stripHtml(m[1]).replace(/\|/g, '\\|').replace(/\n/g, ' ').trim();
42
+ cells.push(text);
43
+ // Fill colspan with empty cells
44
+ for (let s = 1; s < span; s++)
45
+ cells.push('');
46
+ }
47
+ return cells;
48
+ }
49
+ // Detect header row: first row with <th> elements
50
+ let headerRowIdx = -1;
51
+ for (let i = 0; i < Math.min(rowMatches.length, 3); i++) {
52
+ if (/<th[\s>]/i.test(rowMatches[i])) {
53
+ headerRowIdx = i;
54
+ break;
55
+ }
56
+ }
57
+ let headers;
58
+ let dataStartIdx;
59
+ if (headerRowIdx >= 0) {
60
+ headers = parseRow(rowMatches[headerRowIdx]);
61
+ dataStartIdx = headerRowIdx + 1;
62
+ }
63
+ else {
64
+ // No header row — use first row as header
65
+ headers = parseRow(rowMatches[0]);
66
+ dataStartIdx = 1;
67
+ }
68
+ if (headers.length < 2)
69
+ return null;
70
+ // Skip tables that look like layout (single column or no real content)
71
+ if (headers.every(h => !h))
72
+ return null;
73
+ const colCount = headers.length;
74
+ const mdLines = [];
75
+ // Header row
76
+ mdLines.push('| ' + headers.map(h => h || ' ').join(' | ') + ' |');
77
+ // Separator row
78
+ mdLines.push('| ' + headers.map(() => '---').join(' | ') + ' |');
79
+ // Data rows (capped at MAX_TABLE_ROWS)
80
+ let rowCount = 0;
81
+ for (let r = dataStartIdx; r < rowMatches.length && rowCount < MAX_TABLE_ROWS; r++) {
82
+ const cells = parseRow(rowMatches[r]);
83
+ if (cells.length === 0)
84
+ continue;
85
+ // Pad or trim to match column count
86
+ while (cells.length < colCount)
87
+ cells.push('');
88
+ const row = cells.slice(0, colCount);
89
+ // Skip completely empty rows
90
+ if (row.every(c => !c))
91
+ continue;
92
+ mdLines.push('| ' + row.map(c => c || ' ').join(' | ') + ' |');
93
+ rowCount++;
94
+ }
95
+ if (rowCount === 0)
96
+ return null;
97
+ const truncNote = (rowMatches.length - dataStartIdx > MAX_TABLE_ROWS)
98
+ ? `\n\n*Table truncated to ${MAX_TABLE_ROWS} rows.*`
99
+ : '';
100
+ return mdLines.join('\n') + truncNote;
101
+ }
102
+ /**
103
+ * Extract wikitables from raw Wikipedia HTML.
104
+ * Returns markdown for data tables (class="wikitable"), ignoring navboxes/infoboxes/layout tables.
105
+ */
106
+ function extractWikitables(html) {
107
+ const tables = [];
108
+ // Match tables with class="wikitable" — these are always data tables on Wikipedia
109
+ const tableRegex = /<table[^>]*class="[^"]*wikitable[^"]*"[^>]*>([\s\S]*?)<\/table>/gi;
110
+ let match;
111
+ while ((match = tableRegex.exec(html)) !== null) {
112
+ const fullTable = match[0];
113
+ // Skip navboxes and sidebar tables
114
+ if (/navbox|sidebar|metadata/i.test(fullTable.slice(0, 200)))
115
+ continue;
116
+ // Try to extract a caption
117
+ const captionMatch = fullTable.match(/<caption[^>]*>([\s\S]*?)<\/caption>/i);
118
+ const caption = captionMatch ? stripHtml(captionMatch[1]).trim() : '';
119
+ const md = htmlTableToMarkdown(fullTable);
120
+ if (md) {
121
+ const prefix = caption ? `**${caption}**\n\n` : '';
122
+ tables.push(prefix + md);
123
+ }
124
+ }
125
+ return tables;
126
+ }
127
+ export async function wikipediaExtractor(_html, url, options) {
128
+ const urlObj = new URL(url);
129
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
130
+ // Only handle article pages: /wiki/Article_Title
131
+ if (pathParts[0] !== 'wiki' || pathParts.length < 2)
132
+ return null;
133
+ const articleTitle = decodeURIComponent(pathParts[1]);
134
+ // Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
135
+ if (articleTitle.includes(':'))
136
+ return null;
137
+ // For list/comparison/data-heavy articles, skip the summary API entirely.
138
+ // The summary API only returns paragraph text — no tables. Return null so the
139
+ // normal HTML→markdown pipeline fetches the full page and preserves tables.
140
+ const isListArticle = /^List_of|^Lists_of|^Comparison_of|^List_/i.test(articleTitle);
141
+ if (isListArticle)
142
+ return null;
143
+ const lang = urlObj.hostname.split('.')[0] || 'en';
144
+ const summaryUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
145
+ // Wikipedia REST API requires a descriptive User-Agent (https://meta.wikimedia.org/wiki/User-Agent_policy)
146
+ const wikiHeaders = { 'User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me) Node.js', 'Api-User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me)' };
147
+ // Detect data-heavy pages: "List of ...", "Comparison of ...", tables in the raw HTML
148
+ const isListPage = /^List[_ ]of[_ ]/i.test(articleTitle) || /^Comparison[_ ]of[_ ]/i.test(articleTitle);
149
+ const rawHasWikitables = _html && /class="[^"]*wikitable/i.test(_html);
150
+ const hasTableData = isListPage || rawHasWikitables;
151
+ try {
152
+ const data = await fetchJson(summaryUrl, wikiHeaders);
153
+ if (!data || data.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found')
154
+ return null;
155
+ const structured = {
156
+ title: data.title || articleTitle.replace(/_/g, ' '),
157
+ description: data.description || '',
158
+ extract: data.extract || '',
159
+ extractHtml: data.extract_html || '',
160
+ thumbnail: data.thumbnail?.source || null,
161
+ url: data.content_urls?.desktop?.page || url,
162
+ lastModified: data.timestamp || null,
163
+ coordinates: data.coordinates || null,
164
+ };
165
+ // Default: use summary API (200-400 tokens). Only fetch full article if budget > 5000.
166
+ const budget = options?.budget ?? 0;
167
+ const useFull = budget > 5000;
168
+ let bodyContent = structured.extract;
169
+ let mobileHtmlSize;
170
+ let tableSections = [];
171
+ // Extract tables from the raw HTML we already have (no extra API call needed)
172
+ if (hasTableData && _html) {
173
+ tableSections = extractWikitables(_html);
174
+ }
175
+ if (useFull) {
176
+ try {
177
+ const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
178
+ const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
179
+ ...wikiHeaders,
180
+ 'Accept': 'text/html',
181
+ });
182
+ if (fullResult?.html) {
183
+ mobileHtmlSize = fullResult.html.length;
184
+ let fullContent = '';
185
+ const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
186
+ for (const section of sectionMatches) {
187
+ const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
188
+ const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
189
+ const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
190
+ const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
191
+ if (sectionText) {
192
+ const prefix = heading ? `## ${heading}\n\n` : '';
193
+ fullContent += `\n\n${prefix}${sectionText}`;
194
+ }
195
+ }
196
+ bodyContent = cleanWikipediaContent(fullContent) || structured.extract;
197
+ // Also extract tables from mobile-html if we didn't get them from raw HTML
198
+ if (tableSections.length === 0) {
199
+ tableSections = extractWikitables(fullResult.html);
200
+ }
201
+ }
202
+ }
203
+ catch (e) {
204
+ if (process.env.DEBUG)
205
+ console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
206
+ }
207
+ }
208
+ const articleUrl = structured.url;
209
+ const lines = [
210
+ `# ${structured.title}`,
211
+ '',
212
+ ];
213
+ if (structured.description)
214
+ lines.push(`*${structured.description}*`, '');
215
+ lines.push(bodyContent);
216
+ // Append extracted tables
217
+ if (tableSections.length > 0) {
218
+ lines.push('', '---', '');
219
+ for (const table of tableSections) {
220
+ lines.push(table, '');
221
+ }
222
+ }
223
+ if (structured.coordinates) {
224
+ lines.push('', `📍 Coordinates: ${structured.coordinates.lat}, ${structured.coordinates.lon}`);
225
+ }
226
+ lines.push('', `📖 [Read full article on Wikipedia](${articleUrl})`);
227
+ const cleanContent = lines.join('\n');
228
+ return { domain: 'wikipedia.org', type: 'article', structured, cleanContent, rawHtmlSize: mobileHtmlSize };
229
+ }
230
+ catch (e) {
231
+ if (process.env.DEBUG)
232
+ console.debug('[webpeel]', 'Wikipedia API failed:', e instanceof Error ? e.message : e);
233
+ return null;
234
+ }
235
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function yelpExtractor(_html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,216 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Yelp extractor — parse JSON-LD + meta from stealth-rendered HTML
3
+ // ---------------------------------------------------------------------------
4
+ export async function yelpExtractor(_html, url) {
5
+ const YELP_API_KEY = process.env.YELP_API_KEY;
6
+ // Helper to call Yelp Fusion API
7
+ async function yelpFetch(path, params) {
8
+ const base = 'https://api.yelp.com/v3';
9
+ const qs = params ? '?' + new URLSearchParams(params).toString() : '';
10
+ const res = await fetch(`${base}${path}${qs}`, {
11
+ headers: { 'Authorization': `Bearer ${YELP_API_KEY}` },
12
+ });
13
+ if (!res.ok) {
14
+ throw new Error(`Yelp API ${res.status}: ${res.statusText}`);
15
+ }
16
+ return res.json();
17
+ }
18
+ try {
19
+ const parsed = new URL(url);
20
+ const pathname = parsed.pathname;
21
+ const searchParams = parsed.searchParams;
22
+ // ----------------------------------------------------------------
23
+ // If no API key, fall back to the legacy HTML-scraping approach
24
+ // ----------------------------------------------------------------
25
+ if (!YELP_API_KEY) {
26
+ // Legacy fallback: minimal result pointing user to Yelp
27
+ const term = searchParams.get('find_desc') || searchParams.get('cflt') || 'businesses';
28
+ const loc = searchParams.get('find_loc') || '';
29
+ const isBiz = pathname.startsWith('/biz/');
30
+ const cleanContent = isBiz
31
+ ? `# Yelp Business\n\n*No YELP_API_KEY configured — visit [Yelp](${url}) for details.*`
32
+ : `# 🔍 Yelp Search: ${term}${loc ? ` in ${loc}` : ''}\n\n*No YELP_API_KEY configured — [View on Yelp](${url})*`;
33
+ return {
34
+ domain: 'yelp.com',
35
+ type: isBiz ? 'business' : 'search',
36
+ structured: { url },
37
+ cleanContent,
38
+ };
39
+ }
40
+ // ----------------------------------------------------------------
41
+ // Business page: /biz/<alias>
42
+ // ----------------------------------------------------------------
43
+ if (pathname.startsWith('/biz/')) {
44
+ const alias = pathname.replace('/biz/', '').split('?')[0].split('#')[0];
45
+ let biz;
46
+ try {
47
+ biz = await yelpFetch(`/businesses/${alias}`);
48
+ }
49
+ catch (e) {
50
+ if (process.env.DEBUG)
51
+ console.debug('[webpeel] Yelp biz fetch failed:', e instanceof Error ? e.message : e);
52
+ return null;
53
+ }
54
+ // Fetch reviews (best-effort)
55
+ let reviews = [];
56
+ try {
57
+ const revData = await yelpFetch(`/businesses/${alias}/reviews`, { limit: '3' });
58
+ reviews = revData.reviews || [];
59
+ }
60
+ catch { /* reviews are optional */ }
61
+ const name = biz.name || alias;
62
+ const rating = biz.rating != null ? biz.rating.toFixed(1) : '?';
63
+ const reviewCount = biz.review_count ?? 0;
64
+ const addr = biz.location;
65
+ const address = addr
66
+ ? [addr.address1, addr.city, addr.state, addr.zip_code].filter(Boolean).join(', ')
67
+ : '';
68
+ const phone = biz.display_phone || biz.phone || '';
69
+ const price = biz.price || '';
70
+ const categories = (biz.categories || []).map((c) => c.title).join(' | ');
71
+ const yelpUrl = biz.url || url;
72
+ // Hours
73
+ let hoursStr = '';
74
+ if (biz.hours && biz.hours.length > 0) {
75
+ const dayNames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'];
76
+ const dayMap = {};
77
+ for (const slot of biz.hours[0].open || []) {
78
+ const fmt = (t) => {
79
+ const h = parseInt(t.slice(0, 2), 10);
80
+ const m = t.slice(2);
81
+ const period = h >= 12 ? 'PM' : 'AM';
82
+ const h12 = h % 12 || 12;
83
+ return `${h12}:${m} ${period}`;
84
+ };
85
+ const day = slot.day;
86
+ if (!dayMap[day])
87
+ dayMap[day] = [];
88
+ dayMap[day].push(`${fmt(slot.start)}–${fmt(slot.end)}`);
89
+ }
90
+ hoursStr = Object.entries(dayMap)
91
+ .map(([d, times]) => `${dayNames[parseInt(d, 10)]}: ${times.join(', ')}`)
92
+ .join(' | ');
93
+ }
94
+ const lines = [
95
+ `# ${name} ⭐ ${rating} (${reviewCount.toLocaleString()} reviews)`,
96
+ '',
97
+ ];
98
+ if (address)
99
+ lines.push(`📍 ${address}`);
100
+ if (categories)
101
+ lines.push(`🏷️ ${categories}${price ? ` | 💰 ${price}` : ''}`);
102
+ else if (price)
103
+ lines.push(`💰 ${price}`);
104
+ if (phone)
105
+ lines.push(`📞 ${phone}`);
106
+ if (hoursStr)
107
+ lines.push(`🕐 ${hoursStr}`);
108
+ if (biz.is_closed === true)
109
+ lines.push(`⚠️ *Permanently closed*`);
110
+ lines.push('');
111
+ if (reviews.length > 0) {
112
+ for (const rev of reviews) {
113
+ const stars = '⭐'.repeat(Math.round(rev.rating || 0));
114
+ const text = (rev.text || '').replace(/\n+/g, ' ').trim().slice(0, 200);
115
+ lines.push(`> ${stars} — ${text}${(rev.text || '').length > 200 ? '…' : ''}`);
116
+ lines.push('');
117
+ }
118
+ }
119
+ lines.push(`[View on Yelp](${yelpUrl})`);
120
+ return {
121
+ domain: 'yelp.com',
122
+ type: 'business',
123
+ structured: { name, rating: parseFloat(rating), reviewCount, address, phone, price, categories, url: yelpUrl },
124
+ cleanContent: lines.join('\n'),
125
+ };
126
+ }
127
+ // ----------------------------------------------------------------
128
+ // Search / Category URL: /search?find_desc=...&find_loc=...
129
+ // /search?cflt=restaurants&find_loc=...
130
+ // ----------------------------------------------------------------
131
+ const findDesc = searchParams.get('find_desc') || '';
132
+ const cflt = searchParams.get('cflt') || '';
133
+ const findLoc = searchParams.get('find_loc') || '';
134
+ if (!findLoc && !findDesc && !cflt) {
135
+ // Not a recognized pattern
136
+ return null;
137
+ }
138
+ const apiParams = { limit: '10' };
139
+ if (findLoc)
140
+ apiParams.location = findLoc;
141
+ if (findDesc)
142
+ apiParams.term = findDesc;
143
+ if (cflt && !findDesc)
144
+ apiParams.categories = cflt;
145
+ let data;
146
+ try {
147
+ data = await yelpFetch('/businesses/search', apiParams);
148
+ }
149
+ catch (e) {
150
+ if (process.env.DEBUG)
151
+ console.debug('[webpeel] Yelp search failed:', e instanceof Error ? e.message : e);
152
+ return null;
153
+ }
154
+ const businesses = data.businesses || [];
155
+ const total = data.total ?? businesses.length;
156
+ // Build header
157
+ const searchLabel = findDesc || cflt || 'Businesses';
158
+ const locationLabel = findLoc || '';
159
+ const emoji = cflt === 'restaurants' || findDesc?.toLowerCase().includes('restaurant') ? '🍽️'
160
+ : findDesc?.toLowerCase().includes('pizza') ? '🍕'
161
+ : findDesc?.toLowerCase().includes('coffee') || findDesc?.toLowerCase().includes('cafe') ? '☕'
162
+ : findDesc?.toLowerCase().includes('bar') ? '🍺'
163
+ : '🔍';
164
+ const titleParts = [searchLabel.charAt(0).toUpperCase() + searchLabel.slice(1)];
165
+ if (locationLabel)
166
+ titleParts.push(`in ${locationLabel}`);
167
+ const lines = [
168
+ `# ${emoji} Yelp — ${titleParts.join(' ')}`,
169
+ '',
170
+ `*${businesses.length} of ${total.toLocaleString()} results via Yelp Fusion API*`,
171
+ '',
172
+ ];
173
+ for (let i = 0; i < businesses.length; i++) {
174
+ const b = businesses[i];
175
+ const bName = b.name || 'Unknown';
176
+ const bRating = b.rating != null ? b.rating.toFixed(1) : '?';
177
+ const bReviews = b.review_count ?? 0;
178
+ const bAddr = b.location;
179
+ const bAddress = bAddr
180
+ ? [bAddr.address1, bAddr.city, bAddr.state, bAddr.zip_code].filter(Boolean).join(', ')
181
+ : '';
182
+ const bPhone = b.display_phone || '';
183
+ const bPrice = b.price || '';
184
+ const bCategories = (b.categories || []).map((c) => c.title).join(' | ');
185
+ const bUrl = b.url || '';
186
+ const bSnippet = b.snippet_text || '';
187
+ lines.push(`## ${i + 1}. ${bName} ⭐ ${bRating} (${bReviews.toLocaleString()} reviews)`);
188
+ if (bAddress)
189
+ lines.push(`📍 ${bAddress}`);
190
+ const tagLine = [bCategories && `🏷️ ${bCategories}`, bPrice && `💰 ${bPrice}`].filter(Boolean).join(' | ');
191
+ if (tagLine)
192
+ lines.push(tagLine);
193
+ if (bPhone)
194
+ lines.push(`📞 ${bPhone}`);
195
+ if (bSnippet)
196
+ lines.push(`> ${bSnippet.replace(/\n+/g, ' ').trim().slice(0, 150)}`);
197
+ if (bUrl)
198
+ lines.push(`[View on Yelp](${bUrl})`);
199
+ lines.push('');
200
+ }
201
+ if (businesses.length === 0) {
202
+ lines.push(`*No results found for "${searchLabel}"${locationLabel ? ` in ${locationLabel}` : ''}.*`);
203
+ }
204
+ return {
205
+ domain: 'yelp.com',
206
+ type: 'search',
207
+ structured: { query: searchLabel, location: locationLabel, total, count: businesses.length, businesses },
208
+ cleanContent: lines.join('\n'),
209
+ };
210
+ }
211
+ catch (e) {
212
+ if (process.env.DEBUG)
213
+ console.debug('[webpeel]', 'Yelp extractor error:', e instanceof Error ? e.message : e);
214
+ return null;
215
+ }
216
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function youtubeExtractor(_html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,189 @@
1
+ import { getYouTubeTranscript } from '../../core/youtube.js';
2
+ import { fetchJson } from './shared.js';
3
+ // ---------------------------------------------------------------------------
4
+ // 6. YouTube extractor (oEmbed API-first)
5
+ // ---------------------------------------------------------------------------
6
+ export async function youtubeExtractor(_html, url) {
7
+ // Helper: wrap a promise with a timeout
8
+ function withTimeout(promise, ms) {
9
+ return Promise.race([
10
+ promise,
11
+ new Promise((_, reject) => setTimeout(() => reject(new Error(`Timeout after ${ms}ms`)), ms)),
12
+ ]);
13
+ }
14
+ // Run transcript fetch and oEmbed fetch in parallel
15
+ // Proxy-based extraction takes 2-5s, but retry logic may need more time
16
+ const transcriptPromise = withTimeout(getYouTubeTranscript(url), 30000);
17
+ const oembedPromise = fetchJson(`https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`);
18
+ const noembedPromise = fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`).catch(() => null);
19
+ // Fetch subscriber count from channel page (lightweight, parallel)
20
+ const subscriberPromise = (async () => {
21
+ try {
22
+ // Wait for oEmbed to get channel URL, then fetch subscriber count from channel page
23
+ const oembed = await oembedPromise;
24
+ const channelUrl = oembed?.author_url;
25
+ if (!channelUrl)
26
+ return '';
27
+ const resp = await fetch(channelUrl, {
28
+ headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' },
29
+ signal: AbortSignal.timeout(5000),
30
+ });
31
+ const html = await resp.text();
32
+ // Look for subscriber count in page metadata (e.g. "4.12M subscribers")
33
+ const subMatch = html.match(/(\d+(?:\.\d+)?[KMBkmb]?)\s*subscribers/i);
34
+ return subMatch ? subMatch[1] + ' subscribers' : '';
35
+ }
36
+ catch {
37
+ return '';
38
+ }
39
+ })();
40
+ const [transcriptResult, oembedResult, noembedResult, subscriberResult] = await Promise.allSettled([
41
+ transcriptPromise,
42
+ oembedPromise,
43
+ noembedPromise,
44
+ subscriberPromise,
45
+ ]);
46
+ const transcript = transcriptResult.status === 'fulfilled' ? transcriptResult.value : null;
47
+ const oembedData = oembedResult.status === 'fulfilled' ? oembedResult.value : null;
48
+ const noembedData = noembedResult.status === 'fulfilled' ? noembedResult.value : null;
49
+ const subscriberCount = subscriberResult.status === 'fulfilled' ? subscriberResult.value : '';
50
+ if (process.env.DEBUG) {
51
+ if (transcriptResult.status === 'rejected') {
52
+ console.debug('[webpeel]', 'YouTube transcript failed:', transcriptResult.reason instanceof Error ? transcriptResult.reason.message : transcriptResult.reason);
53
+ }
54
+ if (oembedResult.status === 'rejected') {
55
+ console.debug('[webpeel]', 'YouTube oEmbed failed:', oembedResult.reason instanceof Error ? oembedResult.reason.message : oembedResult.reason);
56
+ }
57
+ }
58
+ // If transcript succeeded, build rich content
59
+ if (transcript) {
60
+ const title = transcript.title || oembedData?.title || '';
61
+ const channel = transcript.channel || oembedData?.author_name || '';
62
+ const channelUrl = oembedData?.author_url || `https://www.youtube.com/@${channel}`;
63
+ const description = transcript.description || noembedData?.description || oembedData?.description || '';
64
+ const thumbnailUrl = oembedData?.thumbnail_url || '';
65
+ const publishDate = transcript.publishDate || '';
66
+ const hasTranscript = transcript.segments.length > 0;
67
+ const structured = {
68
+ title,
69
+ channel,
70
+ channelUrl,
71
+ subscriberCount: subscriberCount || undefined,
72
+ duration: transcript.duration,
73
+ publishDate,
74
+ language: transcript.language,
75
+ availableLanguages: transcript.availableLanguages,
76
+ transcriptSegments: transcript.segments.length,
77
+ wordCount: transcript.wordCount ?? 0,
78
+ viewCount: transcript.viewCount ?? '',
79
+ likeCount: transcript.likeCount ?? '',
80
+ description,
81
+ thumbnailUrl,
82
+ chapters: transcript.chapters ?? [],
83
+ keyPoints: transcript.keyPoints ?? [],
84
+ source: 'transcript',
85
+ };
86
+ // Format the publish date nicely if it's an ISO date
87
+ let publishStr = '';
88
+ if (publishDate) {
89
+ try {
90
+ const d = new Date(publishDate);
91
+ publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
92
+ }
93
+ catch {
94
+ publishStr = publishDate;
95
+ }
96
+ }
97
+ // Format view count (e.g. "1,234,567" → "1.2M views")
98
+ let viewStr = '';
99
+ if (transcript.viewCount) {
100
+ const v = parseInt(transcript.viewCount, 10);
101
+ if (!isNaN(v)) {
102
+ if (v >= 1_000_000)
103
+ viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
104
+ else if (v >= 1_000)
105
+ viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
106
+ else
107
+ viewStr = `${v.toLocaleString()} views`;
108
+ }
109
+ }
110
+ // Build header line
111
+ const channelPart = subscriberCount ? `${channel} (${subscriberCount})` : channel;
112
+ const headerParts = [`**Channel:** ${channelPart}`];
113
+ if (transcript.duration && transcript.duration !== '0:00')
114
+ headerParts.push(`**Duration:** ${transcript.duration}`);
115
+ if (viewStr)
116
+ headerParts.push(`**${viewStr}**`);
117
+ if (publishStr)
118
+ headerParts.push(`**Published:** ${publishStr}`);
119
+ const headerLine = headerParts.join(' | ');
120
+ const parts = [];
121
+ parts.push(`# ${title}`);
122
+ parts.push(headerLine);
123
+ /**
124
+ * Strip music note symbols from transcript/caption text.
125
+ * YouTube auto-captions include ♪ and 🎵 as music cues.
126
+ * Patterns cleaned:
127
+ * [♪♪♪] → (removed)
128
+ * ♪ text ♪ → text
129
+ * standalone ♪ / 🎵 → (removed)
130
+ */
131
+ const cleanMusicNotes = (text) => text
132
+ // Remove bracketed music cues: [♪], [♪♪♪], [🎵🎵🎵], etc.
133
+ .replace(/\[[♪🎵]+\]/g, '')
134
+ // Unwrap ♪ text ♪ → text (keep the words between notes)
135
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
136
+ // Remove any remaining standalone ♪ or 🎵
137
+ .replace(/[♪🎵]+/g, '')
138
+ // Collapse extra whitespace introduced by removals
139
+ .replace(/\s{2,}/g, ' ')
140
+ .trim();
141
+ // Summary section
142
+ if (transcript.summary && hasTranscript) {
143
+ let summaryText = cleanMusicNotes(transcript.summary);
144
+ summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
145
+ parts.push(`## Summary\n\n${summaryText}`);
146
+ }
147
+ else if (!hasTranscript && transcript.fullText) {
148
+ parts.push(`## Description\n\n${transcript.fullText}`);
149
+ }
150
+ // Key Points section
151
+ if (transcript.keyPoints && transcript.keyPoints.length > 0) {
152
+ const kpLines = transcript.keyPoints.map(kp => `- ${kp}`).join('\n');
153
+ parts.push(`## Key Points\n\n${kpLines}`);
154
+ }
155
+ // Chapters section
156
+ if (transcript.chapters && transcript.chapters.length > 0) {
157
+ const chLines = transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n');
158
+ parts.push(`## Chapters\n\n${chLines}`);
159
+ }
160
+ // Full Transcript section (only if we have real transcript segments)
161
+ // Add intelligent paragraph breaks for readability
162
+ if (hasTranscript) {
163
+ let readableText = cleanMusicNotes(transcript.fullText);
164
+ // Break into paragraphs: after sentence-ending punctuation followed by a capital letter
165
+ readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
166
+ // Collapse any triple+ newlines
167
+ readableText = readableText.replace(/\n{3,}/g, '\n\n');
168
+ parts.push(`## Full Transcript\n\n${readableText}`);
169
+ }
170
+ const cleanContent = parts.join('\n\n');
171
+ return { domain: 'youtube.com', type: 'video', structured, cleanContent };
172
+ }
173
+ // Fall back to oEmbed if transcript failed
174
+ if (oembedData && oembedData.title) {
175
+ const structured = {
176
+ title: oembedData.title,
177
+ channel: oembedData.author_name || '',
178
+ channelUrl: oembedData.author_url || '',
179
+ thumbnailUrl: oembedData.thumbnail_url || '',
180
+ description: noembedData?.description || '',
181
+ type: oembedData.type || 'video',
182
+ source: 'oembed',
183
+ };
184
+ const descSection = structured.description ? `\n\n${structured.description}` : '\n\nYouTube video';
185
+ const cleanContent = `## 🎬 ${structured.title}\n\n**Channel:** [${structured.channel}](${structured.channelUrl})${descSection}`;
186
+ return { domain: 'youtube.com', type: 'video', structured, cleanContent };
187
+ }
188
+ return null;
189
+ }