@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,461 @@
1
+ /**
2
+ * Auto-extract repeated listing patterns from HTML pages.
3
+ *
4
+ * Given raw HTML (e.g. an eBay search results page), this module detects the
5
+ * largest group of sibling elements with a consistent internal structure and
6
+ * extracts structured fields (title, price, image, link, description, rating)
7
+ * from each item.
8
+ *
9
+ * @module extract-listings
10
+ */
11
+ import { load } from 'cheerio';
12
+ /* ------------------------------------------------------------------ */
13
+ /* Internal helpers */
14
+ /* ------------------------------------------------------------------ */
15
+ /** Tags we consider as potential listing containers. */
16
+ const CONTAINER_CHILD_TAGS = new Set(['li', 'div', 'article', 'section', 'tr', 'a']);
17
+ /** Return a normalised "child-tag signature" for a DOM element.
18
+ * Includes tag names **and their counts** so that elements with the same
19
+ * child-tag *names* but different *counts* (e.g. 3 `<td>` vs 2 `<td>`)
20
+ * produce distinct signatures. This is essential for table-based layouts
21
+ * like Hacker News where story rows (3 td) must be distinguished from
22
+ * subtext rows (2 td).
23
+ */
24
+ function childSignature($, el) {
25
+ const children = $(el).children();
26
+ if (children.length === 0)
27
+ return '';
28
+ const tagCounts = new Map();
29
+ children.each((_, child) => {
30
+ const tagName = child.tagName?.toLowerCase();
31
+ if (tagName)
32
+ tagCounts.set(tagName, (tagCounts.get(tagName) || 0) + 1);
33
+ });
34
+ return [...tagCounts.entries()]
35
+ .sort(([a], [b]) => a.localeCompare(b))
36
+ .map(([tag, count]) => `${tag}:${count}`)
37
+ .join(',');
38
+ }
39
+ /**
40
+ * Check whether two child-tag signatures are "similar enough" to be considered
41
+ * the same listing type.
42
+ *
43
+ * Compares full `tag:count` pairs so that elements with the same child tags
44
+ * but different counts are kept separate (critical for table-based layouts
45
+ * like Hacker News where story rows have 3 `<td>` and subtext rows have 2).
46
+ *
47
+ * Similarity is measured by Jaccard index on `tag:count` pairs, with a
48
+ * threshold of 0.5 or one being a subset of the other.
49
+ */
50
+ function signaturesAreSimilar(a, b) {
51
+ if (a === b)
52
+ return true;
53
+ if (!a || !b)
54
+ return false;
55
+ // Compare full "tag:count" pairs (e.g. "td:3" ≠ "td:2")
56
+ const pairsA = new Set(a.split(','));
57
+ const pairsB = new Set(b.split(','));
58
+ const intersection = [...pairsA].filter(p => pairsB.has(p)).length;
59
+ const union = new Set([...pairsA, ...pairsB]).size;
60
+ return intersection === pairsA.size || intersection === pairsB.size || (intersection / union) >= 0.5;
61
+ }
62
+ /**
63
+ * Walk the DOM and find the best "listing container" — the element whose
64
+ * direct children form the largest group of structurally-similar items.
65
+ */
66
+ function findListingContainer($) {
67
+ const candidates = [];
68
+ $('*').each((_, el) => {
69
+ const $el = $(el);
70
+ const children = $el.children();
71
+ if (children.length < 3)
72
+ return; // need at least 3 repeating items
73
+ // Group children by tag name
74
+ const byTag = new Map();
75
+ children.each((_, child) => {
76
+ const tag = child.tagName?.toLowerCase();
77
+ if (tag && CONTAINER_CHILD_TAGS.has(tag)) {
78
+ let arr = byTag.get(tag);
79
+ if (!arr) {
80
+ arr = [];
81
+ byTag.set(tag, arr);
82
+ }
83
+ arr.push(child);
84
+ }
85
+ });
86
+ for (const [tag, tagChildren] of byTag) {
87
+ if (tagChildren.length < 3)
88
+ continue;
89
+ // Compute child-structure signatures
90
+ const childSigs = [];
91
+ for (const child of tagChildren) {
92
+ const sig = childSignature($, child);
93
+ childSigs.push({ child, sig });
94
+ }
95
+ // Separate children with content vs empty
96
+ const withSig = childSigs.filter(c => c.sig.length > 0);
97
+ const withoutSig = childSigs.filter(c => c.sig.length === 0);
98
+ if (withSig.length === 0) {
99
+ // All children are text-only or empty — still consider if there are enough
100
+ const withContent = tagChildren.filter(c => $(c).text().trim().length > 3);
101
+ if (withContent.length >= 3) {
102
+ const score = withContent.length;
103
+ candidates.push({ parent: el, tag, children: withContent, score });
104
+ }
105
+ continue;
106
+ }
107
+ // Build signature clusters: each distinct (incompatible) signature group
108
+ // becomes its own candidate. This is critical for table-based layouts like
109
+ // Hacker News where story rows (td:3) and subtext rows (td:2) are siblings
110
+ // in the same tbody — we need BOTH as candidates so content scoring can
111
+ // choose the right one (article titles beat usernames).
112
+ const sigGroups = [];
113
+ for (const { child, sig } of withSig) {
114
+ let placed = false;
115
+ for (const group of sigGroups) {
116
+ if (signaturesAreSimilar(sig, group.repr)) {
117
+ group.children.push(child);
118
+ placed = true;
119
+ break;
120
+ }
121
+ }
122
+ if (!placed) {
123
+ sigGroups.push({ repr: sig, children: [child] });
124
+ }
125
+ }
126
+ // Text-only (no-sig) children with meaningful content go into the largest group
127
+ const largestGroup = sigGroups.reduce((best, g) => (!best || g.children.length > best.children.length) ? g : best, null);
128
+ for (const { child } of withoutSig) {
129
+ if (largestGroup && $(child).text().trim().length > 3) {
130
+ largestGroup.children.push(child);
131
+ }
132
+ }
133
+ // Generate a candidate for each significant cluster (count >= 3)
134
+ for (const group of sigGroups) {
135
+ if (group.children.length < 3)
136
+ continue;
137
+ const consistency = group.children.length / tagChildren.length;
138
+ const score = group.children.length * consistency;
139
+ candidates.push({ parent: el, tag, children: group.children, score });
140
+ }
141
+ }
142
+ });
143
+ if (candidates.length === 0)
144
+ return null;
145
+ // Sort by initial structural score descending.
146
+ candidates.sort((a, b) => b.score - a.score || b.children.length - a.children.length);
147
+ // Take top candidates and re-rank by content quality.
148
+ // This ensures containers with actual titles/prices beat those with
149
+ // usernames or boilerplate (e.g. HN subtext rows vs title rows).
150
+ const topN = candidates.slice(0, Math.min(candidates.length, 8));
151
+ let best = null;
152
+ let bestContentScore = -1;
153
+ for (const cand of topN) {
154
+ let titleLenSum = 0;
155
+ let titlesFound = 0;
156
+ let linksFound = 0;
157
+ const sample = cand.children.slice(0, 5);
158
+ for (const child of sample) {
159
+ const item = extractItem($, child);
160
+ if (item.title && item.title.length >= 3) {
161
+ titleLenSum += item.title.length;
162
+ titlesFound++;
163
+ }
164
+ if (item.link)
165
+ linksFound++;
166
+ }
167
+ // Content score: average title length × title hit rate × structural score
168
+ const avgTitleLen = titlesFound > 0 ? titleLenSum / titlesFound : 0;
169
+ const titleRate = titlesFound / sample.length;
170
+ const contentScore = avgTitleLen * titleRate * cand.score;
171
+ if (contentScore > bestContentScore) {
172
+ bestContentScore = contentScore;
173
+ best = cand;
174
+ }
175
+ }
176
+ return best;
177
+ }
178
+ /** Price-matching regex — $12.34, £99, €5,00, etc. */
179
+ const PRICE_RE = /(?:[\$£€¥₹])\s*[\d,]+(?:\.\d{1,2})?|[\d,]+(?:\.\d{1,2})?\s*(?:USD|EUR|GBP|JPY|INR)/i;
180
+ /**
181
+ * Common title prefixes injected by marketplaces (e.g. eBay's "New Listing")
182
+ * that should be stripped from extracted titles.
183
+ */
184
+ const TITLE_STRIP_PREFIXES = [
185
+ /^New\s+Listing\s*/i,
186
+ /^Sponsored\s*/i,
187
+ /^Opens\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?\s*/i,
188
+ /^Advertisement\s*/i,
189
+ /^Ad\s*[-–—:·]\s*/i,
190
+ /^Promoted\s*[-–—:·]?\s*/i,
191
+ ];
192
+ /**
193
+ * Common section-header / junk words that appear as single-word titles in
194
+ * listing pages (e.g. Amazon's "Results" header, eBay's "Sponsored" label).
195
+ */
196
+ const HEADER_WORDS = new Set([
197
+ 'results', 'sponsored', 'related', 'advertisement', 'shop', 'browse',
198
+ 'featured', 'popular', 'trending', 'new', 'sale', 'deals', 'more',
199
+ 'filters', 'sort', 'categories', 'departments', 'navigation',
200
+ ]);
201
+ /**
202
+ * Return true if a title string looks like a section header or junk rather
203
+ * than a real listing title.
204
+ */
205
+ function isHeaderOrJunk(title) {
206
+ if (!title)
207
+ return true;
208
+ if (title.length <= 3)
209
+ return true;
210
+ // Pure numbers or rank numbers like "10." "21." "100"
211
+ if (/^\d+\.?$/.test(title))
212
+ return true;
213
+ // Single word that matches a known header term
214
+ if (!/\s/.test(title) && HEADER_WORDS.has(title.toLowerCase()))
215
+ return true;
216
+ return false;
217
+ }
218
+ /**
219
+ * Common title suffixes that appear at the end of listing titles due to
220
+ * accessibility text embedded in links (e.g. eBay's "Opens in a new window or tab").
221
+ */
222
+ const TITLE_STRIP_SUFFIXES = [
223
+ // "Opens in new window" / "Opens in a new tab" / "Opens in new window or tab" etc.
224
+ /\s*Opens\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?$/i,
225
+ // Parenthesized variant: "(opens in a new window)"
226
+ /\s*\(opens\s+(?:in\s+)?(?:a\s+)?new\s+(?:window|tab)\)$/i,
227
+ // Dash variant: "- New window"
228
+ /\s*[-–—]\s*New\s+window$/i,
229
+ /\s*Sponsored$/i,
230
+ ];
231
+ /**
232
+ * Clean concatenated title artifacts that appear on travel/hotel aggregators
233
+ * (e.g. Google Travel where price, source and rating get concatenated into the title).
234
+ */
235
+ function cleanConcatenatedTitle(title) {
236
+ let cleaned = title;
237
+ // Strip price suffixes: "$149 Booking.com...", "$179Hyatt Place...", "£99 Hotels.com...", "$149" at end
238
+ // Handles both spaced ("$179 Hyatt") and concatenated ("$179Hyatt") from Google Travel etc.
239
+ cleaned = cleaned.replace(/[\$£€]\d[\d,.]*(?:\s+[A-Z].*|\S+.*)?$/i, '').trim();
240
+ // Strip rating suffixes: "4.2/5 (1.4K)" or "4.5 out of 5"
241
+ cleaned = cleaned.replace(/\d+\.?\d*\/5\s*\(.*$/i, '').trim();
242
+ // Strip star ratings: "· 3-star hotel" or "- 4 star"
243
+ cleaned = cleaned.replace(/\s*[·\-–]\s*\d+-?star\s.*$/i, '').trim();
244
+ // Strip source labels that got concatenated: "Expedia.com" at end
245
+ cleaned = cleaned.replace(/(?:Booking|Expedia|Hotels|Kayak|Trivago|Priceline|Agoda)\.com.*$/i, '').trim();
246
+ return cleaned || title; // fallback to original if over-stripped
247
+ }
248
+ /**
249
+ * Strip known marketplace prefixes and suffixes from a title string,
250
+ * then clean up any concatenated artifacts.
251
+ */
252
+ function stripTitlePrefixes(title) {
253
+ let t = title;
254
+ for (const prefix of TITLE_STRIP_PREFIXES) {
255
+ t = t.replace(prefix, '');
256
+ }
257
+ for (const suffix of TITLE_STRIP_SUFFIXES) {
258
+ t = t.replace(suffix, '');
259
+ }
260
+ return cleanConcatenatedTitle(t.trim());
261
+ }
262
+ /**
263
+ * Resolve a potentially relative URL against a base URL.
264
+ * Returns the original string if resolution fails.
265
+ */
266
+ function resolveUrl(href, baseUrl) {
267
+ if (!href)
268
+ return undefined;
269
+ if (href.startsWith('data:') || href.startsWith('javascript:'))
270
+ return undefined;
271
+ if (!baseUrl)
272
+ return href;
273
+ try {
274
+ return new URL(href, baseUrl).href;
275
+ }
276
+ catch {
277
+ return href;
278
+ }
279
+ }
280
+ /**
281
+ * Extract a single ListingItem from a DOM element.
282
+ */
283
+ function extractItem($, el, baseUrl) {
284
+ const $el = $(el);
285
+ const item = {};
286
+ // --- Title + Title-source element (used later for preferred link) ---
287
+ // Priority: heading > [class*="title"]/[class*="name"] text or inner link text > first <a> text
288
+ let titleSourceEl = null;
289
+ const heading = $el.find('h1, h2, h3, h4, h5, h6').first();
290
+ if (heading.length && heading.text().trim().length >= 3) {
291
+ item.title = stripTitlePrefixes(heading.text().trim());
292
+ titleSourceEl = heading;
293
+ }
294
+ else {
295
+ // Iterate ALL title/name class matches (not just .first()) — some sites
296
+ // have multiple elements with "title" in their class (e.g. HN has a rank
297
+ // cell and a title cell both with class="title").
298
+ // Two-pass approach: first prefer candidates that have an inner link
299
+ // (avoids picking rank numbers like "10." which appear before the real
300
+ // title cell in Hacker News rows), then fall back to linkless candidates.
301
+ const titleCandidates = $el.find('[class*="title"], [class*="name"], [class*="Title"], [class*="Name"]');
302
+ // Pass 1: title-class elements with inner <a> links
303
+ titleCandidates.each((_, tc) => {
304
+ if (item.title)
305
+ return;
306
+ const $tc = $(tc);
307
+ const innerLink = $tc.find('a').first();
308
+ if (!innerLink.length)
309
+ return; // no link — skip in pass 1
310
+ const candidateText = innerLink.text().trim();
311
+ if (candidateText.length >= 3) {
312
+ item.title = stripTitlePrefixes(candidateText);
313
+ titleSourceEl = $tc;
314
+ }
315
+ });
316
+ // Pass 2: title-class elements without inner links (requires longer text
317
+ // to avoid picking up rank numbers like "10.")
318
+ if (!item.title) {
319
+ titleCandidates.each((_, tc) => {
320
+ if (item.title)
321
+ return;
322
+ const $tc = $(tc);
323
+ const innerLink = $tc.find('a').first();
324
+ if (innerLink.length)
325
+ return; // has link — already handled in pass 1
326
+ const candidateText = $tc.text().trim();
327
+ if (candidateText.length >= 8) { // higher threshold for linkless elements
328
+ item.title = stripTitlePrefixes(candidateText);
329
+ titleSourceEl = $tc;
330
+ }
331
+ });
332
+ }
333
+ if (!item.title) {
334
+ // Fall back to first <a> with meaningful text
335
+ $el.find('a').each((_, a) => {
336
+ if (item.title)
337
+ return;
338
+ const text = $(a).text().trim();
339
+ if (text.length >= 3) {
340
+ item.title = stripTitlePrefixes(text);
341
+ titleSourceEl = $(a);
342
+ }
343
+ });
344
+ }
345
+ }
346
+ // --- Price ---
347
+ const priceEl = $el.find('[class*="price"], [class*="Price"], [data-price]').first();
348
+ if (priceEl.length) {
349
+ const priceText = priceEl.text().trim();
350
+ const match = priceText.match(PRICE_RE);
351
+ item.price = match ? match[0] : priceText;
352
+ }
353
+ else {
354
+ // Scan entire element text for a price pattern
355
+ const fullText = $el.text();
356
+ const match = fullText.match(PRICE_RE);
357
+ if (match) {
358
+ item.price = match[0];
359
+ }
360
+ }
361
+ // --- Image ---
362
+ const img = $el.find('img').first();
363
+ if (img.length) {
364
+ const src = img.attr('src') || img.attr('data-src') || img.attr('data-lazy-src');
365
+ item.image = resolveUrl(src, baseUrl);
366
+ }
367
+ // --- Link ---
368
+ // Prefer the link associated with the title element we found, falling back
369
+ // to any <a[href]> in the listing. Using titleSourceEl avoids accidentally
370
+ // picking up vote / action links that appear before the article link in the DOM
371
+ // (e.g. Hacker News vote arrows before the story title link).
372
+ let primaryLink = null;
373
+ if (titleSourceEl) {
374
+ const titleElTag = titleSourceEl.prop('tagName')?.toLowerCase();
375
+ if (titleElTag === 'a') {
376
+ // The title source IS the link itself
377
+ primaryLink = titleSourceEl;
378
+ }
379
+ else {
380
+ const innerLink = titleSourceEl.find('a[href]').first();
381
+ if (innerLink.length)
382
+ primaryLink = innerLink;
383
+ }
384
+ }
385
+ if (!primaryLink || !primaryLink.length) {
386
+ primaryLink = $el.find('a[href]').first();
387
+ }
388
+ if (primaryLink && primaryLink.length) {
389
+ item.link = resolveUrl(primaryLink.attr('href'), baseUrl);
390
+ }
391
+ // If the element itself is an <a>, use its href
392
+ if (!item.link && $el.prop('tagName')?.toLowerCase() === 'a') {
393
+ item.link = resolveUrl($el.attr('href'), baseUrl);
394
+ }
395
+ // --- Rating ---
396
+ const ratingEl = $el.find('[class*="rating"], [class*="Rating"], [class*="star"], [class*="Star"], [aria-label*="star"], [aria-label*="rating"]').first();
397
+ if (ratingEl.length) {
398
+ const ariaLabel = ratingEl.attr('aria-label');
399
+ item.rating = ariaLabel || ratingEl.text().trim() || undefined;
400
+ }
401
+ // --- Description ---
402
+ // Gather remaining text that isn't the title or price
403
+ const usedTexts = new Set();
404
+ if (item.title)
405
+ usedTexts.add(item.title);
406
+ if (item.price)
407
+ usedTexts.add(item.price);
408
+ if (item.rating)
409
+ usedTexts.add(item.rating);
410
+ const descParts = [];
411
+ $el.find('p, span, [class*="desc"], [class*="Desc"], [class*="subtitle"], [class*="snippet"]').each((_, descEl) => {
412
+ const text = $(descEl).text().trim();
413
+ if (text.length > 5 && !usedTexts.has(text) && text !== item.title) {
414
+ descParts.push(text);
415
+ usedTexts.add(text);
416
+ }
417
+ });
418
+ if (descParts.length > 0) {
419
+ item.description = descParts.slice(0, 2).join(' ');
420
+ }
421
+ return item;
422
+ }
423
+ /* ------------------------------------------------------------------ */
424
+ /* Public API */
425
+ /* ------------------------------------------------------------------ */
426
+ /**
427
+ * Automatically detect repeated listing patterns in raw HTML and extract
428
+ * structured items.
429
+ *
430
+ * @param html - Raw HTML string to parse.
431
+ * @param url - Optional base URL for resolving relative links and images.
432
+ * @returns Array of extracted listing items (may be empty).
433
+ *
434
+ * @example
435
+ * ```typescript
436
+ * import { extractListings } from 'webpeel';
437
+ *
438
+ * const items = extractListings(ebayHtml, 'https://ebay.com/sch?q=card');
439
+ * console.log(items[0].title); // "Charizard VMAX 020/189"
440
+ * console.log(items[0].price); // "$24.99"
441
+ * ```
442
+ */
443
+ export function extractListings(html, url) {
444
+ if (!html || html.trim().length === 0)
445
+ return [];
446
+ const $ = load(html);
447
+ const container = findListingContainer($);
448
+ if (!container)
449
+ return [];
450
+ const items = [];
451
+ for (const child of container.children) {
452
+ const item = extractItem($, child, url);
453
+ // Filter out empty / too-short titles and known header/junk words
454
+ if (!item.title || item.title.length < 3)
455
+ continue;
456
+ if (isHeaderOrJunk(item.title))
457
+ continue;
458
+ items.push(item);
459
+ }
460
+ return items;
461
+ }
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Structured data extraction using CSS selectors and heuristics
3
+ */
4
+ import type { ExtractOptions } from '../types.js';
5
+ /**
6
+ * Extract structured data using an LLM (OpenAI-compatible API)
7
+ */
8
+ export declare function extractWithLLM(content: string, options: ExtractOptions): Promise<Record<string, any>>;
9
+ export declare function extractStructured(html: string, options: ExtractOptions): Record<string, any>;
@@ -0,0 +1,139 @@
1
+ /**
2
+ * Structured data extraction using CSS selectors and heuristics
3
+ */
4
+ import { load } from 'cheerio';
5
+ /**
6
+ * Extract structured data using an LLM (OpenAI-compatible API)
7
+ */
8
+ export async function extractWithLLM(content, options) {
9
+ const { prompt, schema, llmApiKey, llmModel = 'gpt-4o-mini', llmBaseUrl = 'https://api.openai.com/v1' } = options;
10
+ if (!llmApiKey)
11
+ throw new Error('LLM extraction requires llmApiKey');
12
+ if (!prompt && !schema)
13
+ throw new Error('LLM extraction requires prompt or schema');
14
+ // Truncate content to ~4000 tokens to keep costs low
15
+ const maxChars = 16000;
16
+ const truncatedContent = content.length > maxChars
17
+ ? content.slice(0, maxChars) + '\n\n[Content truncated]'
18
+ : content;
19
+ const systemPrompt = schema
20
+ ? `Extract structured data from the following web page content. Return a JSON object matching this schema:\n${JSON.stringify(schema, null, 2)}\n\nReturn ONLY valid JSON, no explanation.`
21
+ : `Extract structured data from the following web page content based on this instruction: ${prompt}\n\nReturn ONLY valid JSON, no explanation.`;
22
+ const { fetch: undiciFetch } = await import('undici');
23
+ const response = await undiciFetch(`${llmBaseUrl}/chat/completions`, {
24
+ method: 'POST',
25
+ headers: {
26
+ 'Content-Type': 'application/json',
27
+ 'Authorization': `Bearer ${llmApiKey}`,
28
+ },
29
+ body: JSON.stringify({
30
+ model: llmModel,
31
+ messages: [
32
+ { role: 'system', content: systemPrompt },
33
+ { role: 'user', content: truncatedContent },
34
+ ],
35
+ temperature: 0,
36
+ response_format: { type: 'json_object' },
37
+ }),
38
+ });
39
+ if (!response.ok) {
40
+ const errorText = await response.text();
41
+ throw new Error(`LLM API error ${response.status}: ${errorText}`);
42
+ }
43
+ const result = await response.json();
44
+ const responseContent = result.choices?.[0]?.message?.content;
45
+ if (!responseContent) {
46
+ throw new Error('LLM returned empty response');
47
+ }
48
+ try {
49
+ return JSON.parse(responseContent);
50
+ }
51
+ catch {
52
+ throw new Error(`LLM returned invalid JSON: ${responseContent.slice(0, 200)}`);
53
+ }
54
+ }
55
+ export function extractStructured(html, options) {
56
+ const $ = load(html);
57
+ const result = {};
58
+ if (options.selectors) {
59
+ // Direct CSS selector extraction
60
+ for (const [field, selectorRaw] of Object.entries(options.selectors)) {
61
+ // Support @attr syntax: "a@href" extracts the href attribute from <a> elements
62
+ const attrMatch = selectorRaw.match(/^(.+?)@([a-zA-Z-]+)$/);
63
+ const cssSelector = attrMatch ? attrMatch[1] : selectorRaw;
64
+ const attrName = attrMatch ? attrMatch[2] : null;
65
+ const elements = $(cssSelector);
66
+ if (elements.length === 0) {
67
+ result[field] = null;
68
+ }
69
+ else if (attrName) {
70
+ // Extract attribute value(s)
71
+ if (elements.length === 1) {
72
+ result[field] = elements.first().attr(attrName) ?? null;
73
+ }
74
+ else {
75
+ result[field] = elements.map((_, el) => $(el).attr(attrName) ?? null).get();
76
+ }
77
+ }
78
+ else if (elements.length === 1) {
79
+ result[field] = elements.first().text().trim();
80
+ }
81
+ else {
82
+ result[field] = elements.map((_, el) => $(el).text().trim()).get();
83
+ }
84
+ }
85
+ }
86
+ if (options.schema) {
87
+ // Schema-based extraction using heuristics
88
+ const properties = options.schema.properties || options.schema;
89
+ for (const [field, spec] of Object.entries(properties)) {
90
+ if (result[field] !== undefined)
91
+ continue; // Already extracted by selector
92
+ // Try common CSS patterns based on field name
93
+ const fieldLower = field.toLowerCase();
94
+ const candidates = [
95
+ `[itemprop="${fieldLower}"]`,
96
+ `[data-${fieldLower}]`,
97
+ `.${fieldLower}`,
98
+ `#${fieldLower}`,
99
+ `[class*="${fieldLower}"]`,
100
+ `meta[name="${fieldLower}"]`,
101
+ `meta[property="og:${fieldLower}"]`,
102
+ ];
103
+ for (const sel of candidates) {
104
+ const el = $(sel).first();
105
+ if (el.length > 0) {
106
+ let value = el.attr('content') || el.text().trim();
107
+ if (value) {
108
+ // Type coercion based on schema
109
+ if (spec?.type === 'number') {
110
+ const num = parseFloat(value.replace(/[^0-9.-]/g, ''));
111
+ if (!isNaN(num)) {
112
+ result[field] = num;
113
+ break;
114
+ }
115
+ }
116
+ else if (spec?.type === 'boolean') {
117
+ result[field] = ['true', 'yes', '1'].includes(value.toLowerCase());
118
+ break;
119
+ }
120
+ else if (spec?.type === 'array') {
121
+ // For arrays, get all matches
122
+ const allValues = $(sel).map((_, e) => $(e).text().trim()).get();
123
+ result[field] = allValues;
124
+ break;
125
+ }
126
+ else {
127
+ result[field] = value;
128
+ break;
129
+ }
130
+ }
131
+ }
132
+ }
133
+ if (result[field] === undefined) {
134
+ result[field] = null;
135
+ }
136
+ }
137
+ }
138
+ return result;
139
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * In-memory LRU fetch cache for WebPeel
3
+ *
4
+ * Caches pipeline results to avoid redundant fetches for identical requests.
5
+ * Supports TTL-based expiry and LRU eviction when maxEntries is exceeded.
6
+ * Exported as a singleton: import { fetchCache } from './fetch-cache.js'
7
+ */
8
+ export interface FetchCacheEntry {
9
+ content: string;
10
+ title: string;
11
+ metadata: any;
12
+ method: string;
13
+ tokens: number;
14
+ links?: any[];
15
+ timestamp: number;
16
+ }
17
+ export interface FetchCacheStats {
18
+ size: number;
19
+ hits: number;
20
+ misses: number;
21
+ hitRate: number;
22
+ }
23
+ export declare class FetchCache {
24
+ private cache;
25
+ private maxEntries;
26
+ private defaultTTL;
27
+ private hits;
28
+ private misses;
29
+ constructor(maxEntries?: number, defaultTTLSeconds?: number);
30
+ /**
31
+ * Generate a stable cache key from url + relevant fetch options.
32
+ * Different option combinations produce different cache entries.
33
+ */
34
+ getKey(url: string, options?: {
35
+ render?: boolean;
36
+ stealth?: boolean;
37
+ budget?: number;
38
+ }): string;
39
+ /**
40
+ * Retrieve a cached entry. Returns null if missing or expired.
41
+ * On hit: entry is moved to the end of the Map (LRU refresh).
42
+ */
43
+ get(key: string): FetchCacheEntry | null;
44
+ /**
45
+ * Store an entry in the cache.
46
+ * If the cache is at capacity, the least recently used entry is evicted.
47
+ */
48
+ set(key: string, entry: FetchCacheEntry): void;
49
+ /** Clear all entries and reset stats. */
50
+ clear(): void;
51
+ /** Return cache stats. hitRate is in [0, 1]. */
52
+ stats(): FetchCacheStats;
53
+ }
54
+ /** Singleton fetch cache — shared across all requests (5 min TTL, 500 entries). */
55
+ export declare const fetchCache: FetchCache;
56
+ /** Singleton search cache — shorter TTL since results change faster (60 s). */
57
+ export declare const searchCache: FetchCache;