@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Schema-based extraction using CSS selectors.
3
+ *
4
+ * Each schema defines how to extract listings from a specific domain,
5
+ * inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
6
+ * auto-detection, schemas provide exact selectors for each site's DOM.
7
+ *
8
+ * @module schema-extraction
9
+ */
10
+ export interface SchemaField {
11
+ /** Field name in output (e.g., "title", "price", "rating") */
12
+ name: string;
13
+ /** CSS selector relative to baseSelector. Empty string selects the base element itself. */
14
+ selector: string;
15
+ /** What to extract */
16
+ type: 'text' | 'attribute' | 'html' | 'exists';
17
+ /** For type='attribute', which attribute to read */
18
+ attribute?: string;
19
+ /** Extract all matches (returns array instead of first match) */
20
+ multiple?: boolean;
21
+ /** Optional transform to apply after extraction */
22
+ transform?: 'trim' | 'number' | 'stripCurrency';
23
+ }
24
+ export interface ExtractionSchema {
25
+ /** Human-readable schema name (e.g., "Booking.com Hotel Search") */
26
+ name: string;
27
+ /** Schema version string */
28
+ version: string;
29
+ /** Matching domains (e.g., ["booking.com", "www.booking.com"]) */
30
+ domains: string[];
31
+ /** Optional URL path patterns (regex strings) for more specific matching */
32
+ urlPatterns?: string[];
33
+ /** CSS selector for each listing item */
34
+ baseSelector: string;
35
+ /** Fields to extract from each item */
36
+ fields: SchemaField[];
37
+ /** Optional pagination config */
38
+ pagination?: {
39
+ nextSelector?: string;
40
+ pageParam?: string;
41
+ };
42
+ }
43
+ /** A single extracted item — field names map to extracted values */
44
+ export interface ExtractedItem {
45
+ [key: string]: string | string[] | boolean | number | undefined;
46
+ }
47
+ /**
48
+ * Load all bundled schemas.
49
+ */
50
+ export declare function loadBundledSchemas(): ExtractionSchema[];
51
+ /**
52
+ * Find a matching schema for a given URL.
53
+ *
54
+ * Matches by domain first, then optionally by URL patterns (regex).
55
+ * Returns the first matching schema or null.
56
+ */
57
+ export declare function findSchemaForUrl(url: string): ExtractionSchema | null;
58
+ /**
59
+ * Extract listings from HTML using a schema's CSS selectors.
60
+ *
61
+ * @param html - Raw HTML string to parse
62
+ * @param schema - Extraction schema to use
63
+ * @param baseUrl - Optional base URL for resolving relative links
64
+ * @returns Array of extracted items (may be empty)
65
+ */
66
+ export declare function extractWithSchema(html: string, schema: ExtractionSchema, baseUrl?: string): ExtractedItem[];
@@ -0,0 +1,352 @@
1
+ /**
2
+ * Schema-based extraction using CSS selectors.
3
+ *
4
+ * Each schema defines how to extract listings from a specific domain,
5
+ * inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
6
+ * auto-detection, schemas provide exact selectors for each site's DOM.
7
+ *
8
+ * @module schema-extraction
9
+ */
10
+ import { load } from 'cheerio';
11
+ /* ------------------------------------------------------------------ */
12
+ /* Bundled schemas (hardcoded to avoid JSON import complications) */
13
+ /* ------------------------------------------------------------------ */
14
+ const BOOKING_COM_SCHEMA = {
15
+ name: 'Booking.com Hotel Search',
16
+ version: '1.0',
17
+ domains: ['booking.com', 'www.booking.com'],
18
+ urlPatterns: ['searchresults'],
19
+ baseSelector: "[data-testid='property-card']",
20
+ fields: [
21
+ { name: 'title', selector: "[data-testid='title'], .sr-hotel__name, h3 a", type: 'text' },
22
+ { name: 'price', selector: "[data-testid='price-and-discounted-price'], .bui-price-display__value, [data-testid='price-for-x-nights']", type: 'text', transform: 'trim' },
23
+ { name: 'rating', selector: "[data-testid='review-score'] div:first-child, .bui-review-score__badge", type: 'text' },
24
+ { name: 'reviewCount', selector: "[data-testid='review-score'] div:nth-child(2) div:nth-child(2), .bui-review-score__text", type: 'text' },
25
+ { name: 'location', selector: "[data-testid='address'], .sr_card_address_line", type: 'text' },
26
+ { name: 'link', selector: "a[data-testid='title-link'], h3 a, a.hotel_name_link", type: 'attribute', attribute: 'href' },
27
+ { name: 'image', selector: "img[data-testid='image'], img.hotel_image", type: 'attribute', attribute: 'src' },
28
+ { name: 'stars', selector: "[data-testid='rating-stars'] span, .bui-star-rating .bui-star-rating__star", type: 'text' },
29
+ ],
30
+ };
31
+ const AMAZON_COM_SCHEMA = {
32
+ name: 'Amazon Product Search',
33
+ version: '1.0',
34
+ domains: ['amazon.com', 'www.amazon.com', 'amazon.co.uk', 'amazon.de', 'amazon.fr', 'amazon.ca'],
35
+ urlPatterns: ['/s\\?', '/s/'],
36
+ baseSelector: "[data-component-type='s-search-result']",
37
+ fields: [
38
+ { name: 'title', selector: 'h2 a span, h2 span a span', type: 'text' },
39
+ { name: 'price', selector: '.a-price .a-offscreen', type: 'text' },
40
+ { name: 'originalPrice', selector: '.a-price.a-text-price .a-offscreen', type: 'text' },
41
+ { name: 'rating', selector: '.a-icon-star-small .a-icon-alt, .a-icon-star-mini .a-icon-alt', type: 'text' },
42
+ { name: 'reviewCount', selector: "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span, .a-size-base.s-underline-text", type: 'text' },
43
+ { name: 'link', selector: 'h2 a', type: 'attribute', attribute: 'href' },
44
+ { name: 'image', selector: '.s-image', type: 'attribute', attribute: 'src' },
45
+ { name: 'sponsored', selector: '.puis-sponsored-label-text', type: 'exists' },
46
+ { name: 'asin', selector: '', type: 'attribute', attribute: 'data-asin' },
47
+ ],
48
+ };
49
+ const EBAY_COM_SCHEMA = {
50
+ name: 'eBay Search Results',
51
+ version: '1.0',
52
+ domains: ['ebay.com', 'www.ebay.com'],
53
+ urlPatterns: ['/sch/'],
54
+ baseSelector: '.s-item, [data-viewport]',
55
+ fields: [
56
+ { name: 'title', selector: '.s-item__title span, .s-item__title', type: 'text' },
57
+ { name: 'price', selector: '.s-item__price', type: 'text' },
58
+ { name: 'link', selector: '.s-item__link, a.s-item__link', type: 'attribute', attribute: 'href' },
59
+ { name: 'image', selector: '.s-item__image-wrapper img, .s-item__image img', type: 'attribute', attribute: 'src' },
60
+ { name: 'condition', selector: '.SECONDARY_INFO', type: 'text' },
61
+ { name: 'shipping', selector: '.s-item__shipping, .s-item__freeXDays', type: 'text' },
62
+ { name: 'seller', selector: '.s-item__seller-info-text', type: 'text' },
63
+ ],
64
+ };
65
+ const YELP_COM_SCHEMA = {
66
+ name: 'Yelp Business Search',
67
+ version: '1.0',
68
+ domains: ['yelp.com', 'www.yelp.com'],
69
+ urlPatterns: ['/search'],
70
+ baseSelector: "[data-testid='serp-ia-card'], li.border-color--default",
71
+ fields: [
72
+ { name: 'title', selector: "a[href*='/biz/'] span, h3 a span", type: 'text' },
73
+ { name: 'rating', selector: "[aria-label*='star rating'], .i-stars", type: 'attribute', attribute: 'aria-label' },
74
+ { name: 'reviewCount', selector: ".reviewCount, span[class*='css-']", type: 'text' },
75
+ { name: 'price', selector: '.priceRange, span.priceRange', type: 'text' },
76
+ { name: 'category', selector: ".priceCategory span, p[class*='css-'] a", type: 'text' },
77
+ { name: 'link', selector: "a[href*='/biz/']", type: 'attribute', attribute: 'href' },
78
+ { name: 'address', selector: "address, span[class*='css-']", type: 'text' },
79
+ ],
80
+ };
81
+ const WALMART_COM_SCHEMA = {
82
+ name: 'Walmart Product Search',
83
+ version: '1.0',
84
+ domains: ['walmart.com', 'www.walmart.com'],
85
+ urlPatterns: ['/search'],
86
+ baseSelector: "[data-testid='list-view'] > div, [data-item-id]",
87
+ fields: [
88
+ { name: 'title', selector: "a[link-identifier] span, [data-automation-id='product-title']", type: 'text' },
89
+ { name: 'price', selector: "[data-automation-id='product-price'] .f2, [itemprop='price']", type: 'text' },
90
+ { name: 'rating', selector: "[data-testid='product-ratings'] .w_iUH7, .stars-reviews-count", type: 'text' },
91
+ { name: 'link', selector: "a[link-identifier], a[href*='/ip/']", type: 'attribute', attribute: 'href' },
92
+ { name: 'image', selector: "img[data-testid='productTileImage'], img[loading]", type: 'attribute', attribute: 'src' },
93
+ { name: 'seller', selector: "[data-automation-id='fulfillment-badge']", type: 'text' },
94
+ ],
95
+ };
96
+ const HACKERNEWS_SCHEMA = {
97
+ name: 'Hacker News',
98
+ version: '1.0',
99
+ domains: ['news.ycombinator.com'],
100
+ baseSelector: 'tr.athing',
101
+ fields: [
102
+ { name: 'title', selector: '.titleline a', type: 'text' },
103
+ { name: 'link', selector: '.titleline a', type: 'attribute', attribute: 'href' },
104
+ { name: 'rank', selector: '.rank', type: 'text' },
105
+ { name: 'site', selector: '.sitestr', type: 'text' },
106
+ ],
107
+ };
108
+ const EXPEDIA_COM_SCHEMA = {
109
+ name: 'Expedia Hotel Search',
110
+ version: '1.0',
111
+ domains: ['expedia.com', 'www.expedia.com'],
112
+ urlPatterns: ['Hotel-Search', 'hotel-search'],
113
+ baseSelector: "[data-stid='property-listing'], li.uitk-spacing[class*='uitk-spacing'], [data-stid='lodging-card-responsive']",
114
+ fields: [
115
+ { name: 'title', selector: "[data-stid='content-hotel-title'], .uitk-heading-5, .uitk-heading-6, h3[class*='uitk-heading']", type: 'text' },
116
+ { name: 'price', selector: "[data-stid='price-summary'] .uitk-type-500, [data-stid='price-summary-message-total'], .uitk-type-500", type: 'text', transform: 'trim' },
117
+ { name: 'rating', selector: "[data-stid='star-rating-msg'], .uitk-badge-base, [aria-label*='out of']", type: 'text' },
118
+ { name: 'reviewCount', selector: "[data-stid='review-info-text'], .uitk-type-200", type: 'text' },
119
+ { name: 'location', selector: "[data-stid='location-info'], [data-stid='neighborhood-name']", type: 'text' },
120
+ { name: 'link', selector: "a[data-stid='open-hotel-information'], a[href*='/h/'], a.uitk-card-link", type: 'attribute', attribute: 'href' },
121
+ { name: 'image', selector: "img[data-stid='image'], .uitk-image-media img", type: 'attribute', attribute: 'src' },
122
+ ],
123
+ };
124
+ /** All bundled schemas in priority order */
125
+ const BUNDLED_SCHEMAS = [
126
+ BOOKING_COM_SCHEMA,
127
+ AMAZON_COM_SCHEMA,
128
+ EBAY_COM_SCHEMA,
129
+ YELP_COM_SCHEMA,
130
+ WALMART_COM_SCHEMA,
131
+ HACKERNEWS_SCHEMA,
132
+ EXPEDIA_COM_SCHEMA,
133
+ ];
134
+ /* ------------------------------------------------------------------ */
135
+ /* Helpers */
136
+ /* ------------------------------------------------------------------ */
137
+ /**
138
+ * Apply a transform to an extracted string value.
139
+ */
140
+ function applyTransform(value, transform) {
141
+ if (!transform)
142
+ return value;
143
+ switch (transform) {
144
+ case 'trim':
145
+ return value.trim();
146
+ case 'number': {
147
+ const num = parseFloat(value.replace(/[^\d.]/g, ''));
148
+ return isNaN(num) ? value : num;
149
+ }
150
+ case 'stripCurrency':
151
+ return value.replace(/[^\d.,]/g, '').trim();
152
+ default:
153
+ return value;
154
+ }
155
+ }
156
+ /**
157
+ * Resolve a potentially relative URL against a base URL.
158
+ */
159
+ function resolveUrl(href, baseUrl) {
160
+ if (!href)
161
+ return undefined;
162
+ if (href.startsWith('data:') || href.startsWith('javascript:'))
163
+ return undefined;
164
+ if (!baseUrl)
165
+ return href;
166
+ try {
167
+ return new URL(href, baseUrl).href;
168
+ }
169
+ catch {
170
+ return href;
171
+ }
172
+ }
173
+ /**
174
+ * Extract a single field value from a cheerio element.
175
+ */
176
+ function extractFieldValue($, $el, field, baseUrl) {
177
+ // For empty selector on attribute type, read from the base element itself
178
+ const useBaseEl = field.selector === '' || field.selector.trim() === '';
179
+ if (field.multiple && !useBaseEl) {
180
+ // Collect all matches
181
+ const results = [];
182
+ $el.find(field.selector).each((_, el) => {
183
+ const $match = $(el);
184
+ let val;
185
+ switch (field.type) {
186
+ case 'text':
187
+ val = $match.text().trim();
188
+ break;
189
+ case 'attribute':
190
+ val = field.attribute ? ($match.attr(field.attribute) ?? undefined) : undefined;
191
+ if (field.attribute === 'href' || field.attribute === 'src') {
192
+ val = resolveUrl(val, baseUrl);
193
+ }
194
+ break;
195
+ case 'html':
196
+ val = $match.html() ?? undefined;
197
+ break;
198
+ case 'exists':
199
+ // not meaningful for multiple
200
+ break;
201
+ }
202
+ if (val !== undefined && val !== '')
203
+ results.push(val);
204
+ });
205
+ return results.length > 0 ? results : undefined;
206
+ }
207
+ // Single match mode
208
+ const $target = useBaseEl ? $el : $el.find(field.selector).first();
209
+ switch (field.type) {
210
+ case 'exists':
211
+ return useBaseEl ? true : $el.find(field.selector).length > 0;
212
+ case 'text': {
213
+ if (!useBaseEl && $target.length === 0)
214
+ return undefined;
215
+ const text = $target.text().trim();
216
+ if (text === '')
217
+ return undefined;
218
+ const transformed = applyTransform(text, field.transform);
219
+ return transformed;
220
+ }
221
+ case 'attribute': {
222
+ if (!field.attribute)
223
+ return undefined;
224
+ const attrVal = $target.attr(field.attribute) ?? undefined;
225
+ if (attrVal === undefined)
226
+ return undefined;
227
+ if (field.attribute === 'href' || field.attribute === 'src') {
228
+ const resolved = resolveUrl(attrVal, baseUrl);
229
+ if (!resolved)
230
+ return undefined;
231
+ return applyTransform(resolved, field.transform);
232
+ }
233
+ return applyTransform(attrVal, field.transform);
234
+ }
235
+ case 'html': {
236
+ if (!useBaseEl && $target.length === 0)
237
+ return undefined;
238
+ return $target.html() ?? undefined;
239
+ }
240
+ default:
241
+ return undefined;
242
+ }
243
+ }
244
+ /* ------------------------------------------------------------------ */
245
+ /* Public API */
246
+ /* ------------------------------------------------------------------ */
247
+ /**
248
+ * Load all bundled schemas.
249
+ */
250
+ export function loadBundledSchemas() {
251
+ return [...BUNDLED_SCHEMAS];
252
+ }
253
+ /**
254
+ * Find a matching schema for a given URL.
255
+ *
256
+ * Matches by domain first, then optionally by URL patterns (regex).
257
+ * Returns the first matching schema or null.
258
+ */
259
+ export function findSchemaForUrl(url) {
260
+ let parsed;
261
+ try {
262
+ parsed = new URL(url);
263
+ }
264
+ catch {
265
+ return null;
266
+ }
267
+ const hostname = parsed.hostname.toLowerCase();
268
+ const fullUrl = url;
269
+ for (const schema of BUNDLED_SCHEMAS) {
270
+ // Check domain match
271
+ const domainMatch = schema.domains.some(domain => {
272
+ const d = domain.toLowerCase();
273
+ return hostname === d || hostname.endsWith('.' + d) || d.endsWith('.' + hostname);
274
+ });
275
+ if (!domainMatch)
276
+ continue;
277
+ // If no urlPatterns, domain match is enough
278
+ if (!schema.urlPatterns || schema.urlPatterns.length === 0) {
279
+ return schema;
280
+ }
281
+ // Check URL patterns against the full URL
282
+ const patternMatch = schema.urlPatterns.some(pattern => {
283
+ try {
284
+ return new RegExp(pattern).test(fullUrl);
285
+ }
286
+ catch {
287
+ return false;
288
+ }
289
+ });
290
+ if (patternMatch)
291
+ return schema;
292
+ }
293
+ return null;
294
+ }
295
+ /**
296
+ * Extract listings from HTML using a schema's CSS selectors.
297
+ *
298
+ * @param html - Raw HTML string to parse
299
+ * @param schema - Extraction schema to use
300
+ * @param baseUrl - Optional base URL for resolving relative links
301
+ * @returns Array of extracted items (may be empty)
302
+ */
303
+ export function extractWithSchema(html, schema, baseUrl) {
304
+ if (!html || html.trim().length === 0)
305
+ return [];
306
+ const $ = load(html);
307
+ const items = [];
308
+ // Find the title/name field to use for filtering empty items
309
+ const titleFieldName = schema.fields.find(f => f.name === 'title' || f.name === 'name')?.name;
310
+ $(schema.baseSelector).each((_, el) => {
311
+ const $el = $(el);
312
+ const item = {};
313
+ for (const field of schema.fields) {
314
+ const value = extractFieldValue($, $el, field, baseUrl);
315
+ if (value !== undefined) {
316
+ item[field.name] = value;
317
+ }
318
+ }
319
+ // Clean title/name field: strip common junk suffixes (e.g., "Opens in new window")
320
+ if (titleFieldName !== undefined && typeof item[titleFieldName] === 'string') {
321
+ let title = item[titleFieldName];
322
+ // Strip "Opens in (a) new window/tab" variants
323
+ title = title.replace(/\s*Opens?\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?/gi, '');
324
+ // Strip "New Listing", "Sponsored", "Ad" prefixes
325
+ title = title.replace(/^(?:New\s+Listing|Sponsored|Ad)\s*[-–—:·]?\s*/i, '');
326
+ item[titleFieldName] = title.trim();
327
+ }
328
+ // Skip items with no title/name (likely empty/phantom elements)
329
+ if (titleFieldName !== undefined) {
330
+ const titleVal = item[titleFieldName];
331
+ if (!titleVal || (typeof titleVal === 'string' && titleVal.trim() === '')) {
332
+ return; // skip
333
+ }
334
+ }
335
+ // Skip completely empty items
336
+ if (Object.keys(item).length === 0)
337
+ return;
338
+ items.push(item);
339
+ });
340
+ // Deduplicate: remove items with identical title + price (common with nested selectors)
341
+ if (titleFieldName) {
342
+ const seen = new Set();
343
+ return items.filter(item => {
344
+ const key = `${String(item[titleFieldName] ?? '')}|${String(item.price ?? '')}`;
345
+ if (seen.has(key))
346
+ return false;
347
+ seen.add(key);
348
+ return true;
349
+ });
350
+ }
351
+ return items;
352
+ }
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Post-process BM25 quickAnswer passages to extract specific values.
3
+ *
4
+ * BM25 finds relevant passages but can't extract values. This module
5
+ * applies field-type-aware regex extraction to pull the actual value
6
+ * from the passage.
7
+ */
8
+ /**
9
+ * Post-process a BM25 passage to extract the actual value for a given field name.
10
+ */
11
+ export declare function extractValueFromPassage(passage: string, fieldName: string): string;
12
+ /**
13
+ * Smart schema extraction that uses structural signals before falling back to BM25.
14
+ *
15
+ * For title/name: uses the page title or first heading
16
+ * For author: scans first 1000 chars for "by X" patterns
17
+ * For date: scans first 1000 chars for date patterns
18
+ * For price/email/phone/url: regex scan of full content
19
+ * For everything else: BM25 quickAnswer + post-processing
20
+ */
21
+ export declare function smartExtractSchemaFields(content: string, templateFields: Record<string, string>, quickAnswerFn: (opts: {
22
+ content: string;
23
+ question: string;
24
+ url?: string;
25
+ }) => {
26
+ answer: string;
27
+ confidence: number;
28
+ }, options?: {
29
+ pageTitle?: string;
30
+ pageUrl?: string;
31
+ metadata?: Record<string, any>;
32
+ }): Record<string, string>;