@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,469 @@
1
+ /**
2
+ * Post-process BM25 quickAnswer passages to extract specific values.
3
+ *
4
+ * BM25 finds relevant passages but can't extract values. This module
5
+ * applies field-type-aware regex extraction to pull the actual value
6
+ * from the passage.
7
+ */
8
+ const FIELD_EXTRACTORS = {
9
+ // Price: find currency patterns
10
+ price: {
11
+ patterns: [
12
+ /\$[\d,]+(?:\.\d{2})?/, // $999.99 or $1,299
13
+ /USD\s*[\d,]+(?:\.\d{2})?/, // USD 999.99
14
+ /€[\d,]+(?:\.\d{2})?/, // €999.99
15
+ /£[\d,]+(?:\.\d{2})?/, // £999.99
16
+ /¥[\d,]+/, // ¥9999
17
+ /[\d,]+(?:\.\d{2})?\s*(?:USD|EUR|GBP|JPY)/, // 999.99 USD
18
+ /(?:price|cost|costs?)\s*(?:is|:|\s)\s*\$?[\d,]+(?:\.\d{2})?/i, // "price is $999"
19
+ /(?:starting\s+(?:at|from)|from)\s+\$?[\d,]+(?:\.\d{2})?/i, // "starting at $99"
20
+ ],
21
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 60),
22
+ },
23
+ // Date: find date patterns
24
+ date: {
25
+ patterns: [
26
+ /\d{4}-\d{2}-\d{2}/, // 2023-11-21
27
+ /(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}/i, // November 21, 2023
28
+ /\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}/i, // 21 November 2023
29
+ /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}/i, // Nov 21, 2023
30
+ /\d{1,2}\/\d{1,2}\/\d{2,4}/, // 11/21/2023
31
+ /\d{1,2}\.\d{1,2}\.\d{2,4}/, // 21.11.2023
32
+ ],
33
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 40),
34
+ },
35
+ // Author: find author patterns
36
+ author: {
37
+ patterns: [
38
+ /(?:by|author|written by|posted by)\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})/i, // "by John Smith"
39
+ /([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,3})\s+(?:wrote|writes|reports|published)/i, // "John Smith wrote"
40
+ ],
41
+ // Trim captured group to only consecutive title-cased words (i flag makes [A-Z] match lowercase too)
42
+ trimMatch: (s) => {
43
+ const words = s.split(/\s+/);
44
+ const result = [];
45
+ for (const w of words) {
46
+ if (/^[A-Z]/.test(w))
47
+ result.push(w);
48
+ else
49
+ break;
50
+ }
51
+ return result.join(' ') || s;
52
+ },
53
+ fallback: (p) => {
54
+ // Try to find a capitalized name
55
+ const nameMatch = p.match(/([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,2})/);
56
+ return nameMatch?.[1] || p.split(/[.\n]/)[0].trim().slice(0, 50);
57
+ },
58
+ },
59
+ // Title: extract from headings or first meaningful text
60
+ title: {
61
+ patterns: [
62
+ /^#\s+(.+)$/m, // # Heading
63
+ /^##\s+(.+)$/m, // ## Heading
64
+ ],
65
+ fallback: (p) => {
66
+ // Take the first line that's not a date, whitespace, or metadata
67
+ const lines = p.split('\n').filter((l) => l.trim());
68
+ for (const line of lines) {
69
+ const clean = line.replace(/^#+\s*/, '').trim();
70
+ // Skip lines that look like dates or metadata
71
+ if (/^\d{4}-\d{2}-\d{2}/.test(clean))
72
+ continue;
73
+ if (/^\d+\s*min\s*read/i.test(clean))
74
+ continue;
75
+ if (/^(by|author|posted|published|updated)/i.test(clean))
76
+ continue;
77
+ if (clean.length > 10)
78
+ return clean.slice(0, 120);
79
+ }
80
+ return p.split('\n')[0].trim().slice(0, 120);
81
+ },
82
+ },
83
+ // Name (product, event, recipe): similar to title
84
+ name: {
85
+ patterns: [
86
+ /^#\s+(.+)$/m,
87
+ /^##\s+(.+)$/m,
88
+ ],
89
+ fallback: (p) => {
90
+ const lines = p.split('\n').filter((l) => l.trim());
91
+ for (const line of lines) {
92
+ const clean = line.replace(/^#+\s*/, '').trim();
93
+ if (/^\d{4}-\d{2}-\d{2}/.test(clean))
94
+ continue;
95
+ if (/^\d+\s*min\s*read/i.test(clean))
96
+ continue;
97
+ if (clean.length > 5)
98
+ return clean.slice(0, 100);
99
+ }
100
+ return p.split('\n')[0].trim().slice(0, 100);
101
+ },
102
+ },
103
+ // Brand: extract proper nouns / company names
104
+ brand: {
105
+ patterns: [
106
+ /(?:brand|manufacturer|made by|by)\s*:?\s*([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,2})/i,
107
+ ],
108
+ // Trim to consecutive title-cased words only
109
+ trimMatch: (s) => {
110
+ const words = s.split(/\s+/);
111
+ const result = [];
112
+ for (const w of words) {
113
+ if (/^[A-Z]/.test(w))
114
+ result.push(w);
115
+ else
116
+ break;
117
+ }
118
+ return result.join(' ') || s;
119
+ },
120
+ fallback: (p) => {
121
+ // Find the first capitalized word that looks like a brand
122
+ const brandMatch = p.match(/([A-Z][a-zA-Z]{2,})/);
123
+ return brandMatch?.[1] || p.split(/[.\n]/)[0].trim().slice(0, 40);
124
+ },
125
+ },
126
+ // Rating: extract numeric ratings
127
+ rating: {
128
+ patterns: [
129
+ /(\d+(?:\.\d+)?)\s*(?:\/\s*\d+|out of \d+|stars?)/i, // 4.5/5, 4.5 out of 5, 4.5 stars
130
+ /(?:rating|rated|score)\s*:?\s*(\d+(?:\.\d+)?)/i, // rating: 4.5
131
+ /(\d+(?:\.\d+)?)\s*%/, // 95%
132
+ ],
133
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 50),
134
+ },
135
+ // Email
136
+ email: {
137
+ patterns: [/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/],
138
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 80),
139
+ },
140
+ // Phone
141
+ phone: {
142
+ patterns: [
143
+ /(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/, // +1 (555) 123-4567
144
+ /(?:\+\d{1,3}[-.\s]?)?[\d\s-]{7,15}/, // International
145
+ ],
146
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 40),
147
+ },
148
+ // URL / image / website
149
+ url: {
150
+ patterns: [/https?:\/\/[^\s"'<>]+/],
151
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
152
+ },
153
+ image: {
154
+ patterns: [
155
+ /https?:\/\/[^\s"'<>]+\.(?:jpg|jpeg|png|gif|webp|svg|avif)[^\s"'<>]*/i,
156
+ /https?:\/\/[^\s"'<>]+/,
157
+ ],
158
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
159
+ },
160
+ website: {
161
+ patterns: [/https?:\/\/[^\s"'<>]+/],
162
+ fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
163
+ },
164
+ };
165
+ // Default extractor: take first sentence
166
+ const DEFAULT_EXTRACTOR = {
167
+ patterns: [],
168
+ fallback: (p) => {
169
+ // Split into sentences, return the most relevant one (first non-trivial)
170
+ const sentences = p.split(/(?<=[.!?])\s+/).filter((s) => s.trim().length > 10);
171
+ return sentences[0]?.trim().slice(0, 150) || p.trim().slice(0, 150);
172
+ },
173
+ };
174
+ /**
175
+ * Post-process a BM25 passage to extract the actual value for a given field name.
176
+ */
177
+ export function extractValueFromPassage(passage, fieldName) {
178
+ if (!passage || !passage.trim())
179
+ return '';
180
+ const normalizedField = fieldName.toLowerCase().trim();
181
+ const extractor = FIELD_EXTRACTORS[normalizedField] || DEFAULT_EXTRACTOR;
182
+ // Try each pattern
183
+ for (const pattern of extractor.patterns) {
184
+ const match = passage.match(pattern);
185
+ if (match) {
186
+ // If there's a capture group, use it; otherwise use the full match
187
+ const raw = (match[1] || match[0]).trim();
188
+ return extractor.trimMatch ? extractor.trimMatch(raw) : raw;
189
+ }
190
+ }
191
+ // No pattern matched — use fallback
192
+ if (extractor.fallback) {
193
+ return extractor.fallback(passage);
194
+ }
195
+ // Last resort
196
+ return passage.split(/[.\n]/)[0].trim().slice(0, 100);
197
+ }
198
+ /**
199
+ * Smart schema extraction that uses structural signals before falling back to BM25.
200
+ *
201
+ * For title/name: uses the page title or first heading
202
+ * For author: scans first 1000 chars for "by X" patterns
203
+ * For date: scans first 1000 chars for date patterns
204
+ * For price/email/phone/url: regex scan of full content
205
+ * For everything else: BM25 quickAnswer + post-processing
206
+ */
207
+ export function smartExtractSchemaFields(content, templateFields, quickAnswerFn, options) {
208
+ const { pageTitle, pageUrl, metadata } = options || {};
209
+ const extracted = {};
210
+ const topContent = content.slice(0, 1500); // First 1500 chars for structural extraction
211
+ for (const [field, question] of Object.entries(templateFields)) {
212
+ const normalizedField = field.toLowerCase().trim();
213
+ let value = '';
214
+ // === STRUCTURAL EXTRACTION (try first) ===
215
+ if (normalizedField === 'title' || normalizedField === 'name') {
216
+ // 1. Use page title if available
217
+ if (pageTitle && pageTitle.length > 3) {
218
+ value = pageTitle.replace(/\s*[-|–—]\s*.+$/, '').trim(); // Strip " - Site Name" suffix
219
+ }
220
+ // 2. Try first heading in content
221
+ if (!value) {
222
+ const headingMatch = content.match(/^#\s+(.+)$/m);
223
+ if (headingMatch)
224
+ value = headingMatch[1].trim();
225
+ }
226
+ // 3. Try ## heading
227
+ if (!value) {
228
+ const h2Match = content.match(/^##\s+(.+)$/m);
229
+ if (h2Match)
230
+ value = h2Match[1].trim();
231
+ }
232
+ }
233
+ else if (normalizedField === 'author') {
234
+ // Scan top of page for author patterns
235
+ const authorPatterns = [
236
+ /(?:^|\n)\s*(?:by|author|written by|posted by)[:\s]+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})/im,
237
+ /(?:^|\n)\s*([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,2})\s*[|·•]\s*(?:\d|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/im,
238
+ ];
239
+ for (const pat of authorPatterns) {
240
+ const match = topContent.match(pat);
241
+ if (match?.[1]) {
242
+ // Trim to only capitalized words
243
+ const words = match[1].split(/\s+/);
244
+ const nameWords = [];
245
+ for (const w of words) {
246
+ if (/^[A-Z]/.test(w))
247
+ nameWords.push(w);
248
+ else
249
+ break;
250
+ }
251
+ if (nameWords.length >= 1) {
252
+ value = nameWords.join(' ');
253
+ break;
254
+ }
255
+ }
256
+ }
257
+ // Also check metadata
258
+ if (!value && metadata?.author) {
259
+ value = String(metadata.author);
260
+ }
261
+ }
262
+ else if (normalizedField === 'date') {
263
+ // Scan top of page for date patterns
264
+ const datePatterns = [
265
+ /\d{4}-\d{2}-\d{2}/,
266
+ /(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}/i,
267
+ /\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}/i,
268
+ /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}/i,
269
+ ];
270
+ for (const pat of datePatterns) {
271
+ const match = topContent.match(pat);
272
+ if (match) {
273
+ value = match[0].trim();
274
+ break;
275
+ }
276
+ }
277
+ // Also check metadata
278
+ if (!value && metadata?.date) {
279
+ value = String(metadata.date);
280
+ }
281
+ if (!value && metadata?.publishedTime) {
282
+ value = String(metadata.publishedTime).split('T')[0];
283
+ }
284
+ }
285
+ else if (normalizedField === 'price') {
286
+ // Scan full content for currency patterns
287
+ const pricePatterns = [
288
+ /\$[\d,]+(?:\.\d{2})?/,
289
+ /€[\d,]+(?:\.\d{2})?/,
290
+ /£[\d,]+(?:\.\d{2})?/,
291
+ /(?:price|cost|starting at|from)\s*:?\s*\$[\d,]+(?:\.\d{2})?/i,
292
+ ];
293
+ for (const pat of pricePatterns) {
294
+ const match = content.match(pat);
295
+ if (match) {
296
+ // Extract just the currency amount from the match
297
+ const currMatch = match[0].match(/[$€£¥][\d,]+(?:\.\d{2})?/);
298
+ value = currMatch ? currMatch[0] : match[0];
299
+ break;
300
+ }
301
+ }
302
+ }
303
+ else if (normalizedField === 'email') {
304
+ const emailMatch = content.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
305
+ if (emailMatch)
306
+ value = emailMatch[0];
307
+ }
308
+ else if (normalizedField === 'phone') {
309
+ const phoneMatch = content.match(/(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/);
310
+ if (phoneMatch)
311
+ value = phoneMatch[0];
312
+ }
313
+ else if (normalizedField === 'url' || normalizedField === 'website' || normalizedField === 'image') {
314
+ if (normalizedField === 'image') {
315
+ const imgMatch = content.match(/https?:\/\/[^\s"'<>]+\.(?:jpg|jpeg|png|gif|webp|svg|avif)[^\s"'<>]*/i);
316
+ if (imgMatch)
317
+ value = imgMatch[0];
318
+ }
319
+ if (!value) {
320
+ const urlMatch = content.match(/https?:\/\/[^\s"'<>]+/);
321
+ if (urlMatch)
322
+ value = urlMatch[0];
323
+ }
324
+ }
325
+ else if (normalizedField === 'rating') {
326
+ const ratingPatterns = [
327
+ /(\d+(?:\.\d+)?)\s*(?:\/\s*\d+|out of \d+|stars?)/i,
328
+ /(?:rating|rated|score)\s*:?\s*(\d+(?:\.\d+)?)/i,
329
+ ];
330
+ for (const pat of ratingPatterns) {
331
+ const match = content.match(pat);
332
+ if (match) {
333
+ value = match[1] || match[0];
334
+ break;
335
+ }
336
+ }
337
+ }
338
+ else if (normalizedField === 'brand') {
339
+ // 1. Look for "by Brand" or "developed by Brand" etc. in content (highest priority)
340
+ const brandByPatterns = [
341
+ /(?:by|from|developed by|manufactured by|made by|produced by|created by)\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]*)?)/,
342
+ ];
343
+ for (const pat of brandByPatterns) {
344
+ const match = topContent.match(pat);
345
+ if (match?.[1]) {
346
+ // Trim to just the brand name (first 1-2 capitalized words)
347
+ const words = match[1].split(/\s+/);
348
+ const brandWords = [];
349
+ for (const w of words) {
350
+ if (/^[A-Z]/.test(w) && !/^(The|This|That|And|For|With|From)$/.test(w))
351
+ brandWords.push(w);
352
+ else
353
+ break;
354
+ }
355
+ if (brandWords.length >= 1) {
356
+ value = brandWords.join(' ');
357
+ break;
358
+ }
359
+ }
360
+ }
361
+ // 2. Check metadata
362
+ if (!value && metadata?.brand) {
363
+ value = String(metadata.brand);
364
+ }
365
+ // 3. Fallback: first word of page title (lower priority than content patterns)
366
+ if (!value && pageTitle) {
367
+ const brandMatch = pageTitle.match(/^([A-Z][a-zA-Z]+)/);
368
+ if (brandMatch)
369
+ value = brandMatch[1];
370
+ }
371
+ // BM25 fallback will handle the rest
372
+ }
373
+ else if (normalizedField === 'source') {
374
+ // 1. Try title suffix first "Article Title - Site Name" or "Article Title | Site Name"
375
+ // (more human-readable, more specific than domain)
376
+ if (pageTitle) {
377
+ const suffixMatch = pageTitle.match(/\s*[-|–—]\s*(.+)$/);
378
+ if (suffixMatch?.[1] && suffixMatch[1].length < 40) {
379
+ value = suffixMatch[1].trim();
380
+ }
381
+ }
382
+ // 2. Extract from URL domain
383
+ if (!value && pageUrl) {
384
+ try {
385
+ const parsed = new URL(pageUrl);
386
+ const host = parsed.hostname.replace(/^www\./, '');
387
+ const parts = host.split('.');
388
+ const siteName = parts.length >= 2 ? parts[parts.length - 2] : parts[0];
389
+ // Handle subdomains like blog.cloudflare.com
390
+ const subdomain = parts[0];
391
+ if (subdomain && !['www', 'en', 'm', 'mobile', 'api', 'app'].includes(subdomain) && subdomain !== siteName) {
392
+ value = `${subdomain.charAt(0).toUpperCase() + subdomain.slice(1)} ${siteName.charAt(0).toUpperCase() + siteName.slice(1)}`;
393
+ }
394
+ else {
395
+ value = siteName.charAt(0).toUpperCase() + siteName.slice(1);
396
+ }
397
+ }
398
+ catch {
399
+ // ignore malformed URLs
400
+ }
401
+ }
402
+ }
403
+ else if (normalizedField === 'summary' || normalizedField === 'description') {
404
+ // Find the first substantive paragraph (skip headings, dates, metadata)
405
+ const lines = content.split('\n');
406
+ const summaryParts = [];
407
+ let charCount = 0;
408
+ for (const line of lines) {
409
+ const trimmed = line.trim();
410
+ if (!trimmed)
411
+ continue;
412
+ if (trimmed.startsWith('#'))
413
+ continue; // skip headings
414
+ if (/^\d{4}-\d{2}-\d{2}/.test(trimmed))
415
+ continue; // skip dates
416
+ if (/^\d+\s*min\s*read/i.test(trimmed))
417
+ continue; // skip "5 min read"
418
+ if (/^(by|author|posted|published|updated|written)/i.test(trimmed))
419
+ continue;
420
+ if (/^\*[^*]+\*$/.test(trimmed))
421
+ continue; // skip italic-only lines
422
+ if (trimmed.length > 30) { // substantive line
423
+ summaryParts.push(trimmed);
424
+ charCount += trimmed.length;
425
+ if (charCount > 300)
426
+ break; // ~2-3 sentences
427
+ }
428
+ }
429
+ if (summaryParts.length > 0) {
430
+ value = summaryParts.join(' ').slice(0, 400);
431
+ }
432
+ }
433
+ else if (normalizedField === 'body') {
434
+ // Body IS the content — return it directly (truncated for JSON output)
435
+ value = content.slice(0, 2000);
436
+ }
437
+ else if (normalizedField === 'tags') {
438
+ // Extract topic keywords from headings (skip the first one which is the title)
439
+ const headings = content.match(/^#{1,3}\s+(.+)$/gm) || [];
440
+ const topics = [];
441
+ for (const h of headings.slice(1, 6)) { // skip title, take up to 5
442
+ const clean = h.replace(/^#+\s*/, '').replace(/[*\[\](){}]/g, '').trim();
443
+ if (clean.length > 3 && clean.length < 60) {
444
+ topics.push(clean);
445
+ }
446
+ }
447
+ if (topics.length >= 2) {
448
+ value = topics.join(', ');
449
+ }
450
+ // If fewer than 2 headings, fall back to BM25
451
+ }
452
+ // === BM25 FALLBACK (only for fields without structural signal) ===
453
+ if (!value) {
454
+ try {
455
+ const qa = quickAnswerFn({
456
+ content,
457
+ question: typeof question === 'string' ? question : field,
458
+ url: pageUrl || '',
459
+ });
460
+ value = qa.answer ? extractValueFromPassage(qa.answer, field) : '';
461
+ }
462
+ catch {
463
+ value = '';
464
+ }
465
+ }
466
+ extracted[field] = value;
467
+ }
468
+ return extracted;
469
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Pre-built extraction schema templates for common use cases.
3
+ * Used with quickAnswer BM25 extraction (no LLM needed).
4
+ */
5
+ export interface SchemaTemplate {
6
+ name: string;
7
+ description: string;
8
+ fields: Record<string, string>;
9
+ }
10
+ export declare const SCHEMA_TEMPLATES: Record<string, SchemaTemplate>;
11
+ /**
12
+ * Get a schema template by name, or return null if it's not a known template.
13
+ * If the input looks like JSON, return null (let caller parse it as custom JSON).
14
+ */
15
+ export declare function getSchemaTemplate(nameOrJson: string): SchemaTemplate | null;
16
+ /**
17
+ * List all available schema template names.
18
+ */
19
+ export declare function listSchemaTemplates(): string[];
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Pre-built extraction schema templates for common use cases.
3
+ * Used with quickAnswer BM25 extraction (no LLM needed).
4
+ */
5
+ export const SCHEMA_TEMPLATES = {
6
+ product: {
7
+ name: 'Product',
8
+ description: 'Extract product information from e-commerce pages',
9
+ fields: {
10
+ name: 'What is the product name?',
11
+ price: 'What is the price in dollars, euros, or other currency?',
12
+ description: 'What are the main features and specifications of this product?',
13
+ brand: 'What brand or company makes this product?',
14
+ rating: 'What is the customer rating or review score?',
15
+ availability: 'Is this product in stock or available for purchase?',
16
+ image: 'What is the URL of the product image?',
17
+ sku: 'What is the SKU, model number, or product identifier?',
18
+ },
19
+ },
20
+ article: {
21
+ name: 'Article',
22
+ description: 'Extract article/blog post information',
23
+ fields: {
24
+ title: 'What is the title or headline of this article?',
25
+ author: 'Who is the author or writer of this article?',
26
+ date: 'When was this article published?',
27
+ summary: 'What is the main point or summary of this article in one paragraph?',
28
+ body: 'What is the full text of the article body?',
29
+ tags: 'What topics, tags, or categories does this article cover?',
30
+ source: 'What publication, website, or news source published this article?',
31
+ },
32
+ },
33
+ listing: {
34
+ name: 'Listing',
35
+ description: 'Extract listing/directory items',
36
+ fields: {
37
+ items: 'list of items with name, price, and description',
38
+ totalCount: 'total number of items or results',
39
+ category: 'listing category or type',
40
+ sortOrder: 'how items are sorted',
41
+ },
42
+ },
43
+ contact: {
44
+ name: 'Contact',
45
+ description: 'Extract contact information',
46
+ fields: {
47
+ name: 'person or company name',
48
+ email: 'email address',
49
+ phone: 'phone number',
50
+ address: 'physical address',
51
+ website: 'website URL',
52
+ company: 'company or organization name',
53
+ social: 'social media links or handles',
54
+ },
55
+ },
56
+ event: {
57
+ name: 'Event',
58
+ description: 'Extract event information',
59
+ fields: {
60
+ name: 'What is the name of this event?',
61
+ date: 'When does this event take place?',
62
+ time: 'What time does this event start?',
63
+ location: 'Where is this event held?',
64
+ price: 'How much does this event cost?',
65
+ description: 'What is this event about?',
66
+ organizer: 'Who is organizing this event?',
67
+ },
68
+ },
69
+ recipe: {
70
+ name: 'Recipe',
71
+ description: 'Extract recipe information from cooking sites',
72
+ fields: {
73
+ name: 'What is the name of this recipe?',
74
+ ingredients: 'What ingredients are needed? List all.',
75
+ steps: 'What are the cooking steps or instructions?',
76
+ prepTime: 'How long does preparation take?',
77
+ cookTime: 'How long does cooking take?',
78
+ servings: 'How many servings does this recipe make?',
79
+ calories: 'How many calories per serving?',
80
+ rating: 'What is the recipe rating?',
81
+ },
82
+ },
83
+ job: {
84
+ name: 'Job',
85
+ description: 'Extract job posting information',
86
+ fields: {
87
+ title: 'What is the job title?',
88
+ company: 'What company is hiring?',
89
+ location: 'Where is the job located?',
90
+ salary: 'What is the salary or compensation range?',
91
+ type: 'Is this full-time, part-time, contract, or remote?',
92
+ requirements: 'What are the key requirements or qualifications?',
93
+ description: 'What is the job description?',
94
+ applyUrl: 'What is the URL or method to apply?',
95
+ },
96
+ },
97
+ business: {
98
+ name: 'Business',
99
+ description: 'Extract business/company information',
100
+ fields: {
101
+ name: 'What is the business name?',
102
+ address: 'What is the full address?',
103
+ phone: 'What is the phone number?',
104
+ hours: 'What are the business hours?',
105
+ rating: 'What is the business rating?',
106
+ reviewCount: 'How many reviews does this business have?',
107
+ website: 'What is the business website URL?',
108
+ categories: 'What type of business is this?',
109
+ },
110
+ },
111
+ review: {
112
+ name: 'Review',
113
+ description: 'Extract review information',
114
+ fields: {
115
+ title: 'review title',
116
+ rating: 'rating or score',
117
+ author: 'reviewer name',
118
+ date: 'review date',
119
+ body: 'review text or content',
120
+ pros: 'positive points',
121
+ cons: 'negative points',
122
+ product: 'product or service being reviewed',
123
+ },
124
+ },
125
+ };
126
+ /**
127
+ * Get a schema template by name, or return null if it's not a known template.
128
+ * If the input looks like JSON, return null (let caller parse it as custom JSON).
129
+ */
130
+ export function getSchemaTemplate(nameOrJson) {
131
+ // If it starts with { or [, it's custom JSON, not a template name
132
+ if (nameOrJson.trim().startsWith('{') || nameOrJson.trim().startsWith('[')) {
133
+ return null;
134
+ }
135
+ const key = nameOrJson.toLowerCase().trim();
136
+ return SCHEMA_TEMPLATES[key] || null;
137
+ }
138
+ /**
139
+ * List all available schema template names.
140
+ */
141
+ export function listSchemaTemplates() {
142
+ return Object.keys(SCHEMA_TEMPLATES);
143
+ }