@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,550 @@
1
+ /**
2
+ * Structured JSON Extraction Engine
3
+ *
4
+ * Extracts structured data from markdown/text content using either:
5
+ * 1. LLM (via callLLM from llm-provider.ts) when an LLM config is provided
6
+ * 2. Heuristic regex/BM25-style extraction as a zero-key fallback
7
+ *
8
+ * Firecrawl-compatible: accepts a JSON schema, returns typed structured data.
9
+ */
10
+ import { callLLM } from './llm-provider.js';
11
+ // ---------------------------------------------------------------------------
12
+ // System prompt
13
+ // ---------------------------------------------------------------------------
14
+ const SYSTEM_PROMPT = 'Extract the following fields from the content. Return valid JSON matching the schema. Only use information present in the content. If a field is not found in the content, set it to null.';
15
+ // ---------------------------------------------------------------------------
16
+ // Schema validation / type coercion
17
+ // ---------------------------------------------------------------------------
18
+ function coerceValue(value, expectedType) {
19
+ if (value === null || value === undefined)
20
+ return null;
21
+ switch (expectedType) {
22
+ case 'string':
23
+ return typeof value === 'string' ? value : String(value);
24
+ case 'boolean': {
25
+ if (typeof value === 'boolean')
26
+ return value;
27
+ const str = String(value).toLowerCase().trim();
28
+ if (['true', 'yes', '1', 'open', 'enabled'].includes(str))
29
+ return true;
30
+ if (['false', 'no', '0', 'closed', 'disabled'].includes(str))
31
+ return false;
32
+ return null;
33
+ }
34
+ case 'number': {
35
+ if (typeof value === 'number')
36
+ return isNaN(value) ? null : value;
37
+ const num = parseFloat(String(value).replace(/,/g, ''));
38
+ return isNaN(num) ? null : num;
39
+ }
40
+ case 'array':
41
+ return Array.isArray(value) ? value : [value];
42
+ case 'object':
43
+ return typeof value === 'object' ? value : null;
44
+ default:
45
+ return value;
46
+ }
47
+ }
48
+ function validateAndCoerce(raw, schema) {
49
+ const data = {};
50
+ const missingRequired = [];
51
+ for (const [field, fieldDef] of Object.entries(schema.properties)) {
52
+ const coerced = coerceValue(raw[field], fieldDef.type);
53
+ data[field] = coerced;
54
+ if ((coerced === null || coerced === undefined) && schema.required?.includes(field)) {
55
+ missingRequired.push(field);
56
+ }
57
+ }
58
+ return { data, missingRequired };
59
+ }
60
+ // ---------------------------------------------------------------------------
61
+ // Parse JSON out of LLM text (handles code fences + raw JSON)
62
+ // ---------------------------------------------------------------------------
63
+ function parseLLMJson(text) {
64
+ const stripped = text.trim();
65
+ // Extract from ```json ... ``` or ``` ... ``` code fences
66
+ const fenceMatch = stripped.match(/```(?:json)?\s*\n?([\s\S]+?)\n?```/);
67
+ if (fenceMatch?.[1]) {
68
+ return JSON.parse(fenceMatch[1].trim());
69
+ }
70
+ // Try direct parse
71
+ try {
72
+ return JSON.parse(stripped);
73
+ }
74
+ catch {
75
+ // Find first {...} in the text
76
+ const objMatch = stripped.match(/\{[\s\S]+\}/);
77
+ if (objMatch) {
78
+ return JSON.parse(objMatch[0]);
79
+ }
80
+ throw new Error(`Could not parse JSON from LLM response: ${stripped.slice(0, 200)}`);
81
+ }
82
+ }
83
+ // ---------------------------------------------------------------------------
84
+ // Heuristic extraction helpers (no LLM key needed)
85
+ // ---------------------------------------------------------------------------
86
+ /**
87
+ * For string fields: search for field name in content, extract surrounding text.
88
+ */
89
+ /** Extract first H1 or page title from markdown content */
90
+ function extractPageTitle(content) {
91
+ const h1 = content.match(/^#\s+(.+)$/m);
92
+ if (h1?.[1])
93
+ return h1[1].replace(/[*_`]/g, '').trim();
94
+ return null;
95
+ }
96
+ /** Extract meta description (after *X min read* pattern common in WebPeel output) */
97
+ function extractDescription(content) {
98
+ // First paragraph after the title
99
+ const lines = content.split('\n').filter(l => l.trim());
100
+ let seenH1 = false;
101
+ for (const line of lines) {
102
+ if (line.startsWith('#')) {
103
+ seenH1 = true;
104
+ continue;
105
+ }
106
+ if (line.startsWith('*') && line.endsWith('*'))
107
+ continue; // byline
108
+ if (seenH1 && line.length > 30)
109
+ return line.replace(/[*_`]/g, '').trim().slice(0, 300);
110
+ }
111
+ return null;
112
+ }
113
+ /** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
114
+ function extractCompanyFromTitle(title) {
115
+ const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
116
+ if (sep?.[1])
117
+ return sep[1].trim();
118
+ return title.trim().slice(0, 60);
119
+ }
120
+ /** Smart field-name-aware string extractor */
121
+ function heuristicExtractString(fieldName, content, pageUrl) {
122
+ const lf = fieldName.toLowerCase();
123
+ const humanName = fieldName.replace(/_/g, ' ');
124
+ const title = extractPageTitle(content);
125
+ // --- Concept-aware extraction ---
126
+ // Company/brand/organization name
127
+ if (/company|brand|organization|org_name/.test(lf)) {
128
+ if (title)
129
+ return extractCompanyFromTitle(title);
130
+ // Fallback: extract from first heading of any level
131
+ const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
132
+ if (anyHeading?.[1])
133
+ return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
134
+ }
135
+ // Title/name/product → first H1 or any heading, stripped of markdown
136
+ if (/^(title|name|product_name|product|heading)$/.test(lf)) {
137
+ const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
138
+ if (rawTitle) {
139
+ // Strip markdown links [text](url) → text, badges ![...](url) → '', etc.
140
+ return rawTitle
141
+ .replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
142
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
143
+ .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
144
+ .replace(/[*_`[\]]/g, '')
145
+ .replace(/&[a-z]+;/g, '') // HTML entities
146
+ // Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
147
+ .replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
148
+ .replace(/\s+/g, ' ')
149
+ .trim().slice(0, 150);
150
+ }
151
+ }
152
+ // Description/summary/about → first paragraph
153
+ if (/description|summary|about|overview/.test(lf)) {
154
+ return extractDescription(content) ?? null;
155
+ }
156
+ // URL/website/link → use the URL if we have it
157
+ if (/^(url|website|link|homepage|site)$/.test(lf)) {
158
+ if (pageUrl)
159
+ return pageUrl;
160
+ }
161
+ // Creator / designer / founder / inventor
162
+ if (/creator|designer|founder|inventor|invented_by|created_by/.test(lf)) {
163
+ const m = content.match(/(?:created?|designed?|founded?|invented?)\s+by\s+([A-Z][^\n,·|–—]+?)(?:\s*[,·|–—]|\s+in\s+\d{4}|\.)/i)
164
+ ?? content.match(/(?:creator|designer|founder|inventor)[:\s]+([A-Z][^\n,·|]+?)(?:\s*[,·|–—]|\.)/i);
165
+ if (m?.[1])
166
+ return m[1].replace(/[*_`[\]]/g, '').trim().slice(0, 80);
167
+ }
168
+ // Director (for movies/films)
169
+ if (/director/.test(lf)) {
170
+ const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
171
+ if (m?.[1])
172
+ return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
173
+ }
174
+ // Version (semver: x.y.z or x.y.z.w)
175
+ if (/^version$/.test(lf)) {
176
+ const m = content.match(/\*\*Version:\*\*\s*([\d]+\.[\d]+[\.\d]*)/i)
177
+ ?? content.match(/version[:\s]+v?([\d]+\.[\d]+[\.\d]*)/i)
178
+ ?? content.match(/v?([\d]+\.[\d]+\.[\d]+)/);
179
+ if (m?.[1])
180
+ return m[1];
181
+ }
182
+ // Author/writer/by
183
+ if (/author|writer|by/.test(lf)) {
184
+ const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
185
+ if (m?.[1])
186
+ return m[1].trim().slice(0, 100);
187
+ }
188
+ // Date/published/updated
189
+ if (/date|published|updated|modified/.test(lf)) {
190
+ const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
191
+ ?? content.match(/(\d{4}-\d{2}-\d{2})/);
192
+ if (m?.[1])
193
+ return m[1];
194
+ }
195
+ // Email
196
+ if (/email|contact/.test(lf)) {
197
+ const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
198
+ if (m?.[0])
199
+ return m[0];
200
+ }
201
+ // Price/cost/pricing → extract value near $
202
+ if (/price|cost|pricing|fee/.test(lf)) {
203
+ const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
204
+ ?? content.match(/(free|no cost|no charge)/i);
205
+ if (m?.[0])
206
+ return m[0].trim();
207
+ }
208
+ // Language (for GitHub repos)
209
+ if (/language|lang|tech/.test(lf)) {
210
+ const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
211
+ if (m?.[1])
212
+ return m[1];
213
+ }
214
+ // Stars (for GitHub)
215
+ if (/stars?/.test(lf)) {
216
+ const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
217
+ if (m?.[1])
218
+ return m[1].replace(/,/g, '');
219
+ }
220
+ // License
221
+ if (/license/.test(lf)) {
222
+ const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
223
+ if (m?.[1])
224
+ return m[1];
225
+ }
226
+ // --- Generic patterns (exact-ish match) ---
227
+ const patterns = [
228
+ new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
229
+ new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
230
+ new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
231
+ new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
232
+ ];
233
+ for (const pattern of patterns) {
234
+ const match = content.match(pattern);
235
+ if (match?.[1])
236
+ return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
237
+ }
238
+ return null;
239
+ }
240
+ /**
241
+ * For boolean fields: search the ENTIRE content for positive/negative indicators.
242
+ */
243
+ function heuristicExtractBoolean(fieldName, content) {
244
+ const lf = fieldName.toLowerCase();
245
+ const ctx = content.toLowerCase();
246
+ // Concept-aware boolean extraction — search entire content, not just near field name
247
+ // Free tier / free plan
248
+ if (/free_tier|has_free|is_free/.test(lf)) {
249
+ if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
250
+ return true;
251
+ if (/no free|paid only|subscription required/.test(ctx))
252
+ return false;
253
+ }
254
+ // Open source
255
+ if (/open_source|is_open|oss/.test(lf)) {
256
+ if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
257
+ return true;
258
+ if (/closed[- ]source|proprietary|commercial license/.test(ctx))
259
+ return false;
260
+ }
261
+ // API availability
262
+ if (/has_api|api_available|has_rest/.test(lf)) {
263
+ if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
264
+ return true;
265
+ }
266
+ // Authentication
267
+ if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
268
+ if (/login|sign in|authentication|api key|bearer token/.test(ctx))
269
+ return true;
270
+ }
271
+ // General approach: search near field name concept
272
+ const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
273
+ let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
274
+ if (fieldIdx === -1)
275
+ fieldIdx = ctx.indexOf(humanName);
276
+ if (fieldIdx !== -1) {
277
+ const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
278
+ const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
279
+ const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
280
+ for (const pos of positive) {
281
+ if (window.includes(pos))
282
+ return true;
283
+ }
284
+ for (const neg of negative) {
285
+ if (window.includes(neg))
286
+ return false;
287
+ }
288
+ }
289
+ return null;
290
+ }
291
+ /**
292
+ * For number fields: find digits near the field name.
293
+ */
294
+ function heuristicExtractNumber(fieldName, content) {
295
+ const lf = fieldName.toLowerCase();
296
+ // Stars (GitHub)
297
+ if (/stars?/.test(lf)) {
298
+ const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
299
+ if (m?.[1]) {
300
+ const n = parseFloat(m[1].replace(/,/g, ''));
301
+ return isNaN(n) ? null : n;
302
+ }
303
+ }
304
+ // Forks
305
+ if (/forks?/.test(lf)) {
306
+ const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
307
+ if (m?.[1]) {
308
+ const n = parseFloat(m[1].replace(/,/g, ''));
309
+ return isNaN(n) ? null : n;
310
+ }
311
+ }
312
+ // Rating/score
313
+ if (/rating|score/.test(lf)) {
314
+ const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
315
+ if (m?.[1]) {
316
+ const n = parseFloat(m[1]);
317
+ return isNaN(n) ? null : n;
318
+ }
319
+ }
320
+ // Year
321
+ if (/year/.test(lf)) {
322
+ // Explicit "Year: YYYY" label first
323
+ const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
324
+ if (explicit?.[1]) {
325
+ const n = parseInt(explicit[1]);
326
+ return isNaN(n) ? null : n;
327
+ }
328
+ // For "created_year" / "founded_year" / "released_year" — look for context
329
+ if (/creat|found|release|launch|start|born|inception/.test(lf)) {
330
+ const ctxMatch = content.match(/(?:created?|founded?|released?|launched?|started?|born|inception)[^\d]*(\b(?:19|20)\d{2}\b)/i)
331
+ ?? content.match(/\b(?:in|year)\s+(\b(?:19|20)\d{2}\b)/i)
332
+ ?? content.match(/(\b(?:19|20)\d{2}\b)/);
333
+ if (ctxMatch?.[1]) {
334
+ const n = parseInt(ctxMatch[1]);
335
+ return isNaN(n) ? null : n;
336
+ }
337
+ }
338
+ // Fallback: first year found
339
+ const m = content.match(/\b((?:19|20)\d{2})\b/);
340
+ if (m?.[1]) {
341
+ const n = parseInt(m[1]);
342
+ return isNaN(n) ? null : n;
343
+ }
344
+ }
345
+ // Downloads / weekly_downloads (npm, pypi)
346
+ if (/downloads?/.test(lf)) {
347
+ const m = content.match(/weekly\s+downloads[^\d]*([\d,]+)/i)
348
+ ?? content.match(/downloads?[^\d]*([\d,]+)/i);
349
+ if (m?.[1]) {
350
+ const n = parseFloat(m[1].replace(/,/g, ''));
351
+ return isNaN(n) ? null : n;
352
+ }
353
+ }
354
+ // Population (Wikipedia infoboxes)
355
+ if (/population/.test(lf)) {
356
+ const m = content.match(/population[^\d]*([\d,]+)/i);
357
+ if (m?.[1]) {
358
+ const n = parseFloat(m[1].replace(/,/g, ''));
359
+ return isNaN(n) ? null : n;
360
+ }
361
+ }
362
+ // Generic: find number near field name — use [^\d]* to skip non-digit separators
363
+ const humanName = fieldName.replace(/_/g, '[\\s_-]*');
364
+ const pattern = new RegExp(`${humanName}[^\\d]*(\\d[\\d,]*\\.?\\d*)`, 'i');
365
+ const match = content.match(pattern);
366
+ if (match?.[1]) {
367
+ const num = parseFloat(match[1].replace(/,/g, ''));
368
+ return isNaN(num) ? null : num;
369
+ }
370
+ return null;
371
+ }
372
+ async function heuristicExtract(content, schema) {
373
+ const data = {};
374
+ let fieldsFound = 0;
375
+ const totalFields = Object.keys(schema.properties).length;
376
+ for (const [field, fieldDef] of Object.entries(schema.properties)) {
377
+ const type = fieldDef.type;
378
+ let value = null;
379
+ if (type === 'string') {
380
+ value = heuristicExtractString(field, content);
381
+ }
382
+ else if (type === 'boolean') {
383
+ value = heuristicExtractBoolean(field, content);
384
+ }
385
+ else if (type === 'number') {
386
+ value = heuristicExtractNumber(field, content);
387
+ }
388
+ // For array/object types, heuristic returns null (not enough context)
389
+ if (value !== null && value !== undefined)
390
+ fieldsFound++;
391
+ data[field] = value;
392
+ }
393
+ // Confidence based on fill rate:
394
+ // - ALL fields null → 0.1 (extraction found nothing useful)
395
+ // - Some fields null → 0.3-0.5 based on fill ratio
396
+ // - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
397
+ const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
398
+ let confidence;
399
+ if (fieldsFound === 0) {
400
+ confidence = 0.1; // All null — heuristic found nothing
401
+ }
402
+ else if (fieldsFound === totalFields) {
403
+ confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
404
+ }
405
+ else {
406
+ confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
407
+ }
408
+ return {
409
+ data,
410
+ confidence: parseFloat(confidence.toFixed(2)),
411
+ tokensUsed: 0,
412
+ };
413
+ }
414
+ // ---------------------------------------------------------------------------
415
+ // Main extraction function
416
+ // ---------------------------------------------------------------------------
417
+ /**
418
+ * Extract structured data from markdown content using an LLM or heuristics.
419
+ *
420
+ * @param content Markdown/text content to extract from
421
+ * @param schema JSON schema describing what to extract
422
+ * @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
423
+ * @param prompt Optional user guidance added to the LLM prompt
424
+ */
425
+ export async function extractStructured(content, schema, llmConfig, prompt, domainHints) {
426
+ // Guard: empty content
427
+ if (!content || content.trim().length === 0) {
428
+ return { data: {}, confidence: 0, tokensUsed: 0 };
429
+ }
430
+ // Guard: invalid schema
431
+ if (!schema || schema.type !== 'object' || typeof schema.properties !== 'object') {
432
+ throw new Error('Invalid schema: must be { type: "object", properties: { ... } }');
433
+ }
434
+ // ── LLM extraction ──────────────────────────────────────────────────────
435
+ if (llmConfig) {
436
+ const schemaStr = JSON.stringify(schema, null, 2);
437
+ const userContent = [
438
+ `Schema:\n${schemaStr}`,
439
+ prompt ? `\nInstructions: ${prompt}` : '',
440
+ `\nContent:\n${content.slice(0, 12000)}`,
441
+ ]
442
+ .filter(Boolean)
443
+ .join('');
444
+ const messages = [
445
+ { role: 'system', content: SYSTEM_PROMPT },
446
+ { role: 'user', content: userContent },
447
+ ];
448
+ try {
449
+ const llmResult = await callLLM(llmConfig, { messages, maxTokens: 2048, temperature: 0.1 });
450
+ const tokensUsed = llmResult.usage.input + llmResult.usage.output;
451
+ let parsed;
452
+ try {
453
+ parsed = parseLLMJson(llmResult.text);
454
+ }
455
+ catch {
456
+ // Malformed LLM response — fall back to heuristic
457
+ const heuristic = await heuristicExtract(content, schema);
458
+ return heuristic;
459
+ }
460
+ const { data, missingRequired } = validateAndCoerce(parsed, schema);
461
+ // Confidence for LLM extraction:
462
+ // - ALL fields null → 0.1 (LLM couldn't extract anything)
463
+ // - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
464
+ // - All populated → 0.90-0.98 based on fill rate
465
+ const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
466
+ const totalCount = Object.keys(schema.properties).length;
467
+ const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
468
+ const penalty = missingRequired.length * 0.05;
469
+ let confidence;
470
+ if (filledCount === 0) {
471
+ confidence = 0.1; // LLM returned all nulls — extraction failed
472
+ }
473
+ else {
474
+ const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
475
+ confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
476
+ }
477
+ return {
478
+ data,
479
+ confidence: parseFloat(confidence.toFixed(2)),
480
+ tokensUsed,
481
+ };
482
+ }
483
+ catch (err) {
484
+ // Re-throw auth/rate-limit/quota errors; fall back on parse/network errors
485
+ const msg = String(err instanceof Error ? err.message : err);
486
+ if (msg.includes('free_tier_limit') ||
487
+ msg.includes('API key') ||
488
+ msg.includes('Unauthorized') ||
489
+ msg.includes('401') ||
490
+ msg.includes('403')) {
491
+ throw err;
492
+ }
493
+ // Network / parse failure → heuristic fallback
494
+ return heuristicExtract(content, schema);
495
+ }
496
+ }
497
+ // ── Heuristic extraction ─────────────────────────────────────────────────
498
+ const heuristic = await heuristicExtract(content, schema);
499
+ // ── Domain hints overlay ─────────────────────────────────────────────────
500
+ // If domain-api pre-extracted fields (e.g. GitHub stars/language), merge them
501
+ // into the result. Domain-api data is authoritative — prefer over heuristic.
502
+ if (domainHints && Object.keys(domainHints).length > 0) {
503
+ const props = schema.properties;
504
+ let hintMerged = 0;
505
+ for (const [field, hintValue] of Object.entries(domainHints)) {
506
+ if (field in props && hintValue !== null && hintValue !== undefined) {
507
+ const expected = props[field].type;
508
+ const actual = typeof hintValue;
509
+ // Only merge if type matches (or number vs string coercion)
510
+ if (actual === expected ||
511
+ (expected === 'number' && actual === 'string' && !isNaN(Number(hintValue))) ||
512
+ (expected === 'string' && actual !== 'object')) {
513
+ heuristic.data[field] =
514
+ expected === 'number' ? Number(hintValue) : hintValue;
515
+ hintMerged++;
516
+ }
517
+ }
518
+ }
519
+ if (hintMerged > 0) {
520
+ // Boost confidence since we have authoritative domain-api data
521
+ const filled = Object.values(heuristic.data).filter(v => v !== null && v !== undefined).length;
522
+ const total = Object.keys(props).length;
523
+ heuristic.confidence = parseFloat(Math.min(0.90, 0.65 + (filled / total) * 0.25).toFixed(2));
524
+ }
525
+ }
526
+ return heuristic;
527
+ }
528
+ // ---------------------------------------------------------------------------
529
+ // Helper: convert simple { field: "type" } map → ExtractionSchema
530
+ // ---------------------------------------------------------------------------
531
+ /**
532
+ * Convert a shorthand schema `{ field: "string", active: "boolean" }` to a
533
+ * full ExtractionSchema. Useful for CLI --extract flag.
534
+ */
535
+ export function simpleToExtractionSchema(simple) {
536
+ const properties = {};
537
+ for (const [field, type] of Object.entries(simple)) {
538
+ properties[field] = { type };
539
+ }
540
+ return { type: 'object', properties };
541
+ }
542
+ /**
543
+ * Check if a JSON object looks like a simple type-schema
544
+ * (`{ field: "string" | "boolean" | "number" }`) rather than CSS selectors.
545
+ */
546
+ export function isTypeSchema(obj) {
547
+ const typeNames = new Set(['string', 'boolean', 'number', 'array', 'object', 'integer']);
548
+ const values = Object.values(obj);
549
+ return values.length > 0 && values.every((v) => typeof v === 'string' && typeNames.has(v));
550
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * AI-powered content summarization using OpenAI-compatible APIs
3
+ */
4
+ export interface SummarizeOptions {
5
+ /** OpenAI-compatible API base URL (default: https://api.openai.com/v1) */
6
+ apiBase?: string;
7
+ /** API key for the LLM */
8
+ apiKey: string;
9
+ /** Model to use (default: gpt-4o-mini) */
10
+ model?: string;
11
+ /** Max length of summary in words */
12
+ maxWords?: number;
13
+ }
14
+ /**
15
+ * Summarize content using an OpenAI-compatible LLM API
16
+ */
17
+ export declare function summarizeContent(content: string, options: SummarizeOptions): Promise<string>;
@@ -0,0 +1,78 @@
1
+ /**
2
+ * AI-powered content summarization using OpenAI-compatible APIs
3
+ */
4
+ /**
5
+ * Truncate content to roughly 4000 tokens (~16000 characters)
6
+ * This leaves room for system prompt and response
7
+ */
8
+ function truncateContent(content) {
9
+ const MAX_CHARS = 16000; // ~4000 tokens
10
+ if (content.length <= MAX_CHARS) {
11
+ return content;
12
+ }
13
+ // Truncate and add ellipsis
14
+ return content.slice(0, MAX_CHARS) + '\n\n[Content truncated for summarization...]';
15
+ }
16
+ /**
17
+ * Summarize content using an OpenAI-compatible LLM API
18
+ */
19
+ export async function summarizeContent(content, options) {
20
+ const { apiBase = 'https://api.openai.com/v1', apiKey, model = 'gpt-4o-mini', maxWords = 150, } = options;
21
+ // Validate inputs
22
+ if (!apiKey || apiKey.trim().length === 0) {
23
+ throw new Error('API key is required for summarization');
24
+ }
25
+ if (!content || content.trim().length === 0) {
26
+ throw new Error('Content is required for summarization');
27
+ }
28
+ // Truncate content to fit within token limits
29
+ const truncatedContent = truncateContent(content);
30
+ // Build the prompt
31
+ const prompt = `Summarize the following web page content concisely in ${maxWords} words or fewer. Focus on the key information.
32
+
33
+ Content:
34
+ ${truncatedContent}`;
35
+ // Call the OpenAI-compatible API
36
+ const apiUrl = `${apiBase.replace(/\/$/, '')}/chat/completions`;
37
+ try {
38
+ const response = await fetch(apiUrl, {
39
+ method: 'POST',
40
+ headers: {
41
+ 'Content-Type': 'application/json',
42
+ 'Authorization': `Bearer ${apiKey}`,
43
+ },
44
+ body: JSON.stringify({
45
+ model,
46
+ messages: [
47
+ {
48
+ role: 'user',
49
+ content: prompt,
50
+ },
51
+ ],
52
+ temperature: 0.3, // Lower temperature for more focused summaries
53
+ max_tokens: maxWords * 2, // Rough estimate: 1 word ≈ 1.5-2 tokens
54
+ }),
55
+ });
56
+ if (!response.ok) {
57
+ const errorText = await response.text();
58
+ throw new Error(`LLM API error: HTTP ${response.status} - ${errorText}`);
59
+ }
60
+ const result = await response.json();
61
+ // Check for API error
62
+ if (result.error) {
63
+ throw new Error(`LLM API error: ${result.error.message}`);
64
+ }
65
+ // Extract summary from response
66
+ const summary = result.choices?.[0]?.message?.content?.trim();
67
+ if (!summary) {
68
+ throw new Error('LLM API returned empty response');
69
+ }
70
+ return summary;
71
+ }
72
+ catch (error) {
73
+ if (error instanceof Error) {
74
+ throw new Error(`Summarization failed: ${error.message}`);
75
+ }
76
+ throw new Error('Summarization failed: Unknown error');
77
+ }
78
+ }
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Synonym expansion for query broadening.
3
+ *
4
+ * Provides stemmed synonym groups and a function to expand a set of stemmed
5
+ * query tokens with related synonyms (at a lower weight).
6
+ *
7
+ * Usage:
8
+ * const queryTerms = tokenizeQuestion(question); // already stemmed
9
+ * const expanded = expandWithSynonyms(queryTerms);
10
+ * // expanded includes originals (weight=1.0) + synonyms (weight=0.5)
11
+ */
12
+ /**
13
+ * Raw synonym groups. Each group is a set of words with equivalent or near-
14
+ * equivalent meaning in the context of software/web documentation queries.
15
+ *
16
+ * These are stored in unstemmed form for readability; the build process stems
17
+ * them into STEMMED_SYNONYM_GROUPS and builds an index.
18
+ */
19
+ export declare const SYNONYM_GROUPS: string[][];
20
+ /**
21
+ * Stemmed synonym groups.
22
+ * Each word in each group has been run through the Porter stemmer.
23
+ * Duplicate stems within a group are deduplicated.
24
+ */
25
+ export declare const STEMMED_SYNONYM_GROUPS: string[][];
26
+ export interface ExpandedTerm {
27
+ /** The stemmed term */
28
+ term: string;
29
+ /** 1.0 for original query terms, 0.5 for synonym expansions */
30
+ weight: number;
31
+ /** True if this term came from the original query */
32
+ isOriginal: boolean;
33
+ }
34
+ /**
35
+ * Expand a list of stemmed query tokens with their synonyms.
36
+ *
37
+ * @param terms - Already-stemmed tokens from the query
38
+ * @returns Array of ExpandedTerm objects. Original terms have weight=1.0,
39
+ * synonym expansions have weight=0.5.
40
+ * The returned array preserves originals first, then synonyms.
41
+ */
42
+ export declare function expandWithSynonyms(terms: string[]): ExpandedTerm[];