@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,270 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Kayak Car Rental extractor
3
+ // ---------------------------------------------------------------------------
4
+ export async function kayakCarRentalExtractor(_html, url) {
5
+ if (!url.includes('/cars/'))
6
+ return null;
7
+ // Rental company homepage URLs
8
+ const rentalCompanyUrls = {
9
+ 'Hertz': 'https://www.hertz.com',
10
+ 'Budget': 'https://www.budget.com',
11
+ 'Avis': 'https://www.avis.com',
12
+ 'Enterprise': 'https://www.enterprise.com',
13
+ 'National': 'https://www.nationalcar.com',
14
+ 'Alamo': 'https://www.alamo.com',
15
+ 'Dollar': 'https://www.dollar.com',
16
+ 'Thrifty': 'https://www.thrifty.com',
17
+ 'Sixt': 'https://www.sixt.com',
18
+ 'Fox': 'https://www.foxrentacar.com',
19
+ 'Payless': 'https://www.paylesscar.com',
20
+ 'Turn': 'https://www.turn.com',
21
+ 'EconomyBookings': 'https://www.economybookings.com',
22
+ 'Priceline': 'https://www.priceline.com',
23
+ 'Expedia': 'https://www.expedia.com',
24
+ 'Turo': 'https://www.turo.com',
25
+ 'KAYAK': 'https://www.kayak.com',
26
+ 'Booking.com': 'https://www.booking.com',
27
+ 'DiscoverCars': 'https://www.discovercars.com',
28
+ 'RentalCars': 'https://www.rentalcars.com',
29
+ 'Car Rental 8': 'https://www.carrental8.com',
30
+ 'Hotwire': 'https://www.hotwire.com',
31
+ };
32
+ function getCompanyUrl(company) {
33
+ return rentalCompanyUrls[company] || `https://www.kayak.com`;
34
+ }
35
+ // Parse dates from URL: /cars/Location/YYYY-MM-DD/YYYY-MM-DD
36
+ let numDays = 1;
37
+ let pickupDate = '';
38
+ let dropoffDate = '';
39
+ let locationName = '';
40
+ const dateMatch = url.match(/\/cars\/([^/]+)\/(\d{4}-\d{2}-\d{2})\/(\d{4}-\d{2}-\d{2})/);
41
+ if (dateMatch) {
42
+ locationName = decodeURIComponent(dateMatch[1]);
43
+ pickupDate = dateMatch[2];
44
+ dropoffDate = dateMatch[3];
45
+ const pickup = new Date(pickupDate);
46
+ const dropoff = new Date(dropoffDate);
47
+ numDays = Math.max(1, Math.round((dropoff.getTime() - pickup.getTime()) / (1000 * 60 * 60 * 24)));
48
+ }
49
+ // Format date range for display (e.g. "Apr 1–3")
50
+ function formatDateRange(from, to) {
51
+ if (!from || !to)
52
+ return '';
53
+ const fromDate = new Date(from + 'T12:00:00');
54
+ const toDate = new Date(to + 'T12:00:00');
55
+ const months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
56
+ const fromMonth = months[fromDate.getUTCMonth()];
57
+ const toMonth = months[toDate.getUTCMonth()];
58
+ const fromDay = fromDate.getUTCDate();
59
+ const toDay = toDate.getUTCDate();
60
+ if (fromMonth === toMonth)
61
+ return `${fromMonth} ${fromDay}–${toDay}`;
62
+ return `${fromMonth} ${fromDay}–${toMonth} ${toDay}`;
63
+ }
64
+ // Process content: strip HTML if needed
65
+ let text = _html;
66
+ if (text.includes('<!DOCTYPE') || text.includes('<html')) {
67
+ text = text
68
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
69
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
70
+ .replace(/<[^>]+>/g, '\n')
71
+ .replace(/&amp;/g, '&')
72
+ .replace(/&lt;/g, '<')
73
+ .replace(/&gt;/g, '>')
74
+ .replace(/&#\d+;/g, '')
75
+ .replace(/\n{2,}/g, '\n');
76
+ }
77
+ const lines = text.split('\n').map(l => l.trim()).filter(Boolean);
78
+ const KNOWN_COMPANIES = ['Hertz', 'Budget', 'Avis', 'Enterprise', 'National', 'Alamo', 'Dollar', 'Thrifty', 'Sixt', 'Fox', 'Payless', 'Turn', 'EconomyBookings', 'Priceline', 'Expedia', 'Turo', 'KAYAK', 'Booking.com', 'DiscoverCars', 'RentalCars', 'Car Rental 8', 'Hotwire'];
79
+ const listings = [];
80
+ for (let i = 0; i < lines.length; i++) {
81
+ const line = lines[i];
82
+ // Detect: "or similar {Class}" — this signals a standard car rental listing
83
+ // The car name is the line BEFORE "or similar"
84
+ const orSimilarMatch = line.match(/^or similar\s+(.+)$/);
85
+ if (orSimilarMatch) {
86
+ const carClass = orSimilarMatch[1].trim();
87
+ const carName = i > 0 ? lines[i - 1] : '';
88
+ if (!carName || carName.length > 60)
89
+ continue;
90
+ // Look ahead for: pickup location, rating, company, price
91
+ let location = '';
92
+ let distanceFromCenter = '';
93
+ let rating = null;
94
+ let company = '';
95
+ let totalPrice = 0;
96
+ let cancellation = '';
97
+ for (let j = i + 1; j < Math.min(i + 30, lines.length); j++) {
98
+ const l = lines[j];
99
+ // Pickup location
100
+ if (!location && l.startsWith('Pick-up')) {
101
+ const locMatch = l.match(/Pick-up (?:city|airport):\s*(.+)/);
102
+ if (locMatch)
103
+ location = locMatch[1].trim();
104
+ continue;
105
+ }
106
+ // Distance from center
107
+ if (!distanceFromCenter) {
108
+ const distM = l.match(/^([\d.]+)\s+mi\s+from\s+city\s+center/);
109
+ if (distM) {
110
+ distanceFromCenter = `${distM[1]} mi from city center`;
111
+ continue;
112
+ }
113
+ }
114
+ // Rating (number like "9.2", "8.5", "7.2")
115
+ if (rating === null) {
116
+ const ratingM = l.match(/^(\d+\.\d+)$/);
117
+ if (ratingM) {
118
+ rating = parseFloat(ratingM[1]);
119
+ continue;
120
+ }
121
+ }
122
+ // Company from "X offer from {Company}" or "{Company}" line
123
+ if (!company) {
124
+ const offerMatch = l.match(/offer from (.+)$/);
125
+ if (offerMatch) {
126
+ company = offerMatch[1].trim();
127
+ continue;
128
+ }
129
+ // Also detect company name standalone
130
+ for (const c of KNOWN_COMPANIES) {
131
+ if (l === c) {
132
+ company = c;
133
+ break;
134
+ }
135
+ }
136
+ if (company)
137
+ continue;
138
+ }
139
+ // Cancellation policy
140
+ if (!cancellation && (l.includes('Free cancellation') || l.includes('No free cancellation'))) {
141
+ cancellation = l;
142
+ continue;
143
+ }
144
+ // Price — "$NNN" followed by "Total"
145
+ const priceM = l.match(/^\$(\d[\d,]*)$/);
146
+ if (priceM) {
147
+ const nextLine = lines[j + 1] || '';
148
+ if (nextLine === 'Total' || nextLine.includes('Total')) {
149
+ totalPrice = parseInt(priceM[1].replace(',', ''));
150
+ break;
151
+ }
152
+ }
153
+ // Also catch price on same line
154
+ const inlinePriceM = l.match(/\$(\d[\d,]*)\s*Total/);
155
+ if (inlinePriceM) {
156
+ totalPrice = parseInt(inlinePriceM[1].replace(',', ''));
157
+ break;
158
+ }
159
+ // Stop if we hit another car listing marker
160
+ if (l.match(/^or similar\s/) || l === 'Show more results')
161
+ break;
162
+ }
163
+ if (carName && totalPrice > 0) {
164
+ const normalizedClass = carClass.replace('Full size', 'Full-size');
165
+ listings.push({
166
+ name: carName,
167
+ carClass: normalizedClass,
168
+ totalPrice,
169
+ perDayPrice: Math.round(totalPrice / numDays),
170
+ company: company || 'Unknown',
171
+ location: location || 'See booking',
172
+ distanceFromCenter,
173
+ rating,
174
+ cancellation,
175
+ isTuro: false,
176
+ });
177
+ }
178
+ }
179
+ }
180
+ // Deduplicate: first prefer listings with real company info over "Unknown"
181
+ // Key by name+price; keep the one with best data
182
+ const byKey = new Map();
183
+ for (const c of listings) {
184
+ const key = `${c.name.toLowerCase()}-${c.totalPrice}`;
185
+ const existing = byKey.get(key);
186
+ if (!existing) {
187
+ byKey.set(key, c);
188
+ }
189
+ else {
190
+ // Prefer non-Unknown company, or same company with more info
191
+ if (existing.company === 'Unknown' && c.company !== 'Unknown') {
192
+ byKey.set(key, c);
193
+ }
194
+ }
195
+ }
196
+ const unique = Array.from(byKey.values());
197
+ if (unique.length === 0)
198
+ return null;
199
+ // Filter out Unknown company entries if the total found from page suggests more results exist
200
+ // Also filter them only if they have no location info (these are likely ad/promo extractions)
201
+ const knownCompanyListings = unique.filter(c => c.company !== 'Unknown');
202
+ const finalListings = knownCompanyListings.length > 0 ? knownCompanyListings : unique;
203
+ // Sort by price
204
+ finalListings.sort((a, b) => a.totalPrice - b.totalPrice);
205
+ // Get total count from page if mentioned
206
+ let totalFound = unique.length;
207
+ for (const l of lines) {
208
+ const m = l.match(/^(\d+)\s+results?$/);
209
+ if (m) {
210
+ totalFound = parseInt(m[1]);
211
+ break;
212
+ }
213
+ const m2 = l.match(/(\d+)\s+cars?\s+found/);
214
+ if (m2) {
215
+ totalFound = parseInt(m2[1]);
216
+ break;
217
+ }
218
+ }
219
+ // Format location name nicely (e.g. "Punta-Gorda,FL-c34451" → "Punta Gorda, FL")
220
+ function formatLocation(loc) {
221
+ return loc
222
+ .replace(/-c\d+$/, '') // remove trailing "-c12345"
223
+ .replace(/-/g, ' ') // hyphens to spaces
224
+ .replace(/,(\S)/g, ', $1'); // ensure space after comma
225
+ }
226
+ const dateRange = formatDateRange(pickupDate, dropoffDate);
227
+ const displayLocation = formatLocation(locationName);
228
+ const daysLabel = numDays === 1 ? '1 day' : `${numDays} days`;
229
+ const md = [
230
+ `# 🚗 Car Rentals — ${displayLocation} · ${dateRange} (${daysLabel})`,
231
+ '',
232
+ `*${totalFound} cars found · Source: [Kayak](${url})*`,
233
+ `*Free cancellation available on most rentals*`,
234
+ '',
235
+ ];
236
+ for (let idx = 0; idx < finalListings.length; idx++) {
237
+ const c = finalListings[idx];
238
+ md.push(`## ${idx + 1}. ${c.name} (${c.carClass}) — $${c.totalPrice} total · $${c.perDayPrice}/day`);
239
+ if (c.distanceFromCenter) {
240
+ md.push(`📍 ${c.distanceFromCenter}`);
241
+ }
242
+ else if (c.location && c.location !== 'See booking') {
243
+ md.push(`📍 ${c.location}`);
244
+ }
245
+ const ratingStr = c.rating !== null ? ` · Rating: ${c.rating}` : '';
246
+ md.push(`🏪 via ${c.company}${ratingStr}`);
247
+ if (c.cancellation)
248
+ md.push(`✅ ${c.cancellation}`);
249
+ md.push(`🔍 [See price on Kayak](${url})`);
250
+ md.push(`🛒 [Book on ${c.company}](${getCompanyUrl(c.company)})`);
251
+ md.push('');
252
+ }
253
+ md.push('---');
254
+ md.push(`📌 *Prices verified via [Kayak](${url}). Click "See price" to confirm current rate, then book with the rental company.*`);
255
+ return {
256
+ domain: 'kayak.com/cars',
257
+ type: 'car-rental',
258
+ structured: {
259
+ cars: finalListings,
260
+ location: displayLocation,
261
+ pickupDate,
262
+ dropoffDate,
263
+ numDays,
264
+ totalFound,
265
+ source: 'Kayak',
266
+ sourceUrl: url,
267
+ },
268
+ cleanContent: md.join('\n'),
269
+ };
270
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function linkedinExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,113 @@
1
+ import { tryParseJson } from './shared.js';
2
+ // ---------------------------------------------------------------------------
3
+ // 17. LinkedIn extractor
4
+ // ---------------------------------------------------------------------------
5
+ export async function linkedinExtractor(html, url) {
6
+ try {
7
+ const { load } = await import('cheerio');
8
+ const $ = load(html);
9
+ // Detect page type from URL first
10
+ const urlObj = new URL(url);
11
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
12
+ const pageType = pathParts[0] === 'company' ? 'company'
13
+ : pathParts[0] === 'in' ? 'profile'
14
+ : pathParts[0] === 'jobs' ? 'job'
15
+ : 'page';
16
+ // Detect if we're on the authwall (LinkedIn redirects unauthenticated requests)
17
+ const isAuthwall = html.includes('authwall') || html.includes('Join LinkedIn') || html.includes('Sign in') && !html.includes('linkedin.com/in/');
18
+ // --- Try parsing meta tags / JSON-LD from the HTML ---
19
+ let jsonLd = null;
20
+ $('script[type="application/ld+json"]').each((_, el) => {
21
+ if (jsonLd)
22
+ return;
23
+ const raw = $(el).html() || '';
24
+ const parsed = tryParseJson(raw);
25
+ if (parsed?.['@type'] === 'Person' || parsed?.['@type'] === 'Organization')
26
+ jsonLd = parsed;
27
+ });
28
+ const ogTitle = $('meta[property="og:title"]').attr('content') || '';
29
+ const ogDescription = $('meta[property="og:description"]').attr('content') || '';
30
+ const ogImage = $('meta[property="og:image"]').attr('content') || '';
31
+ const metaDescription = $('meta[name="description"]').attr('content') || '';
32
+ let name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').replace(/Sign Up \| LinkedIn$/, '').trim() || '';
33
+ // When on authwall, discard authwall-specific meta data
34
+ let headline = isAuthwall ? (jsonLd?.jobTitle || '') : (jsonLd?.jobTitle || metaDescription?.split('|')?.[0]?.trim() || ogDescription || '');
35
+ let description = isAuthwall ? (jsonLd?.description || '') : (jsonLd?.description || ogDescription || '');
36
+ let location = $('[class*="location"]').first().text().trim() || jsonLd?.address?.addressLocality || '';
37
+ // --- If authwall or no useful data, try direct HTTPS fetch with minimal headers ---
38
+ // LinkedIn returns rich og: meta tags when fetched with a plain browser UA (no Sec-Fetch-* noise)
39
+ if (!name || isAuthwall || name.toLowerCase().includes('sign up') || name.toLowerCase().includes('linkedin')) {
40
+ try {
41
+ const { default: httpsLI } = await import('https');
42
+ const { gunzip } = await import('zlib');
43
+ const linkedInHtml = await new Promise((resolve, reject) => {
44
+ const req = httpsLI.request({
45
+ hostname: 'www.linkedin.com',
46
+ path: urlObj.pathname,
47
+ method: 'GET',
48
+ headers: {
49
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
50
+ 'Accept': 'text/html,application/xhtml+xml',
51
+ 'Accept-Language': 'en-US,en;q=0.9',
52
+ 'Accept-Encoding': 'gzip, deflate',
53
+ },
54
+ }, (res) => {
55
+ if (res.statusCode && res.statusCode >= 400) {
56
+ reject(new Error(`HTTP ${res.statusCode}`));
57
+ res.resume();
58
+ return;
59
+ }
60
+ const chunks = [];
61
+ res.on('data', (chunk) => chunks.push(chunk));
62
+ res.on('end', () => {
63
+ const buf = Buffer.concat(chunks);
64
+ const enc = res.headers['content-encoding'] || '';
65
+ if (enc === 'gzip') {
66
+ gunzip(buf, (err, decoded) => err ? reject(err) : resolve(decoded.toString('utf8')));
67
+ }
68
+ else {
69
+ resolve(buf.toString('utf8'));
70
+ }
71
+ });
72
+ });
73
+ req.on('error', reject);
74
+ setTimeout(() => req.destroy(new Error('timeout')), 10000);
75
+ req.end();
76
+ });
77
+ if (linkedInHtml) {
78
+ const $li = load(linkedInHtml);
79
+ const liOgTitle = $li('meta[property="og:title"]').attr('content') || '';
80
+ const liOgDesc = $li('meta[property="og:description"]').attr('content') || '';
81
+ // Only use if it has real profile data (not authwall)
82
+ if (liOgTitle && !liOgTitle.toLowerCase().includes('sign up') && !liOgTitle.toLowerCase().includes('join linkedin')) {
83
+ // "Name - Headline | LinkedIn" or "Name | LinkedIn"
84
+ const titleParts = liOgTitle.replace(/ \| LinkedIn$/, '').split(/\s*[-–]\s*/);
85
+ if (titleParts[0])
86
+ name = titleParts[0].trim();
87
+ if (titleParts[1])
88
+ headline = titleParts[1].trim();
89
+ if (liOgDesc)
90
+ description = liOgDesc;
91
+ }
92
+ }
93
+ }
94
+ catch { /* direct fetch optional */ }
95
+ }
96
+ if (!name)
97
+ return null;
98
+ const structured = {
99
+ name, headline, description, location, pageType,
100
+ image: ogImage, url,
101
+ };
102
+ const typeLine = pageType === 'company' ? '🏢' : pageType === 'profile' ? '👤' : '🔗';
103
+ const locationLine = location ? `\n📍 ${location}` : '';
104
+ const headlineLine = headline && headline !== name ? `\n*${headline}*` : '';
105
+ const descriptionLine = description ? `\n\n${description}` : '';
106
+ const authNote = '\n\n⚠️ Full LinkedIn profiles require authentication. Use /v1/session to log in first.';
107
+ const cleanContent = `# ${typeLine} ${name} — LinkedIn${headlineLine}${locationLine}${descriptionLine}${authNote}`;
108
+ return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
109
+ }
110
+ catch {
111
+ return null;
112
+ }
113
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function mediumExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,130 @@
1
+ import { tryParseJson } from './shared.js';
2
+ // ---------------------------------------------------------------------------
3
+ // 13. Medium Articles extractor
4
+ // ---------------------------------------------------------------------------
5
+ export async function mediumExtractor(html, url) {
6
+ try {
7
+ const { load } = await import('cheerio');
8
+ const $ = load(html);
9
+ // JSON-LD
10
+ let jsonLdData = null;
11
+ $('script[type="application/ld+json"]').each((_, el) => {
12
+ if (jsonLdData)
13
+ return;
14
+ const raw = $(el).html() || '';
15
+ const parsed = tryParseJson(raw);
16
+ if (parsed?.['@type'] === 'NewsArticle' || parsed?.['@type'] === 'Article')
17
+ jsonLdData = parsed;
18
+ });
19
+ const title = jsonLdData?.headline ||
20
+ $('meta[property="og:title"]').attr('content') ||
21
+ $('h1').first().text().trim() || '';
22
+ if (!title)
23
+ return null;
24
+ const author = jsonLdData?.author?.name ||
25
+ $('meta[name="author"]').attr('content') ||
26
+ $('[data-testid="authorName"]').text().trim() ||
27
+ $('a[rel="author"]').first().text().trim() || '';
28
+ const publishDate = jsonLdData?.datePublished ||
29
+ $('meta[property="article:published_time"]').attr('content') || '';
30
+ const readingTime = $('[data-testid="storyReadTime"]').text().trim() ||
31
+ $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
32
+ const description = jsonLdData?.description ||
33
+ $('meta[property="og:description"]').attr('content') || '';
34
+ // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
35
+ let publication = '';
36
+ try {
37
+ const urlObj2 = new URL(url);
38
+ const hostname = urlObj2.hostname;
39
+ if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
40
+ publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
41
+ }
42
+ }
43
+ catch { /* ignore */ }
44
+ if (!publication) {
45
+ publication = $('[data-testid="publicationName"]').text().trim() ||
46
+ $('a[data-testid="publicationName"]').text().trim() ||
47
+ $('meta[property="article:section"]').attr('content') ||
48
+ $('a[href*="/tag/"]').first().text().trim() || '';
49
+ }
50
+ // Author bio — usually shown in an author card or bio section
51
+ const authorBio = $('[data-testid="authorBio"]').text().trim() ||
52
+ $('p[class*="bio"]').first().text().trim() ||
53
+ $('[aria-label="authorBio"]').text().trim() || '';
54
+ // Clap count — Medium shows clap button with count
55
+ let clapCount = '';
56
+ $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
57
+ const txt = $(el).text().trim();
58
+ if (txt && /\d/.test(txt)) {
59
+ clapCount = txt;
60
+ return false;
61
+ }
62
+ });
63
+ if (!clapCount) {
64
+ // Fallback: find spans that look like clap counts (e.g., "2.4K")
65
+ $('span').filter((_, el) => {
66
+ const label = $(el).closest('[aria-label]').attr('aria-label') || '';
67
+ return label.toLowerCase().includes('clap');
68
+ }).each((_, el) => {
69
+ const txt = $(el).text().trim();
70
+ if (txt && /\d/.test(txt)) {
71
+ clapCount = txt;
72
+ return false;
73
+ }
74
+ });
75
+ }
76
+ // Extract article body — Medium puts content in <article> or section
77
+ let articleBody = '';
78
+ const articleEl = $('article').first();
79
+ if (articleEl.length) {
80
+ // Remove nav, aside, buttons, author-card, footer sections
81
+ articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
82
+ // Get paragraphs and headings
83
+ const parts = [];
84
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
85
+ const tag = el.name;
86
+ const text = $(el).text().trim();
87
+ if (!text || text.length < 5)
88
+ return;
89
+ if (tag === 'h1' || tag === 'h2')
90
+ parts.push(`## ${text}`);
91
+ else if (tag === 'h3' || tag === 'h4')
92
+ parts.push(`### ${text}`);
93
+ else if (tag === 'blockquote')
94
+ parts.push(`> ${text}`);
95
+ else if (tag === 'pre')
96
+ parts.push('```\n' + text + '\n```');
97
+ else if (tag === 'figcaption')
98
+ parts.push(`*${text}*`);
99
+ else
100
+ parts.push(text);
101
+ });
102
+ articleBody = parts.join('\n\n');
103
+ }
104
+ // Fallback to og:description if no body
105
+ const contentBody = articleBody || description;
106
+ const structured = {
107
+ title,
108
+ author,
109
+ authorBio,
110
+ publishDate,
111
+ readingTime,
112
+ description,
113
+ publication,
114
+ clapCount,
115
+ url,
116
+ };
117
+ const authorLine = author ? `\n**Author:** ${author}` : '';
118
+ const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
119
+ const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
120
+ const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
121
+ const pubLine = publication ? `\n**Publication:** ${publication}` : '';
122
+ const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
123
+ // No hard character cap — let the pipeline's budget/maxTokens handle truncation
124
+ const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
125
+ return { domain: 'medium.com', type: 'article', structured, cleanContent };
126
+ }
127
+ catch {
128
+ return null;
129
+ }
130
+ }
@@ -0,0 +1,4 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function nytimesExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
3
+ export declare function bbcExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
4
+ export declare function cnnExtractor(html: string, url: string): Promise<DomainExtractResult | null>;