@staticn0va/wigolo 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1003) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +146 -227
  3. package/SKILL.md +382 -0
  4. package/assets/blocks/claude-code/CLAUDE.md.block +20 -0
  5. package/assets/blocks/claude-code/wigolo-command.md +40 -0
  6. package/assets/blocks/cursor/wigolo.mdc +46 -0
  7. package/assets/blocks/gemini-cli/GEMINI.md.block +18 -0
  8. package/assets/blocks/vscode/copilot-instructions.md.block +18 -0
  9. package/assets/skills/wigolo/SKILL.md +50 -0
  10. package/assets/skills/wigolo/rules/cache-first.md +30 -0
  11. package/assets/skills/wigolo/rules/synthesis.md +43 -0
  12. package/assets/skills/wigolo-agent/SKILL.md +73 -0
  13. package/assets/skills/wigolo-crawl/SKILL.md +60 -0
  14. package/assets/skills/wigolo-extract/SKILL.md +59 -0
  15. package/assets/skills/wigolo-fetch/SKILL.md +65 -0
  16. package/assets/skills/wigolo-find-similar/SKILL.md +72 -0
  17. package/assets/skills/wigolo-research/SKILL.md +77 -0
  18. package/assets/skills/wigolo-search/SKILL.md +78 -0
  19. package/dist/agent/executor.d.ts +33 -0
  20. package/dist/agent/executor.d.ts.map +1 -0
  21. package/dist/agent/executor.js +233 -0
  22. package/dist/agent/executor.js.map +1 -0
  23. package/dist/agent/pipeline.d.ts +5 -0
  24. package/dist/agent/pipeline.d.ts.map +1 -0
  25. package/dist/agent/pipeline.js +238 -0
  26. package/dist/agent/pipeline.js.map +1 -0
  27. package/dist/agent/planner.d.ts +13 -0
  28. package/dist/agent/planner.d.ts.map +1 -0
  29. package/dist/agent/planner.js +271 -0
  30. package/dist/agent/planner.js.map +1 -0
  31. package/dist/agent/relevance.d.ts +15 -0
  32. package/dist/agent/relevance.d.ts.map +1 -0
  33. package/dist/agent/relevance.js +60 -0
  34. package/dist/agent/relevance.js.map +1 -0
  35. package/dist/cache/backfill-embeddings.d.ts +23 -0
  36. package/dist/cache/backfill-embeddings.d.ts.map +1 -0
  37. package/dist/cache/backfill-embeddings.js +105 -0
  38. package/dist/cache/backfill-embeddings.js.map +1 -0
  39. package/dist/cache/change-detector.d.ts +7 -0
  40. package/dist/cache/change-detector.d.ts.map +1 -0
  41. package/dist/cache/change-detector.js +43 -0
  42. package/dist/cache/change-detector.js.map +1 -0
  43. package/dist/cache/db.d.ts +1 -0
  44. package/dist/cache/db.d.ts.map +1 -1
  45. package/dist/cache/db.js +94 -22
  46. package/dist/cache/db.js.map +1 -1
  47. package/dist/cache/diff-summary.d.ts +2 -0
  48. package/dist/cache/diff-summary.d.ts.map +1 -0
  49. package/dist/cache/diff-summary.js +82 -0
  50. package/dist/cache/diff-summary.js.map +1 -0
  51. package/dist/cache/migrations/runner.d.ts +29 -0
  52. package/dist/cache/migrations/runner.d.ts.map +1 -0
  53. package/dist/cache/migrations/runner.js +147 -0
  54. package/dist/cache/migrations/runner.js.map +1 -0
  55. package/dist/cache/sqlite-vec-store.d.ts +42 -0
  56. package/dist/cache/sqlite-vec-store.d.ts.map +1 -0
  57. package/dist/cache/sqlite-vec-store.js +176 -0
  58. package/dist/cache/sqlite-vec-store.js.map +1 -0
  59. package/dist/cache/store.d.ts +47 -1
  60. package/dist/cache/store.d.ts.map +1 -1
  61. package/dist/cache/store.js +364 -168
  62. package/dist/cache/store.js.map +1 -1
  63. package/dist/cli/agents/antigravity.d.ts +20 -0
  64. package/dist/cli/agents/antigravity.d.ts.map +1 -0
  65. package/dist/cli/agents/antigravity.js +49 -0
  66. package/dist/cli/agents/antigravity.js.map +1 -0
  67. package/dist/cli/agents/claude-code.d.ts +25 -0
  68. package/dist/cli/agents/claude-code.d.ts.map +1 -0
  69. package/dist/cli/agents/claude-code.js +111 -0
  70. package/dist/cli/agents/claude-code.js.map +1 -0
  71. package/dist/cli/agents/cursor.d.ts +21 -0
  72. package/dist/cli/agents/cursor.d.ts.map +1 -0
  73. package/dist/cli/agents/cursor.js +58 -0
  74. package/dist/cli/agents/cursor.js.map +1 -0
  75. package/dist/cli/agents/gemini-cli.d.ts +21 -0
  76. package/dist/cli/agents/gemini-cli.d.ts.map +1 -0
  77. package/dist/cli/agents/gemini-cli.js +55 -0
  78. package/dist/cli/agents/gemini-cli.js.map +1 -0
  79. package/dist/cli/agents/registry.d.ts +21 -0
  80. package/dist/cli/agents/registry.d.ts.map +1 -0
  81. package/dist/cli/agents/registry.js +27 -0
  82. package/dist/cli/agents/registry.js.map +1 -0
  83. package/dist/cli/agents/utils.d.ts +26 -0
  84. package/dist/cli/agents/utils.d.ts.map +1 -0
  85. package/dist/cli/agents/utils.js +136 -0
  86. package/dist/cli/agents/utils.js.map +1 -0
  87. package/dist/cli/agents/vscode.d.ts +21 -0
  88. package/dist/cli/agents/vscode.d.ts.map +1 -0
  89. package/dist/cli/agents/vscode.js +62 -0
  90. package/dist/cli/agents/vscode.js.map +1 -0
  91. package/dist/cli/auth.d.ts +2 -0
  92. package/dist/cli/auth.d.ts.map +1 -0
  93. package/dist/cli/auth.js +94 -0
  94. package/dist/cli/auth.js.map +1 -0
  95. package/dist/cli/backfill.d.ts +2 -0
  96. package/dist/cli/backfill.d.ts.map +1 -0
  97. package/dist/cli/backfill.js +58 -0
  98. package/dist/cli/backfill.js.map +1 -0
  99. package/dist/cli/daemon.d.ts +6 -1
  100. package/dist/cli/daemon.d.ts.map +1 -1
  101. package/dist/cli/daemon.js +61 -3
  102. package/dist/cli/daemon.js.map +1 -1
  103. package/dist/cli/doctor.d.ts +8 -0
  104. package/dist/cli/doctor.d.ts.map +1 -0
  105. package/dist/cli/doctor.js +344 -0
  106. package/dist/cli/doctor.js.map +1 -0
  107. package/dist/cli/health.d.ts +1 -1
  108. package/dist/cli/health.d.ts.map +1 -1
  109. package/dist/cli/health.js +42 -3
  110. package/dist/cli/health.js.map +1 -1
  111. package/dist/cli/help.d.ts +6 -0
  112. package/dist/cli/help.d.ts.map +1 -0
  113. package/dist/cli/help.js +63 -0
  114. package/dist/cli/help.js.map +1 -0
  115. package/dist/cli/index.d.ts +1 -1
  116. package/dist/cli/index.d.ts.map +1 -1
  117. package/dist/cli/index.js +35 -7
  118. package/dist/cli/index.js.map +1 -1
  119. package/dist/cli/init.d.ts +2 -0
  120. package/dist/cli/init.d.ts.map +1 -0
  121. package/dist/cli/init.js +201 -0
  122. package/dist/cli/init.js.map +1 -0
  123. package/dist/cli/plugin.d.ts +5 -0
  124. package/dist/cli/plugin.d.ts.map +1 -0
  125. package/dist/cli/plugin.js +185 -0
  126. package/dist/cli/plugin.js.map +1 -0
  127. package/dist/cli/setup-mcp.d.ts +2 -0
  128. package/dist/cli/setup-mcp.d.ts.map +1 -0
  129. package/dist/cli/setup-mcp.js +114 -0
  130. package/dist/cli/setup-mcp.js.map +1 -0
  131. package/dist/cli/shell.d.ts +2 -0
  132. package/dist/cli/shell.d.ts.map +1 -0
  133. package/dist/cli/shell.js +86 -0
  134. package/dist/cli/shell.js.map +1 -0
  135. package/dist/cli/shutdown.d.ts +2 -0
  136. package/dist/cli/shutdown.d.ts.map +1 -0
  137. package/dist/cli/shutdown.js +26 -0
  138. package/dist/cli/shutdown.js.map +1 -0
  139. package/dist/cli/status.d.ts +2 -0
  140. package/dist/cli/status.d.ts.map +1 -0
  141. package/dist/cli/status.js +31 -0
  142. package/dist/cli/status.js.map +1 -0
  143. package/dist/cli/telemetry.d.ts +10 -0
  144. package/dist/cli/telemetry.d.ts.map +1 -0
  145. package/dist/cli/telemetry.js +56 -0
  146. package/dist/cli/telemetry.js.map +1 -0
  147. package/dist/cli/tui/agents-types.d.ts +28 -0
  148. package/dist/cli/tui/agents-types.d.ts.map +1 -0
  149. package/dist/cli/tui/agents-types.js +1 -0
  150. package/dist/cli/tui/agents-types.js.map +1 -0
  151. package/dist/cli/tui/agents.d.ts +11 -0
  152. package/dist/cli/tui/agents.d.ts.map +1 -0
  153. package/dist/cli/tui/agents.js +93 -0
  154. package/dist/cli/tui/agents.js.map +1 -0
  155. package/dist/cli/tui/banner.d.ts +3 -0
  156. package/dist/cli/tui/banner.d.ts.map +1 -0
  157. package/dist/cli/tui/banner.js +30 -0
  158. package/dist/cli/tui/banner.js.map +1 -0
  159. package/dist/cli/tui/components/AgentSelect.d.ts +13 -0
  160. package/dist/cli/tui/components/AgentSelect.d.ts.map +1 -0
  161. package/dist/cli/tui/components/AgentSelect.js +116 -0
  162. package/dist/cli/tui/components/AgentSelect.js.map +1 -0
  163. package/dist/cli/tui/components/Banner.d.ts +6 -0
  164. package/dist/cli/tui/components/Banner.d.ts.map +1 -0
  165. package/dist/cli/tui/components/Banner.js +25 -0
  166. package/dist/cli/tui/components/Banner.js.map +1 -0
  167. package/dist/cli/tui/components/BrowserSelect.d.ts +7 -0
  168. package/dist/cli/tui/components/BrowserSelect.d.ts.map +1 -0
  169. package/dist/cli/tui/components/BrowserSelect.js +19 -0
  170. package/dist/cli/tui/components/BrowserSelect.js.map +1 -0
  171. package/dist/cli/tui/components/InstallProgress.d.ts +9 -0
  172. package/dist/cli/tui/components/InstallProgress.d.ts.map +1 -0
  173. package/dist/cli/tui/components/InstallProgress.js +67 -0
  174. package/dist/cli/tui/components/InstallProgress.js.map +1 -0
  175. package/dist/cli/tui/components/SkillInstall.d.ts +14 -0
  176. package/dist/cli/tui/components/SkillInstall.d.ts.map +1 -0
  177. package/dist/cli/tui/components/SkillInstall.js +94 -0
  178. package/dist/cli/tui/components/SkillInstall.js.map +1 -0
  179. package/dist/cli/tui/components/Summary.d.ts +22 -0
  180. package/dist/cli/tui/components/Summary.d.ts.map +1 -0
  181. package/dist/cli/tui/components/Summary.js +135 -0
  182. package/dist/cli/tui/components/Summary.js.map +1 -0
  183. package/dist/cli/tui/components/SystemCheck.d.ts +8 -0
  184. package/dist/cli/tui/components/SystemCheck.d.ts.map +1 -0
  185. package/dist/cli/tui/components/SystemCheck.js +71 -0
  186. package/dist/cli/tui/components/SystemCheck.js.map +1 -0
  187. package/dist/cli/tui/components/Verification.d.ts +8 -0
  188. package/dist/cli/tui/components/Verification.d.ts.map +1 -0
  189. package/dist/cli/tui/components/Verification.js +63 -0
  190. package/dist/cli/tui/components/Verification.js.map +1 -0
  191. package/dist/cli/tui/config-writer-cli.d.ts +12 -0
  192. package/dist/cli/tui/config-writer-cli.d.ts.map +1 -0
  193. package/dist/cli/tui/config-writer-cli.js +39 -0
  194. package/dist/cli/tui/config-writer-cli.js.map +1 -0
  195. package/dist/cli/tui/config-writer-json.d.ts +16 -0
  196. package/dist/cli/tui/config-writer-json.d.ts.map +1 -0
  197. package/dist/cli/tui/config-writer-json.js +86 -0
  198. package/dist/cli/tui/config-writer-json.js.map +1 -0
  199. package/dist/cli/tui/config-writer-toml.d.ts +16 -0
  200. package/dist/cli/tui/config-writer-toml.d.ts.map +1 -0
  201. package/dist/cli/tui/config-writer-toml.js +83 -0
  202. package/dist/cli/tui/config-writer-toml.js.map +1 -0
  203. package/dist/cli/tui/config-writer.d.ts +25 -0
  204. package/dist/cli/tui/config-writer.d.ts.map +1 -0
  205. package/dist/cli/tui/config-writer.js +101 -0
  206. package/dist/cli/tui/config-writer.js.map +1 -0
  207. package/dist/cli/tui/detect-helpers.d.ts +6 -0
  208. package/dist/cli/tui/detect-helpers.d.ts.map +1 -0
  209. package/dist/cli/tui/detect-helpers.js +45 -0
  210. package/dist/cli/tui/detect-helpers.js.map +1 -0
  211. package/dist/cli/tui/extras-prompt.d.ts +7 -0
  212. package/dist/cli/tui/extras-prompt.d.ts.map +1 -0
  213. package/dist/cli/tui/extras-prompt.js +42 -0
  214. package/dist/cli/tui/extras-prompt.js.map +1 -0
  215. package/dist/cli/tui/flags-types.d.ts +19 -0
  216. package/dist/cli/tui/flags-types.d.ts.map +1 -0
  217. package/dist/cli/tui/flags-types.js +23 -0
  218. package/dist/cli/tui/flags-types.js.map +1 -0
  219. package/dist/cli/tui/flags.d.ts +5 -0
  220. package/dist/cli/tui/flags.d.ts.map +1 -0
  221. package/dist/cli/tui/flags.js +132 -0
  222. package/dist/cli/tui/flags.js.map +1 -0
  223. package/dist/cli/tui/format.d.ts +14 -0
  224. package/dist/cli/tui/format.d.ts.map +1 -0
  225. package/dist/cli/tui/format.js +37 -0
  226. package/dist/cli/tui/format.js.map +1 -0
  227. package/dist/cli/tui/hooks/useAgentDetect.d.ts +6 -0
  228. package/dist/cli/tui/hooks/useAgentDetect.d.ts.map +1 -0
  229. package/dist/cli/tui/hooks/useAgentDetect.js +19 -0
  230. package/dist/cli/tui/hooks/useAgentDetect.js.map +1 -0
  231. package/dist/cli/tui/hooks/useInstall.d.ts +14 -0
  232. package/dist/cli/tui/hooks/useInstall.d.ts.map +1 -0
  233. package/dist/cli/tui/hooks/useInstall.js +90 -0
  234. package/dist/cli/tui/hooks/useInstall.js.map +1 -0
  235. package/dist/cli/tui/hooks/useSystemCheck.d.ts +13 -0
  236. package/dist/cli/tui/hooks/useSystemCheck.d.ts.map +1 -0
  237. package/dist/cli/tui/hooks/useSystemCheck.js +95 -0
  238. package/dist/cli/tui/hooks/useSystemCheck.js.map +1 -0
  239. package/dist/cli/tui/hooks/useVerify.d.ts +14 -0
  240. package/dist/cli/tui/hooks/useVerify.d.ts.map +1 -0
  241. package/dist/cli/tui/hooks/useVerify.js +71 -0
  242. package/dist/cli/tui/hooks/useVerify.js.map +1 -0
  243. package/dist/cli/tui/ink-init.d.ts +2 -0
  244. package/dist/cli/tui/ink-init.d.ts.map +1 -0
  245. package/dist/cli/tui/ink-init.js +198 -0
  246. package/dist/cli/tui/ink-init.js.map +1 -0
  247. package/dist/cli/tui/reporter-auto.d.ts +7 -0
  248. package/dist/cli/tui/reporter-auto.d.ts.map +1 -0
  249. package/dist/cli/tui/reporter-auto.js +15 -0
  250. package/dist/cli/tui/reporter-auto.js.map +1 -0
  251. package/dist/cli/tui/reporter.d.ts +26 -0
  252. package/dist/cli/tui/reporter.d.ts.map +1 -0
  253. package/dist/cli/tui/reporter.js +32 -0
  254. package/dist/cli/tui/reporter.js.map +1 -0
  255. package/dist/cli/tui/run-command.d.ts +14 -0
  256. package/dist/cli/tui/run-command.d.ts.map +1 -0
  257. package/dist/cli/tui/run-command.js +72 -0
  258. package/dist/cli/tui/run-command.js.map +1 -0
  259. package/dist/cli/tui/select-agents.d.ts +6 -0
  260. package/dist/cli/tui/select-agents.d.ts.map +1 -0
  261. package/dist/cli/tui/select-agents.js +32 -0
  262. package/dist/cli/tui/select-agents.js.map +1 -0
  263. package/dist/cli/tui/status-agents.d.ts +11 -0
  264. package/dist/cli/tui/status-agents.d.ts.map +1 -0
  265. package/dist/cli/tui/status-agents.js +53 -0
  266. package/dist/cli/tui/status-agents.js.map +1 -0
  267. package/dist/cli/tui/status-cache.d.ts +6 -0
  268. package/dist/cli/tui/status-cache.d.ts.map +1 -0
  269. package/dist/cli/tui/status-cache.js +39 -0
  270. package/dist/cli/tui/status-cache.js.map +1 -0
  271. package/dist/cli/tui/status-format.d.ts +14 -0
  272. package/dist/cli/tui/status-format.d.ts.map +1 -0
  273. package/dist/cli/tui/status-format.js +41 -0
  274. package/dist/cli/tui/status-format.js.map +1 -0
  275. package/dist/cli/tui/status-python.d.ts +6 -0
  276. package/dist/cli/tui/status-python.d.ts.map +1 -0
  277. package/dist/cli/tui/status-python.js +30 -0
  278. package/dist/cli/tui/status-python.js.map +1 -0
  279. package/dist/cli/tui/system-check.d.ts +24 -0
  280. package/dist/cli/tui/system-check.d.ts.map +1 -0
  281. package/dist/cli/tui/system-check.js +103 -0
  282. package/dist/cli/tui/system-check.js.map +1 -0
  283. package/dist/cli/tui/tui-reporter.d.ts +19 -0
  284. package/dist/cli/tui/tui-reporter.d.ts.map +1 -0
  285. package/dist/cli/tui/tui-reporter.js +95 -0
  286. package/dist/cli/tui/tui-reporter.js.map +1 -0
  287. package/dist/cli/tui/utils/config-writer.d.ts +3 -0
  288. package/dist/cli/tui/utils/config-writer.d.ts.map +1 -0
  289. package/dist/cli/tui/utils/config-writer.js +22 -0
  290. package/dist/cli/tui/utils/config-writer.js.map +1 -0
  291. package/dist/cli/tui/utils/suppress-logs.d.ts +3 -0
  292. package/dist/cli/tui/utils/suppress-logs.d.ts.map +1 -0
  293. package/dist/cli/tui/utils/suppress-logs.js +11 -0
  294. package/dist/cli/tui/utils/suppress-logs.js.map +1 -0
  295. package/dist/cli/tui/verify-suggestions.d.ts +5 -0
  296. package/dist/cli/tui/verify-suggestions.d.ts.map +1 -0
  297. package/dist/cli/tui/verify-suggestions.js +20 -0
  298. package/dist/cli/tui/verify-suggestions.js.map +1 -0
  299. package/dist/cli/tui/verify.d.ts +14 -0
  300. package/dist/cli/tui/verify.d.ts.map +1 -0
  301. package/dist/cli/tui/verify.js +101 -0
  302. package/dist/cli/tui/verify.js.map +1 -0
  303. package/dist/cli/tui/version.d.ts +2 -0
  304. package/dist/cli/tui/version.d.ts.map +1 -0
  305. package/dist/cli/tui/version.js +14 -0
  306. package/dist/cli/tui/version.js.map +1 -0
  307. package/dist/cli/uninstall.d.ts +2 -0
  308. package/dist/cli/uninstall.d.ts.map +1 -0
  309. package/dist/cli/uninstall.js +57 -0
  310. package/dist/cli/uninstall.js.map +1 -0
  311. package/dist/cli/warmup.d.ts +10 -2
  312. package/dist/cli/warmup.d.ts.map +1 -1
  313. package/dist/cli/warmup.js +226 -93
  314. package/dist/cli/warmup.js.map +1 -1
  315. package/dist/config.d.ts +28 -2
  316. package/dist/config.d.ts.map +1 -1
  317. package/dist/config.js +106 -56
  318. package/dist/config.js.map +1 -1
  319. package/dist/crawl/crawler.d.ts +6 -0
  320. package/dist/crawl/crawler.d.ts.map +1 -1
  321. package/dist/crawl/crawler.js +210 -209
  322. package/dist/crawl/crawler.js.map +1 -1
  323. package/dist/crawl/dedup.d.ts +1 -0
  324. package/dist/crawl/dedup.d.ts.map +1 -1
  325. package/dist/crawl/dedup.js +124 -81
  326. package/dist/crawl/dedup.js.map +1 -1
  327. package/dist/crawl/etag-incremental.d.ts +43 -0
  328. package/dist/crawl/etag-incremental.d.ts.map +1 -0
  329. package/dist/crawl/etag-incremental.js +94 -0
  330. package/dist/crawl/etag-incremental.js.map +1 -0
  331. package/dist/crawl/index-to-vec.d.ts +10 -0
  332. package/dist/crawl/index-to-vec.d.ts.map +1 -0
  333. package/dist/crawl/index-to-vec.js +44 -0
  334. package/dist/crawl/index-to-vec.js.map +1 -0
  335. package/dist/crawl/mapper.js +136 -164
  336. package/dist/crawl/mapper.js.map +1 -1
  337. package/dist/crawl/rate-limiter.js +63 -66
  338. package/dist/crawl/rate-limiter.js.map +1 -1
  339. package/dist/crawl/robots.js +58 -57
  340. package/dist/crawl/robots.js.map +1 -1
  341. package/dist/crawl/sitemap-first.d.ts +12 -0
  342. package/dist/crawl/sitemap-first.d.ts.map +1 -0
  343. package/dist/crawl/sitemap-first.js +47 -0
  344. package/dist/crawl/sitemap-first.js.map +1 -0
  345. package/dist/crawl/sitemap.js +33 -32
  346. package/dist/crawl/sitemap.js.map +1 -1
  347. package/dist/crawl/url-utils.d.ts +1 -0
  348. package/dist/crawl/url-utils.d.ts.map +1 -1
  349. package/dist/crawl/url-utils.js +49 -37
  350. package/dist/crawl/url-utils.js.map +1 -1
  351. package/dist/daemon/health-check.d.ts +16 -0
  352. package/dist/daemon/health-check.d.ts.map +1 -0
  353. package/dist/daemon/health-check.js +33 -0
  354. package/dist/daemon/health-check.js.map +1 -0
  355. package/dist/daemon/http-server.d.ts +26 -0
  356. package/dist/daemon/http-server.d.ts.map +1 -0
  357. package/dist/daemon/http-server.js +275 -0
  358. package/dist/daemon/http-server.js.map +1 -0
  359. package/dist/daemon/proxy.d.ts +10 -0
  360. package/dist/daemon/proxy.d.ts.map +1 -0
  361. package/dist/daemon/proxy.js +93 -0
  362. package/dist/daemon/proxy.js.map +1 -0
  363. package/dist/embedding/embed.d.ts +59 -0
  364. package/dist/embedding/embed.d.ts.map +1 -0
  365. package/dist/embedding/embed.js +233 -0
  366. package/dist/embedding/embed.js.map +1 -0
  367. package/dist/embedding/fastembed-provider.d.ts +19 -0
  368. package/dist/embedding/fastembed-provider.d.ts.map +1 -0
  369. package/dist/embedding/fastembed-provider.js +51 -0
  370. package/dist/embedding/fastembed-provider.js.map +1 -0
  371. package/dist/embedding/key-terms.d.ts +12 -0
  372. package/dist/embedding/key-terms.d.ts.map +1 -0
  373. package/dist/embedding/key-terms.js +234 -0
  374. package/dist/embedding/key-terms.js.map +1 -0
  375. package/dist/extraction/boilerplate.d.ts +15 -0
  376. package/dist/extraction/boilerplate.d.ts.map +1 -0
  377. package/dist/extraction/boilerplate.js +52 -0
  378. package/dist/extraction/boilerplate.js.map +1 -0
  379. package/dist/extraction/defuddle.d.ts.map +1 -1
  380. package/dist/extraction/defuddle.js +27 -23
  381. package/dist/extraction/defuddle.js.map +1 -1
  382. package/dist/extraction/extract.d.ts.map +1 -1
  383. package/dist/extraction/extract.js +76 -76
  384. package/dist/extraction/extract.js.map +1 -1
  385. package/dist/extraction/jsonld.js +50 -54
  386. package/dist/extraction/jsonld.js.map +1 -1
  387. package/dist/extraction/lang-hints.d.ts +2 -0
  388. package/dist/extraction/lang-hints.d.ts.map +1 -0
  389. package/dist/extraction/lang-hints.js +30 -0
  390. package/dist/extraction/lang-hints.js.map +1 -0
  391. package/dist/extraction/llm-fallback.d.ts +17 -0
  392. package/dist/extraction/llm-fallback.d.ts.map +1 -0
  393. package/dist/extraction/llm-fallback.js +130 -0
  394. package/dist/extraction/llm-fallback.js.map +1 -0
  395. package/dist/extraction/markdown-sanitize.d.ts +2 -0
  396. package/dist/extraction/markdown-sanitize.d.ts.map +1 -0
  397. package/dist/extraction/markdown-sanitize.js +151 -0
  398. package/dist/extraction/markdown-sanitize.js.map +1 -0
  399. package/dist/extraction/markdown.d.ts +11 -0
  400. package/dist/extraction/markdown.d.ts.map +1 -1
  401. package/dist/extraction/markdown.js +195 -91
  402. package/dist/extraction/markdown.js.map +1 -1
  403. package/dist/extraction/pipeline.d.ts +8 -0
  404. package/dist/extraction/pipeline.d.ts.map +1 -1
  405. package/dist/extraction/pipeline.js +57 -91
  406. package/dist/extraction/pipeline.js.map +1 -1
  407. package/dist/extraction/readability.d.ts +1 -1
  408. package/dist/extraction/readability.d.ts.map +1 -1
  409. package/dist/extraction/readability.js +28 -29
  410. package/dist/extraction/readability.js.map +1 -1
  411. package/dist/extraction/schema.d.ts +12 -0
  412. package/dist/extraction/schema.d.ts.map +1 -1
  413. package/dist/extraction/schema.js +135 -72
  414. package/dist/extraction/schema.js.map +1 -1
  415. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -1
  416. package/dist/extraction/site-extractors/docs-generic.js +81 -91
  417. package/dist/extraction/site-extractors/docs-generic.js.map +1 -1
  418. package/dist/extraction/site-extractors/github.d.ts.map +1 -1
  419. package/dist/extraction/site-extractors/github.js +87 -95
  420. package/dist/extraction/site-extractors/github.js.map +1 -1
  421. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -1
  422. package/dist/extraction/site-extractors/mdn.js +46 -54
  423. package/dist/extraction/site-extractors/mdn.js.map +1 -1
  424. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -1
  425. package/dist/extraction/site-extractors/stackoverflow.js +71 -80
  426. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -1
  427. package/dist/extraction/structured-data.d.ts +4 -0
  428. package/dist/extraction/structured-data.d.ts.map +1 -0
  429. package/dist/extraction/structured-data.js +173 -0
  430. package/dist/extraction/structured-data.js.map +1 -0
  431. package/dist/extraction/structured.d.ts +4 -0
  432. package/dist/extraction/structured.d.ts.map +1 -0
  433. package/dist/extraction/structured.js +163 -0
  434. package/dist/extraction/structured.js.map +1 -0
  435. package/dist/extraction/v1/classifier.d.ts +3 -0
  436. package/dist/extraction/v1/classifier.d.ts.map +1 -0
  437. package/dist/extraction/v1/classifier.js +110 -0
  438. package/dist/extraction/v1/classifier.js.map +1 -0
  439. package/dist/extraction/v1/extract-provider.d.ts +16 -0
  440. package/dist/extraction/v1/extract-provider.d.ts.map +1 -0
  441. package/dist/extraction/v1/extract-provider.js +43 -0
  442. package/dist/extraction/v1/extract-provider.js.map +1 -0
  443. package/dist/extraction/v1/local-llm.d.ts +8 -0
  444. package/dist/extraction/v1/local-llm.d.ts.map +1 -0
  445. package/dist/extraction/v1/local-llm.js +34 -0
  446. package/dist/extraction/v1/local-llm.js.map +1 -0
  447. package/dist/extraction/v1/news.d.ts +3 -0
  448. package/dist/extraction/v1/news.d.ts.map +1 -0
  449. package/dist/extraction/v1/news.js +61 -0
  450. package/dist/extraction/v1/news.js.map +1 -0
  451. package/dist/extraction/v1/product.d.ts +3 -0
  452. package/dist/extraction/v1/product.d.ts.map +1 -0
  453. package/dist/extraction/v1/product.js +166 -0
  454. package/dist/extraction/v1/product.js.map +1 -0
  455. package/dist/extraction/v1/recipe.d.ts +3 -0
  456. package/dist/extraction/v1/recipe.d.ts.map +1 -0
  457. package/dist/extraction/v1/recipe.js +136 -0
  458. package/dist/extraction/v1/recipe.js.map +1 -0
  459. package/dist/extraction/v1/routed.d.ts +17 -0
  460. package/dist/extraction/v1/routed.d.ts.map +1 -0
  461. package/dist/extraction/v1/routed.js +68 -0
  462. package/dist/extraction/v1/routed.js.map +1 -0
  463. package/dist/extraction/v1/schemas/Article.d.ts +11 -0
  464. package/dist/extraction/v1/schemas/Article.d.ts.map +1 -0
  465. package/dist/extraction/v1/schemas/Article.js +23 -0
  466. package/dist/extraction/v1/schemas/Article.js.map +1 -0
  467. package/dist/extraction/v1/schemas/CodeSnippet.d.ts +9 -0
  468. package/dist/extraction/v1/schemas/CodeSnippet.d.ts.map +1 -0
  469. package/dist/extraction/v1/schemas/CodeSnippet.js +90 -0
  470. package/dist/extraction/v1/schemas/CodeSnippet.js.map +1 -0
  471. package/dist/extraction/v1/schemas/EventListing.d.ts +10 -0
  472. package/dist/extraction/v1/schemas/EventListing.d.ts.map +1 -0
  473. package/dist/extraction/v1/schemas/EventListing.js +122 -0
  474. package/dist/extraction/v1/schemas/EventListing.js.map +1 -0
  475. package/dist/extraction/v1/schemas/Paper.d.ts +10 -0
  476. package/dist/extraction/v1/schemas/Paper.d.ts.map +1 -0
  477. package/dist/extraction/v1/schemas/Paper.js +156 -0
  478. package/dist/extraction/v1/schemas/Paper.js.map +1 -0
  479. package/dist/extraction/v1/schemas/Product.d.ts +17 -0
  480. package/dist/extraction/v1/schemas/Product.d.ts.map +1 -0
  481. package/dist/extraction/v1/schemas/Product.js +149 -0
  482. package/dist/extraction/v1/schemas/Product.js.map +1 -0
  483. package/dist/extraction/v1/schemas/Recipe.d.ts +14 -0
  484. package/dist/extraction/v1/schemas/Recipe.d.ts.map +1 -0
  485. package/dist/extraction/v1/schemas/Recipe.js +160 -0
  486. package/dist/extraction/v1/schemas/Recipe.js.map +1 -0
  487. package/dist/extraction/v1/schemas/index.d.ts +13 -0
  488. package/dist/extraction/v1/schemas/index.d.ts.map +1 -0
  489. package/dist/extraction/v1/schemas/index.js +44 -0
  490. package/dist/extraction/v1/schemas/index.js.map +1 -0
  491. package/dist/extraction/v1/site-extractors.d.ts +5 -0
  492. package/dist/extraction/v1/site-extractors.d.ts.map +1 -0
  493. package/dist/extraction/v1/site-extractors.js +31 -0
  494. package/dist/extraction/v1/site-extractors.js.map +1 -0
  495. package/dist/fetch/action-executor.d.ts +28 -0
  496. package/dist/fetch/action-executor.d.ts.map +1 -0
  497. package/dist/fetch/action-executor.js +88 -0
  498. package/dist/fetch/action-executor.js.map +1 -0
  499. package/dist/fetch/auth.d.ts +2 -1
  500. package/dist/fetch/auth.d.ts.map +1 -1
  501. package/dist/fetch/auth.js +56 -26
  502. package/dist/fetch/auth.js.map +1 -1
  503. package/dist/fetch/browser-pool.d.ts +30 -11
  504. package/dist/fetch/browser-pool.d.ts.map +1 -1
  505. package/dist/fetch/browser-pool.js +303 -127
  506. package/dist/fetch/browser-pool.js.map +1 -1
  507. package/dist/fetch/browser-selector.d.ts +17 -0
  508. package/dist/fetch/browser-selector.d.ts.map +1 -0
  509. package/dist/fetch/browser-selector.js +72 -0
  510. package/dist/fetch/browser-selector.js.map +1 -0
  511. package/dist/fetch/browser-types.d.ts +3 -0
  512. package/dist/fetch/browser-types.d.ts.map +1 -0
  513. package/dist/fetch/browser-types.js +45 -0
  514. package/dist/fetch/browser-types.js.map +1 -0
  515. package/dist/fetch/cdp-client.d.ts +9 -0
  516. package/dist/fetch/cdp-client.d.ts.map +1 -0
  517. package/dist/fetch/cdp-client.js +89 -0
  518. package/dist/fetch/cdp-client.js.map +1 -0
  519. package/dist/fetch/content-check.js +39 -46
  520. package/dist/fetch/content-check.js.map +1 -1
  521. package/dist/fetch/error-describe.d.ts +7 -0
  522. package/dist/fetch/error-describe.d.ts.map +1 -0
  523. package/dist/fetch/error-describe.js +37 -0
  524. package/dist/fetch/error-describe.js.map +1 -0
  525. package/dist/fetch/http-client.d.ts +4 -0
  526. package/dist/fetch/http-client.d.ts.map +1 -1
  527. package/dist/fetch/http-client.js +147 -128
  528. package/dist/fetch/http-client.js.map +1 -1
  529. package/dist/fetch/lightpanda.d.ts +28 -0
  530. package/dist/fetch/lightpanda.d.ts.map +1 -0
  531. package/dist/fetch/lightpanda.js +174 -0
  532. package/dist/fetch/lightpanda.js.map +1 -0
  533. package/dist/fetch/playwright-tier.d.ts +19 -0
  534. package/dist/fetch/playwright-tier.d.ts.map +1 -0
  535. package/dist/fetch/playwright-tier.js +76 -0
  536. package/dist/fetch/playwright-tier.js.map +1 -0
  537. package/dist/fetch/router.d.ts +49 -3
  538. package/dist/fetch/router.d.ts.map +1 -1
  539. package/dist/fetch/router.js +187 -81
  540. package/dist/fetch/router.js.map +1 -1
  541. package/dist/index.js +102 -17
  542. package/dist/index.js.map +1 -1
  543. package/dist/instructions.d.ts +31 -0
  544. package/dist/instructions.d.ts.map +1 -0
  545. package/dist/instructions.js +245 -0
  546. package/dist/instructions.js.map +1 -0
  547. package/dist/integrations/cloud/llm/anthropic.d.ts +3 -0
  548. package/dist/integrations/cloud/llm/anthropic.d.ts.map +1 -0
  549. package/dist/integrations/cloud/llm/anthropic.js +41 -0
  550. package/dist/integrations/cloud/llm/anthropic.js.map +1 -0
  551. package/dist/integrations/cloud/llm/cache.d.ts +5 -0
  552. package/dist/integrations/cloud/llm/cache.d.ts.map +1 -0
  553. package/dist/integrations/cloud/llm/cache.js +49 -0
  554. package/dist/integrations/cloud/llm/cache.js.map +1 -0
  555. package/dist/integrations/cloud/llm/gemini.d.ts +3 -0
  556. package/dist/integrations/cloud/llm/gemini.d.ts.map +1 -0
  557. package/dist/integrations/cloud/llm/gemini.js +37 -0
  558. package/dist/integrations/cloud/llm/gemini.js.map +1 -0
  559. package/dist/integrations/cloud/llm/groq.d.ts +3 -0
  560. package/dist/integrations/cloud/llm/groq.d.ts.map +1 -0
  561. package/dist/integrations/cloud/llm/groq.js +74 -0
  562. package/dist/integrations/cloud/llm/groq.js.map +1 -0
  563. package/dist/integrations/cloud/llm/hash.d.ts +3 -0
  564. package/dist/integrations/cloud/llm/hash.d.ts.map +1 -0
  565. package/dist/integrations/cloud/llm/hash.js +26 -0
  566. package/dist/integrations/cloud/llm/hash.js.map +1 -0
  567. package/dist/integrations/cloud/llm/model-select.d.ts +5 -0
  568. package/dist/integrations/cloud/llm/model-select.d.ts.map +1 -0
  569. package/dist/integrations/cloud/llm/model-select.js +32 -0
  570. package/dist/integrations/cloud/llm/model-select.js.map +1 -0
  571. package/dist/integrations/cloud/llm/openai.d.ts +3 -0
  572. package/dist/integrations/cloud/llm/openai.d.ts.map +1 -0
  573. package/dist/integrations/cloud/llm/openai.js +43 -0
  574. package/dist/integrations/cloud/llm/openai.js.map +1 -0
  575. package/dist/integrations/cloud/llm/run.d.ts +27 -0
  576. package/dist/integrations/cloud/llm/run.d.ts.map +1 -0
  577. package/dist/integrations/cloud/llm/run.js +99 -0
  578. package/dist/integrations/cloud/llm/run.js.map +1 -0
  579. package/dist/integrations/cloud/llm/select.d.ts +5 -0
  580. package/dist/integrations/cloud/llm/select.d.ts.map +1 -0
  581. package/dist/integrations/cloud/llm/select.js +30 -0
  582. package/dist/integrations/cloud/llm/select.js.map +1 -0
  583. package/dist/integrations/cloud/llm/text-adapters.d.ts +19 -0
  584. package/dist/integrations/cloud/llm/text-adapters.d.ts.map +1 -0
  585. package/dist/integrations/cloud/llm/text-adapters.js +103 -0
  586. package/dist/integrations/cloud/llm/text-adapters.js.map +1 -0
  587. package/dist/integrations/cloud/llm/types.d.ts +24 -0
  588. package/dist/integrations/cloud/llm/types.d.ts.map +1 -0
  589. package/dist/integrations/cloud/llm/types.js +1 -0
  590. package/dist/integrations/cloud/llm/types.js.map +1 -0
  591. package/dist/integrations/cloud/llm/validate.d.ts +6 -0
  592. package/dist/integrations/cloud/llm/validate.d.ts.map +1 -0
  593. package/dist/integrations/cloud/llm/validate.js +63 -0
  594. package/dist/integrations/cloud/llm/validate.js.map +1 -0
  595. package/dist/logger.d.ts +4 -1
  596. package/dist/logger.d.ts.map +1 -1
  597. package/dist/logger.js +71 -30
  598. package/dist/logger.js.map +1 -1
  599. package/dist/pdf-parse.d.js +1 -0
  600. package/dist/pdf-parse.d.js.map +1 -0
  601. package/dist/plugins/loader.d.ts +20 -0
  602. package/dist/plugins/loader.d.ts.map +1 -0
  603. package/dist/plugins/loader.js +157 -0
  604. package/dist/plugins/loader.js.map +1 -0
  605. package/dist/plugins/registry.d.ts +26 -0
  606. package/dist/plugins/registry.d.ts.map +1 -0
  607. package/dist/plugins/registry.js +71 -0
  608. package/dist/plugins/registry.js.map +1 -0
  609. package/dist/plugins/validate.d.ts +9 -0
  610. package/dist/plugins/validate.d.ts.map +1 -0
  611. package/dist/plugins/validate.js +79 -0
  612. package/dist/plugins/validate.js.map +1 -0
  613. package/dist/providers/embed-provider.d.ts +11 -0
  614. package/dist/providers/embed-provider.d.ts.map +1 -0
  615. package/dist/providers/embed-provider.js +24 -0
  616. package/dist/providers/embed-provider.js.map +1 -0
  617. package/dist/providers/extract-provider.d.ts +23 -0
  618. package/dist/providers/extract-provider.d.ts.map +1 -0
  619. package/dist/providers/extract-provider.js +25 -0
  620. package/dist/providers/extract-provider.js.map +1 -0
  621. package/dist/providers/rerank-provider.d.ts +17 -0
  622. package/dist/providers/rerank-provider.d.ts.map +1 -0
  623. package/dist/providers/rerank-provider.js +41 -0
  624. package/dist/providers/rerank-provider.js.map +1 -0
  625. package/dist/providers/search-provider.d.ts +25 -0
  626. package/dist/providers/search-provider.d.ts.map +1 -0
  627. package/dist/providers/search-provider.js +44 -0
  628. package/dist/providers/search-provider.js.map +1 -0
  629. package/dist/providers/vector-store.d.ts +27 -0
  630. package/dist/providers/vector-store.d.ts.map +1 -0
  631. package/dist/providers/vector-store.js +27 -0
  632. package/dist/providers/vector-store.js.map +1 -0
  633. package/dist/python-env.d.ts +9 -0
  634. package/dist/python-env.d.ts.map +1 -0
  635. package/dist/python-env.js +13 -0
  636. package/dist/python-env.js.map +1 -0
  637. package/dist/repl/commands/agent.d.ts +5 -0
  638. package/dist/repl/commands/agent.d.ts.map +1 -0
  639. package/dist/repl/commands/agent.js +62 -0
  640. package/dist/repl/commands/agent.js.map +1 -0
  641. package/dist/repl/commands/cache.d.ts +4 -0
  642. package/dist/repl/commands/cache.d.ts.map +1 -0
  643. package/dist/repl/commands/cache.js +43 -0
  644. package/dist/repl/commands/cache.js.map +1 -0
  645. package/dist/repl/commands/crawl.d.ts +7 -0
  646. package/dist/repl/commands/crawl.d.ts.map +1 -0
  647. package/dist/repl/commands/crawl.js +44 -0
  648. package/dist/repl/commands/crawl.js.map +1 -0
  649. package/dist/repl/commands/extract.d.ts +5 -0
  650. package/dist/repl/commands/extract.d.ts.map +1 -0
  651. package/dist/repl/commands/extract.js +47 -0
  652. package/dist/repl/commands/extract.js.map +1 -0
  653. package/dist/repl/commands/fetch.d.ts +5 -0
  654. package/dist/repl/commands/fetch.d.ts.map +1 -0
  655. package/dist/repl/commands/fetch.js +67 -0
  656. package/dist/repl/commands/fetch.js.map +1 -0
  657. package/dist/repl/commands/find-similar.d.ts +5 -0
  658. package/dist/repl/commands/find-similar.d.ts.map +1 -0
  659. package/dist/repl/commands/find-similar.js +74 -0
  660. package/dist/repl/commands/find-similar.js.map +1 -0
  661. package/dist/repl/commands/research.d.ts +5 -0
  662. package/dist/repl/commands/research.d.ts.map +1 -0
  663. package/dist/repl/commands/research.js +65 -0
  664. package/dist/repl/commands/research.js.map +1 -0
  665. package/dist/repl/commands/search.d.ts +5 -0
  666. package/dist/repl/commands/search.d.ts.map +1 -0
  667. package/dist/repl/commands/search.js +74 -0
  668. package/dist/repl/commands/search.js.map +1 -0
  669. package/dist/repl/commands/types.d.ts +9 -0
  670. package/dist/repl/commands/types.d.ts.map +1 -0
  671. package/dist/repl/commands/types.js +1 -0
  672. package/dist/repl/commands/types.js.map +1 -0
  673. package/dist/repl/formatters.d.ts +13 -0
  674. package/dist/repl/formatters.d.ts.map +1 -0
  675. package/dist/repl/formatters.js +283 -0
  676. package/dist/repl/formatters.js.map +1 -0
  677. package/dist/repl/parser.d.ts +9 -0
  678. package/dist/repl/parser.d.ts.map +1 -0
  679. package/dist/repl/parser.js +86 -0
  680. package/dist/repl/parser.js.map +1 -0
  681. package/dist/repl/shell.d.ts +8 -0
  682. package/dist/repl/shell.d.ts.map +1 -0
  683. package/dist/repl/shell.js +184 -0
  684. package/dist/repl/shell.js.map +1 -0
  685. package/dist/research/branch-exploration.d.ts +14 -0
  686. package/dist/research/branch-exploration.d.ts.map +1 -0
  687. package/dist/research/branch-exploration.js +100 -0
  688. package/dist/research/branch-exploration.js.map +1 -0
  689. package/dist/research/brief.d.ts +6 -0
  690. package/dist/research/brief.d.ts.map +1 -0
  691. package/dist/research/brief.js +246 -0
  692. package/dist/research/brief.js.map +1 -0
  693. package/dist/research/citation-graph.d.ts +9 -0
  694. package/dist/research/citation-graph.d.ts.map +1 -0
  695. package/dist/research/citation-graph.js +114 -0
  696. package/dist/research/citation-graph.js.map +1 -0
  697. package/dist/research/decompose.d.ts +14 -0
  698. package/dist/research/decompose.d.ts.map +1 -0
  699. package/dist/research/decompose.js +439 -0
  700. package/dist/research/decompose.js.map +1 -0
  701. package/dist/research/pipeline.d.ts +5 -0
  702. package/dist/research/pipeline.d.ts.map +1 -0
  703. package/dist/research/pipeline.js +269 -0
  704. package/dist/research/pipeline.js.map +1 -0
  705. package/dist/research/synthesis-local.d.ts +19 -0
  706. package/dist/research/synthesis-local.d.ts.map +1 -0
  707. package/dist/research/synthesis-local.js +62 -0
  708. package/dist/research/synthesis-local.js.map +1 -0
  709. package/dist/research/synthesize.d.ts +10 -0
  710. package/dist/research/synthesize.d.ts.map +1 -0
  711. package/dist/research/synthesize.js +137 -0
  712. package/dist/research/synthesize.js.map +1 -0
  713. package/dist/search/answer-synthesis.d.ts +33 -0
  714. package/dist/search/answer-synthesis.d.ts.map +1 -0
  715. package/dist/search/answer-synthesis.js +244 -0
  716. package/dist/search/answer-synthesis.js.map +1 -0
  717. package/dist/search/context-formatter.d.ts +3 -0
  718. package/dist/search/context-formatter.d.ts.map +1 -0
  719. package/dist/search/context-formatter.js +56 -0
  720. package/dist/search/context-formatter.js.map +1 -0
  721. package/dist/search/dedup.d.ts +1 -0
  722. package/dist/search/dedup.d.ts.map +1 -1
  723. package/dist/search/dedup.js +40 -32
  724. package/dist/search/dedup.js.map +1 -1
  725. package/dist/search/engines/arxiv.d.ts +7 -0
  726. package/dist/search/engines/arxiv.d.ts.map +1 -0
  727. package/dist/search/engines/arxiv.js +70 -0
  728. package/dist/search/engines/arxiv.js.map +1 -0
  729. package/dist/search/engines/bing-news.d.ts +7 -0
  730. package/dist/search/engines/bing-news.d.ts.map +1 -0
  731. package/dist/search/engines/bing-news.js +97 -0
  732. package/dist/search/engines/bing-news.js.map +1 -0
  733. package/dist/search/engines/bing.d.ts +1 -0
  734. package/dist/search/engines/bing.d.ts.map +1 -1
  735. package/dist/search/engines/bing.js +100 -44
  736. package/dist/search/engines/bing.js.map +1 -1
  737. package/dist/search/engines/devdocs.d.ts +6 -0
  738. package/dist/search/engines/devdocs.d.ts.map +1 -0
  739. package/dist/search/engines/devdocs.js +56 -0
  740. package/dist/search/engines/devdocs.js.map +1 -0
  741. package/dist/search/engines/duckduckgo.d.ts.map +1 -1
  742. package/dist/search/engines/duckduckgo.js +56 -44
  743. package/dist/search/engines/duckduckgo.js.map +1 -1
  744. package/dist/search/engines/github-code.d.ts +7 -0
  745. package/dist/search/engines/github-code.d.ts.map +1 -0
  746. package/dist/search/engines/github-code.js +55 -0
  747. package/dist/search/engines/github-code.js.map +1 -0
  748. package/dist/search/engines/hn-algolia.d.ts +7 -0
  749. package/dist/search/engines/hn-algolia.d.ts.map +1 -0
  750. package/dist/search/engines/hn-algolia.js +76 -0
  751. package/dist/search/engines/hn-algolia.js.map +1 -0
  752. package/dist/search/engines/lobsters.d.ts +7 -0
  753. package/dist/search/engines/lobsters.d.ts.map +1 -0
  754. package/dist/search/engines/lobsters.js +83 -0
  755. package/dist/search/engines/lobsters.js.map +1 -0
  756. package/dist/search/engines/mdn.d.ts +7 -0
  757. package/dist/search/engines/mdn.d.ts.map +1 -0
  758. package/dist/search/engines/mdn.js +48 -0
  759. package/dist/search/engines/mdn.js.map +1 -0
  760. package/dist/search/engines/semantic-scholar.d.ts +7 -0
  761. package/dist/search/engines/semantic-scholar.d.ts.map +1 -0
  762. package/dist/search/engines/semantic-scholar.js +69 -0
  763. package/dist/search/engines/semantic-scholar.js.map +1 -0
  764. package/dist/search/engines/stackoverflow.d.ts +7 -0
  765. package/dist/search/engines/stackoverflow.d.ts.map +1 -0
  766. package/dist/search/engines/stackoverflow.js +73 -0
  767. package/dist/search/engines/stackoverflow.js.map +1 -0
  768. package/dist/search/engines/startpage.d.ts.map +1 -1
  769. package/dist/search/engines/startpage.js +65 -46
  770. package/dist/search/engines/startpage.js.map +1 -1
  771. package/dist/search/evidence.d.ts +25 -0
  772. package/dist/search/evidence.d.ts.map +1 -0
  773. package/dist/search/evidence.js +220 -0
  774. package/dist/search/evidence.js.map +1 -0
  775. package/dist/search/filters.d.ts.map +1 -1
  776. package/dist/search/filters.js +58 -54
  777. package/dist/search/filters.js.map +1 -1
  778. package/dist/search/find-similar/crawl-rank.d.ts +9 -0
  779. package/dist/search/find-similar/crawl-rank.d.ts.map +1 -0
  780. package/dist/search/find-similar/crawl-rank.js +272 -0
  781. package/dist/search/find-similar/crawl-rank.js.map +1 -0
  782. package/dist/search/find-similar/mode.d.ts +4 -0
  783. package/dist/search/find-similar/mode.d.ts.map +1 -0
  784. package/dist/search/find-similar/mode.js +12 -0
  785. package/dist/search/find-similar/mode.js.map +1 -0
  786. package/dist/search/find-similar.d.ts +5 -0
  787. package/dist/search/find-similar.d.ts.map +1 -0
  788. package/dist/search/find-similar.js +509 -0
  789. package/dist/search/find-similar.js.map +1 -0
  790. package/dist/search/highlights.d.ts +19 -0
  791. package/dist/search/highlights.d.ts.map +1 -0
  792. package/dist/search/highlights.js +167 -0
  793. package/dist/search/highlights.js.map +1 -0
  794. package/dist/search/language-filter.d.ts +29 -0
  795. package/dist/search/language-filter.d.ts.map +1 -0
  796. package/dist/search/language-filter.js +126 -0
  797. package/dist/search/language-filter.js.map +1 -0
  798. package/dist/search/legacy/searxng-orchestrator.d.ts +4 -0
  799. package/dist/search/legacy/searxng-orchestrator.d.ts.map +1 -0
  800. package/dist/search/legacy/searxng-orchestrator.js +501 -0
  801. package/dist/search/legacy/searxng-orchestrator.js.map +1 -0
  802. package/dist/search/legacy/searxng-provider.d.ts +7 -0
  803. package/dist/search/legacy/searxng-provider.d.ts.map +1 -0
  804. package/dist/search/legacy/searxng-provider.js +11 -0
  805. package/dist/search/legacy/searxng-provider.js.map +1 -0
  806. package/dist/search/multi-query.d.ts +25 -0
  807. package/dist/search/multi-query.d.ts.map +1 -0
  808. package/dist/search/multi-query.js +228 -0
  809. package/dist/search/multi-query.js.map +1 -0
  810. package/dist/search/query.js +32 -34
  811. package/dist/search/query.js.map +1 -1
  812. package/dist/search/rerank.d.ts +3 -1
  813. package/dist/search/rerank.d.ts.map +1 -1
  814. package/dist/search/rerank.js +44 -35
  815. package/dist/search/rerank.js.map +1 -1
  816. package/dist/search/reranker/authority-boost.d.ts +3 -0
  817. package/dist/search/reranker/authority-boost.d.ts.map +1 -0
  818. package/dist/search/reranker/authority-boost.js +179 -0
  819. package/dist/search/reranker/authority-boost.js.map +1 -0
  820. package/dist/search/reranker/consensus-boost.d.ts +3 -0
  821. package/dist/search/reranker/consensus-boost.d.ts.map +1 -0
  822. package/dist/search/reranker/consensus-boost.js +27 -0
  823. package/dist/search/reranker/consensus-boost.js.map +1 -0
  824. package/dist/search/reranker/recency-boost.d.ts +3 -0
  825. package/dist/search/reranker/recency-boost.d.ts.map +1 -0
  826. package/dist/search/reranker/recency-boost.js +13 -0
  827. package/dist/search/reranker/recency-boost.js.map +1 -0
  828. package/dist/search/reranker/recency.d.ts +3 -0
  829. package/dist/search/reranker/recency.d.ts.map +1 -0
  830. package/dist/search/reranker/recency.js +23 -0
  831. package/dist/search/reranker/recency.js.map +1 -0
  832. package/dist/search/reranker/transformers-rerank-provider.d.ts +13 -0
  833. package/dist/search/reranker/transformers-rerank-provider.d.ts.map +1 -0
  834. package/dist/search/reranker/transformers-rerank-provider.js +94 -0
  835. package/dist/search/reranker/transformers-rerank-provider.js.map +1 -0
  836. package/dist/search/rrf.d.ts +17 -0
  837. package/dist/search/rrf.d.ts.map +1 -0
  838. package/dist/search/rrf.js +39 -0
  839. package/dist/search/rrf.js.map +1 -0
  840. package/dist/search/sampling.d.ts +25 -0
  841. package/dist/search/sampling.d.ts.map +1 -0
  842. package/dist/search/sampling.js +52 -0
  843. package/dist/search/sampling.js.map +1 -0
  844. package/dist/search/searxng.d.ts.map +1 -1
  845. package/dist/search/searxng.js +69 -79
  846. package/dist/search/searxng.js.map +1 -1
  847. package/dist/search/tokens.d.ts +3 -0
  848. package/dist/search/tokens.d.ts.map +1 -0
  849. package/dist/search/tokens.js +39 -0
  850. package/dist/search/tokens.js.map +1 -0
  851. package/dist/search/truncate.d.ts +6 -0
  852. package/dist/search/truncate.d.ts.map +1 -0
  853. package/dist/search/truncate.js +26 -0
  854. package/dist/search/truncate.js.map +1 -0
  855. package/dist/search/url-unwrap.d.ts +3 -0
  856. package/dist/search/url-unwrap.d.ts.map +1 -0
  857. package/dist/search/url-unwrap.js +43 -0
  858. package/dist/search/url-unwrap.js.map +1 -0
  859. package/dist/search/v1/context-rank.d.ts +13 -0
  860. package/dist/search/v1/context-rank.d.ts.map +1 -0
  861. package/dist/search/v1/context-rank.js +74 -0
  862. package/dist/search/v1/context-rank.js.map +1 -0
  863. package/dist/search/v1/engine-base.d.ts +27 -0
  864. package/dist/search/v1/engine-base.d.ts.map +1 -0
  865. package/dist/search/v1/engine-base.js +110 -0
  866. package/dist/search/v1/engine-base.js.map +1 -0
  867. package/dist/search/v1/intent-router.d.ts +22 -0
  868. package/dist/search/v1/intent-router.d.ts.map +1 -0
  869. package/dist/search/v1/intent-router.js +138 -0
  870. package/dist/search/v1/intent-router.js.map +1 -0
  871. package/dist/search/v1/orchestrator.d.ts +24 -0
  872. package/dist/search/v1/orchestrator.d.ts.map +1 -0
  873. package/dist/search/v1/orchestrator.js +163 -0
  874. package/dist/search/v1/orchestrator.js.map +1 -0
  875. package/dist/search/v1/recency-boost.d.ts +9 -0
  876. package/dist/search/v1/recency-boost.d.ts.map +1 -0
  877. package/dist/search/v1/recency-boost.js +37 -0
  878. package/dist/search/v1/recency-boost.js.map +1 -0
  879. package/dist/search/v1/recent-cache-dedup.d.ts +6 -0
  880. package/dist/search/v1/recent-cache-dedup.d.ts.map +1 -0
  881. package/dist/search/v1/recent-cache-dedup.js +85 -0
  882. package/dist/search/v1/recent-cache-dedup.js.map +1 -0
  883. package/dist/search/v1/rss/feed-config.d.ts +21 -0
  884. package/dist/search/v1/rss/feed-config.d.ts.map +1 -0
  885. package/dist/search/v1/rss/feed-config.js +90 -0
  886. package/dist/search/v1/rss/feed-config.js.map +1 -0
  887. package/dist/search/v1/rss/feed-parser.d.ts +14 -0
  888. package/dist/search/v1/rss/feed-parser.d.ts.map +1 -0
  889. package/dist/search/v1/rss/feed-parser.js +104 -0
  890. package/dist/search/v1/rss/feed-parser.js.map +1 -0
  891. package/dist/search/v1/rss/feed-poller.d.ts +22 -0
  892. package/dist/search/v1/rss/feed-poller.d.ts.map +1 -0
  893. package/dist/search/v1/rss/feed-poller.js +102 -0
  894. package/dist/search/v1/rss/feed-poller.js.map +1 -0
  895. package/dist/search/v1/rss/feed-store.d.ts +30 -0
  896. package/dist/search/v1/rss/feed-store.d.ts.map +1 -0
  897. package/dist/search/v1/rss/feed-store.js +134 -0
  898. package/dist/search/v1/rss/feed-store.js.map +1 -0
  899. package/dist/search/v1/rss/rss-engine.d.ts +6 -0
  900. package/dist/search/v1/rss/rss-engine.d.ts.map +1 -0
  901. package/dist/search/v1/rss/rss-engine.js +28 -0
  902. package/dist/search/v1/rss/rss-engine.js.map +1 -0
  903. package/dist/search/v1/v1-provider.d.ts +7 -0
  904. package/dist/search/v1/v1-provider.d.ts.map +1 -0
  905. package/dist/search/v1/v1-provider.js +68 -0
  906. package/dist/search/v1/v1-provider.js.map +1 -0
  907. package/dist/search/v1/verticals/code.d.ts +4 -0
  908. package/dist/search/v1/verticals/code.d.ts.map +1 -0
  909. package/dist/search/v1/verticals/code.js +20 -0
  910. package/dist/search/v1/verticals/code.js.map +1 -0
  911. package/dist/search/v1/verticals/docs.d.ts +4 -0
  912. package/dist/search/v1/verticals/docs.d.ts.map +1 -0
  913. package/dist/search/v1/verticals/docs.js +20 -0
  914. package/dist/search/v1/verticals/docs.js.map +1 -0
  915. package/dist/search/v1/verticals/general.d.ts +4 -0
  916. package/dist/search/v1/verticals/general.d.ts.map +1 -0
  917. package/dist/search/v1/verticals/general.js +22 -0
  918. package/dist/search/v1/verticals/general.js.map +1 -0
  919. package/dist/search/v1/verticals/news.d.ts +10 -0
  920. package/dist/search/v1/verticals/news.d.ts.map +1 -0
  921. package/dist/search/v1/verticals/news.js +52 -0
  922. package/dist/search/v1/verticals/news.js.map +1 -0
  923. package/dist/search/v1/verticals/papers.d.ts +4 -0
  924. package/dist/search/v1/verticals/papers.d.ts.map +1 -0
  925. package/dist/search/v1/verticals/papers.js +23 -0
  926. package/dist/search/v1/verticals/papers.js.map +1 -0
  927. package/dist/search/validator.js +31 -31
  928. package/dist/search/validator.js.map +1 -1
  929. package/dist/searxng/bootstrap.d.ts +30 -0
  930. package/dist/searxng/bootstrap.d.ts.map +1 -1
  931. package/dist/searxng/bootstrap.js +223 -85
  932. package/dist/searxng/bootstrap.js.map +1 -1
  933. package/dist/searxng/docker.d.ts.map +1 -1
  934. package/dist/searxng/docker.js +69 -60
  935. package/dist/searxng/docker.js.map +1 -1
  936. package/dist/searxng/process.d.ts +13 -1
  937. package/dist/searxng/process.d.ts.map +1 -1
  938. package/dist/searxng/process.js +231 -164
  939. package/dist/searxng/process.js.map +1 -1
  940. package/dist/server/backend-status.d.ts +13 -0
  941. package/dist/server/backend-status.d.ts.map +1 -0
  942. package/dist/server/backend-status.js +40 -0
  943. package/dist/server/backend-status.js.map +1 -0
  944. package/dist/server/tool-schemas.d.ts +549 -0
  945. package/dist/server/tool-schemas.d.ts.map +1 -0
  946. package/dist/server/tool-schemas.js +464 -0
  947. package/dist/server/tool-schemas.js.map +1 -0
  948. package/dist/server/warmup-on-start.d.ts +9 -0
  949. package/dist/server/warmup-on-start.d.ts.map +1 -0
  950. package/dist/server/warmup-on-start.js +55 -0
  951. package/dist/server/warmup-on-start.js.map +1 -0
  952. package/dist/server.d.ts +17 -0
  953. package/dist/server.d.ts.map +1 -1
  954. package/dist/server.js +454 -297
  955. package/dist/server.js.map +1 -1
  956. package/dist/tools/agent.d.ts +5 -0
  957. package/dist/tools/agent.d.ts.map +1 -0
  958. package/dist/tools/agent.js +128 -0
  959. package/dist/tools/agent.js.map +1 -0
  960. package/dist/tools/cache.d.ts +2 -1
  961. package/dist/tools/cache.d.ts.map +1 -1
  962. package/dist/tools/cache.js +177 -44
  963. package/dist/tools/cache.js.map +1 -1
  964. package/dist/tools/crawl.d.ts.map +1 -1
  965. package/dist/tools/crawl.js +171 -88
  966. package/dist/tools/crawl.js.map +1 -1
  967. package/dist/tools/extract.d.ts +2 -2
  968. package/dist/tools/extract.d.ts.map +1 -1
  969. package/dist/tools/extract.js +175 -59
  970. package/dist/tools/extract.js.map +1 -1
  971. package/dist/tools/fetch.d.ts +2 -2
  972. package/dist/tools/fetch.d.ts.map +1 -1
  973. package/dist/tools/fetch.js +174 -68
  974. package/dist/tools/fetch.js.map +1 -1
  975. package/dist/tools/find-similar.d.ts +5 -0
  976. package/dist/tools/find-similar.d.ts.map +1 -0
  977. package/dist/tools/find-similar.js +127 -0
  978. package/dist/tools/find-similar.js.map +1 -0
  979. package/dist/tools/research.d.ts +5 -0
  980. package/dist/tools/research.d.ts.map +1 -0
  981. package/dist/tools/research.js +107 -0
  982. package/dist/tools/research.js.map +1 -0
  983. package/dist/tools/search.d.ts +10 -2
  984. package/dist/tools/search.d.ts.map +1 -1
  985. package/dist/tools/search.js +13 -158
  986. package/dist/tools/search.js.map +1 -1
  987. package/dist/types.d.ts +350 -7
  988. package/dist/types.d.ts.map +1 -1
  989. package/dist/types.js +6 -1
  990. package/dist/types.js.map +1 -1
  991. package/dist/util/mode.d.ts +4 -0
  992. package/dist/util/mode.d.ts.map +1 -0
  993. package/dist/util/mode.js +34 -0
  994. package/dist/util/mode.js.map +1 -0
  995. package/package.json +78 -8
  996. package/dist/extraction/trafilatura.d.ts +0 -6
  997. package/dist/extraction/trafilatura.d.ts.map +0 -1
  998. package/dist/extraction/trafilatura.js +0 -105
  999. package/dist/extraction/trafilatura.js.map +0 -1
  1000. package/dist/search/flashrank.d.ts +0 -12
  1001. package/dist/search/flashrank.d.ts.map +0 -1
  1002. package/dist/search/flashrank.js +0 -63
  1003. package/dist/search/flashrank.js.map +0 -1
@@ -1 +1 @@
1
- {"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/extraction/markdown.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAEvC,SAAS,aAAa;IACpB,MAAM,EAAE,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;IAElF,wCAAwC;IACxC,EAAE,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;IAE/B,iDAAiD;IACjD,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE;QAClB,MAAM,EAAE,OAAO;QACf,WAAW,CAAC,QAAQ,EAAE,IAAI;YACxB,MAAM,EAAE,GAAG,IAAe,CAAC;YAC3B,MAAM,IAAI,GAAc,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC;YAC9D,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAC;YAEjC,MAAM,SAAS,GAAG,CAAC,GAAY,EAAU,EAAE;gBACzC,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;gBACzD,OAAO,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YACnG,CAAC,CAAC;YAEF,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,WAAW,GAAG,SAAS,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YAChE,MAAM,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC;YACrE,MAAM,SAAS,GAAG,IAAI,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC;YAEzE,IAAI,WAAW,EAAE,CAAC;gBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC/B,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;gBAC5E,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;YAC5C,CAAC;YAED,MAAM,KAAK,GAAG,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC;YACjF,OAAO,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QAC5C,CAAC;KACF,CAAC,CAAC;IAEH,qFAAqF;IACrF,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE;QACtB,MAAM,EAAE,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC;QACrD,WAAW,CAAC,OAAO;YACjB,OAAO,OAAO,CAAC;QACjB,CAAC;KACF,CAAC,CAAC;IAEH,OAAO,EAAE,CAAC;AACZ,CAAC;AAED,MAAM,QAAQ,GAAG,aAAa,EAAE,CAAC;AAEjC,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAQD,SAAS,aAAa,CAAC,KAAe;IACpC,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QACjD,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;IACH,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAe,EAAE,QAAmB,EAAE,UAAkB;IAClF,MAAM,OAAO,GAAG,QAAQ,CAAC,UAAU,CAAC,CAAC;IACrC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,CAAC;IAEhC,0EAA0E;IAC1E,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC;IACvB,KAAK,IAAI,CAAC,GAAG,UAAU,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtD,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACvC,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAC5B,MAAM;QACR,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,cAAc,CAC5B,QAAgB,EAChB,OAAe,EACf,YAAY,GAAG,CAAC;IAEhB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAExE,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IACpC,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IAEnD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,CAAC,CAAC;IAE/E,yDAAyD;IACzD,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC;QAClE,MAAM,EAAE,CAAC,EAAE,GAAG,YAAY,CAAC,YAAY,CAAC,CAAC;QACzC,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;IAC5E,CAAC;IAED,4EAA4E;IAC5E,MAAM,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAEzF,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,YAAY,IAAI,gBAAgB,CAAC,MAAM,EAAE,CAAC;QAC7E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IAC/C,CAAC;IAED,MAAM,EAAE,CAAC,EAAE,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;IAC7C,OAAO,EAAE,OAAO,EAAE,kBAAkB,CAAC,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC;AAC5E,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,QAAgB;IACpD,MAAM,YAAY,GAAG,yBAAyB,CAAC;IAC/C,MAAM,WAAW,GAAG,8BAA8B,CAAC;IAEnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAU,CAAC;IACjC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAEhC,IAAI,KAA6B,CAAC;IAElC,uBAAuB;IACvB,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvB,CAAC;IAED,4BAA4B;IAC5B,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;AAClE,CAAC"}
1
+ {"version":3,"sources":["../../src/extraction/markdown.ts"],"sourcesContent":["import TurndownService from 'turndown';\nimport { detectCodeLanguage } from './lang-hints.js';\n\nfunction longestBacktickRun(s: string): number {\n let max = 0;\n let cur = 0;\n for (let i = 0; i < s.length; i++) {\n if (s.charCodeAt(i) === 96) {\n cur++;\n if (cur > max) max = cur;\n } else {\n cur = 0;\n }\n }\n return max;\n}\n\nexport function buildTurndown(): TurndownService {\n const td = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });\n\n // Remove script and style tags entirely\n td.remove(['script', 'style']);\n\n // Custom rule: convert <table> to markdown table\n td.addRule('table', {\n filter: 'table',\n replacement(_content, node) {\n const el = node as Element;\n const rows: Element[] = Array.from(el.querySelectorAll('tr'));\n if (rows.length === 0) return '';\n\n const renderRow = (row: Element): string => {\n const cells = Array.from(row.querySelectorAll('th, td'));\n return '| ' + cells.map(c => c.textContent?.replace(/\\n/g, ' ').trim() ?? '').join(' | ') + ' |';\n };\n\n const headerRow = rows[0];\n const isHeaderRow = headerRow.querySelectorAll('th').length > 0;\n const headerCells = Array.from(headerRow.querySelectorAll('th, td'));\n const separator = '| ' + headerCells.map(() => '---').join(' | ') + ' |';\n\n if (isHeaderRow) {\n const bodyRows = rows.slice(1);\n const lines = [renderRow(headerRow), separator, ...bodyRows.map(renderRow)];\n return '\\n\\n' + lines.join('\\n') + '\\n\\n';\n }\n\n const lines = [renderRow(headerRow), separator, ...rows.slice(1).map(renderRow)];\n return '\\n\\n' + lines.join('\\n') + '\\n\\n';\n },\n });\n\n // Suppress thead/tbody/tr/th/td individually since table rule handles the whole node\n td.addRule('tableCell', {\n filter: ['thead', 'tbody', 'tfoot', 'tr', 'th', 'td'],\n replacement(content) {\n return content;\n },\n });\n\n td.addRule('codeBlockLang', {\n filter(node) {\n return node.nodeName === 'PRE' && (node as Element).querySelector('code') !== null;\n },\n replacement(_content, node) {\n const pre = node as Element;\n const code = pre.querySelector('code');\n const cls = code?.getAttribute('class') ?? pre.getAttribute('class') ?? '';\n const lang = detectCodeLanguage(cls);\n const body = code?.textContent ?? pre.textContent ?? '';\n const fence = '`'.repeat(Math.max(3, longestBacktickRun(body) + 1));\n return `\\n\\n${fence}${lang ?? ''}\\n${body.replace(/\\n+$/, '')}\\n${fence}\\n\\n`;\n },\n });\n\n return td;\n}\n\nconst turndown = buildTurndown();\n\nexport function htmlToMarkdown(html: string): string {\n if (!html) return '';\n return turndown.turndown(html);\n}\n\nexport interface Heading {\n level: number;\n text: string;\n lineIndex: number;\n}\n\nexport function parseHeadings(lines: string[]): Heading[] {\n const headings: Heading[] = [];\n for (let i = 0; i < lines.length; i++) {\n const match = lines[i].match(/^(#{1,6})\\s+(.+)/);\n if (match) {\n headings.push({ level: match[1].length, text: match[2].trim(), lineIndex: i });\n }\n }\n return headings;\n}\n\n// Prefix-sum array of char offsets: offsets[i] is the index in\n// `lines.join('\\n')` at which lines[i] begins.\nexport function lineStartCharOffsets(lines: string[]): number[] {\n const offsets = new Array<number>(lines.length);\n let acc = 0;\n for (let i = 0; i < lines.length; i++) {\n offsets[i] = acc;\n acc += lines[i].length + 1; // +1 for the '\\n' separator\n }\n return offsets;\n}\n\nfunction extractFromHeading(lines: string[], headings: Heading[], headingIdx: number): string {\n const heading = headings[headingIdx];\n const start = heading.lineIndex;\n\n // Find the next heading of equal or higher level (lower or equal # count)\n let end = lines.length;\n for (let i = headingIdx + 1; i < headings.length; i++) {\n if (headings[i].level <= heading.level) {\n end = headings[i].lineIndex;\n break;\n }\n }\n\n return lines.slice(start, end).join('\\n');\n}\n\nexport function extractSection(\n markdown: string,\n section: string,\n sectionIndex = 0,\n): { content: string; matched: boolean } {\n const lines = markdown.split('\\n');\n const headings = parseHeadings(lines);\n\n if (headings.length === 0) return { content: markdown, matched: false };\n\n const lower = section.toLowerCase();\n const indexed = headings.map((h, i) => ({ h, i }));\n\n // Collect exact matches first\n const exactMatches = indexed.filter(({ h }) => h.text.toLowerCase() === lower);\n\n // If exact matches satisfy the requested index, use them\n if (exactMatches.length > 0 && sectionIndex < exactMatches.length) {\n const { i } = exactMatches[sectionIndex];\n return { content: extractFromHeading(lines, headings, i), matched: true };\n }\n\n // Fall back to substring matches (includes exact headings and partial ones)\n const substringMatches = indexed.filter(({ h }) => h.text.toLowerCase().includes(lower));\n\n if (substringMatches.length === 0 || sectionIndex >= substringMatches.length) {\n return { content: markdown, matched: false };\n }\n\n const { i } = substringMatches[sectionIndex];\n return { content: extractFromHeading(lines, headings, i), matched: true };\n}\n\nexport function extractLinksAndImages(markdown: string): { links: string[]; images: string[] } {\n const imagePattern = /!\\[[^\\]]*\\]\\(([^)]+)\\)/g;\n const linkPattern = /(?<!!)\\[[^\\]]*\\]\\(([^)]+)\\)/g;\n\n const images = new Set<string>();\n const links = new Set<string>();\n\n let match: RegExpExecArray | null;\n\n // Extract images first\n while ((match = imagePattern.exec(markdown)) !== null) {\n images.add(match[1]);\n }\n\n // Extract links (non-image)\n while ((match = linkPattern.exec(markdown)) !== null) {\n links.add(match[1]);\n }\n\n return { links: Array.from(links), images: Array.from(images) };\n}\n\nconst DECORATIVE_URL_MARKERS = [\n 'avatar',\n 'icon',\n 'logo',\n 'badge',\n 'shield',\n 'tracking',\n 'pixel',\n 'sprite',\n 'emoji',\n 'favicon',\n];\n\n// Drop `![alt](src)` tokens that look decorative. Heuristic only -- keep\n// images that have alt text unless the URL clearly marks them decorative.\n// Tracking pixels (tiny data-URI gifs) and empty-alt icons are removed.\nexport function filterDecorativeImages(markdown: string): string {\n if (!markdown) return markdown;\n return markdown.replace(/!\\[([^\\]]*)\\]\\(([^)]+)\\)/g, (match, alt: string, src: string) => {\n const trimmedAlt = alt.trim();\n const lowerSrc = src.toLowerCase();\n\n // Tiny animated-GIF tracking pixel / 1x1 beacons\n if (lowerSrc.startsWith('data:image/gif;base64,')) return '';\n\n // Inline SVG icon data URIs (short = tiny, likely decorative glyph)\n if (lowerSrc.startsWith('data:image/svg+xml') && src.length < 200) return '';\n\n // URL marks it as decorative regardless of alt\n for (const marker of DECORATIVE_URL_MARKERS) {\n if (lowerSrc.includes(marker)) return '';\n }\n\n // No alt text + no title = decorative\n if (!trimmedAlt) return '';\n\n return match;\n });\n}\n\n// Resolve relative `[text](path)` and `![alt](path)` targets against baseUrl.\n// Leaves absolute URLs, mailto:, tel:, javascript:, and #fragments untouched.\nexport function resolveRelativeUrls(markdown: string, baseUrl: string): string {\n if (!markdown || !baseUrl) return markdown;\n\n const rewrite = (path: string): string => {\n const trimmed = path.trim();\n if (!trimmed) return path;\n if (/^(?:https?:|mailto:|tel:|javascript:|data:)/i.test(trimmed)) return path;\n if (trimmed.startsWith('#')) {\n try {\n return new URL(trimmed, baseUrl).href;\n } catch {\n return path;\n }\n }\n if (trimmed.startsWith('//')) {\n try {\n const base = new URL(baseUrl);\n return `${base.protocol}${trimmed}`;\n } catch {\n return path;\n }\n }\n try {\n return new URL(trimmed, baseUrl).href;\n } catch {\n return path;\n }\n };\n\n // Image links first so the shared link regex does not rewrite them twice.\n let result = markdown.replace(\n /(!\\[[^\\]]*\\]\\()([^)\\s]+)(\\s*(?:\"[^\"]*\")?\\))/g,\n (_m, open, path, close) => `${open}${rewrite(path)}${close}`,\n );\n\n result = result.replace(\n /(^|[^!])(\\[[^\\]]*\\]\\()([^)\\s]+)(\\s*(?:\"[^\"]*\")?\\))/g,\n (_m, pre, open, path, close) => `${pre}${open}${rewrite(path)}${close}`,\n );\n\n return result;\n}\n"],"mappings":"AAAA,OAAO,qBAAqB;AAC5B,SAAS,0BAA0B;AAEnC,SAAS,mBAAmB,GAAmB;AAC7C,MAAI,MAAM;AACV,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,QAAI,EAAE,WAAW,CAAC,MAAM,IAAI;AAC1B;AACA,UAAI,MAAM,IAAK,OAAM;AAAA,IACvB,OAAO;AACL,YAAM;AAAA,IACR;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,gBAAiC;AAC/C,QAAM,KAAK,IAAI,gBAAgB,EAAE,cAAc,OAAO,gBAAgB,SAAS,CAAC;AAGhF,KAAG,OAAO,CAAC,UAAU,OAAO,CAAC;AAG7B,KAAG,QAAQ,SAAS;AAAA,IAClB,QAAQ;AAAA,IACR,YAAY,UAAU,MAAM;AAC1B,YAAM,KAAK;AACX,YAAM,OAAkB,MAAM,KAAK,GAAG,iBAAiB,IAAI,CAAC;AAC5D,UAAI,KAAK,WAAW,EAAG,QAAO;AAE9B,YAAM,YAAY,CAAC,QAAyB;AAC1C,cAAM,QAAQ,MAAM,KAAK,IAAI,iBAAiB,QAAQ,CAAC;AACvD,eAAO,OAAO,MAAM,IAAI,OAAK,EAAE,aAAa,QAAQ,OAAO,GAAG,EAAE,KAAK,KAAK,EAAE,EAAE,KAAK,KAAK,IAAI;AAAA,MAC9F;AAEA,YAAM,YAAY,KAAK,CAAC;AACxB,YAAM,cAAc,UAAU,iBAAiB,IAAI,EAAE,SAAS;AAC9D,YAAM,cAAc,MAAM,KAAK,UAAU,iBAAiB,QAAQ,CAAC;AACnE,YAAM,YAAY,OAAO,YAAY,IAAI,MAAM,KAAK,EAAE,KAAK,KAAK,IAAI;AAEpE,UAAI,aAAa;AACf,cAAM,WAAW,KAAK,MAAM,CAAC;AAC7B,cAAMA,SAAQ,CAAC,UAAU,SAAS,GAAG,WAAW,GAAG,SAAS,IAAI,SAAS,CAAC;AAC1E,eAAO,SAASA,OAAM,KAAK,IAAI,IAAI;AAAA,MACrC;AAEA,YAAM,QAAQ,CAAC,UAAU,SAAS,GAAG,WAAW,GAAG,KAAK,MAAM,CAAC,EAAE,IAAI,SAAS,CAAC;AAC/E,aAAO,SAAS,MAAM,KAAK,IAAI,IAAI;AAAA,IACrC;AAAA,EACF,CAAC;AAGD,KAAG,QAAQ,aAAa;AAAA,IACtB,QAAQ,CAAC,SAAS,SAAS,SAAS,MAAM,MAAM,IAAI;AAAA,IACpD,YAAY,SAAS;AACnB,aAAO;AAAA,IACT;AAAA,EACF,CAAC;AAED,KAAG,QAAQ,iBAAiB;AAAA,IAC1B,OAAO,MAAM;AACX,aAAO,KAAK,aAAa,SAAU,KAAiB,cAAc,MAAM,MAAM;AAAA,IAChF;AAAA,IACA,YAAY,UAAU,MAAM;AAC1B,YAAM,MAAM;AACZ,YAAM,OAAO,IAAI,cAAc,MAAM;AACrC,YAAM,MAAM,MAAM,aAAa,OAAO,KAAK,IAAI,aAAa,OAAO,KAAK;AACxE,YAAM,OAAO,mBAAmB,GAAG;AACnC,YAAM,OAAO,MAAM,eAAe,IAAI,eAAe;AACrD,YAAM,QAAQ,IAAI,OAAO,KAAK,IAAI,GAAG,mBAAmB,IAAI,IAAI,CAAC,CAAC;AAClE,aAAO;AAAA;AAAA,EAAO,KAAK,GAAG,QAAQ,EAAE;AAAA,EAAK,KAAK,QAAQ,QAAQ,EAAE,CAAC;AAAA,EAAK,KAAK;AAAA;AAAA;AAAA,IACzE;AAAA,EACF,CAAC;AAED,SAAO;AACT;AAEA,MAAM,WAAW,cAAc;AAExB,SAAS,eAAe,MAAsB;AACnD,MAAI,CAAC,KAAM,QAAO;AAClB,SAAO,SAAS,SAAS,IAAI;AAC/B;AAQO,SAAS,cAAc,OAA4B;AACxD,QAAM,WAAsB,CAAC;AAC7B,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,UAAM,QAAQ,MAAM,CAAC,EAAE,MAAM,kBAAkB;AAC/C,QAAI,OAAO;AACT,eAAS,KAAK,EAAE,OAAO,MAAM,CAAC,EAAE,QAAQ,MAAM,MAAM,CAAC,EAAE,KAAK,GAAG,WAAW,EAAE,CAAC;AAAA,IAC/E;AAAA,EACF;AACA,SAAO;AACT;AAIO,SAAS,qBAAqB,OAA2B;AAC9D,QAAM,UAAU,IAAI,MAAc,MAAM,MAAM;AAC9C,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,YAAQ,CAAC,IAAI;AACb,WAAO,MAAM,CAAC,EAAE,SAAS;AAAA,EAC3B;AACA,SAAO;AACT;AAEA,SAAS,mBAAmB,OAAiB,UAAqB,YAA4B;AAC5F,QAAM,UAAU,SAAS,UAAU;AACnC,QAAM,QAAQ,QAAQ;AAGtB,MAAI,MAAM,MAAM;AAChB,WAAS,IAAI,aAAa,GAAG,IAAI,SAAS,QAAQ,KAAK;AACrD,QAAI,SAAS,CAAC,EAAE,SAAS,QAAQ,OAAO;AACtC,YAAM,SAAS,CAAC,EAAE;AAClB;AAAA,IACF;AAAA,EACF;AAEA,SAAO,MAAM,MAAM,OAAO,GAAG,EAAE,KAAK,IAAI;AAC1C;AAEO,SAAS,eACd,UACA,SACA,eAAe,GACwB;AACvC,QAAM,QAAQ,SAAS,MAAM,IAAI;AACjC,QAAM,WAAW,cAAc,KAAK;AAEpC,MAAI,SAAS,WAAW,EAAG,QAAO,EAAE,SAAS,UAAU,SAAS,MAAM;AAEtE,QAAM,QAAQ,QAAQ,YAAY;AAClC,QAAM,UAAU,SAAS,IAAI,CAAC,GAAGC,QAAO,EAAE,GAAG,GAAAA,GAAE,EAAE;AAGjD,QAAM,eAAe,QAAQ,OAAO,CAAC,EAAE,EAAE,MAAM,EAAE,KAAK,YAAY,MAAM,KAAK;AAG7E,MAAI,aAAa,SAAS,KAAK,eAAe,aAAa,QAAQ;AACjE,UAAM,EAAE,GAAAA,GAAE,IAAI,aAAa,YAAY;AACvC,WAAO,EAAE,SAAS,mBAAmB,OAAO,UAAUA,EAAC,GAAG,SAAS,KAAK;AAAA,EAC1E;AAGA,QAAM,mBAAmB,QAAQ,OAAO,CAAC,EAAE,EAAE,MAAM,EAAE,KAAK,YAAY,EAAE,SAAS,KAAK,CAAC;AAEvF,MAAI,iBAAiB,WAAW,KAAK,gBAAgB,iBAAiB,QAAQ;AAC5E,WAAO,EAAE,SAAS,UAAU,SAAS,MAAM;AAAA,EAC7C;AAEA,QAAM,EAAE,EAAE,IAAI,iBAAiB,YAAY;AAC3C,SAAO,EAAE,SAAS,mBAAmB,OAAO,UAAU,CAAC,GAAG,SAAS,KAAK;AAC1E;AAEO,SAAS,sBAAsB,UAAyD;AAC7F,QAAM,eAAe;AACrB,QAAM,cAAc;AAEpB,QAAM,SAAS,oBAAI,IAAY;AAC/B,QAAM,QAAQ,oBAAI,IAAY;AAE9B,MAAI;AAGJ,UAAQ,QAAQ,aAAa,KAAK,QAAQ,OAAO,MAAM;AACrD,WAAO,IAAI,MAAM,CAAC,CAAC;AAAA,EACrB;AAGA,UAAQ,QAAQ,YAAY,KAAK,QAAQ,OAAO,MAAM;AACpD,UAAM,IAAI,MAAM,CAAC,CAAC;AAAA,EACpB;AAEA,SAAO,EAAE,OAAO,MAAM,KAAK,KAAK,GAAG,QAAQ,MAAM,KAAK,MAAM,EAAE;AAChE;AAEA,MAAM,yBAAyB;AAAA,EAC7B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAKO,SAAS,uBAAuB,UAA0B;AAC/D,MAAI,CAAC,SAAU,QAAO;AACtB,SAAO,SAAS,QAAQ,6BAA6B,CAAC,OAAO,KAAa,QAAgB;AACxF,UAAM,aAAa,IAAI,KAAK;AAC5B,UAAM,WAAW,IAAI,YAAY;AAGjC,QAAI,SAAS,WAAW,wBAAwB,EAAG,QAAO;AAG1D,QAAI,SAAS,WAAW,oBAAoB,KAAK,IAAI,SAAS,IAAK,QAAO;AAG1E,eAAW,UAAU,wBAAwB;AAC3C,UAAI,SAAS,SAAS,MAAM,EAAG,QAAO;AAAA,IACxC;AAGA,QAAI,CAAC,WAAY,QAAO;AAExB,WAAO;AAAA,EACT,CAAC;AACH;AAIO,SAAS,oBAAoB,UAAkB,SAAyB;AAC7E,MAAI,CAAC,YAAY,CAAC,QAAS,QAAO;AAElC,QAAM,UAAU,CAAC,SAAyB;AACxC,UAAM,UAAU,KAAK,KAAK;AAC1B,QAAI,CAAC,QAAS,QAAO;AACrB,QAAI,+CAA+C,KAAK,OAAO,EAAG,QAAO;AACzE,QAAI,QAAQ,WAAW,GAAG,GAAG;AAC3B,UAAI;AACF,eAAO,IAAI,IAAI,SAAS,OAAO,EAAE;AAAA,MACnC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AACA,QAAI,QAAQ,WAAW,IAAI,GAAG;AAC5B,UAAI;AACF,cAAM,OAAO,IAAI,IAAI,OAAO;AAC5B,eAAO,GAAG,KAAK,QAAQ,GAAG,OAAO;AAAA,MACnC,QAAQ;AACN,eAAO;AAAA,MACT;AAAA,IACF;AACA,QAAI;AACF,aAAO,IAAI,IAAI,SAAS,OAAO,EAAE;AAAA,IACnC,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF;AAGA,MAAI,SAAS,SAAS;AAAA,IACpB;AAAA,IACA,CAAC,IAAI,MAAM,MAAM,UAAU,GAAG,IAAI,GAAG,QAAQ,IAAI,CAAC,GAAG,KAAK;AAAA,EAC5D;AAEA,WAAS,OAAO;AAAA,IACd;AAAA,IACA,CAAC,IAAI,KAAK,MAAM,MAAM,UAAU,GAAG,GAAG,GAAG,IAAI,GAAG,QAAQ,IAAI,CAAC,GAAG,KAAK;AAAA,EACvE;AAEA,SAAO;AACT;","names":["lines","i"]}
@@ -7,5 +7,13 @@ export interface ExtractionOptions {
7
7
  pdfBuffer?: Buffer;
8
8
  }
9
9
  export declare function registerExtractor(extractor: Extractor): void;
10
+ /**
11
+ * @deprecated Use `getExtractProvider().extract(...)` from
12
+ * `src/providers/extract-provider.ts`. This facade remains for backwards
13
+ * compatibility with existing test mocks and benchmark runners that import
14
+ * `extractContent` directly. Will be removed after the test-mock migration.
15
+ */
10
16
  export declare function extractContent(html: string, url: string, options?: ExtractionOptions): Promise<ExtractionResult>;
17
+ export declare function mergeMetadata(base: ExtractionResult['metadata'], html: string): ExtractionResult['metadata'];
18
+ export declare function applyPostProcessing(result: ExtractionResult, url: string, html: string, options: ExtractionOptions): ExtractionResult;
11
19
  //# sourceMappingURL=pipeline.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAU/D,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AASD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,GAAG,IAAI,CAE5D;AAED,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CAmE3B"}
1
+ {"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAI/D,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAKD,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,SAAS,GAAG,IAAI,CAE5D;AAED;;;;;GAKG;AACH,wBAAsB,cAAc,CAClC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,iBAAsB,GAC9B,OAAO,CAAC,gBAAgB,CAAC,CAG3B;AAED,wBAAgB,aAAa,CAC3B,IAAI,EAAE,gBAAgB,CAAC,UAAU,CAAC,EAClC,IAAI,EAAE,MAAM,GACX,gBAAgB,CAAC,UAAU,CAAC,CAkB9B;AAED,wBAAgB,mBAAmB,CACjC,MAAM,EAAE,gBAAgB,EACxB,GAAG,EAAE,MAAM,EACX,IAAI,EAAE,MAAM,EACZ,OAAO,EAAE,iBAAiB,GACzB,gBAAgB,CAsBlB"}
@@ -1,95 +1,61 @@
1
- import { defuddleExtract } from './defuddle.js';
2
- import { readabilityExtract } from './readability.js';
3
- import { trafilaturaExtract, isTrafilaturaAvailable } from './trafilatura.js';
4
- import { htmlToMarkdown, extractSection, extractLinksAndImages } from './markdown.js';
5
- import { githubExtractor } from './site-extractors/github.js';
6
- import { stackoverflowExtractor } from './site-extractors/stackoverflow.js';
7
- import { mdnExtractor } from './site-extractors/mdn.js';
8
- import { docsGenericExtractor } from './site-extractors/docs-generic.js';
9
- import { createLogger } from '../logger.js';
10
- import { getConfig } from '../config.js';
11
- const log = createLogger('extract');
12
- const siteExtractors = [
13
- githubExtractor,
14
- stackoverflowExtractor,
15
- mdnExtractor,
16
- docsGenericExtractor,
17
- ];
18
- export function registerExtractor(extractor) {
19
- siteExtractors.push(extractor);
1
+ import {
2
+ extractSection,
3
+ extractLinksAndImages,
4
+ filterDecorativeImages,
5
+ resolveRelativeUrls
6
+ } from "./markdown.js";
7
+ import { extractMetadata } from "./extract.js";
8
+ import { stripBoilerplateMarkdown } from "./boilerplate.js";
9
+ import { sanitizeExtractedMarkdown } from "./markdown-sanitize.js";
10
+ import { registerSiteExtractor } from "./v1/site-extractors.js";
11
+ import { getExtractProvider } from "../providers/extract-provider.js";
12
+ function registerExtractor(extractor) {
13
+ registerSiteExtractor(extractor);
20
14
  }
21
- export async function extractContent(html, url, options = {}) {
22
- let result = null;
23
- if (options.contentType === 'application/pdf') {
24
- let pdfText = '';
25
- if (options.pdfBuffer) {
26
- try {
27
- const pdfParse = (await import('pdf-parse')).default;
28
- const parsed = await pdfParse(options.pdfBuffer);
29
- pdfText = parsed.text ?? '';
30
- }
31
- catch (err) {
32
- log.warn('pdf-parse failed', { url, error: String(err) });
33
- }
34
- }
35
- result = {
36
- title: '',
37
- markdown: pdfText,
38
- metadata: {},
39
- links: [],
40
- images: [],
41
- extractor: 'turndown',
42
- };
43
- return applyPostProcessing(result, options);
44
- }
45
- const siteExtractor = siteExtractors.find((e) => e.canHandle(url, html));
46
- if (siteExtractor) {
47
- const extracted = siteExtractor.extract(html, url);
48
- if (extracted) {
49
- result = extracted;
50
- return applyPostProcessing(result, options);
51
- }
52
- }
53
- result = await defuddleExtract(html, url);
54
- if (!result) {
55
- const config = getConfig();
56
- if (config.trafilatura !== 'never') {
57
- const trafAvailable = await isTrafilaturaAvailable();
58
- if (trafAvailable) {
59
- result = await trafilaturaExtract(html, url);
60
- if (result) {
61
- log.info('Trafilatura extraction succeeded', { url, chars: result.markdown.length });
62
- return applyPostProcessing(result, options);
63
- }
64
- }
65
- }
66
- }
67
- if (!result) {
68
- result = readabilityExtract(html, url);
69
- }
70
- if (!result) {
71
- const markdown = htmlToMarkdown(html);
72
- result = {
73
- title: '',
74
- markdown,
75
- metadata: {},
76
- links: [],
77
- images: [],
78
- extractor: 'turndown',
79
- };
80
- }
81
- return applyPostProcessing(result, options);
15
+ async function extractContent(html, url, options = {}) {
16
+ const provider = await getExtractProvider();
17
+ return provider.extract(html, url, options);
82
18
  }
83
- function applyPostProcessing(result, options) {
84
- let markdown = result.markdown;
85
- if (options.section) {
86
- const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);
87
- markdown = content;
88
- }
89
- const { links, images } = extractLinksAndImages(markdown);
90
- if (options.maxChars && markdown.length > options.maxChars) {
91
- markdown = markdown.slice(0, options.maxChars);
92
- }
93
- return { ...result, markdown, links, images };
19
+ function mergeMetadata(base, html) {
20
+ try {
21
+ const meta = extractMetadata(html);
22
+ return {
23
+ ...meta,
24
+ // Extractor-provided fields win when set (they already inspected the article body).
25
+ description: base.description || meta.description,
26
+ author: base.author || meta.author,
27
+ date: base.date || meta.date,
28
+ language: base.language,
29
+ og_image: base.og_image ?? meta.og_image,
30
+ og_type: base.og_type ?? meta.og_type,
31
+ canonical_url: base.canonical_url ?? meta.canonical_url,
32
+ keywords: base.keywords ?? meta.keywords
33
+ };
34
+ } catch {
35
+ return base;
36
+ }
94
37
  }
38
+ function applyPostProcessing(result, url, html, options) {
39
+ let markdown = result.markdown;
40
+ markdown = resolveRelativeUrls(markdown, url);
41
+ markdown = stripBoilerplateMarkdown(markdown);
42
+ markdown = filterDecorativeImages(markdown);
43
+ markdown = sanitizeExtractedMarkdown(markdown);
44
+ if (options.section) {
45
+ const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);
46
+ markdown = content;
47
+ }
48
+ const { links, images } = extractLinksAndImages(markdown);
49
+ const metadata = mergeMetadata(result.metadata, html);
50
+ if (options.maxChars && markdown.length > options.maxChars) {
51
+ markdown = markdown.slice(0, options.maxChars);
52
+ }
53
+ return { ...result, markdown, links, images, metadata };
54
+ }
55
+ export {
56
+ applyPostProcessing,
57
+ extractContent,
58
+ mergeMetadata,
59
+ registerExtractor
60
+ };
95
61
  //# sourceMappingURL=pipeline.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../../src/extraction/pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC9E,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,eAAe,CAAC;AAEtF,OAAO,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAC9D,OAAO,EAAE,sBAAsB,EAAE,MAAM,oCAAoC,CAAC;AAC5E,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AACxD,OAAO,EAAE,oBAAoB,EAAE,MAAM,mCAAmC,CAAC;AACzE,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,MAAM,GAAG,GAAG,YAAY,CAAC,SAAS,CAAC,CAAC;AAUpC,MAAM,cAAc,GAAgB;IAClC,eAAe;IACf,sBAAsB;IACtB,YAAY;IACZ,oBAAoB;CACrB,CAAC;AAEF,MAAM,UAAU,iBAAiB,CAAC,SAAoB;IACpD,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AACjC,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAAY,EACZ,GAAW,EACX,UAA6B,EAAE;IAE/B,IAAI,MAAM,GAA4B,IAAI,CAAC;IAE3C,IAAI,OAAO,CAAC,WAAW,KAAK,iBAAiB,EAAE,CAAC;QAC9C,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,IAAI,OAAO,CAAC,SAAS,EAAE,CAAC;YACtB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,CAAC,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,CAAC;gBACrD,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACjD,OAAO,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YAC9B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,GAAG,CAAC,IAAI,CAAC,kBAAkB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QACD,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ,EAAE,OAAO;YACjB,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;QACF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC;IACzE,IAAI,aAAa,EAAE,CAAC;QAClB,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;QACnD,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,GAAG,SAAS,CAAC;YACnB,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAE1C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,SAAS,EAAE,CAAC;QAC3B,IAAI,MAAM,CAAC,WAAW,KAAK,OAAO,EAAE,CAAC;YACnC,MAAM,aAAa,GAAG,MAAM,sBAAsB,EAAE,CAAC;YACrD,IAAI,aAAa,EAAE,CAAC;gBAClB,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;gBAC7C,IAAI,MAAM,EAAE,CAAC;oBACX,GAAG,CAAC,IAAI,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;oBACrF,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;gBAC9C,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,GAAG,kBAAkB,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IACzC,CAAC;IAED,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,QAAQ,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,GAAG;YACP,KAAK,EAAE,EAAE;YACT,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,UAAU;SACtB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,mBAAmB,CAC1B,MAAwB,EACxB,OAA0B;IAE1B,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;IAE/B,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,EAAE,OAAO,EAAE,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,YAAY,IAAI,CAAC,CAAC,CAAC;QACzF,QAAQ,GAAG,OAAO,CAAC;IACrB,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,qBAAqB,CAAC,QAAQ,CAAC,CAAC;IAE1D,IAAI,OAAO,CAAC,QAAQ,IAAI,QAAQ,CAAC,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC3D,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,EAAE,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;AAChD,CAAC"}
1
+ {"version":3,"sources":["../../src/extraction/pipeline.ts"],"sourcesContent":["import {\n extractSection,\n extractLinksAndImages,\n filterDecorativeImages,\n resolveRelativeUrls,\n} from './markdown.js';\nimport { extractMetadata } from './extract.js';\nimport { stripBoilerplateMarkdown } from './boilerplate.js';\nimport { sanitizeExtractedMarkdown } from './markdown-sanitize.js';\nimport type { ExtractionResult, Extractor } from '../types.js';\nimport { registerSiteExtractor } from './v1/site-extractors.js';\nimport { getExtractProvider } from '../providers/extract-provider.js';\n\nexport interface ExtractionOptions {\n maxChars?: number;\n section?: string;\n sectionIndex?: number;\n contentType?: string;\n pdfBuffer?: Buffer;\n}\n\n// Plugin entry point — back-compat alias. `src/server.ts` imports\n// `registerExtractor` from here. The registry lives in v1/site-extractors.ts\n// so both the facade and the v1 router see the same plugin-registered extractors.\nexport function registerExtractor(extractor: Extractor): void {\n registerSiteExtractor(extractor);\n}\n\n/**\n * @deprecated Use `getExtractProvider().extract(...)` from\n * `src/providers/extract-provider.ts`. This facade remains for backwards\n * compatibility with existing test mocks and benchmark runners that import\n * `extractContent` directly. Will be removed after the test-mock migration.\n */\nexport async function extractContent(\n html: string,\n url: string,\n options: ExtractionOptions = {},\n): Promise<ExtractionResult> {\n const provider = await getExtractProvider();\n return provider.extract(html, url, options);\n}\n\nexport function mergeMetadata(\n base: ExtractionResult['metadata'],\n html: string,\n): ExtractionResult['metadata'] {\n try {\n const meta = extractMetadata(html);\n return {\n ...meta,\n // Extractor-provided fields win when set (they already inspected the article body).\n description: base.description || meta.description,\n author: base.author || meta.author,\n date: base.date || meta.date,\n language: base.language,\n og_image: base.og_image ?? meta.og_image,\n og_type: base.og_type ?? meta.og_type,\n canonical_url: base.canonical_url ?? meta.canonical_url,\n keywords: base.keywords ?? meta.keywords,\n };\n } catch {\n return base;\n }\n}\n\nexport function applyPostProcessing(\n result: ExtractionResult,\n url: string,\n html: string,\n options: ExtractionOptions,\n): ExtractionResult {\n let markdown = result.markdown;\n\n // Resolve relative links/images before slicing so downstream consumers get absolute URLs.\n markdown = resolveRelativeUrls(markdown, url);\n markdown = stripBoilerplateMarkdown(markdown);\n markdown = filterDecorativeImages(markdown);\n markdown = sanitizeExtractedMarkdown(markdown);\n\n if (options.section) {\n const { content } = extractSection(markdown, options.section, options.sectionIndex ?? 0);\n markdown = content;\n }\n\n const { links, images } = extractLinksAndImages(markdown);\n const metadata = mergeMetadata(result.metadata, html);\n\n if (options.maxChars && markdown.length > options.maxChars) {\n markdown = markdown.slice(0, options.maxChars);\n }\n\n return { ...result, markdown, links, images, metadata };\n}\n"],"mappings":"AAAA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP,SAAS,uBAAuB;AAChC,SAAS,gCAAgC;AACzC,SAAS,iCAAiC;AAE1C,SAAS,6BAA6B;AACtC,SAAS,0BAA0B;AAa5B,SAAS,kBAAkB,WAA4B;AAC5D,wBAAsB,SAAS;AACjC;AAQA,eAAsB,eACpB,MACA,KACA,UAA6B,CAAC,GACH;AAC3B,QAAM,WAAW,MAAM,mBAAmB;AAC1C,SAAO,SAAS,QAAQ,MAAM,KAAK,OAAO;AAC5C;AAEO,SAAS,cACd,MACA,MAC8B;AAC9B,MAAI;AACF,UAAM,OAAO,gBAAgB,IAAI;AACjC,WAAO;AAAA,MACL,GAAG;AAAA;AAAA,MAEH,aAAa,KAAK,eAAe,KAAK;AAAA,MACtC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK,QAAQ,KAAK;AAAA,MACxB,UAAU,KAAK;AAAA,MACf,UAAU,KAAK,YAAY,KAAK;AAAA,MAChC,SAAS,KAAK,WAAW,KAAK;AAAA,MAC9B,eAAe,KAAK,iBAAiB,KAAK;AAAA,MAC1C,UAAU,KAAK,YAAY,KAAK;AAAA,IAClC;AAAA,EACF,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEO,SAAS,oBACd,QACA,KACA,MACA,SACkB;AAClB,MAAI,WAAW,OAAO;AAGtB,aAAW,oBAAoB,UAAU,GAAG;AAC5C,aAAW,yBAAyB,QAAQ;AAC5C,aAAW,uBAAuB,QAAQ;AAC1C,aAAW,0BAA0B,QAAQ;AAE7C,MAAI,QAAQ,SAAS;AACnB,UAAM,EAAE,QAAQ,IAAI,eAAe,UAAU,QAAQ,SAAS,QAAQ,gBAAgB,CAAC;AACvF,eAAW;AAAA,EACb;AAEA,QAAM,EAAE,OAAO,OAAO,IAAI,sBAAsB,QAAQ;AACxD,QAAM,WAAW,cAAc,OAAO,UAAU,IAAI;AAEpD,MAAI,QAAQ,YAAY,SAAS,SAAS,QAAQ,UAAU;AAC1D,eAAW,SAAS,MAAM,GAAG,QAAQ,QAAQ;AAAA,EAC/C;AAEA,SAAO,EAAE,GAAG,QAAQ,UAAU,OAAO,QAAQ,SAAS;AACxD;","names":[]}
@@ -1,3 +1,3 @@
1
1
  import type { ExtractionResult } from '../types.js';
2
- export declare function readabilityExtract(html: string, url: string): ExtractionResult | null;
2
+ export declare function readabilityExtract(html: string, _url: string): ExtractionResult | null;
3
3
  //# sourceMappingURL=readability.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CA0BrF"}
1
+ {"version":3,"file":"readability.d.ts","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAIpD,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAyBtF"}
@@ -1,32 +1,31 @@
1
- import { Readability } from '@mozilla/readability';
2
- import { parseHTML } from 'linkedom';
3
- import TurndownService from 'turndown';
1
+ import { Readability } from "@mozilla/readability";
2
+ import { parseHTML } from "linkedom";
3
+ import { htmlToMarkdown } from "./markdown.js";
4
4
  const MIN_CONTENT_THRESHOLD = 100;
5
- export function readabilityExtract(html, url) {
6
- try {
7
- const { document } = parseHTML(html);
8
- const reader = new Readability(document);
9
- const article = reader.parse();
10
- if (!article || !article.content)
11
- return null;
12
- const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
13
- const markdown = turndown.turndown(article.content);
14
- if (markdown.length < MIN_CONTENT_THRESHOLD)
15
- return null;
16
- return {
17
- title: article.title ?? '',
18
- markdown,
19
- metadata: {
20
- author: article.byline || undefined,
21
- language: article.lang || undefined,
22
- },
23
- links: [],
24
- images: [],
25
- extractor: 'readability',
26
- };
27
- }
28
- catch {
29
- return null;
30
- }
5
+ function readabilityExtract(html, _url) {
6
+ try {
7
+ const { document } = parseHTML(html);
8
+ const reader = new Readability(document);
9
+ const article = reader.parse();
10
+ if (!article || !article.content) return null;
11
+ const markdown = htmlToMarkdown(article.content);
12
+ if (markdown.length < MIN_CONTENT_THRESHOLD) return null;
13
+ return {
14
+ title: article.title ?? "",
15
+ markdown,
16
+ metadata: {
17
+ author: article.byline || void 0,
18
+ language: article.lang || void 0
19
+ },
20
+ links: [],
21
+ images: [],
22
+ extractor: "readability"
23
+ };
24
+ } catch {
25
+ return null;
26
+ }
31
27
  }
28
+ export {
29
+ readabilityExtract
30
+ };
32
31
  //# sourceMappingURL=readability.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"readability.js","sourceRoot":"","sources":["../../src/extraction/readability.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,GAAW;IAC1D,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;QAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE9C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;QACxF,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpD,IAAI,QAAQ,CAAC,MAAM,GAAG,qBAAqB;YAAE,OAAO,IAAI,CAAC;QAEzD,OAAO;YACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,EAAE;YAC1B,QAAQ;YACR,QAAQ,EAAE;gBACR,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,QAAQ,EAAE,OAAO,CAAC,IAAI,IAAI,SAAS;aACpC;YACD,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,aAAa;SACzB,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
1
+ {"version":3,"sources":["../../src/extraction/readability.ts"],"sourcesContent":["import { Readability } from '@mozilla/readability';\nimport { parseHTML } from 'linkedom';\nimport { htmlToMarkdown } from './markdown.js';\nimport type { ExtractionResult } from '../types.js';\n\nconst MIN_CONTENT_THRESHOLD = 100;\n\nexport function readabilityExtract(html: string, _url: string): ExtractionResult | null {\n try {\n const { document } = parseHTML(html);\n const reader = new Readability(document as any);\n const article = reader.parse();\n if (!article || !article.content) return null;\n\n const markdown = htmlToMarkdown(article.content);\n\n if (markdown.length < MIN_CONTENT_THRESHOLD) return null;\n\n return {\n title: article.title ?? '',\n markdown,\n metadata: {\n author: article.byline || undefined,\n language: article.lang || undefined,\n },\n links: [],\n images: [],\n extractor: 'readability',\n };\n } catch {\n return null;\n }\n}\n"],"mappings":"AAAA,SAAS,mBAAmB;AAC5B,SAAS,iBAAiB;AAC1B,SAAS,sBAAsB;AAG/B,MAAM,wBAAwB;AAEvB,SAAS,mBAAmB,MAAc,MAAuC;AACtF,MAAI;AACF,UAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AACnC,UAAM,SAAS,IAAI,YAAY,QAAe;AAC9C,UAAM,UAAU,OAAO,MAAM;AAC7B,QAAI,CAAC,WAAW,CAAC,QAAQ,QAAS,QAAO;AAEzC,UAAM,WAAW,eAAe,QAAQ,OAAO;AAE/C,QAAI,SAAS,SAAS,sBAAuB,QAAO;AAEpD,WAAO;AAAA,MACL,OAAO,QAAQ,SAAS;AAAA,MACxB;AAAA,MACA,UAAU;AAAA,QACR,QAAQ,QAAQ,UAAU;AAAA,QAC1B,UAAU,QAAQ,QAAQ;AAAA,MAC5B;AAAA,MACA,OAAO,CAAC;AAAA,MACR,QAAQ,CAAC;AAAA,MACT,WAAW;AAAA,IACb;AAAA,EACF,QAAQ;AACN,WAAO;AAAA,EACT;AACF;","names":[]}
@@ -1,7 +1,19 @@
1
+ import { type LLMFallbackBudget } from './llm-fallback.js';
2
+ import type { SchemaExtractionResult } from '../types.js';
1
3
  export interface JsonSchema {
2
4
  type?: string;
3
5
  properties?: Record<string, JsonSchema>;
4
6
  items?: JsonSchema;
7
+ required?: string[];
8
+ }
9
+ export interface SchemaExtractionOpts {
10
+ signal?: AbortSignal;
11
+ budget?: LLMFallbackBudget;
5
12
  }
6
13
  export declare function extractWithSchema(html: string, schema: JsonSchema): Record<string, unknown>;
14
+ export declare function extractWithSchemaDetailed(html: string, schema: JsonSchema): SchemaExtractionResult;
15
+ export interface SchemaExtractionAsyncResult extends SchemaExtractionResult {
16
+ warnings: string[];
17
+ }
18
+ export declare function extractWithSchemaDetailedAsync(html: string, schema: JsonSchema, opts?: SchemaExtractionOpts): Promise<SchemaExtractionAsyncResult>;
7
19
  //# sourceMappingURL=schema.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAGA,MAAM,WAAW,UAAU;IACzB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IACxC,KAAK,CAAC,EAAE,UAAU,CAAC;CACpB;AAED,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAmBzB"}
1
+ {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAEA,OAAO,EAAkB,KAAK,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAC3E,OAAO,KAAK,EAEV,sBAAsB,EAEvB,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,UAAU;IACzB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IACxC,KAAK,CAAC,EAAE,UAAU,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CACrB;AAED,MAAM,WAAW,oBAAoB;IACnC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,MAAM,CAAC,EAAE,iBAAiB,CAAC;CAC5B;AAQD,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAEzB;AAED,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,GACjB,sBAAsB,CAsCxB;AAED,MAAM,WAAW,2BAA4B,SAAQ,sBAAsB;IACzE,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,wBAAsB,8BAA8B,CAClD,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,UAAU,EAClB,IAAI,GAAE,oBAAyB,GAC9B,OAAO,CAAC,2BAA2B,CAAC,CA+BtC"}
@@ -1,86 +1,149 @@
1
- import { parseHTML } from 'linkedom';
2
- import { extractJsonLd, matchJsonLdToSchema } from './jsonld.js';
3
- export function extractWithSchema(html, schema) {
4
- if (!html || !schema.properties)
5
- return {};
6
- const jsonLdBlocks = extractJsonLd(html);
7
- const jsonLdResult = matchJsonLdToSchema(jsonLdBlocks, schema);
8
- const { document: doc } = parseHTML(html);
9
- const heuristicResult = {};
10
- for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {
11
- if (jsonLdResult[fieldName] !== undefined)
12
- continue;
13
- const value = findFieldValue(doc, fieldName, fieldSchema);
14
- if (value !== undefined) {
15
- heuristicResult[fieldName] = value;
1
+ import { parseHTML } from "linkedom";
2
+ import { extractStructuredData } from "./structured-data.js";
3
+ import { extractWithLLM } from "./llm-fallback.js";
4
+ const PROVENANCE_PRIORITY = [
5
+ "json-ld",
6
+ "microdata",
7
+ "rdfa"
8
+ ];
9
+ function extractWithSchema(html, schema) {
10
+ return extractWithSchemaDetailed(html, schema).values;
11
+ }
12
+ function extractWithSchemaDetailed(html, schema) {
13
+ const values = {};
14
+ const provenance = {};
15
+ if (!html || !schema.properties) return { values, provenance };
16
+ const blocks = extractStructuredData(html);
17
+ for (const source of PROVENANCE_PRIORITY) {
18
+ for (const block of blocks) {
19
+ if (block.provenance !== source) continue;
20
+ for (const fieldName of Object.keys(schema.properties)) {
21
+ if (values[fieldName] !== void 0) continue;
22
+ const v = pickField(block.fields, fieldName);
23
+ if (v !== void 0) {
24
+ values[fieldName] = v;
25
+ provenance[fieldName] = source;
16
26
  }
27
+ }
28
+ }
29
+ }
30
+ const allCovered = Object.keys(schema.properties).every(
31
+ (k) => values[k] !== void 0
32
+ );
33
+ if (allCovered) return { values, provenance };
34
+ const { document: doc } = parseHTML(html);
35
+ for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {
36
+ if (values[fieldName] !== void 0) continue;
37
+ const v = findFieldValue(doc, fieldName, fieldSchema);
38
+ if (v !== void 0) {
39
+ values[fieldName] = v;
40
+ provenance[fieldName] = "heuristic";
17
41
  }
18
- return { ...jsonLdResult, ...heuristicResult };
42
+ }
43
+ return { values, provenance };
19
44
  }
20
- function findFieldValue(doc, fieldName, schema) {
21
- const normalizedName = fieldName.toLowerCase().replace(/_/g, '-');
22
- const compactName = fieldName.replace(/_/g, '').toLowerCase();
23
- const variants = [fieldName, normalizedName, compactName];
24
- if (schema.type === 'array') {
25
- return findArrayValues(doc, variants);
45
+ async function extractWithSchemaDetailedAsync(html, schema, opts = {}) {
46
+ const det = extractWithSchemaDetailed(html, schema);
47
+ const warnings = [];
48
+ if (!schema.required || schema.required.length === 0) {
49
+ return { ...det, warnings };
50
+ }
51
+ const missing = schema.required.filter((k) => det.values[k] === void 0);
52
+ if (missing.length === 0) {
53
+ return { ...det, warnings };
54
+ }
55
+ const llm = await extractWithLLM({
56
+ html,
57
+ jsonSchema: schema,
58
+ partial: det.values,
59
+ missing,
60
+ signal: opts.signal,
61
+ budget: opts.budget
62
+ });
63
+ const values = { ...det.values };
64
+ const provenance = { ...det.provenance };
65
+ for (const key of missing) {
66
+ if (llm.values[key] !== void 0 && values[key] === void 0) {
67
+ values[key] = llm.values[key];
68
+ provenance[key] = "llm";
69
+ }
70
+ }
71
+ return { values, provenance, warnings: llm.warnings };
72
+ }
73
+ function pickField(fields, name) {
74
+ if (fields[name] !== void 0) return fields[name];
75
+ for (const v of Object.values(fields)) {
76
+ if (v && typeof v === "object" && !Array.isArray(v)) {
77
+ const nested = v[name];
78
+ if (nested !== void 0) return nested;
26
79
  }
27
- return findSingleValue(doc, variants);
80
+ }
81
+ return void 0;
82
+ }
83
+ function findFieldValue(doc, fieldName, schema) {
84
+ const normalizedName = fieldName.toLowerCase().replace(/_/g, "-");
85
+ const compactName = fieldName.replace(/_/g, "").toLowerCase();
86
+ const variants = [fieldName, normalizedName, compactName];
87
+ if (schema.type === "array") {
88
+ return findArrayValues(doc, variants);
89
+ }
90
+ return findSingleValue(doc, variants);
28
91
  }
29
92
  function cssEscape(value) {
30
- return value.replace(/([^\w-])/g, '\\$1');
93
+ return value.replace(/([^\w-])/g, "\\$1");
31
94
  }
32
95
  function findSingleValue(doc, variants) {
33
- for (const name of variants) {
34
- const byItemprop = doc.querySelector(`[itemprop="${name}"]`);
35
- if (byItemprop) {
36
- const text = byItemprop.getAttribute('content') ?? byItemprop.textContent?.trim();
37
- if (text)
38
- return text;
39
- }
40
- // Substring match is intentional — heuristic best-effort for partial class names
41
- const byClass = doc.querySelector(`[class*="${name}"]`);
42
- if (byClass) {
43
- const text = byClass.textContent?.trim();
44
- if (text)
45
- return text;
46
- }
47
- const allWithAria = doc.querySelectorAll('[aria-label]');
48
- for (const el of allWithAria) {
49
- const label = el.getAttribute('aria-label')?.toLowerCase().replace(/\s+/g, '-') ?? '';
50
- if (label === name.toLowerCase()) {
51
- const text = el.textContent?.trim();
52
- if (text)
53
- return text;
54
- }
55
- }
56
- const byId = doc.querySelector(`#${cssEscape(name)}`);
57
- if (byId) {
58
- const text = byId.textContent?.trim();
59
- if (text)
60
- return text;
61
- }
62
- const byData = doc.querySelector(`[data-${name}]`);
63
- if (byData) {
64
- return byData.getAttribute(`data-${name}`) ?? byData.textContent?.trim() ?? undefined;
65
- }
96
+ for (const name of variants) {
97
+ const byItemprop = doc.querySelector(`[itemprop="${name}"]`);
98
+ if (byItemprop) {
99
+ const text = byItemprop.getAttribute("content") ?? byItemprop.textContent?.trim();
100
+ if (text) return text;
101
+ }
102
+ const byClass = doc.querySelector(`[class*="${name}"]`);
103
+ if (byClass) {
104
+ const text = byClass.textContent?.trim();
105
+ if (text) return text;
106
+ }
107
+ const allWithAria = doc.querySelectorAll("[aria-label]");
108
+ for (const el of allWithAria) {
109
+ const label = el.getAttribute("aria-label")?.toLowerCase().replace(/\s+/g, "-") ?? "";
110
+ if (label === name.toLowerCase()) {
111
+ const text = el.textContent?.trim();
112
+ if (text) return text;
113
+ }
114
+ }
115
+ const byId = doc.querySelector(`#${cssEscape(name)}`);
116
+ if (byId) {
117
+ const text = byId.textContent?.trim();
118
+ if (text) return text;
66
119
  }
67
- return undefined;
120
+ const byData = doc.querySelector(`[data-${name}]`);
121
+ if (byData) {
122
+ return byData.getAttribute(`data-${name}`) ?? byData.textContent?.trim() ?? void 0;
123
+ }
124
+ }
125
+ return void 0;
68
126
  }
69
127
  function findArrayValues(doc, variants) {
70
- for (const name of variants) {
71
- const container = doc.querySelector(`[class*="${name}"]`);
72
- if (container) {
73
- const items = container.querySelectorAll('li, [class*="item"]');
74
- if (items.length > 0) {
75
- return Array.from(items).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
76
- }
77
- }
78
- const singular = name.replace(/s$/, '');
79
- const elements = doc.querySelectorAll(`[class*="${singular}"]`);
80
- if (elements.length > 1) {
81
- return Array.from(elements).map((el) => (el.textContent ?? '').trim()).filter(Boolean);
82
- }
128
+ for (const name of variants) {
129
+ const container = doc.querySelector(`[class*="${name}"]`);
130
+ if (container) {
131
+ const items = container.querySelectorAll('li, [class*="item"]');
132
+ if (items.length > 0) {
133
+ return Array.from(items).map((el) => (el.textContent ?? "").trim()).filter(Boolean);
134
+ }
135
+ }
136
+ const singular = name.replace(/s$/, "");
137
+ const elements = doc.querySelectorAll(`[class*="${singular}"]`);
138
+ if (elements.length > 1) {
139
+ return Array.from(elements).map((el) => (el.textContent ?? "").trim()).filter(Boolean);
83
140
  }
84
- return undefined;
141
+ }
142
+ return void 0;
85
143
  }
144
+ export {
145
+ extractWithSchema,
146
+ extractWithSchemaDetailed,
147
+ extractWithSchemaDetailedAsync
148
+ };
86
149
  //# sourceMappingURL=schema.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/extraction/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAQjE,MAAM,UAAU,iBAAiB,CAC/B,IAAY,EACZ,MAAkB;IAElB,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU;QAAE,OAAO,EAAE,CAAC;IAE3C,MAAM,YAAY,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACzC,MAAM,YAAY,GAAG,mBAAmB,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;IAE/D,MAAM,EAAE,QAAQ,EAAE,GAAG,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC1C,MAAM,eAAe,GAA4B,EAAE,CAAC;IAEpD,KAAK,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC,EAAE,CAAC;QACzE,IAAI,YAAY,CAAC,SAAS,CAAC,KAAK,SAAS;YAAE,SAAS;QAEpD,MAAM,KAAK,GAAG,cAAc,CAAC,GAAG,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QAC1D,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACxB,eAAe,CAAC,SAAS,CAAC,GAAG,KAAK,CAAC;QACrC,CAAC;IACH,CAAC;IAED,OAAO,EAAE,GAAG,YAAY,EAAE,GAAG,eAAe,EAAE,CAAC;AACjD,CAAC;AAED,SAAS,cAAc,CACrB,GAAa,EACb,SAAiB,EACjB,MAAkB;IAElB,MAAM,cAAc,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;IAClE,MAAM,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;IAC9D,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,cAAc,EAAE,WAAW,CAAC,CAAC;IAE1D,IAAI,MAAM,CAAC,IAAI,KAAK,OAAO,EAAE,CAAC;QAC5B,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACxC,CAAC;IAED,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,SAAS,CAAC,KAAa;IAC9B,OAAO,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;AAC5C,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAG,GAAG,CAAC,aAAa,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC;QAC7D,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,UAAU,CAAC,YAAY,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YAClF,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,iFAAiF;QACjF,MAAM,OAAO,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,IAAI,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACzC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,WAAW,GAAG,GAAG,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,CAAC,EAAE,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC;YACtF,IAAI,KAAK,KAAK,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;gBACpC,IAAI,IAAI;oBAAE,OAAO,IAAI,CAAC;YACxB,CAAC;QACH,CAAC;QAED,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,CAAC,IAAI,SAAS,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACtD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;YACtC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;QACxB,CAAC;QAED,MAAM,MAAM,GAAG,GAAG,CAAC,aAAa,CAAC,SAAS,IAAI,GAAG,CAAC,CAAC;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,OAAO,MAAM,CAAC,YAAY,CAAC,QAAQ,IAAI,EAAE,CAAC,IAAI,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QACxF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,GAAa,EAAE,QAAkB;IACxD,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,SAAS,GAAG,GAAG,CAAC,aAAa,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QAC1D,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,KAAK,GAAG,SAAS,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAAC;YAChE,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,OAAO,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;YACtF,CAAC;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QACxC,MAAM,QAAQ,GAAG,GAAG,CAAC,gBAAgB,CAAC,YAAY,QAAQ,IAAI,CAAC,CAAC;QAChE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzF,CAAC;IACH,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
1
+ {"version":3,"sources":["../../src/extraction/schema.ts"],"sourcesContent":["import { parseHTML } from 'linkedom';\nimport { extractStructuredData } from './structured-data.js';\nimport { extractWithLLM, type LLMFallbackBudget } from './llm-fallback.js';\nimport type {\n FieldProvenance,\n SchemaExtractionResult,\n StructuredDataResult,\n} from '../types.js';\n\nexport interface JsonSchema {\n type?: string;\n properties?: Record<string, JsonSchema>;\n items?: JsonSchema;\n required?: string[];\n}\n\nexport interface SchemaExtractionOpts {\n signal?: AbortSignal;\n budget?: LLMFallbackBudget;\n}\n\nconst PROVENANCE_PRIORITY: StructuredDataResult['provenance'][] = [\n 'json-ld',\n 'microdata',\n 'rdfa',\n];\n\nexport function extractWithSchema(\n html: string,\n schema: JsonSchema,\n): Record<string, unknown> {\n return extractWithSchemaDetailed(html, schema).values;\n}\n\nexport function extractWithSchemaDetailed(\n html: string,\n schema: JsonSchema,\n): SchemaExtractionResult {\n const values: Record<string, unknown> = {};\n const provenance: Record<string, FieldProvenance> = {};\n if (!html || !schema.properties) return { values, provenance };\n\n const blocks = extractStructuredData(html);\n\n for (const source of PROVENANCE_PRIORITY) {\n for (const block of blocks) {\n if (block.provenance !== source) continue;\n for (const fieldName of Object.keys(schema.properties)) {\n if (values[fieldName] !== undefined) continue;\n const v = pickField(block.fields, fieldName);\n if (v !== undefined) {\n values[fieldName] = v;\n provenance[fieldName] = source;\n }\n }\n }\n }\n\n const allCovered = Object.keys(schema.properties).every(\n (k) => values[k] !== undefined,\n );\n if (allCovered) return { values, provenance };\n\n // Heuristic fallback only for fields still missing\n const { document: doc } = parseHTML(html);\n for (const [fieldName, fieldSchema] of Object.entries(schema.properties)) {\n if (values[fieldName] !== undefined) continue;\n const v = findFieldValue(doc, fieldName, fieldSchema);\n if (v !== undefined) {\n values[fieldName] = v;\n provenance[fieldName] = 'heuristic';\n }\n }\n\n return { values, provenance };\n}\n\nexport interface SchemaExtractionAsyncResult extends SchemaExtractionResult {\n warnings: string[];\n}\n\nexport async function extractWithSchemaDetailedAsync(\n html: string,\n schema: JsonSchema,\n opts: SchemaExtractionOpts = {},\n): Promise<SchemaExtractionAsyncResult> {\n const det = extractWithSchemaDetailed(html, schema);\n const warnings: string[] = [];\n\n if (!schema.required || schema.required.length === 0) {\n return { ...det, warnings };\n }\n\n const missing = schema.required.filter((k) => det.values[k] === undefined);\n if (missing.length === 0) {\n return { ...det, warnings };\n }\n\n const llm = await extractWithLLM({\n html,\n jsonSchema: schema as unknown as Record<string, unknown>,\n partial: det.values,\n missing,\n signal: opts.signal,\n budget: opts.budget,\n });\n\n const values = { ...det.values };\n const provenance: Record<string, FieldProvenance> = { ...det.provenance };\n for (const key of missing) {\n if (llm.values[key] !== undefined && values[key] === undefined) {\n values[key] = llm.values[key];\n provenance[key] = 'llm';\n }\n }\n return { values, provenance, warnings: llm.warnings };\n}\n\nfunction pickField(fields: Record<string, unknown>, name: string): unknown {\n if (fields[name] !== undefined) return fields[name];\n // Shallow nested — e.g. JSON-LD Product.offers.price\n for (const v of Object.values(fields)) {\n if (v && typeof v === 'object' && !Array.isArray(v)) {\n const nested = (v as Record<string, unknown>)[name];\n if (nested !== undefined) return nested;\n }\n }\n return undefined;\n}\n\n// ---------- heuristic helpers (preserved from prior schema.ts) ----------\n\nfunction findFieldValue(\n doc: Document,\n fieldName: string,\n schema: JsonSchema,\n): unknown {\n const normalizedName = fieldName.toLowerCase().replace(/_/g, '-');\n const compactName = fieldName.replace(/_/g, '').toLowerCase();\n const variants = [fieldName, normalizedName, compactName];\n\n if (schema.type === 'array') {\n return findArrayValues(doc, variants);\n }\n\n return findSingleValue(doc, variants);\n}\n\nfunction cssEscape(value: string): string {\n return value.replace(/([^\\w-])/g, '\\\\$1');\n}\n\nfunction findSingleValue(doc: Document, variants: string[]): string | undefined {\n for (const name of variants) {\n const byItemprop = doc.querySelector(`[itemprop=\"${name}\"]`);\n if (byItemprop) {\n const text = byItemprop.getAttribute('content') ?? byItemprop.textContent?.trim();\n if (text) return text;\n }\n\n const byClass = doc.querySelector(`[class*=\"${name}\"]`);\n if (byClass) {\n const text = byClass.textContent?.trim();\n if (text) return text;\n }\n\n const allWithAria = doc.querySelectorAll('[aria-label]');\n for (const el of allWithAria) {\n const label = el.getAttribute('aria-label')?.toLowerCase().replace(/\\s+/g, '-') ?? '';\n if (label === name.toLowerCase()) {\n const text = el.textContent?.trim();\n if (text) return text;\n }\n }\n\n const byId = doc.querySelector(`#${cssEscape(name)}`);\n if (byId) {\n const text = byId.textContent?.trim();\n if (text) return text;\n }\n\n const byData = doc.querySelector(`[data-${name}]`);\n if (byData) {\n return byData.getAttribute(`data-${name}`) ?? byData.textContent?.trim() ?? undefined;\n }\n }\n\n return undefined;\n}\n\nfunction findArrayValues(doc: Document, variants: string[]): string[] | undefined {\n for (const name of variants) {\n const container = doc.querySelector(`[class*=\"${name}\"]`);\n if (container) {\n const items = container.querySelectorAll('li, [class*=\"item\"]');\n if (items.length > 0) {\n return Array.from(items).map((el) => (el.textContent ?? '').trim()).filter(Boolean);\n }\n }\n\n const singular = name.replace(/s$/, '');\n const elements = doc.querySelectorAll(`[class*=\"${singular}\"]`);\n if (elements.length > 1) {\n return Array.from(elements).map((el) => (el.textContent ?? '').trim()).filter(Boolean);\n }\n }\n\n return undefined;\n}\n"],"mappings":"AAAA,SAAS,iBAAiB;AAC1B,SAAS,6BAA6B;AACtC,SAAS,sBAA8C;AAmBvD,MAAM,sBAA4D;AAAA,EAChE;AAAA,EACA;AAAA,EACA;AACF;AAEO,SAAS,kBACd,MACA,QACyB;AACzB,SAAO,0BAA0B,MAAM,MAAM,EAAE;AACjD;AAEO,SAAS,0BACd,MACA,QACwB;AACxB,QAAM,SAAkC,CAAC;AACzC,QAAM,aAA8C,CAAC;AACrD,MAAI,CAAC,QAAQ,CAAC,OAAO,WAAY,QAAO,EAAE,QAAQ,WAAW;AAE7D,QAAM,SAAS,sBAAsB,IAAI;AAEzC,aAAW,UAAU,qBAAqB;AACxC,eAAW,SAAS,QAAQ;AAC1B,UAAI,MAAM,eAAe,OAAQ;AACjC,iBAAW,aAAa,OAAO,KAAK,OAAO,UAAU,GAAG;AACtD,YAAI,OAAO,SAAS,MAAM,OAAW;AACrC,cAAM,IAAI,UAAU,MAAM,QAAQ,SAAS;AAC3C,YAAI,MAAM,QAAW;AACnB,iBAAO,SAAS,IAAI;AACpB,qBAAW,SAAS,IAAI;AAAA,QAC1B;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,aAAa,OAAO,KAAK,OAAO,UAAU,EAAE;AAAA,IAChD,CAAC,MAAM,OAAO,CAAC,MAAM;AAAA,EACvB;AACA,MAAI,WAAY,QAAO,EAAE,QAAQ,WAAW;AAG5C,QAAM,EAAE,UAAU,IAAI,IAAI,UAAU,IAAI;AACxC,aAAW,CAAC,WAAW,WAAW,KAAK,OAAO,QAAQ,OAAO,UAAU,GAAG;AACxE,QAAI,OAAO,SAAS,MAAM,OAAW;AACrC,UAAM,IAAI,eAAe,KAAK,WAAW,WAAW;AACpD,QAAI,MAAM,QAAW;AACnB,aAAO,SAAS,IAAI;AACpB,iBAAW,SAAS,IAAI;AAAA,IAC1B;AAAA,EACF;AAEA,SAAO,EAAE,QAAQ,WAAW;AAC9B;AAMA,eAAsB,+BACpB,MACA,QACA,OAA6B,CAAC,GACQ;AACtC,QAAM,MAAM,0BAA0B,MAAM,MAAM;AAClD,QAAM,WAAqB,CAAC;AAE5B,MAAI,CAAC,OAAO,YAAY,OAAO,SAAS,WAAW,GAAG;AACpD,WAAO,EAAE,GAAG,KAAK,SAAS;AAAA,EAC5B;AAEA,QAAM,UAAU,OAAO,SAAS,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,MAAS;AACzE,MAAI,QAAQ,WAAW,GAAG;AACxB,WAAO,EAAE,GAAG,KAAK,SAAS;AAAA,EAC5B;AAEA,QAAM,MAAM,MAAM,eAAe;AAAA,IAC/B;AAAA,IACA,YAAY;AAAA,IACZ,SAAS,IAAI;AAAA,IACb;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,QAAQ,KAAK;AAAA,EACf,CAAC;AAED,QAAM,SAAS,EAAE,GAAG,IAAI,OAAO;AAC/B,QAAM,aAA8C,EAAE,GAAG,IAAI,WAAW;AACxE,aAAW,OAAO,SAAS;AACzB,QAAI,IAAI,OAAO,GAAG,MAAM,UAAa,OAAO,GAAG,MAAM,QAAW;AAC9D,aAAO,GAAG,IAAI,IAAI,OAAO,GAAG;AAC5B,iBAAW,GAAG,IAAI;AAAA,IACpB;AAAA,EACF;AACA,SAAO,EAAE,QAAQ,YAAY,UAAU,IAAI,SAAS;AACtD;AAEA,SAAS,UAAU,QAAiC,MAAuB;AACzE,MAAI,OAAO,IAAI,MAAM,OAAW,QAAO,OAAO,IAAI;AAElD,aAAW,KAAK,OAAO,OAAO,MAAM,GAAG;AACrC,QAAI,KAAK,OAAO,MAAM,YAAY,CAAC,MAAM,QAAQ,CAAC,GAAG;AACnD,YAAM,SAAU,EAA8B,IAAI;AAClD,UAAI,WAAW,OAAW,QAAO;AAAA,IACnC;AAAA,EACF;AACA,SAAO;AACT;AAIA,SAAS,eACP,KACA,WACA,QACS;AACT,QAAM,iBAAiB,UAAU,YAAY,EAAE,QAAQ,MAAM,GAAG;AAChE,QAAM,cAAc,UAAU,QAAQ,MAAM,EAAE,EAAE,YAAY;AAC5D,QAAM,WAAW,CAAC,WAAW,gBAAgB,WAAW;AAExD,MAAI,OAAO,SAAS,SAAS;AAC3B,WAAO,gBAAgB,KAAK,QAAQ;AAAA,EACtC;AAEA,SAAO,gBAAgB,KAAK,QAAQ;AACtC;AAEA,SAAS,UAAU,OAAuB;AACxC,SAAO,MAAM,QAAQ,aAAa,MAAM;AAC1C;AAEA,SAAS,gBAAgB,KAAe,UAAwC;AAC9E,aAAW,QAAQ,UAAU;AAC3B,UAAM,aAAa,IAAI,cAAc,cAAc,IAAI,IAAI;AAC3D,QAAI,YAAY;AACd,YAAM,OAAO,WAAW,aAAa,SAAS,KAAK,WAAW,aAAa,KAAK;AAChF,UAAI,KAAM,QAAO;AAAA,IACnB;AAEA,UAAM,UAAU,IAAI,cAAc,YAAY,IAAI,IAAI;AACtD,QAAI,SAAS;AACX,YAAM,OAAO,QAAQ,aAAa,KAAK;AACvC,UAAI,KAAM,QAAO;AAAA,IACnB;AAEA,UAAM,cAAc,IAAI,iBAAiB,cAAc;AACvD,eAAW,MAAM,aAAa;AAC5B,YAAM,QAAQ,GAAG,aAAa,YAAY,GAAG,YAAY,EAAE,QAAQ,QAAQ,GAAG,KAAK;AACnF,UAAI,UAAU,KAAK,YAAY,GAAG;AAChC,cAAM,OAAO,GAAG,aAAa,KAAK;AAClC,YAAI,KAAM,QAAO;AAAA,MACnB;AAAA,IACF;AAEA,UAAM,OAAO,IAAI,cAAc,IAAI,UAAU,IAAI,CAAC,EAAE;AACpD,QAAI,MAAM;AACR,YAAM,OAAO,KAAK,aAAa,KAAK;AACpC,UAAI,KAAM,QAAO;AAAA,IACnB;AAEA,UAAM,SAAS,IAAI,cAAc,SAAS,IAAI,GAAG;AACjD,QAAI,QAAQ;AACV,aAAO,OAAO,aAAa,QAAQ,IAAI,EAAE,KAAK,OAAO,aAAa,KAAK,KAAK;AAAA,IAC9E;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,gBAAgB,KAAe,UAA0C;AAChF,aAAW,QAAQ,UAAU;AAC3B,UAAM,YAAY,IAAI,cAAc,YAAY,IAAI,IAAI;AACxD,QAAI,WAAW;AACb,YAAM,QAAQ,UAAU,iBAAiB,qBAAqB;AAC9D,UAAI,MAAM,SAAS,GAAG;AACpB,eAAO,MAAM,KAAK,KAAK,EAAE,IAAI,CAAC,QAAQ,GAAG,eAAe,IAAI,KAAK,CAAC,EAAE,OAAO,OAAO;AAAA,MACpF;AAAA,IACF;AAEA,UAAM,WAAW,KAAK,QAAQ,MAAM,EAAE;AACtC,UAAM,WAAW,IAAI,iBAAiB,YAAY,QAAQ,IAAI;AAC9D,QAAI,SAAS,SAAS,GAAG;AACvB,aAAO,MAAM,KAAK,QAAQ,EAAE,IAAI,CAAC,QAAQ,GAAG,eAAe,IAAI,KAAK,CAAC,EAAE,OAAO,OAAO;AAAA,IACvF;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}