@staticn0va/wigolo 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (982) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +195 -73
  3. package/SKILL.md +382 -0
  4. package/assets/blocks/claude-code/CLAUDE.md.block +20 -0
  5. package/assets/blocks/claude-code/wigolo-command.md +40 -0
  6. package/assets/blocks/cursor/wigolo.mdc +46 -0
  7. package/assets/blocks/gemini-cli/GEMINI.md.block +18 -0
  8. package/assets/blocks/vscode/copilot-instructions.md.block +18 -0
  9. package/assets/skills/wigolo/SKILL.md +50 -0
  10. package/assets/skills/wigolo/rules/cache-first.md +30 -0
  11. package/assets/skills/wigolo/rules/synthesis.md +43 -0
  12. package/assets/skills/wigolo-agent/SKILL.md +73 -0
  13. package/assets/skills/wigolo-crawl/SKILL.md +60 -0
  14. package/assets/skills/wigolo-extract/SKILL.md +59 -0
  15. package/assets/skills/wigolo-fetch/SKILL.md +65 -0
  16. package/assets/skills/wigolo-find-similar/SKILL.md +72 -0
  17. package/assets/skills/wigolo-research/SKILL.md +77 -0
  18. package/assets/skills/wigolo-search/SKILL.md +78 -0
  19. package/dist/agent/executor.d.ts +33 -0
  20. package/dist/agent/executor.d.ts.map +1 -0
  21. package/dist/agent/executor.js +233 -0
  22. package/dist/agent/executor.js.map +1 -0
  23. package/dist/agent/pipeline.d.ts +5 -0
  24. package/dist/agent/pipeline.d.ts.map +1 -0
  25. package/dist/agent/pipeline.js +208 -0
  26. package/dist/agent/pipeline.js.map +1 -0
  27. package/dist/agent/planner.d.ts +13 -0
  28. package/dist/agent/planner.d.ts.map +1 -0
  29. package/dist/agent/planner.js +271 -0
  30. package/dist/agent/planner.js.map +1 -0
  31. package/dist/agent/relevance.d.ts +15 -0
  32. package/dist/agent/relevance.d.ts.map +1 -0
  33. package/dist/agent/relevance.js +60 -0
  34. package/dist/agent/relevance.js.map +1 -0
  35. package/dist/cache/backfill-embeddings.d.ts +23 -0
  36. package/dist/cache/backfill-embeddings.d.ts.map +1 -0
  37. package/dist/cache/backfill-embeddings.js +105 -0
  38. package/dist/cache/backfill-embeddings.js.map +1 -0
  39. package/dist/cache/change-detector.d.ts +7 -0
  40. package/dist/cache/change-detector.d.ts.map +1 -0
  41. package/dist/cache/change-detector.js +43 -0
  42. package/dist/cache/change-detector.js.map +1 -0
  43. package/dist/cache/db.d.ts +1 -0
  44. package/dist/cache/db.d.ts.map +1 -1
  45. package/dist/cache/db.js +94 -22
  46. package/dist/cache/db.js.map +1 -1
  47. package/dist/cache/diff-summary.d.ts +2 -0
  48. package/dist/cache/diff-summary.d.ts.map +1 -0
  49. package/dist/cache/diff-summary.js +82 -0
  50. package/dist/cache/diff-summary.js.map +1 -0
  51. package/dist/cache/migrations/runner.d.ts +29 -0
  52. package/dist/cache/migrations/runner.d.ts.map +1 -0
  53. package/dist/cache/migrations/runner.js +147 -0
  54. package/dist/cache/migrations/runner.js.map +1 -0
  55. package/dist/cache/sqlite-vec-store.d.ts +42 -0
  56. package/dist/cache/sqlite-vec-store.d.ts.map +1 -0
  57. package/dist/cache/sqlite-vec-store.js +176 -0
  58. package/dist/cache/sqlite-vec-store.js.map +1 -0
  59. package/dist/cache/store.d.ts +46 -1
  60. package/dist/cache/store.d.ts.map +1 -1
  61. package/dist/cache/store.js +362 -168
  62. package/dist/cache/store.js.map +1 -1
  63. package/dist/cli/agents/antigravity.d.ts +20 -0
  64. package/dist/cli/agents/antigravity.d.ts.map +1 -0
  65. package/dist/cli/agents/antigravity.js +49 -0
  66. package/dist/cli/agents/antigravity.js.map +1 -0
  67. package/dist/cli/agents/claude-code.d.ts +25 -0
  68. package/dist/cli/agents/claude-code.d.ts.map +1 -0
  69. package/dist/cli/agents/claude-code.js +111 -0
  70. package/dist/cli/agents/claude-code.js.map +1 -0
  71. package/dist/cli/agents/cursor.d.ts +21 -0
  72. package/dist/cli/agents/cursor.d.ts.map +1 -0
  73. package/dist/cli/agents/cursor.js +58 -0
  74. package/dist/cli/agents/cursor.js.map +1 -0
  75. package/dist/cli/agents/gemini-cli.d.ts +21 -0
  76. package/dist/cli/agents/gemini-cli.d.ts.map +1 -0
  77. package/dist/cli/agents/gemini-cli.js +55 -0
  78. package/dist/cli/agents/gemini-cli.js.map +1 -0
  79. package/dist/cli/agents/registry.d.ts +21 -0
  80. package/dist/cli/agents/registry.d.ts.map +1 -0
  81. package/dist/cli/agents/registry.js +27 -0
  82. package/dist/cli/agents/registry.js.map +1 -0
  83. package/dist/cli/agents/utils.d.ts +26 -0
  84. package/dist/cli/agents/utils.d.ts.map +1 -0
  85. package/dist/cli/agents/utils.js +136 -0
  86. package/dist/cli/agents/utils.js.map +1 -0
  87. package/dist/cli/agents/vscode.d.ts +21 -0
  88. package/dist/cli/agents/vscode.d.ts.map +1 -0
  89. package/dist/cli/agents/vscode.js +62 -0
  90. package/dist/cli/agents/vscode.js.map +1 -0
  91. package/dist/cli/auth.d.ts +2 -0
  92. package/dist/cli/auth.d.ts.map +1 -0
  93. package/dist/cli/auth.js +94 -0
  94. package/dist/cli/auth.js.map +1 -0
  95. package/dist/cli/backfill.d.ts +2 -0
  96. package/dist/cli/backfill.d.ts.map +1 -0
  97. package/dist/cli/backfill.js +58 -0
  98. package/dist/cli/backfill.js.map +1 -0
  99. package/dist/cli/daemon.d.ts +6 -1
  100. package/dist/cli/daemon.d.ts.map +1 -1
  101. package/dist/cli/daemon.js +61 -3
  102. package/dist/cli/daemon.js.map +1 -1
  103. package/dist/cli/doctor.d.ts +8 -0
  104. package/dist/cli/doctor.d.ts.map +1 -0
  105. package/dist/cli/doctor.js +318 -0
  106. package/dist/cli/doctor.js.map +1 -0
  107. package/dist/cli/health.d.ts +1 -1
  108. package/dist/cli/health.d.ts.map +1 -1
  109. package/dist/cli/health.js +42 -3
  110. package/dist/cli/health.js.map +1 -1
  111. package/dist/cli/help.d.ts +6 -0
  112. package/dist/cli/help.d.ts.map +1 -0
  113. package/dist/cli/help.js +63 -0
  114. package/dist/cli/help.js.map +1 -0
  115. package/dist/cli/index.d.ts +1 -1
  116. package/dist/cli/index.d.ts.map +1 -1
  117. package/dist/cli/index.js +35 -7
  118. package/dist/cli/index.js.map +1 -1
  119. package/dist/cli/init.d.ts +2 -0
  120. package/dist/cli/init.d.ts.map +1 -0
  121. package/dist/cli/init.js +201 -0
  122. package/dist/cli/init.js.map +1 -0
  123. package/dist/cli/plugin.d.ts +5 -0
  124. package/dist/cli/plugin.d.ts.map +1 -0
  125. package/dist/cli/plugin.js +185 -0
  126. package/dist/cli/plugin.js.map +1 -0
  127. package/dist/cli/setup-mcp.d.ts +2 -0
  128. package/dist/cli/setup-mcp.d.ts.map +1 -0
  129. package/dist/cli/setup-mcp.js +114 -0
  130. package/dist/cli/setup-mcp.js.map +1 -0
  131. package/dist/cli/shell.d.ts +2 -0
  132. package/dist/cli/shell.d.ts.map +1 -0
  133. package/dist/cli/shell.js +86 -0
  134. package/dist/cli/shell.js.map +1 -0
  135. package/dist/cli/status.d.ts +2 -0
  136. package/dist/cli/status.d.ts.map +1 -0
  137. package/dist/cli/status.js +31 -0
  138. package/dist/cli/status.js.map +1 -0
  139. package/dist/cli/telemetry.d.ts +10 -0
  140. package/dist/cli/telemetry.d.ts.map +1 -0
  141. package/dist/cli/telemetry.js +56 -0
  142. package/dist/cli/telemetry.js.map +1 -0
  143. package/dist/cli/tui/agents-types.d.ts +28 -0
  144. package/dist/cli/tui/agents-types.d.ts.map +1 -0
  145. package/dist/cli/tui/agents-types.js +1 -0
  146. package/dist/cli/tui/agents-types.js.map +1 -0
  147. package/dist/cli/tui/agents.d.ts +11 -0
  148. package/dist/cli/tui/agents.d.ts.map +1 -0
  149. package/dist/cli/tui/agents.js +93 -0
  150. package/dist/cli/tui/agents.js.map +1 -0
  151. package/dist/cli/tui/banner.d.ts +3 -0
  152. package/dist/cli/tui/banner.d.ts.map +1 -0
  153. package/dist/cli/tui/banner.js +30 -0
  154. package/dist/cli/tui/banner.js.map +1 -0
  155. package/dist/cli/tui/components/AgentSelect.d.ts +13 -0
  156. package/dist/cli/tui/components/AgentSelect.d.ts.map +1 -0
  157. package/dist/cli/tui/components/AgentSelect.js +116 -0
  158. package/dist/cli/tui/components/AgentSelect.js.map +1 -0
  159. package/dist/cli/tui/components/Banner.d.ts +6 -0
  160. package/dist/cli/tui/components/Banner.d.ts.map +1 -0
  161. package/dist/cli/tui/components/Banner.js +25 -0
  162. package/dist/cli/tui/components/Banner.js.map +1 -0
  163. package/dist/cli/tui/components/BrowserSelect.d.ts +7 -0
  164. package/dist/cli/tui/components/BrowserSelect.d.ts.map +1 -0
  165. package/dist/cli/tui/components/BrowserSelect.js +19 -0
  166. package/dist/cli/tui/components/BrowserSelect.js.map +1 -0
  167. package/dist/cli/tui/components/InstallProgress.d.ts +9 -0
  168. package/dist/cli/tui/components/InstallProgress.d.ts.map +1 -0
  169. package/dist/cli/tui/components/InstallProgress.js +67 -0
  170. package/dist/cli/tui/components/InstallProgress.js.map +1 -0
  171. package/dist/cli/tui/components/SkillInstall.d.ts +14 -0
  172. package/dist/cli/tui/components/SkillInstall.d.ts.map +1 -0
  173. package/dist/cli/tui/components/SkillInstall.js +94 -0
  174. package/dist/cli/tui/components/SkillInstall.js.map +1 -0
  175. package/dist/cli/tui/components/Summary.d.ts +22 -0
  176. package/dist/cli/tui/components/Summary.d.ts.map +1 -0
  177. package/dist/cli/tui/components/Summary.js +135 -0
  178. package/dist/cli/tui/components/Summary.js.map +1 -0
  179. package/dist/cli/tui/components/SystemCheck.d.ts +8 -0
  180. package/dist/cli/tui/components/SystemCheck.d.ts.map +1 -0
  181. package/dist/cli/tui/components/SystemCheck.js +71 -0
  182. package/dist/cli/tui/components/SystemCheck.js.map +1 -0
  183. package/dist/cli/tui/components/Verification.d.ts +8 -0
  184. package/dist/cli/tui/components/Verification.d.ts.map +1 -0
  185. package/dist/cli/tui/components/Verification.js +63 -0
  186. package/dist/cli/tui/components/Verification.js.map +1 -0
  187. package/dist/cli/tui/config-writer-cli.d.ts +12 -0
  188. package/dist/cli/tui/config-writer-cli.d.ts.map +1 -0
  189. package/dist/cli/tui/config-writer-cli.js +39 -0
  190. package/dist/cli/tui/config-writer-cli.js.map +1 -0
  191. package/dist/cli/tui/config-writer-json.d.ts +16 -0
  192. package/dist/cli/tui/config-writer-json.d.ts.map +1 -0
  193. package/dist/cli/tui/config-writer-json.js +86 -0
  194. package/dist/cli/tui/config-writer-json.js.map +1 -0
  195. package/dist/cli/tui/config-writer-toml.d.ts +16 -0
  196. package/dist/cli/tui/config-writer-toml.d.ts.map +1 -0
  197. package/dist/cli/tui/config-writer-toml.js +83 -0
  198. package/dist/cli/tui/config-writer-toml.js.map +1 -0
  199. package/dist/cli/tui/config-writer.d.ts +25 -0
  200. package/dist/cli/tui/config-writer.d.ts.map +1 -0
  201. package/dist/cli/tui/config-writer.js +101 -0
  202. package/dist/cli/tui/config-writer.js.map +1 -0
  203. package/dist/cli/tui/detect-helpers.d.ts +6 -0
  204. package/dist/cli/tui/detect-helpers.d.ts.map +1 -0
  205. package/dist/cli/tui/detect-helpers.js +45 -0
  206. package/dist/cli/tui/detect-helpers.js.map +1 -0
  207. package/dist/cli/tui/extras-prompt.d.ts +7 -0
  208. package/dist/cli/tui/extras-prompt.d.ts.map +1 -0
  209. package/dist/cli/tui/extras-prompt.js +42 -0
  210. package/dist/cli/tui/extras-prompt.js.map +1 -0
  211. package/dist/cli/tui/flags-types.d.ts +19 -0
  212. package/dist/cli/tui/flags-types.d.ts.map +1 -0
  213. package/dist/cli/tui/flags-types.js +23 -0
  214. package/dist/cli/tui/flags-types.js.map +1 -0
  215. package/dist/cli/tui/flags.d.ts +5 -0
  216. package/dist/cli/tui/flags.d.ts.map +1 -0
  217. package/dist/cli/tui/flags.js +132 -0
  218. package/dist/cli/tui/flags.js.map +1 -0
  219. package/dist/cli/tui/format.d.ts +14 -0
  220. package/dist/cli/tui/format.d.ts.map +1 -0
  221. package/dist/cli/tui/format.js +37 -0
  222. package/dist/cli/tui/format.js.map +1 -0
  223. package/dist/cli/tui/hooks/useAgentDetect.d.ts +6 -0
  224. package/dist/cli/tui/hooks/useAgentDetect.d.ts.map +1 -0
  225. package/dist/cli/tui/hooks/useAgentDetect.js +19 -0
  226. package/dist/cli/tui/hooks/useAgentDetect.js.map +1 -0
  227. package/dist/cli/tui/hooks/useInstall.d.ts +14 -0
  228. package/dist/cli/tui/hooks/useInstall.d.ts.map +1 -0
  229. package/dist/cli/tui/hooks/useInstall.js +90 -0
  230. package/dist/cli/tui/hooks/useInstall.js.map +1 -0
  231. package/dist/cli/tui/hooks/useSystemCheck.d.ts +13 -0
  232. package/dist/cli/tui/hooks/useSystemCheck.d.ts.map +1 -0
  233. package/dist/cli/tui/hooks/useSystemCheck.js +95 -0
  234. package/dist/cli/tui/hooks/useSystemCheck.js.map +1 -0
  235. package/dist/cli/tui/hooks/useVerify.d.ts +14 -0
  236. package/dist/cli/tui/hooks/useVerify.d.ts.map +1 -0
  237. package/dist/cli/tui/hooks/useVerify.js +71 -0
  238. package/dist/cli/tui/hooks/useVerify.js.map +1 -0
  239. package/dist/cli/tui/ink-init.d.ts +2 -0
  240. package/dist/cli/tui/ink-init.d.ts.map +1 -0
  241. package/dist/cli/tui/ink-init.js +198 -0
  242. package/dist/cli/tui/ink-init.js.map +1 -0
  243. package/dist/cli/tui/reporter-auto.d.ts +7 -0
  244. package/dist/cli/tui/reporter-auto.d.ts.map +1 -0
  245. package/dist/cli/tui/reporter-auto.js +15 -0
  246. package/dist/cli/tui/reporter-auto.js.map +1 -0
  247. package/dist/cli/tui/reporter.d.ts +26 -0
  248. package/dist/cli/tui/reporter.d.ts.map +1 -0
  249. package/dist/cli/tui/reporter.js +32 -0
  250. package/dist/cli/tui/reporter.js.map +1 -0
  251. package/dist/cli/tui/run-command.d.ts +14 -0
  252. package/dist/cli/tui/run-command.d.ts.map +1 -0
  253. package/dist/cli/tui/run-command.js +72 -0
  254. package/dist/cli/tui/run-command.js.map +1 -0
  255. package/dist/cli/tui/select-agents.d.ts +6 -0
  256. package/dist/cli/tui/select-agents.d.ts.map +1 -0
  257. package/dist/cli/tui/select-agents.js +32 -0
  258. package/dist/cli/tui/select-agents.js.map +1 -0
  259. package/dist/cli/tui/status-agents.d.ts +11 -0
  260. package/dist/cli/tui/status-agents.d.ts.map +1 -0
  261. package/dist/cli/tui/status-agents.js +53 -0
  262. package/dist/cli/tui/status-agents.js.map +1 -0
  263. package/dist/cli/tui/status-cache.d.ts +6 -0
  264. package/dist/cli/tui/status-cache.d.ts.map +1 -0
  265. package/dist/cli/tui/status-cache.js +39 -0
  266. package/dist/cli/tui/status-cache.js.map +1 -0
  267. package/dist/cli/tui/status-format.d.ts +14 -0
  268. package/dist/cli/tui/status-format.d.ts.map +1 -0
  269. package/dist/cli/tui/status-format.js +41 -0
  270. package/dist/cli/tui/status-format.js.map +1 -0
  271. package/dist/cli/tui/status-python.d.ts +6 -0
  272. package/dist/cli/tui/status-python.d.ts.map +1 -0
  273. package/dist/cli/tui/status-python.js +30 -0
  274. package/dist/cli/tui/status-python.js.map +1 -0
  275. package/dist/cli/tui/system-check.d.ts +24 -0
  276. package/dist/cli/tui/system-check.d.ts.map +1 -0
  277. package/dist/cli/tui/system-check.js +103 -0
  278. package/dist/cli/tui/system-check.js.map +1 -0
  279. package/dist/cli/tui/tui-reporter.d.ts +19 -0
  280. package/dist/cli/tui/tui-reporter.d.ts.map +1 -0
  281. package/dist/cli/tui/tui-reporter.js +95 -0
  282. package/dist/cli/tui/tui-reporter.js.map +1 -0
  283. package/dist/cli/tui/utils/config-writer.d.ts +3 -0
  284. package/dist/cli/tui/utils/config-writer.d.ts.map +1 -0
  285. package/dist/cli/tui/utils/config-writer.js +22 -0
  286. package/dist/cli/tui/utils/config-writer.js.map +1 -0
  287. package/dist/cli/tui/utils/suppress-logs.d.ts +3 -0
  288. package/dist/cli/tui/utils/suppress-logs.d.ts.map +1 -0
  289. package/dist/cli/tui/utils/suppress-logs.js +11 -0
  290. package/dist/cli/tui/utils/suppress-logs.js.map +1 -0
  291. package/dist/cli/tui/verify-suggestions.d.ts +5 -0
  292. package/dist/cli/tui/verify-suggestions.d.ts.map +1 -0
  293. package/dist/cli/tui/verify-suggestions.js +20 -0
  294. package/dist/cli/tui/verify-suggestions.js.map +1 -0
  295. package/dist/cli/tui/verify.d.ts +14 -0
  296. package/dist/cli/tui/verify.d.ts.map +1 -0
  297. package/dist/cli/tui/verify.js +101 -0
  298. package/dist/cli/tui/verify.js.map +1 -0
  299. package/dist/cli/tui/version.d.ts +2 -0
  300. package/dist/cli/tui/version.d.ts.map +1 -0
  301. package/dist/cli/tui/version.js +14 -0
  302. package/dist/cli/tui/version.js.map +1 -0
  303. package/dist/cli/uninstall.d.ts +2 -0
  304. package/dist/cli/uninstall.d.ts.map +1 -0
  305. package/dist/cli/uninstall.js +57 -0
  306. package/dist/cli/uninstall.js.map +1 -0
  307. package/dist/cli/warmup.d.ts +10 -2
  308. package/dist/cli/warmup.d.ts.map +1 -1
  309. package/dist/cli/warmup.js +226 -93
  310. package/dist/cli/warmup.js.map +1 -1
  311. package/dist/config.d.ts +28 -2
  312. package/dist/config.d.ts.map +1 -1
  313. package/dist/config.js +106 -56
  314. package/dist/config.js.map +1 -1
  315. package/dist/crawl/crawler.d.ts +6 -0
  316. package/dist/crawl/crawler.d.ts.map +1 -1
  317. package/dist/crawl/crawler.js +210 -209
  318. package/dist/crawl/crawler.js.map +1 -1
  319. package/dist/crawl/dedup.d.ts +1 -0
  320. package/dist/crawl/dedup.d.ts.map +1 -1
  321. package/dist/crawl/dedup.js +124 -81
  322. package/dist/crawl/dedup.js.map +1 -1
  323. package/dist/crawl/etag-incremental.d.ts +43 -0
  324. package/dist/crawl/etag-incremental.d.ts.map +1 -0
  325. package/dist/crawl/etag-incremental.js +94 -0
  326. package/dist/crawl/etag-incremental.js.map +1 -0
  327. package/dist/crawl/index-to-vec.d.ts +10 -0
  328. package/dist/crawl/index-to-vec.d.ts.map +1 -0
  329. package/dist/crawl/index-to-vec.js +44 -0
  330. package/dist/crawl/index-to-vec.js.map +1 -0
  331. package/dist/crawl/mapper.js +136 -164
  332. package/dist/crawl/mapper.js.map +1 -1
  333. package/dist/crawl/rate-limiter.js +63 -66
  334. package/dist/crawl/rate-limiter.js.map +1 -1
  335. package/dist/crawl/robots.js +58 -57
  336. package/dist/crawl/robots.js.map +1 -1
  337. package/dist/crawl/sitemap-first.d.ts +12 -0
  338. package/dist/crawl/sitemap-first.d.ts.map +1 -0
  339. package/dist/crawl/sitemap-first.js +47 -0
  340. package/dist/crawl/sitemap-first.js.map +1 -0
  341. package/dist/crawl/sitemap.js +33 -32
  342. package/dist/crawl/sitemap.js.map +1 -1
  343. package/dist/crawl/url-utils.d.ts +1 -0
  344. package/dist/crawl/url-utils.d.ts.map +1 -1
  345. package/dist/crawl/url-utils.js +49 -37
  346. package/dist/crawl/url-utils.js.map +1 -1
  347. package/dist/daemon/health-check.d.ts +16 -0
  348. package/dist/daemon/health-check.d.ts.map +1 -0
  349. package/dist/daemon/health-check.js +33 -0
  350. package/dist/daemon/health-check.js.map +1 -0
  351. package/dist/daemon/http-server.d.ts +26 -0
  352. package/dist/daemon/http-server.d.ts.map +1 -0
  353. package/dist/daemon/http-server.js +275 -0
  354. package/dist/daemon/http-server.js.map +1 -0
  355. package/dist/daemon/proxy.d.ts +10 -0
  356. package/dist/daemon/proxy.d.ts.map +1 -0
  357. package/dist/daemon/proxy.js +93 -0
  358. package/dist/daemon/proxy.js.map +1 -0
  359. package/dist/embedding/embed.d.ts +59 -0
  360. package/dist/embedding/embed.d.ts.map +1 -0
  361. package/dist/embedding/embed.js +233 -0
  362. package/dist/embedding/embed.js.map +1 -0
  363. package/dist/embedding/fastembed-provider.d.ts +19 -0
  364. package/dist/embedding/fastembed-provider.d.ts.map +1 -0
  365. package/dist/embedding/fastembed-provider.js +51 -0
  366. package/dist/embedding/fastembed-provider.js.map +1 -0
  367. package/dist/embedding/key-terms.d.ts +12 -0
  368. package/dist/embedding/key-terms.d.ts.map +1 -0
  369. package/dist/embedding/key-terms.js +234 -0
  370. package/dist/embedding/key-terms.js.map +1 -0
  371. package/dist/extraction/boilerplate.d.ts +15 -0
  372. package/dist/extraction/boilerplate.d.ts.map +1 -0
  373. package/dist/extraction/boilerplate.js +52 -0
  374. package/dist/extraction/boilerplate.js.map +1 -0
  375. package/dist/extraction/defuddle.d.ts.map +1 -1
  376. package/dist/extraction/defuddle.js +27 -23
  377. package/dist/extraction/defuddle.js.map +1 -1
  378. package/dist/extraction/extract.d.ts.map +1 -1
  379. package/dist/extraction/extract.js +76 -76
  380. package/dist/extraction/extract.js.map +1 -1
  381. package/dist/extraction/jsonld.js +50 -54
  382. package/dist/extraction/jsonld.js.map +1 -1
  383. package/dist/extraction/lang-hints.d.ts +2 -0
  384. package/dist/extraction/lang-hints.d.ts.map +1 -0
  385. package/dist/extraction/lang-hints.js +30 -0
  386. package/dist/extraction/lang-hints.js.map +1 -0
  387. package/dist/extraction/llm-fallback.d.ts +17 -0
  388. package/dist/extraction/llm-fallback.d.ts.map +1 -0
  389. package/dist/extraction/llm-fallback.js +130 -0
  390. package/dist/extraction/llm-fallback.js.map +1 -0
  391. package/dist/extraction/markdown-sanitize.d.ts +2 -0
  392. package/dist/extraction/markdown-sanitize.d.ts.map +1 -0
  393. package/dist/extraction/markdown-sanitize.js +151 -0
  394. package/dist/extraction/markdown-sanitize.js.map +1 -0
  395. package/dist/extraction/markdown.d.ts +11 -0
  396. package/dist/extraction/markdown.d.ts.map +1 -1
  397. package/dist/extraction/markdown.js +195 -91
  398. package/dist/extraction/markdown.js.map +1 -1
  399. package/dist/extraction/pipeline.d.ts +8 -0
  400. package/dist/extraction/pipeline.d.ts.map +1 -1
  401. package/dist/extraction/pipeline.js +57 -91
  402. package/dist/extraction/pipeline.js.map +1 -1
  403. package/dist/extraction/readability.d.ts +1 -1
  404. package/dist/extraction/readability.d.ts.map +1 -1
  405. package/dist/extraction/readability.js +28 -29
  406. package/dist/extraction/readability.js.map +1 -1
  407. package/dist/extraction/schema.d.ts +12 -0
  408. package/dist/extraction/schema.d.ts.map +1 -1
  409. package/dist/extraction/schema.js +135 -72
  410. package/dist/extraction/schema.js.map +1 -1
  411. package/dist/extraction/site-extractors/docs-generic.d.ts.map +1 -1
  412. package/dist/extraction/site-extractors/docs-generic.js +81 -91
  413. package/dist/extraction/site-extractors/docs-generic.js.map +1 -1
  414. package/dist/extraction/site-extractors/github.d.ts.map +1 -1
  415. package/dist/extraction/site-extractors/github.js +87 -95
  416. package/dist/extraction/site-extractors/github.js.map +1 -1
  417. package/dist/extraction/site-extractors/mdn.d.ts.map +1 -1
  418. package/dist/extraction/site-extractors/mdn.js +46 -54
  419. package/dist/extraction/site-extractors/mdn.js.map +1 -1
  420. package/dist/extraction/site-extractors/stackoverflow.d.ts.map +1 -1
  421. package/dist/extraction/site-extractors/stackoverflow.js +71 -80
  422. package/dist/extraction/site-extractors/stackoverflow.js.map +1 -1
  423. package/dist/extraction/structured-data.d.ts +4 -0
  424. package/dist/extraction/structured-data.d.ts.map +1 -0
  425. package/dist/extraction/structured-data.js +173 -0
  426. package/dist/extraction/structured-data.js.map +1 -0
  427. package/dist/extraction/structured.d.ts +4 -0
  428. package/dist/extraction/structured.d.ts.map +1 -0
  429. package/dist/extraction/structured.js +163 -0
  430. package/dist/extraction/structured.js.map +1 -0
  431. package/dist/extraction/v1/classifier.d.ts +3 -0
  432. package/dist/extraction/v1/classifier.d.ts.map +1 -0
  433. package/dist/extraction/v1/classifier.js +110 -0
  434. package/dist/extraction/v1/classifier.js.map +1 -0
  435. package/dist/extraction/v1/extract-provider.d.ts +16 -0
  436. package/dist/extraction/v1/extract-provider.d.ts.map +1 -0
  437. package/dist/extraction/v1/extract-provider.js +43 -0
  438. package/dist/extraction/v1/extract-provider.js.map +1 -0
  439. package/dist/extraction/v1/local-llm.d.ts +8 -0
  440. package/dist/extraction/v1/local-llm.d.ts.map +1 -0
  441. package/dist/extraction/v1/local-llm.js +58 -0
  442. package/dist/extraction/v1/local-llm.js.map +1 -0
  443. package/dist/extraction/v1/news.d.ts +3 -0
  444. package/dist/extraction/v1/news.d.ts.map +1 -0
  445. package/dist/extraction/v1/news.js +61 -0
  446. package/dist/extraction/v1/news.js.map +1 -0
  447. package/dist/extraction/v1/product.d.ts +3 -0
  448. package/dist/extraction/v1/product.d.ts.map +1 -0
  449. package/dist/extraction/v1/product.js +166 -0
  450. package/dist/extraction/v1/product.js.map +1 -0
  451. package/dist/extraction/v1/recipe.d.ts +3 -0
  452. package/dist/extraction/v1/recipe.d.ts.map +1 -0
  453. package/dist/extraction/v1/recipe.js +136 -0
  454. package/dist/extraction/v1/recipe.js.map +1 -0
  455. package/dist/extraction/v1/routed.d.ts +17 -0
  456. package/dist/extraction/v1/routed.d.ts.map +1 -0
  457. package/dist/extraction/v1/routed.js +68 -0
  458. package/dist/extraction/v1/routed.js.map +1 -0
  459. package/dist/extraction/v1/schemas/Article.d.ts +11 -0
  460. package/dist/extraction/v1/schemas/Article.d.ts.map +1 -0
  461. package/dist/extraction/v1/schemas/Article.js +23 -0
  462. package/dist/extraction/v1/schemas/Article.js.map +1 -0
  463. package/dist/extraction/v1/schemas/CodeSnippet.d.ts +9 -0
  464. package/dist/extraction/v1/schemas/CodeSnippet.d.ts.map +1 -0
  465. package/dist/extraction/v1/schemas/CodeSnippet.js +90 -0
  466. package/dist/extraction/v1/schemas/CodeSnippet.js.map +1 -0
  467. package/dist/extraction/v1/schemas/EventListing.d.ts +10 -0
  468. package/dist/extraction/v1/schemas/EventListing.d.ts.map +1 -0
  469. package/dist/extraction/v1/schemas/EventListing.js +122 -0
  470. package/dist/extraction/v1/schemas/EventListing.js.map +1 -0
  471. package/dist/extraction/v1/schemas/Paper.d.ts +10 -0
  472. package/dist/extraction/v1/schemas/Paper.d.ts.map +1 -0
  473. package/dist/extraction/v1/schemas/Paper.js +156 -0
  474. package/dist/extraction/v1/schemas/Paper.js.map +1 -0
  475. package/dist/extraction/v1/schemas/Product.d.ts +17 -0
  476. package/dist/extraction/v1/schemas/Product.d.ts.map +1 -0
  477. package/dist/extraction/v1/schemas/Product.js +149 -0
  478. package/dist/extraction/v1/schemas/Product.js.map +1 -0
  479. package/dist/extraction/v1/schemas/Recipe.d.ts +14 -0
  480. package/dist/extraction/v1/schemas/Recipe.d.ts.map +1 -0
  481. package/dist/extraction/v1/schemas/Recipe.js +160 -0
  482. package/dist/extraction/v1/schemas/Recipe.js.map +1 -0
  483. package/dist/extraction/v1/schemas/index.d.ts +13 -0
  484. package/dist/extraction/v1/schemas/index.d.ts.map +1 -0
  485. package/dist/extraction/v1/schemas/index.js +44 -0
  486. package/dist/extraction/v1/schemas/index.js.map +1 -0
  487. package/dist/extraction/v1/site-extractors.d.ts +5 -0
  488. package/dist/extraction/v1/site-extractors.d.ts.map +1 -0
  489. package/dist/extraction/v1/site-extractors.js +31 -0
  490. package/dist/extraction/v1/site-extractors.js.map +1 -0
  491. package/dist/fetch/action-executor.d.ts +28 -0
  492. package/dist/fetch/action-executor.d.ts.map +1 -0
  493. package/dist/fetch/action-executor.js +88 -0
  494. package/dist/fetch/action-executor.js.map +1 -0
  495. package/dist/fetch/auth.d.ts +2 -1
  496. package/dist/fetch/auth.d.ts.map +1 -1
  497. package/dist/fetch/auth.js +56 -26
  498. package/dist/fetch/auth.js.map +1 -1
  499. package/dist/fetch/browser-pool.d.ts +30 -11
  500. package/dist/fetch/browser-pool.d.ts.map +1 -1
  501. package/dist/fetch/browser-pool.js +303 -127
  502. package/dist/fetch/browser-pool.js.map +1 -1
  503. package/dist/fetch/browser-selector.d.ts +17 -0
  504. package/dist/fetch/browser-selector.d.ts.map +1 -0
  505. package/dist/fetch/browser-selector.js +72 -0
  506. package/dist/fetch/browser-selector.js.map +1 -0
  507. package/dist/fetch/browser-types.d.ts +3 -0
  508. package/dist/fetch/browser-types.d.ts.map +1 -0
  509. package/dist/fetch/browser-types.js +45 -0
  510. package/dist/fetch/browser-types.js.map +1 -0
  511. package/dist/fetch/cdp-client.d.ts +9 -0
  512. package/dist/fetch/cdp-client.d.ts.map +1 -0
  513. package/dist/fetch/cdp-client.js +89 -0
  514. package/dist/fetch/cdp-client.js.map +1 -0
  515. package/dist/fetch/content-check.js +39 -46
  516. package/dist/fetch/content-check.js.map +1 -1
  517. package/dist/fetch/http-client.d.ts +4 -0
  518. package/dist/fetch/http-client.d.ts.map +1 -1
  519. package/dist/fetch/http-client.js +147 -128
  520. package/dist/fetch/http-client.js.map +1 -1
  521. package/dist/fetch/lightpanda.d.ts +28 -0
  522. package/dist/fetch/lightpanda.d.ts.map +1 -0
  523. package/dist/fetch/lightpanda.js +174 -0
  524. package/dist/fetch/lightpanda.js.map +1 -0
  525. package/dist/fetch/playwright-tier.d.ts +19 -0
  526. package/dist/fetch/playwright-tier.d.ts.map +1 -0
  527. package/dist/fetch/playwright-tier.js +76 -0
  528. package/dist/fetch/playwright-tier.js.map +1 -0
  529. package/dist/fetch/router.d.ts +49 -3
  530. package/dist/fetch/router.d.ts.map +1 -1
  531. package/dist/fetch/router.js +185 -81
  532. package/dist/fetch/router.js.map +1 -1
  533. package/dist/index.js +97 -17
  534. package/dist/index.js.map +1 -1
  535. package/dist/instructions.d.ts +31 -0
  536. package/dist/instructions.d.ts.map +1 -0
  537. package/dist/instructions.js +245 -0
  538. package/dist/instructions.js.map +1 -0
  539. package/dist/integrations/cloud/llm/anthropic.d.ts +3 -0
  540. package/dist/integrations/cloud/llm/anthropic.d.ts.map +1 -0
  541. package/dist/integrations/cloud/llm/anthropic.js +41 -0
  542. package/dist/integrations/cloud/llm/anthropic.js.map +1 -0
  543. package/dist/integrations/cloud/llm/cache.d.ts +5 -0
  544. package/dist/integrations/cloud/llm/cache.d.ts.map +1 -0
  545. package/dist/integrations/cloud/llm/cache.js +49 -0
  546. package/dist/integrations/cloud/llm/cache.js.map +1 -0
  547. package/dist/integrations/cloud/llm/gemini.d.ts +3 -0
  548. package/dist/integrations/cloud/llm/gemini.d.ts.map +1 -0
  549. package/dist/integrations/cloud/llm/gemini.js +37 -0
  550. package/dist/integrations/cloud/llm/gemini.js.map +1 -0
  551. package/dist/integrations/cloud/llm/groq.d.ts +3 -0
  552. package/dist/integrations/cloud/llm/groq.d.ts.map +1 -0
  553. package/dist/integrations/cloud/llm/groq.js +74 -0
  554. package/dist/integrations/cloud/llm/groq.js.map +1 -0
  555. package/dist/integrations/cloud/llm/hash.d.ts +3 -0
  556. package/dist/integrations/cloud/llm/hash.d.ts.map +1 -0
  557. package/dist/integrations/cloud/llm/hash.js +26 -0
  558. package/dist/integrations/cloud/llm/hash.js.map +1 -0
  559. package/dist/integrations/cloud/llm/openai.d.ts +3 -0
  560. package/dist/integrations/cloud/llm/openai.d.ts.map +1 -0
  561. package/dist/integrations/cloud/llm/openai.js +43 -0
  562. package/dist/integrations/cloud/llm/openai.js.map +1 -0
  563. package/dist/integrations/cloud/llm/select.d.ts +5 -0
  564. package/dist/integrations/cloud/llm/select.d.ts.map +1 -0
  565. package/dist/integrations/cloud/llm/select.js +30 -0
  566. package/dist/integrations/cloud/llm/select.js.map +1 -0
  567. package/dist/integrations/cloud/llm/types.d.ts +24 -0
  568. package/dist/integrations/cloud/llm/types.d.ts.map +1 -0
  569. package/dist/integrations/cloud/llm/types.js +1 -0
  570. package/dist/integrations/cloud/llm/types.js.map +1 -0
  571. package/dist/integrations/cloud/llm/validate.d.ts +6 -0
  572. package/dist/integrations/cloud/llm/validate.d.ts.map +1 -0
  573. package/dist/integrations/cloud/llm/validate.js +63 -0
  574. package/dist/integrations/cloud/llm/validate.js.map +1 -0
  575. package/dist/logger.d.ts +4 -1
  576. package/dist/logger.d.ts.map +1 -1
  577. package/dist/logger.js +71 -30
  578. package/dist/logger.js.map +1 -1
  579. package/dist/pdf-parse.d.js +1 -0
  580. package/dist/pdf-parse.d.js.map +1 -0
  581. package/dist/plugins/loader.d.ts +20 -0
  582. package/dist/plugins/loader.d.ts.map +1 -0
  583. package/dist/plugins/loader.js +157 -0
  584. package/dist/plugins/loader.js.map +1 -0
  585. package/dist/plugins/registry.d.ts +26 -0
  586. package/dist/plugins/registry.d.ts.map +1 -0
  587. package/dist/plugins/registry.js +71 -0
  588. package/dist/plugins/registry.js.map +1 -0
  589. package/dist/plugins/validate.d.ts +9 -0
  590. package/dist/plugins/validate.d.ts.map +1 -0
  591. package/dist/plugins/validate.js +79 -0
  592. package/dist/plugins/validate.js.map +1 -0
  593. package/dist/providers/embed-provider.d.ts +11 -0
  594. package/dist/providers/embed-provider.d.ts.map +1 -0
  595. package/dist/providers/embed-provider.js +24 -0
  596. package/dist/providers/embed-provider.js.map +1 -0
  597. package/dist/providers/extract-provider.d.ts +23 -0
  598. package/dist/providers/extract-provider.d.ts.map +1 -0
  599. package/dist/providers/extract-provider.js +25 -0
  600. package/dist/providers/extract-provider.js.map +1 -0
  601. package/dist/providers/rerank-provider.d.ts +16 -0
  602. package/dist/providers/rerank-provider.d.ts.map +1 -0
  603. package/dist/providers/rerank-provider.js +28 -0
  604. package/dist/providers/rerank-provider.js.map +1 -0
  605. package/dist/providers/search-provider.d.ts +25 -0
  606. package/dist/providers/search-provider.d.ts.map +1 -0
  607. package/dist/providers/search-provider.js +44 -0
  608. package/dist/providers/search-provider.js.map +1 -0
  609. package/dist/providers/vector-store.d.ts +27 -0
  610. package/dist/providers/vector-store.d.ts.map +1 -0
  611. package/dist/providers/vector-store.js +27 -0
  612. package/dist/providers/vector-store.js.map +1 -0
  613. package/dist/python-env.d.ts +9 -0
  614. package/dist/python-env.d.ts.map +1 -0
  615. package/dist/python-env.js +13 -0
  616. package/dist/python-env.js.map +1 -0
  617. package/dist/repl/commands/agent.d.ts +5 -0
  618. package/dist/repl/commands/agent.d.ts.map +1 -0
  619. package/dist/repl/commands/agent.js +62 -0
  620. package/dist/repl/commands/agent.js.map +1 -0
  621. package/dist/repl/commands/cache.d.ts +4 -0
  622. package/dist/repl/commands/cache.d.ts.map +1 -0
  623. package/dist/repl/commands/cache.js +43 -0
  624. package/dist/repl/commands/cache.js.map +1 -0
  625. package/dist/repl/commands/crawl.d.ts +7 -0
  626. package/dist/repl/commands/crawl.d.ts.map +1 -0
  627. package/dist/repl/commands/crawl.js +44 -0
  628. package/dist/repl/commands/crawl.js.map +1 -0
  629. package/dist/repl/commands/extract.d.ts +5 -0
  630. package/dist/repl/commands/extract.d.ts.map +1 -0
  631. package/dist/repl/commands/extract.js +47 -0
  632. package/dist/repl/commands/extract.js.map +1 -0
  633. package/dist/repl/commands/fetch.d.ts +5 -0
  634. package/dist/repl/commands/fetch.d.ts.map +1 -0
  635. package/dist/repl/commands/fetch.js +67 -0
  636. package/dist/repl/commands/fetch.js.map +1 -0
  637. package/dist/repl/commands/find-similar.d.ts +5 -0
  638. package/dist/repl/commands/find-similar.d.ts.map +1 -0
  639. package/dist/repl/commands/find-similar.js +74 -0
  640. package/dist/repl/commands/find-similar.js.map +1 -0
  641. package/dist/repl/commands/research.d.ts +5 -0
  642. package/dist/repl/commands/research.d.ts.map +1 -0
  643. package/dist/repl/commands/research.js +65 -0
  644. package/dist/repl/commands/research.js.map +1 -0
  645. package/dist/repl/commands/search.d.ts +5 -0
  646. package/dist/repl/commands/search.d.ts.map +1 -0
  647. package/dist/repl/commands/search.js +74 -0
  648. package/dist/repl/commands/search.js.map +1 -0
  649. package/dist/repl/commands/types.d.ts +9 -0
  650. package/dist/repl/commands/types.d.ts.map +1 -0
  651. package/dist/repl/commands/types.js +1 -0
  652. package/dist/repl/commands/types.js.map +1 -0
  653. package/dist/repl/formatters.d.ts +13 -0
  654. package/dist/repl/formatters.d.ts.map +1 -0
  655. package/dist/repl/formatters.js +283 -0
  656. package/dist/repl/formatters.js.map +1 -0
  657. package/dist/repl/parser.d.ts +9 -0
  658. package/dist/repl/parser.d.ts.map +1 -0
  659. package/dist/repl/parser.js +86 -0
  660. package/dist/repl/parser.js.map +1 -0
  661. package/dist/repl/shell.d.ts +8 -0
  662. package/dist/repl/shell.d.ts.map +1 -0
  663. package/dist/repl/shell.js +184 -0
  664. package/dist/repl/shell.js.map +1 -0
  665. package/dist/research/branch-exploration.d.ts +14 -0
  666. package/dist/research/branch-exploration.d.ts.map +1 -0
  667. package/dist/research/branch-exploration.js +100 -0
  668. package/dist/research/branch-exploration.js.map +1 -0
  669. package/dist/research/brief.d.ts +5 -0
  670. package/dist/research/brief.d.ts.map +1 -0
  671. package/dist/research/brief.js +242 -0
  672. package/dist/research/brief.js.map +1 -0
  673. package/dist/research/citation-graph.d.ts +9 -0
  674. package/dist/research/citation-graph.d.ts.map +1 -0
  675. package/dist/research/citation-graph.js +114 -0
  676. package/dist/research/citation-graph.js.map +1 -0
  677. package/dist/research/decompose.d.ts +14 -0
  678. package/dist/research/decompose.d.ts.map +1 -0
  679. package/dist/research/decompose.js +439 -0
  680. package/dist/research/decompose.js.map +1 -0
  681. package/dist/research/pipeline.d.ts +5 -0
  682. package/dist/research/pipeline.d.ts.map +1 -0
  683. package/dist/research/pipeline.js +269 -0
  684. package/dist/research/pipeline.js.map +1 -0
  685. package/dist/research/synthesis-local.d.ts +16 -0
  686. package/dist/research/synthesis-local.d.ts.map +1 -0
  687. package/dist/research/synthesis-local.js +73 -0
  688. package/dist/research/synthesis-local.js.map +1 -0
  689. package/dist/research/synthesize.d.ts +10 -0
  690. package/dist/research/synthesize.d.ts.map +1 -0
  691. package/dist/research/synthesize.js +137 -0
  692. package/dist/research/synthesize.js.map +1 -0
  693. package/dist/search/answer-synthesis.d.ts +33 -0
  694. package/dist/search/answer-synthesis.d.ts.map +1 -0
  695. package/dist/search/answer-synthesis.js +244 -0
  696. package/dist/search/answer-synthesis.js.map +1 -0
  697. package/dist/search/context-formatter.d.ts +3 -0
  698. package/dist/search/context-formatter.d.ts.map +1 -0
  699. package/dist/search/context-formatter.js +56 -0
  700. package/dist/search/context-formatter.js.map +1 -0
  701. package/dist/search/dedup.d.ts +1 -0
  702. package/dist/search/dedup.d.ts.map +1 -1
  703. package/dist/search/dedup.js +40 -32
  704. package/dist/search/dedup.js.map +1 -1
  705. package/dist/search/engines/arxiv.d.ts +7 -0
  706. package/dist/search/engines/arxiv.d.ts.map +1 -0
  707. package/dist/search/engines/arxiv.js +70 -0
  708. package/dist/search/engines/arxiv.js.map +1 -0
  709. package/dist/search/engines/bing-news.d.ts +7 -0
  710. package/dist/search/engines/bing-news.d.ts.map +1 -0
  711. package/dist/search/engines/bing-news.js +97 -0
  712. package/dist/search/engines/bing-news.js.map +1 -0
  713. package/dist/search/engines/bing.d.ts +1 -0
  714. package/dist/search/engines/bing.d.ts.map +1 -1
  715. package/dist/search/engines/bing.js +100 -44
  716. package/dist/search/engines/bing.js.map +1 -1
  717. package/dist/search/engines/devdocs.d.ts +6 -0
  718. package/dist/search/engines/devdocs.d.ts.map +1 -0
  719. package/dist/search/engines/devdocs.js +56 -0
  720. package/dist/search/engines/devdocs.js.map +1 -0
  721. package/dist/search/engines/duckduckgo.d.ts.map +1 -1
  722. package/dist/search/engines/duckduckgo.js +56 -44
  723. package/dist/search/engines/duckduckgo.js.map +1 -1
  724. package/dist/search/engines/github-code.d.ts +7 -0
  725. package/dist/search/engines/github-code.d.ts.map +1 -0
  726. package/dist/search/engines/github-code.js +55 -0
  727. package/dist/search/engines/github-code.js.map +1 -0
  728. package/dist/search/engines/hn-algolia.d.ts +7 -0
  729. package/dist/search/engines/hn-algolia.d.ts.map +1 -0
  730. package/dist/search/engines/hn-algolia.js +76 -0
  731. package/dist/search/engines/hn-algolia.js.map +1 -0
  732. package/dist/search/engines/lobsters.d.ts +7 -0
  733. package/dist/search/engines/lobsters.d.ts.map +1 -0
  734. package/dist/search/engines/lobsters.js +83 -0
  735. package/dist/search/engines/lobsters.js.map +1 -0
  736. package/dist/search/engines/mdn.d.ts +7 -0
  737. package/dist/search/engines/mdn.d.ts.map +1 -0
  738. package/dist/search/engines/mdn.js +48 -0
  739. package/dist/search/engines/mdn.js.map +1 -0
  740. package/dist/search/engines/semantic-scholar.d.ts +7 -0
  741. package/dist/search/engines/semantic-scholar.d.ts.map +1 -0
  742. package/dist/search/engines/semantic-scholar.js +69 -0
  743. package/dist/search/engines/semantic-scholar.js.map +1 -0
  744. package/dist/search/engines/stackoverflow.d.ts +7 -0
  745. package/dist/search/engines/stackoverflow.d.ts.map +1 -0
  746. package/dist/search/engines/stackoverflow.js +73 -0
  747. package/dist/search/engines/stackoverflow.js.map +1 -0
  748. package/dist/search/engines/startpage.d.ts.map +1 -1
  749. package/dist/search/engines/startpage.js +65 -46
  750. package/dist/search/engines/startpage.js.map +1 -1
  751. package/dist/search/evidence.d.ts +25 -0
  752. package/dist/search/evidence.d.ts.map +1 -0
  753. package/dist/search/evidence.js +220 -0
  754. package/dist/search/evidence.js.map +1 -0
  755. package/dist/search/filters.js +49 -55
  756. package/dist/search/filters.js.map +1 -1
  757. package/dist/search/find-similar/crawl-rank.d.ts +9 -0
  758. package/dist/search/find-similar/crawl-rank.d.ts.map +1 -0
  759. package/dist/search/find-similar/crawl-rank.js +272 -0
  760. package/dist/search/find-similar/crawl-rank.js.map +1 -0
  761. package/dist/search/find-similar/mode.d.ts +4 -0
  762. package/dist/search/find-similar/mode.d.ts.map +1 -0
  763. package/dist/search/find-similar/mode.js +12 -0
  764. package/dist/search/find-similar/mode.js.map +1 -0
  765. package/dist/search/find-similar.d.ts +5 -0
  766. package/dist/search/find-similar.d.ts.map +1 -0
  767. package/dist/search/find-similar.js +509 -0
  768. package/dist/search/find-similar.js.map +1 -0
  769. package/dist/search/highlights.d.ts +19 -0
  770. package/dist/search/highlights.d.ts.map +1 -0
  771. package/dist/search/highlights.js +167 -0
  772. package/dist/search/highlights.js.map +1 -0
  773. package/dist/search/language-filter.d.ts +29 -0
  774. package/dist/search/language-filter.d.ts.map +1 -0
  775. package/dist/search/language-filter.js +126 -0
  776. package/dist/search/language-filter.js.map +1 -0
  777. package/dist/search/legacy/searxng-orchestrator.d.ts +4 -0
  778. package/dist/search/legacy/searxng-orchestrator.d.ts.map +1 -0
  779. package/dist/search/legacy/searxng-orchestrator.js +501 -0
  780. package/dist/search/legacy/searxng-orchestrator.js.map +1 -0
  781. package/dist/search/legacy/searxng-provider.d.ts +7 -0
  782. package/dist/search/legacy/searxng-provider.d.ts.map +1 -0
  783. package/dist/search/legacy/searxng-provider.js +11 -0
  784. package/dist/search/legacy/searxng-provider.js.map +1 -0
  785. package/dist/search/multi-query.d.ts +25 -0
  786. package/dist/search/multi-query.d.ts.map +1 -0
  787. package/dist/search/multi-query.js +228 -0
  788. package/dist/search/multi-query.js.map +1 -0
  789. package/dist/search/query.js +32 -34
  790. package/dist/search/query.js.map +1 -1
  791. package/dist/search/rerank.d.ts +3 -1
  792. package/dist/search/rerank.d.ts.map +1 -1
  793. package/dist/search/rerank.js +44 -35
  794. package/dist/search/rerank.js.map +1 -1
  795. package/dist/search/reranker/authority-boost.d.ts +3 -0
  796. package/dist/search/reranker/authority-boost.d.ts.map +1 -0
  797. package/dist/search/reranker/authority-boost.js +179 -0
  798. package/dist/search/reranker/authority-boost.js.map +1 -0
  799. package/dist/search/reranker/consensus-boost.d.ts +3 -0
  800. package/dist/search/reranker/consensus-boost.d.ts.map +1 -0
  801. package/dist/search/reranker/consensus-boost.js +27 -0
  802. package/dist/search/reranker/consensus-boost.js.map +1 -0
  803. package/dist/search/reranker/recency-boost.d.ts +3 -0
  804. package/dist/search/reranker/recency-boost.d.ts.map +1 -0
  805. package/dist/search/reranker/recency-boost.js +13 -0
  806. package/dist/search/reranker/recency-boost.js.map +1 -0
  807. package/dist/search/reranker/recency.d.ts +3 -0
  808. package/dist/search/reranker/recency.d.ts.map +1 -0
  809. package/dist/search/reranker/recency.js +23 -0
  810. package/dist/search/reranker/recency.js.map +1 -0
  811. package/dist/search/reranker/transformers-rerank-provider.d.ts +12 -0
  812. package/dist/search/reranker/transformers-rerank-provider.d.ts.map +1 -0
  813. package/dist/search/reranker/transformers-rerank-provider.js +78 -0
  814. package/dist/search/reranker/transformers-rerank-provider.js.map +1 -0
  815. package/dist/search/rrf.d.ts +17 -0
  816. package/dist/search/rrf.d.ts.map +1 -0
  817. package/dist/search/rrf.js +39 -0
  818. package/dist/search/rrf.js.map +1 -0
  819. package/dist/search/sampling.d.ts +25 -0
  820. package/dist/search/sampling.d.ts.map +1 -0
  821. package/dist/search/sampling.js +52 -0
  822. package/dist/search/sampling.js.map +1 -0
  823. package/dist/search/searxng.d.ts.map +1 -1
  824. package/dist/search/searxng.js +69 -79
  825. package/dist/search/searxng.js.map +1 -1
  826. package/dist/search/tokens.d.ts +3 -0
  827. package/dist/search/tokens.d.ts.map +1 -0
  828. package/dist/search/tokens.js +39 -0
  829. package/dist/search/tokens.js.map +1 -0
  830. package/dist/search/truncate.d.ts +6 -0
  831. package/dist/search/truncate.d.ts.map +1 -0
  832. package/dist/search/truncate.js +26 -0
  833. package/dist/search/truncate.js.map +1 -0
  834. package/dist/search/url-unwrap.d.ts +3 -0
  835. package/dist/search/url-unwrap.d.ts.map +1 -0
  836. package/dist/search/url-unwrap.js +43 -0
  837. package/dist/search/url-unwrap.js.map +1 -0
  838. package/dist/search/v1/context-rank.d.ts +13 -0
  839. package/dist/search/v1/context-rank.d.ts.map +1 -0
  840. package/dist/search/v1/context-rank.js +74 -0
  841. package/dist/search/v1/context-rank.js.map +1 -0
  842. package/dist/search/v1/engine-base.d.ts +27 -0
  843. package/dist/search/v1/engine-base.d.ts.map +1 -0
  844. package/dist/search/v1/engine-base.js +110 -0
  845. package/dist/search/v1/engine-base.js.map +1 -0
  846. package/dist/search/v1/intent-router.d.ts +22 -0
  847. package/dist/search/v1/intent-router.d.ts.map +1 -0
  848. package/dist/search/v1/intent-router.js +138 -0
  849. package/dist/search/v1/intent-router.js.map +1 -0
  850. package/dist/search/v1/orchestrator.d.ts +24 -0
  851. package/dist/search/v1/orchestrator.d.ts.map +1 -0
  852. package/dist/search/v1/orchestrator.js +163 -0
  853. package/dist/search/v1/orchestrator.js.map +1 -0
  854. package/dist/search/v1/recency-boost.d.ts +9 -0
  855. package/dist/search/v1/recency-boost.d.ts.map +1 -0
  856. package/dist/search/v1/recency-boost.js +37 -0
  857. package/dist/search/v1/recency-boost.js.map +1 -0
  858. package/dist/search/v1/recent-cache-dedup.d.ts +6 -0
  859. package/dist/search/v1/recent-cache-dedup.d.ts.map +1 -0
  860. package/dist/search/v1/recent-cache-dedup.js +85 -0
  861. package/dist/search/v1/recent-cache-dedup.js.map +1 -0
  862. package/dist/search/v1/rss/feed-config.d.ts +21 -0
  863. package/dist/search/v1/rss/feed-config.d.ts.map +1 -0
  864. package/dist/search/v1/rss/feed-config.js +90 -0
  865. package/dist/search/v1/rss/feed-config.js.map +1 -0
  866. package/dist/search/v1/rss/feed-parser.d.ts +14 -0
  867. package/dist/search/v1/rss/feed-parser.d.ts.map +1 -0
  868. package/dist/search/v1/rss/feed-parser.js +104 -0
  869. package/dist/search/v1/rss/feed-parser.js.map +1 -0
  870. package/dist/search/v1/rss/feed-poller.d.ts +22 -0
  871. package/dist/search/v1/rss/feed-poller.d.ts.map +1 -0
  872. package/dist/search/v1/rss/feed-poller.js +102 -0
  873. package/dist/search/v1/rss/feed-poller.js.map +1 -0
  874. package/dist/search/v1/rss/feed-store.d.ts +30 -0
  875. package/dist/search/v1/rss/feed-store.d.ts.map +1 -0
  876. package/dist/search/v1/rss/feed-store.js +134 -0
  877. package/dist/search/v1/rss/feed-store.js.map +1 -0
  878. package/dist/search/v1/rss/rss-engine.d.ts +6 -0
  879. package/dist/search/v1/rss/rss-engine.d.ts.map +1 -0
  880. package/dist/search/v1/rss/rss-engine.js +28 -0
  881. package/dist/search/v1/rss/rss-engine.js.map +1 -0
  882. package/dist/search/v1/v1-provider.d.ts +7 -0
  883. package/dist/search/v1/v1-provider.d.ts.map +1 -0
  884. package/dist/search/v1/v1-provider.js +68 -0
  885. package/dist/search/v1/v1-provider.js.map +1 -0
  886. package/dist/search/v1/verticals/code.d.ts +4 -0
  887. package/dist/search/v1/verticals/code.d.ts.map +1 -0
  888. package/dist/search/v1/verticals/code.js +20 -0
  889. package/dist/search/v1/verticals/code.js.map +1 -0
  890. package/dist/search/v1/verticals/docs.d.ts +4 -0
  891. package/dist/search/v1/verticals/docs.d.ts.map +1 -0
  892. package/dist/search/v1/verticals/docs.js +20 -0
  893. package/dist/search/v1/verticals/docs.js.map +1 -0
  894. package/dist/search/v1/verticals/general.d.ts +4 -0
  895. package/dist/search/v1/verticals/general.d.ts.map +1 -0
  896. package/dist/search/v1/verticals/general.js +22 -0
  897. package/dist/search/v1/verticals/general.js.map +1 -0
  898. package/dist/search/v1/verticals/news.d.ts +10 -0
  899. package/dist/search/v1/verticals/news.d.ts.map +1 -0
  900. package/dist/search/v1/verticals/news.js +52 -0
  901. package/dist/search/v1/verticals/news.js.map +1 -0
  902. package/dist/search/v1/verticals/papers.d.ts +4 -0
  903. package/dist/search/v1/verticals/papers.d.ts.map +1 -0
  904. package/dist/search/v1/verticals/papers.js +23 -0
  905. package/dist/search/v1/verticals/papers.js.map +1 -0
  906. package/dist/search/validator.js +31 -31
  907. package/dist/search/validator.js.map +1 -1
  908. package/dist/searxng/bootstrap.d.ts +30 -0
  909. package/dist/searxng/bootstrap.d.ts.map +1 -1
  910. package/dist/searxng/bootstrap.js +223 -85
  911. package/dist/searxng/bootstrap.js.map +1 -1
  912. package/dist/searxng/docker.d.ts.map +1 -1
  913. package/dist/searxng/docker.js +69 -60
  914. package/dist/searxng/docker.js.map +1 -1
  915. package/dist/searxng/process.d.ts +13 -1
  916. package/dist/searxng/process.d.ts.map +1 -1
  917. package/dist/searxng/process.js +231 -164
  918. package/dist/searxng/process.js.map +1 -1
  919. package/dist/server/backend-status.d.ts +13 -0
  920. package/dist/server/backend-status.d.ts.map +1 -0
  921. package/dist/server/backend-status.js +40 -0
  922. package/dist/server/backend-status.js.map +1 -0
  923. package/dist/server/tool-schemas.d.ts +549 -0
  924. package/dist/server/tool-schemas.d.ts.map +1 -0
  925. package/dist/server/tool-schemas.js +464 -0
  926. package/dist/server/tool-schemas.js.map +1 -0
  927. package/dist/server/warmup-on-start.d.ts +9 -0
  928. package/dist/server/warmup-on-start.d.ts.map +1 -0
  929. package/dist/server/warmup-on-start.js +55 -0
  930. package/dist/server/warmup-on-start.js.map +1 -0
  931. package/dist/server.d.ts +17 -0
  932. package/dist/server.d.ts.map +1 -1
  933. package/dist/server.js +454 -297
  934. package/dist/server.js.map +1 -1
  935. package/dist/tools/agent.d.ts +5 -0
  936. package/dist/tools/agent.d.ts.map +1 -0
  937. package/dist/tools/agent.js +128 -0
  938. package/dist/tools/agent.js.map +1 -0
  939. package/dist/tools/cache.d.ts +2 -1
  940. package/dist/tools/cache.d.ts.map +1 -1
  941. package/dist/tools/cache.js +175 -44
  942. package/dist/tools/cache.js.map +1 -1
  943. package/dist/tools/crawl.d.ts.map +1 -1
  944. package/dist/tools/crawl.js +171 -88
  945. package/dist/tools/crawl.js.map +1 -1
  946. package/dist/tools/extract.d.ts +2 -2
  947. package/dist/tools/extract.d.ts.map +1 -1
  948. package/dist/tools/extract.js +175 -59
  949. package/dist/tools/extract.js.map +1 -1
  950. package/dist/tools/fetch.d.ts +2 -2
  951. package/dist/tools/fetch.d.ts.map +1 -1
  952. package/dist/tools/fetch.js +161 -68
  953. package/dist/tools/fetch.js.map +1 -1
  954. package/dist/tools/find-similar.d.ts +5 -0
  955. package/dist/tools/find-similar.d.ts.map +1 -0
  956. package/dist/tools/find-similar.js +127 -0
  957. package/dist/tools/find-similar.js.map +1 -0
  958. package/dist/tools/research.d.ts +5 -0
  959. package/dist/tools/research.d.ts.map +1 -0
  960. package/dist/tools/research.js +107 -0
  961. package/dist/tools/research.js.map +1 -0
  962. package/dist/tools/search.d.ts +10 -2
  963. package/dist/tools/search.d.ts.map +1 -1
  964. package/dist/tools/search.js +13 -158
  965. package/dist/tools/search.js.map +1 -1
  966. package/dist/types.d.ts +350 -7
  967. package/dist/types.d.ts.map +1 -1
  968. package/dist/types.js +6 -1
  969. package/dist/types.js.map +1 -1
  970. package/dist/util/mode.d.ts +4 -0
  971. package/dist/util/mode.d.ts.map +1 -0
  972. package/dist/util/mode.js +34 -0
  973. package/dist/util/mode.js.map +1 -0
  974. package/package.json +78 -8
  975. package/dist/extraction/trafilatura.d.ts +0 -6
  976. package/dist/extraction/trafilatura.d.ts.map +0 -1
  977. package/dist/extraction/trafilatura.js +0 -105
  978. package/dist/extraction/trafilatura.js.map +0 -1
  979. package/dist/search/flashrank.d.ts +0 -12
  980. package/dist/search/flashrank.d.ts.map +0 -1
  981. package/dist/search/flashrank.js +0 -63
  982. package/dist/search/flashrank.js.map +0 -1
@@ -1,88 +1,79 @@
1
- import { parseHTML } from 'linkedom';
2
- import TurndownService from 'turndown';
3
- const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
1
+ import { parseHTML } from "linkedom";
2
+ import { htmlToMarkdown } from "../markdown.js";
4
3
  function parseVotes(el) {
5
- if (!el)
6
- return 0;
7
- const voteEl = el.querySelector('.js-vote-count');
8
- const val = voteEl?.getAttribute('data-value') ?? voteEl?.textContent?.trim() ?? '0';
9
- return parseInt(val, 10) || 0;
4
+ if (!el) return 0;
5
+ const voteEl = el.querySelector(".js-vote-count");
6
+ const val = voteEl?.getAttribute("data-value") ?? voteEl?.textContent?.trim() ?? "0";
7
+ return parseInt(val, 10) || 0;
10
8
  }
11
9
  function parseAnswers(document) {
12
- const answerEls = document.querySelectorAll('#answers .answer');
13
- const answers = [];
14
- for (const el of Array.from(answerEls)) {
15
- const accepted = el.classList.contains('accepted-answer');
16
- const votes = parseVotes(el);
17
- const bodyEl = el.querySelector('.s-prose, .js-post-body, .post-text');
18
- const bodyHtml = bodyEl ? bodyEl.innerHTML : '';
19
- answers.push({ accepted, votes, bodyHtml });
20
- }
21
- return answers;
10
+ const answerEls = document.querySelectorAll("#answers .answer");
11
+ const answers = [];
12
+ for (const el of Array.from(answerEls)) {
13
+ const accepted = el.classList.contains("accepted-answer");
14
+ const votes = parseVotes(el);
15
+ const bodyEl = el.querySelector(".s-prose, .js-post-body, .post-text");
16
+ const bodyHtml = bodyEl ? bodyEl.innerHTML : "";
17
+ answers.push({ accepted, votes, bodyHtml });
18
+ }
19
+ return answers;
22
20
  }
23
21
  function buildMarkdown(title, tags, votes, questionHtml, answers) {
24
- const tagLine = `Tags: ${tags.join(', ')} | Votes: ${votes}`;
25
- const questionMd = turndown.turndown(questionHtml).trim();
26
- const sections = [
27
- `# ${title}`,
28
- tagLine,
29
- '',
30
- questionMd,
31
- ];
32
- const accepted = answers.filter((a) => a.accepted);
33
- const others = answers.filter((a) => !a.accepted).sort((a, b) => b.votes - a.votes);
34
- const ordered = [...accepted, ...others];
35
- for (const answer of ordered) {
36
- const heading = answer.accepted
37
- ? `## Accepted Answer (Votes: ${answer.votes})`
38
- : `## Answer (Votes: ${answer.votes})`;
39
- const bodyMd = turndown.turndown(answer.bodyHtml).trim();
40
- sections.push('---', '', heading, '', bodyMd);
41
- }
42
- return sections.join('\n\n');
22
+ const tagLine = `Tags: ${tags.join(", ")} | Votes: ${votes}`;
23
+ const questionMd = htmlToMarkdown(questionHtml).trim();
24
+ const sections = [
25
+ `# ${title}`,
26
+ tagLine,
27
+ "",
28
+ questionMd
29
+ ];
30
+ const accepted = answers.filter((a) => a.accepted);
31
+ const others = answers.filter((a) => !a.accepted).sort((a, b) => b.votes - a.votes);
32
+ const ordered = [...accepted, ...others];
33
+ for (const answer of ordered) {
34
+ const heading = answer.accepted ? `## Accepted Answer (Votes: ${answer.votes})` : `## Answer (Votes: ${answer.votes})`;
35
+ const bodyMd = htmlToMarkdown(answer.bodyHtml).trim();
36
+ sections.push("---", "", heading, "", bodyMd);
37
+ }
38
+ return sections.join("\n\n");
43
39
  }
44
- export const stackoverflowExtractor = {
45
- name: 'stackoverflow',
46
- canHandle(url) {
47
- try {
48
- const hostname = new URL(url).hostname;
49
- return hostname === 'stackoverflow.com' ||
50
- hostname.endsWith('.stackoverflow.com') ||
51
- hostname === 'stackexchange.com' ||
52
- hostname.endsWith('.stackexchange.com');
53
- }
54
- catch {
55
- return false;
56
- }
57
- },
58
- extract(html, url) {
59
- if (!html)
60
- return null;
61
- const { document } = parseHTML(html);
62
- const titleEl = document.querySelector('.question-hyperlink');
63
- if (!titleEl)
64
- return null;
65
- const title = titleEl.textContent?.trim() ?? '';
66
- if (!title)
67
- return null;
68
- const questionBodyEl = document.querySelector('#question .s-prose, #question .js-post-body, #question .post-text');
69
- if (!questionBodyEl)
70
- return null;
71
- const questionHtml = questionBodyEl.innerHTML;
72
- const tagEls = document.querySelectorAll('.js-post-tag-list-wrapper .post-tag, .post-taglist .post-tag');
73
- const tags = Array.from(tagEls).map((el) => el.textContent?.trim() ?? '').filter(Boolean);
74
- const questionEl = document.querySelector('#question');
75
- const votes = parseVotes(questionEl);
76
- const answers = parseAnswers(document);
77
- const markdown = buildMarkdown(title, tags, votes, questionHtml, answers);
78
- return {
79
- title,
80
- markdown,
81
- metadata: {},
82
- links: [],
83
- images: [],
84
- extractor: 'site-specific',
85
- };
86
- },
40
+ const stackoverflowExtractor = {
41
+ name: "stackoverflow",
42
+ canHandle(url) {
43
+ try {
44
+ const hostname = new URL(url).hostname;
45
+ return hostname === "stackoverflow.com" || hostname.endsWith(".stackoverflow.com") || hostname === "stackexchange.com" || hostname.endsWith(".stackexchange.com");
46
+ } catch {
47
+ return false;
48
+ }
49
+ },
50
+ extract(html, _url) {
51
+ if (!html) return null;
52
+ const { document } = parseHTML(html);
53
+ const titleEl = document.querySelector(".question-hyperlink");
54
+ if (!titleEl) return null;
55
+ const title = titleEl.textContent?.trim() ?? "";
56
+ if (!title) return null;
57
+ const questionBodyEl = document.querySelector("#question .s-prose, #question .js-post-body, #question .post-text");
58
+ if (!questionBodyEl) return null;
59
+ const questionHtml = questionBodyEl.innerHTML;
60
+ const tagEls = document.querySelectorAll(".js-post-tag-list-wrapper .post-tag, .post-taglist .post-tag");
61
+ const tags = Array.from(tagEls).map((el) => el.textContent?.trim() ?? "").filter(Boolean);
62
+ const questionEl = document.querySelector("#question");
63
+ const votes = parseVotes(questionEl);
64
+ const answers = parseAnswers(document);
65
+ const markdown = buildMarkdown(title, tags, votes, questionHtml, answers);
66
+ return {
67
+ title,
68
+ markdown,
69
+ metadata: {},
70
+ links: [],
71
+ images: [],
72
+ extractor: "site-specific"
73
+ };
74
+ }
75
+ };
76
+ export {
77
+ stackoverflowExtractor
87
78
  };
88
79
  //# sourceMappingURL=stackoverflow.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"stackoverflow.js","sourceRoot":"","sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AACrC,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC,CAAC;AAQxF,SAAS,UAAU,CAAC,EAAkB;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,CAAC,CAAC;IAClB,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC;IAClD,MAAM,GAAG,GAAG,MAAM,EAAE,YAAY,CAAC,YAAY,CAAC,IAAI,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,GAAG,CAAC;IACrF,OAAO,QAAQ,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,YAAY,CAAC,QAAkB;IACtC,MAAM,SAAS,GAAG,QAAQ,CAAC,gBAAgB,CAAC,kBAAkB,CAAC,CAAC;IAChE,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,EAAE,CAAC,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAC1D,MAAM,KAAK,GAAG,UAAU,CAAC,EAAa,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,EAAE,CAAC,aAAa,CAAC,qCAAqC,CAAC,CAAC;QACvE,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAE,MAAkB,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7D,OAAO,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,aAAa,CACpB,KAAa,EACb,IAAc,EACd,KAAa,EACb,YAAoB,EACpB,OAAiB;IAEjB,MAAM,OAAO,GAAG,SAAS,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,KAAK,EAAE,CAAC;IAC7D,MAAM,UAAU,GAAG,QAAQ,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,IAAI,EAAE,CAAC;IAE1D,MAAM,QAAQ,GAAa;QACzB,KAAK,KAAK,EAAE;QACZ,OAAO;QACP,EAAE;QACF,UAAU;KACX,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IACnD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;IACpF,MAAM,OAAO,GAAG,CAAC,GAAG,QAAQ,EAAE,GAAG,MAAM,CAAC,CAAC;IAEzC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAG,MAAM,CAAC,QAAQ;YAC7B,CAAC,CAAC,8BAA8B,MAAM,CAAC,KAAK,GAAG;YAC/C,CAAC,CAAC,qBAAqB,MAAM,CAAC,KAAK,GAAG,CAAC;QACzC,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC;QACzD,QAAQ,CAAC,IAAI,CAAC,KAAK,EAAE,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED,MAAM,CAAC,MAAM,sBAAsB,GAAc;IAC/C,IAAI,EAAE,eAAe;IAErB,SAAS,CAAC,GAAW;QACnB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,OAAO,QAAQ,KAAK,mBAAmB;gBACrC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC;gBACvC,QAAQ,KAAK,mBAAmB;gBAChC,QAAQ,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC;QAC5C,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,GAAW;QAC/B,IAAI,CAAC,IAAI;YAAE,OAAO,IAAI,CAAC;QAEvB,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAErC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,qBAAqB,CAAC,CAAC;QAC9D,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAChD,IAAI,CAAC,KAAK;YAAE,OAAO,IAAI,CAAC;QAExB,MAAM,cAAc,GAAG,QAAQ,CAAC,aAAa,CAAC,mEAAmE,CAAC,CAAC;QACnH,IAAI,CAAC,cAAc;YAAE,OAAO,IAAI,CAAC;QAEjC,MAAM,YAAY,GAAI,cAA0B,CAAC,SAAS,CAAC;QAE3D,MAAM,MAAM,GAAG,QAAQ,CAAC,gBAAgB,CAAC,8DAA8D,CAAC,CAAC;QACzG,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAE1F,MAAM,UAAU,GAAG,QAAQ,CAAC,aAAa,CAAC,WAAW,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,UAAU,CAAC,UAA4B,CAAC,CAAC;QAEvD,MAAM,OAAO,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;QAEvC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,OAAO,CAAC,CAAC;QAE1E,OAAO;YACL,KAAK;YACL,QAAQ;YACR,QAAQ,EAAE,EAAE;YACZ,KAAK,EAAE,EAAE;YACT,MAAM,EAAE,EAAE;YACV,SAAS,EAAE,eAAe;SAC3B,CAAC;IACJ,CAAC;CACF,CAAC"}
1
+ {"version":3,"sources":["../../../src/extraction/site-extractors/stackoverflow.ts"],"sourcesContent":["import { parseHTML } from 'linkedom';\nimport { htmlToMarkdown } from '../markdown.js';\nimport type { Extractor, ExtractionResult } from '../../types.js';\n\ninterface Answer {\n accepted: boolean;\n votes: number;\n bodyHtml: string;\n}\n\nfunction parseVotes(el: Element | null): number {\n if (!el) return 0;\n const voteEl = el.querySelector('.js-vote-count');\n const val = voteEl?.getAttribute('data-value') ?? voteEl?.textContent?.trim() ?? '0';\n return parseInt(val, 10) || 0;\n}\n\nfunction parseAnswers(document: Document): Answer[] {\n const answerEls = document.querySelectorAll('#answers .answer');\n const answers: Answer[] = [];\n\n for (const el of Array.from(answerEls)) {\n const accepted = el.classList.contains('accepted-answer');\n const votes = parseVotes(el as Element);\n const bodyEl = el.querySelector('.s-prose, .js-post-body, .post-text');\n const bodyHtml = bodyEl ? (bodyEl as Element).innerHTML : '';\n answers.push({ accepted, votes, bodyHtml });\n }\n\n return answers;\n}\n\nfunction buildMarkdown(\n title: string,\n tags: string[],\n votes: number,\n questionHtml: string,\n answers: Answer[],\n): string {\n const tagLine = `Tags: ${tags.join(', ')} | Votes: ${votes}`;\n const questionMd = htmlToMarkdown(questionHtml).trim();\n\n const sections: string[] = [\n `# ${title}`,\n tagLine,\n '',\n questionMd,\n ];\n\n const accepted = answers.filter((a) => a.accepted);\n const others = answers.filter((a) => !a.accepted).sort((a, b) => b.votes - a.votes);\n const ordered = [...accepted, ...others];\n\n for (const answer of ordered) {\n const heading = answer.accepted\n ? `## Accepted Answer (Votes: ${answer.votes})`\n : `## Answer (Votes: ${answer.votes})`;\n const bodyMd = htmlToMarkdown(answer.bodyHtml).trim();\n sections.push('---', '', heading, '', bodyMd);\n }\n\n return sections.join('\\n\\n');\n}\n\nexport const stackoverflowExtractor: Extractor = {\n name: 'stackoverflow',\n\n canHandle(url: string): boolean {\n try {\n const hostname = new URL(url).hostname;\n return hostname === 'stackoverflow.com' ||\n hostname.endsWith('.stackoverflow.com') ||\n hostname === 'stackexchange.com' ||\n hostname.endsWith('.stackexchange.com');\n } catch {\n return false;\n }\n },\n\n extract(html: string, _url: string): ExtractionResult | null {\n if (!html) return null;\n\n const { document } = parseHTML(html);\n\n const titleEl = document.querySelector('.question-hyperlink');\n if (!titleEl) return null;\n\n const title = titleEl.textContent?.trim() ?? '';\n if (!title) return null;\n\n const questionBodyEl = document.querySelector('#question .s-prose, #question .js-post-body, #question .post-text');\n if (!questionBodyEl) return null;\n\n const questionHtml = (questionBodyEl as Element).innerHTML;\n\n const tagEls = document.querySelectorAll('.js-post-tag-list-wrapper .post-tag, .post-taglist .post-tag');\n const tags = Array.from(tagEls).map((el) => el.textContent?.trim() ?? '').filter(Boolean);\n\n const questionEl = document.querySelector('#question');\n const votes = parseVotes(questionEl as Element | null);\n\n const answers = parseAnswers(document);\n\n const markdown = buildMarkdown(title, tags, votes, questionHtml, answers);\n\n return {\n title,\n markdown,\n metadata: {},\n links: [],\n images: [],\n extractor: 'site-specific',\n };\n },\n};\n"],"mappings":"AAAA,SAAS,iBAAiB;AAC1B,SAAS,sBAAsB;AAS/B,SAAS,WAAW,IAA4B;AAC9C,MAAI,CAAC,GAAI,QAAO;AAChB,QAAM,SAAS,GAAG,cAAc,gBAAgB;AAChD,QAAM,MAAM,QAAQ,aAAa,YAAY,KAAK,QAAQ,aAAa,KAAK,KAAK;AACjF,SAAO,SAAS,KAAK,EAAE,KAAK;AAC9B;AAEA,SAAS,aAAa,UAA8B;AAClD,QAAM,YAAY,SAAS,iBAAiB,kBAAkB;AAC9D,QAAM,UAAoB,CAAC;AAE3B,aAAW,MAAM,MAAM,KAAK,SAAS,GAAG;AACtC,UAAM,WAAW,GAAG,UAAU,SAAS,iBAAiB;AACxD,UAAM,QAAQ,WAAW,EAAa;AACtC,UAAM,SAAS,GAAG,cAAc,qCAAqC;AACrE,UAAM,WAAW,SAAU,OAAmB,YAAY;AAC1D,YAAQ,KAAK,EAAE,UAAU,OAAO,SAAS,CAAC;AAAA,EAC5C;AAEA,SAAO;AACT;AAEA,SAAS,cACP,OACA,MACA,OACA,cACA,SACQ;AACR,QAAM,UAAU,SAAS,KAAK,KAAK,IAAI,CAAC,aAAa,KAAK;AAC1D,QAAM,aAAa,eAAe,YAAY,EAAE,KAAK;AAErD,QAAM,WAAqB;AAAA,IACzB,KAAK,KAAK;AAAA,IACV;AAAA,IACA;AAAA,IACA;AAAA,EACF;AAEA,QAAM,WAAW,QAAQ,OAAO,CAAC,MAAM,EAAE,QAAQ;AACjD,QAAM,SAAS,QAAQ,OAAO,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AAClF,QAAM,UAAU,CAAC,GAAG,UAAU,GAAG,MAAM;AAEvC,aAAW,UAAU,SAAS;AAC5B,UAAM,UAAU,OAAO,WACnB,8BAA8B,OAAO,KAAK,MAC1C,qBAAqB,OAAO,KAAK;AACrC,UAAM,SAAS,eAAe,OAAO,QAAQ,EAAE,KAAK;AACpD,aAAS,KAAK,OAAO,IAAI,SAAS,IAAI,MAAM;AAAA,EAC9C;AAEA,SAAO,SAAS,KAAK,MAAM;AAC7B;AAEO,MAAM,yBAAoC;AAAA,EAC/C,MAAM;AAAA,EAEN,UAAU,KAAsB;AAC9B,QAAI;AACF,YAAM,WAAW,IAAI,IAAI,GAAG,EAAE;AAC9B,aAAO,aAAa,uBAClB,SAAS,SAAS,oBAAoB,KACtC,aAAa,uBACb,SAAS,SAAS,oBAAoB;AAAA,IAC1C,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF;AAAA,EAEA,QAAQ,MAAc,MAAuC;AAC3D,QAAI,CAAC,KAAM,QAAO;AAElB,UAAM,EAAE,SAAS,IAAI,UAAU,IAAI;AAEnC,UAAM,UAAU,SAAS,cAAc,qBAAqB;AAC5D,QAAI,CAAC,QAAS,QAAO;AAErB,UAAM,QAAQ,QAAQ,aAAa,KAAK,KAAK;AAC7C,QAAI,CAAC,MAAO,QAAO;AAEnB,UAAM,iBAAiB,SAAS,cAAc,mEAAmE;AACjH,QAAI,CAAC,eAAgB,QAAO;AAE5B,UAAM,eAAgB,eAA2B;AAEjD,UAAM,SAAS,SAAS,iBAAiB,8DAA8D;AACvG,UAAM,OAAO,MAAM,KAAK,MAAM,EAAE,IAAI,CAAC,OAAO,GAAG,aAAa,KAAK,KAAK,EAAE,EAAE,OAAO,OAAO;AAExF,UAAM,aAAa,SAAS,cAAc,WAAW;AACrD,UAAM,QAAQ,WAAW,UAA4B;AAErD,UAAM,UAAU,aAAa,QAAQ;AAErC,UAAM,WAAW,cAAc,OAAO,MAAM,OAAO,cAAc,OAAO;AAExE,WAAO;AAAA,MACL;AAAA,MACA;AAAA,MACA,UAAU,CAAC;AAAA,MACX,OAAO,CAAC;AAAA,MACR,QAAQ,CAAC;AAAA,MACT,WAAW;AAAA,IACb;AAAA,EACF;AACF;","names":[]}
@@ -0,0 +1,4 @@
1
+ import type { StructuredDataResult } from '../types.js';
2
+ export declare function extractStructuredData(html: string): StructuredDataResult[];
3
+ export declare const KNOWN_SCHEMA_TYPES: ReadonlySet<string>;
4
+ //# sourceMappingURL=structured-data.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structured-data.d.ts","sourceRoot":"","sources":["../../src/extraction/structured-data.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,aAAa,CAAC;AAaxD,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB,EAAE,CAQ1E;AA8KD,eAAO,MAAM,kBAAkB,EAAE,WAAW,CAAC,MAAM,CAAe,CAAC"}
@@ -0,0 +1,173 @@
1
+ import { parseHTML } from "linkedom";
2
+ import { createLogger } from "../logger.js";
3
+ const log = createLogger("structured-data");
4
+ const KNOWN_TYPES = /* @__PURE__ */ new Set([
5
+ "Article",
6
+ "Product",
7
+ "Recipe",
8
+ "BreadcrumbList",
9
+ "Organization",
10
+ "Person"
11
+ ]);
12
+ function extractStructuredData(html) {
13
+ if (!html) return [];
14
+ const { document: doc } = parseHTML(html);
15
+ const out = [];
16
+ out.push(...extractJsonLdBlocks(doc));
17
+ out.push(...extractMicrodataBlocks(doc));
18
+ out.push(...extractRdfaBlocks(doc));
19
+ return out;
20
+ }
21
+ function extractJsonLdBlocks(doc) {
22
+ const out = [];
23
+ const scripts = doc.querySelectorAll('script[type="application/ld+json"]');
24
+ for (const script of scripts) {
25
+ const text = script.textContent?.trim();
26
+ if (!text) continue;
27
+ let parsed;
28
+ try {
29
+ parsed = JSON.parse(text);
30
+ } catch (err) {
31
+ log.warn("Failed to parse JSON-LD block", { error: String(err) });
32
+ continue;
33
+ }
34
+ for (const node of flattenJsonLd(parsed)) {
35
+ const type = normalizeType(node["@type"]);
36
+ if (!type) continue;
37
+ const fields = {};
38
+ for (const [k, v] of Object.entries(node)) {
39
+ if (k.startsWith("@")) continue;
40
+ fields[k] = v;
41
+ }
42
+ out.push({ provenance: "json-ld", type, fields });
43
+ }
44
+ }
45
+ return out;
46
+ }
47
+ function flattenJsonLd(value) {
48
+ if (!value || typeof value !== "object") return [];
49
+ if (Array.isArray(value)) return value.flatMap(flattenJsonLd);
50
+ const obj = value;
51
+ if (Array.isArray(obj["@graph"])) return obj["@graph"].flatMap(flattenJsonLd);
52
+ return [obj];
53
+ }
54
+ function normalizeType(raw) {
55
+ if (typeof raw === "string") {
56
+ const tail = raw.split("/").pop();
57
+ return tail || null;
58
+ }
59
+ if (Array.isArray(raw)) {
60
+ for (const t of raw) {
61
+ const norm = normalizeType(t);
62
+ if (norm) return norm;
63
+ }
64
+ }
65
+ return null;
66
+ }
67
+ function extractMicrodataBlocks(doc) {
68
+ const out = [];
69
+ const all = Array.from(doc.querySelectorAll("[itemscope]"));
70
+ const tops = all.filter((el) => !hasItemscopeAncestor(el));
71
+ for (const el of tops) {
72
+ const node = readMicrodataNode(el);
73
+ if (!node) continue;
74
+ out.push(node);
75
+ }
76
+ return out;
77
+ }
78
+ function hasItemscopeAncestor(el) {
79
+ let cur = el.parentElement;
80
+ while (cur) {
81
+ if (cur.hasAttribute("itemscope")) return true;
82
+ cur = cur.parentElement;
83
+ }
84
+ return false;
85
+ }
86
+ function readMicrodataNode(el) {
87
+ const itemtype = el.getAttribute("itemtype") ?? "";
88
+ const type = itemtype ? itemtype.split("/").pop() : "";
89
+ if (!type) return null;
90
+ const fields = {};
91
+ collectItemprops(el, fields);
92
+ return { provenance: "microdata", type, fields };
93
+ }
94
+ function collectItemprops(root, target) {
95
+ const stack = Array.from(root.children);
96
+ while (stack.length) {
97
+ const el = stack.shift();
98
+ const prop = el.getAttribute("itemprop");
99
+ if (prop) {
100
+ let value;
101
+ if (el.hasAttribute("itemscope")) {
102
+ const nested = {};
103
+ collectItemprops(el, nested);
104
+ value = nested;
105
+ } else {
106
+ value = el.getAttribute("content") ?? el.getAttribute("href") ?? el.getAttribute("src") ?? (el.textContent ?? "").trim();
107
+ }
108
+ mergeProp(target, prop, value);
109
+ }
110
+ if (el.hasAttribute("itemscope")) continue;
111
+ for (const c of el.children) stack.push(c);
112
+ }
113
+ }
114
+ function mergeProp(target, prop, value) {
115
+ if (target[prop] === void 0) {
116
+ target[prop] = value;
117
+ return;
118
+ }
119
+ if (Array.isArray(target[prop])) {
120
+ target[prop].push(value);
121
+ return;
122
+ }
123
+ target[prop] = [target[prop], value];
124
+ }
125
+ function extractRdfaBlocks(doc) {
126
+ const out = [];
127
+ const all = Array.from(doc.querySelectorAll("[typeof]"));
128
+ const tops = all.filter((el) => !hasTypeofAncestor(el));
129
+ for (const el of tops) {
130
+ const typeAttr = el.getAttribute("typeof") ?? "";
131
+ const type = typeAttr.split(/\s+/)[0]?.split(/[:/]/).pop() ?? "";
132
+ if (!type) continue;
133
+ const fields = {};
134
+ collectRdfaProps(el, fields);
135
+ out.push({ provenance: "rdfa", type, fields });
136
+ }
137
+ return out;
138
+ }
139
+ function hasTypeofAncestor(el) {
140
+ let cur = el.parentElement;
141
+ while (cur) {
142
+ if (cur.hasAttribute("typeof")) return true;
143
+ cur = cur.parentElement;
144
+ }
145
+ return false;
146
+ }
147
+ function collectRdfaProps(root, target) {
148
+ const stack = Array.from(root.children);
149
+ while (stack.length) {
150
+ const el = stack.shift();
151
+ const prop = el.getAttribute("property");
152
+ if (prop) {
153
+ const propName = prop.split(/[:/]/).pop() ?? prop;
154
+ let value;
155
+ if (el.hasAttribute("typeof")) {
156
+ const nested = {};
157
+ collectRdfaProps(el, nested);
158
+ value = nested;
159
+ } else {
160
+ value = el.getAttribute("content") ?? el.getAttribute("href") ?? el.getAttribute("resource") ?? (el.textContent ?? "").trim();
161
+ }
162
+ mergeProp(target, propName, value);
163
+ }
164
+ if (el.hasAttribute("typeof")) continue;
165
+ for (const c of el.children) stack.push(c);
166
+ }
167
+ }
168
+ const KNOWN_SCHEMA_TYPES = KNOWN_TYPES;
169
+ export {
170
+ KNOWN_SCHEMA_TYPES,
171
+ extractStructuredData
172
+ };
173
+ //# sourceMappingURL=structured-data.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/extraction/structured-data.ts"],"sourcesContent":["import { parseHTML } from 'linkedom';\nimport { createLogger } from '../logger.js';\nimport type { StructuredDataResult } from '../types.js';\n\nconst log = createLogger('structured-data');\n\nconst KNOWN_TYPES = new Set([\n 'Article',\n 'Product',\n 'Recipe',\n 'BreadcrumbList',\n 'Organization',\n 'Person',\n]);\n\nexport function extractStructuredData(html: string): StructuredDataResult[] {\n if (!html) return [];\n const { document: doc } = parseHTML(html);\n const out: StructuredDataResult[] = [];\n out.push(...extractJsonLdBlocks(doc));\n out.push(...extractMicrodataBlocks(doc));\n out.push(...extractRdfaBlocks(doc));\n return out;\n}\n\nfunction extractJsonLdBlocks(doc: Document): StructuredDataResult[] {\n const out: StructuredDataResult[] = [];\n const scripts = doc.querySelectorAll('script[type=\"application/ld+json\"]');\n for (const script of scripts) {\n const text = script.textContent?.trim();\n if (!text) continue;\n let parsed: unknown;\n try {\n parsed = JSON.parse(text);\n } catch (err) {\n log.warn('Failed to parse JSON-LD block', { error: String(err) });\n continue;\n }\n for (const node of flattenJsonLd(parsed)) {\n const type = normalizeType(node['@type']);\n if (!type) continue;\n const fields: Record<string, unknown> = {};\n for (const [k, v] of Object.entries(node)) {\n if (k.startsWith('@')) continue;\n fields[k] = v;\n }\n out.push({ provenance: 'json-ld', type, fields });\n }\n }\n return out;\n}\n\nfunction flattenJsonLd(value: unknown): Record<string, unknown>[] {\n if (!value || typeof value !== 'object') return [];\n if (Array.isArray(value)) return value.flatMap(flattenJsonLd);\n const obj = value as Record<string, unknown>;\n if (Array.isArray(obj['@graph'])) return obj['@graph'].flatMap(flattenJsonLd);\n return [obj];\n}\n\nfunction normalizeType(raw: unknown): string | null {\n if (typeof raw === 'string') {\n const tail = raw.split('/').pop()!;\n return tail || null;\n }\n if (Array.isArray(raw)) {\n for (const t of raw) {\n const norm = normalizeType(t);\n if (norm) return norm;\n }\n }\n return null;\n}\n\nfunction extractMicrodataBlocks(doc: Document): StructuredDataResult[] {\n const out: StructuredDataResult[] = [];\n // Top-level itemscopes only — nested itemscopes are walked into as fields.\n const all = Array.from(doc.querySelectorAll('[itemscope]'));\n const tops = all.filter((el) => !hasItemscopeAncestor(el));\n for (const el of tops) {\n const node = readMicrodataNode(el);\n if (!node) continue;\n out.push(node);\n }\n return out;\n}\n\nfunction hasItemscopeAncestor(el: Element): boolean {\n let cur = el.parentElement;\n while (cur) {\n if (cur.hasAttribute('itemscope')) return true;\n cur = cur.parentElement;\n }\n return false;\n}\n\nfunction readMicrodataNode(el: Element): StructuredDataResult | null {\n const itemtype = el.getAttribute('itemtype') ?? '';\n const type = itemtype ? itemtype.split('/').pop()! : '';\n if (!type) return null;\n const fields: Record<string, unknown> = {};\n // Walk descendants but stop crossing into nested itemscopes (handle them as nested objects)\n collectItemprops(el, fields);\n return { provenance: 'microdata', type, fields };\n}\n\nfunction collectItemprops(root: Element, target: Record<string, unknown>): void {\n const stack: Element[] = Array.from(root.children);\n while (stack.length) {\n const el = stack.shift()!;\n const prop = el.getAttribute('itemprop');\n if (prop) {\n let value: unknown;\n if (el.hasAttribute('itemscope')) {\n const nested: Record<string, unknown> = {};\n collectItemprops(el, nested);\n value = nested;\n } else {\n value =\n el.getAttribute('content') ??\n el.getAttribute('href') ??\n el.getAttribute('src') ??\n (el.textContent ?? '').trim();\n }\n mergeProp(target, prop, value);\n }\n // Always stop at any itemscope: it is an independent item, regardless of\n // whether it carries an itemprop. Otherwise its descendants' itemprops\n // would leak into the parent record.\n if (el.hasAttribute('itemscope')) continue;\n for (const c of el.children) stack.push(c);\n }\n}\n\nfunction mergeProp(target: Record<string, unknown>, prop: string, value: unknown): void {\n if (target[prop] === undefined) {\n target[prop] = value;\n return;\n }\n if (Array.isArray(target[prop])) {\n (target[prop] as unknown[]).push(value);\n return;\n }\n target[prop] = [target[prop], value];\n}\n\nfunction extractRdfaBlocks(doc: Document): StructuredDataResult[] {\n const out: StructuredDataResult[] = [];\n const all = Array.from(doc.querySelectorAll('[typeof]'));\n const tops = all.filter((el) => !hasTypeofAncestor(el));\n for (const el of tops) {\n const typeAttr = el.getAttribute('typeof') ?? '';\n const type = typeAttr.split(/\\s+/)[0]?.split(/[:/]/).pop() ?? '';\n if (!type) continue;\n const fields: Record<string, unknown> = {};\n collectRdfaProps(el, fields);\n out.push({ provenance: 'rdfa', type, fields });\n }\n return out;\n}\n\nfunction hasTypeofAncestor(el: Element): boolean {\n let cur = el.parentElement;\n while (cur) {\n if (cur.hasAttribute('typeof')) return true;\n cur = cur.parentElement;\n }\n return false;\n}\n\nfunction collectRdfaProps(root: Element, target: Record<string, unknown>): void {\n const stack: Element[] = Array.from(root.children);\n while (stack.length) {\n const el = stack.shift()!;\n const prop = el.getAttribute('property');\n if (prop) {\n const propName = prop.split(/[:/]/).pop() ?? prop;\n let value: unknown;\n if (el.hasAttribute('typeof')) {\n const nested: Record<string, unknown> = {};\n collectRdfaProps(el, nested);\n value = nested;\n } else {\n value =\n el.getAttribute('content') ??\n el.getAttribute('href') ??\n el.getAttribute('resource') ??\n (el.textContent ?? '').trim();\n }\n mergeProp(target, propName, value);\n }\n // Always stop at any nested typeof, regardless of property — independent item.\n if (el.hasAttribute('typeof')) continue;\n for (const c of el.children) stack.push(c);\n }\n}\n\nexport const KNOWN_SCHEMA_TYPES: ReadonlySet<string> = KNOWN_TYPES;\n"],"mappings":"AAAA,SAAS,iBAAiB;AAC1B,SAAS,oBAAoB;AAG7B,MAAM,MAAM,aAAa,iBAAiB;AAE1C,MAAM,cAAc,oBAAI,IAAI;AAAA,EAC1B;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAEM,SAAS,sBAAsB,MAAsC;AAC1E,MAAI,CAAC,KAAM,QAAO,CAAC;AACnB,QAAM,EAAE,UAAU,IAAI,IAAI,UAAU,IAAI;AACxC,QAAM,MAA8B,CAAC;AACrC,MAAI,KAAK,GAAG,oBAAoB,GAAG,CAAC;AACpC,MAAI,KAAK,GAAG,uBAAuB,GAAG,CAAC;AACvC,MAAI,KAAK,GAAG,kBAAkB,GAAG,CAAC;AAClC,SAAO;AACT;AAEA,SAAS,oBAAoB,KAAuC;AAClE,QAAM,MAA8B,CAAC;AACrC,QAAM,UAAU,IAAI,iBAAiB,oCAAoC;AACzE,aAAW,UAAU,SAAS;AAC5B,UAAM,OAAO,OAAO,aAAa,KAAK;AACtC,QAAI,CAAC,KAAM;AACX,QAAI;AACJ,QAAI;AACF,eAAS,KAAK,MAAM,IAAI;AAAA,IAC1B,SAAS,KAAK;AACZ,UAAI,KAAK,iCAAiC,EAAE,OAAO,OAAO,GAAG,EAAE,CAAC;AAChE;AAAA,IACF;AACA,eAAW,QAAQ,cAAc,MAAM,GAAG;AACxC,YAAM,OAAO,cAAc,KAAK,OAAO,CAAC;AACxC,UAAI,CAAC,KAAM;AACX,YAAM,SAAkC,CAAC;AACzC,iBAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,IAAI,GAAG;AACzC,YAAI,EAAE,WAAW,GAAG,EAAG;AACvB,eAAO,CAAC,IAAI;AAAA,MACd;AACA,UAAI,KAAK,EAAE,YAAY,WAAW,MAAM,OAAO,CAAC;AAAA,IAClD;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,cAAc,OAA2C;AAChE,MAAI,CAAC,SAAS,OAAO,UAAU,SAAU,QAAO,CAAC;AACjD,MAAI,MAAM,QAAQ,KAAK,EAAG,QAAO,MAAM,QAAQ,aAAa;AAC5D,QAAM,MAAM;AACZ,MAAI,MAAM,QAAQ,IAAI,QAAQ,CAAC,EAAG,QAAO,IAAI,QAAQ,EAAE,QAAQ,aAAa;AAC5E,SAAO,CAAC,GAAG;AACb;AAEA,SAAS,cAAc,KAA6B;AAClD,MAAI,OAAO,QAAQ,UAAU;AAC3B,UAAM,OAAO,IAAI,MAAM,GAAG,EAAE,IAAI;AAChC,WAAO,QAAQ;AAAA,EACjB;AACA,MAAI,MAAM,QAAQ,GAAG,GAAG;AACtB,eAAW,KAAK,KAAK;AACnB,YAAM,OAAO,cAAc,CAAC;AAC5B,UAAI,KAAM,QAAO;AAAA,IACnB;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,uBAAuB,KAAuC;AACrE,QAAM,MAA8B,CAAC;AAErC,QAAM,MAAM,MAAM,KAAK,IAAI,iBAAiB,aAAa,CAAC;AAC1D,QAAM,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,qBAAqB,EAAE,CAAC;AACzD,aAAW,MAAM,MAAM;AACrB,UAAM,OAAO,kBAAkB,EAAE;AACjC,QAAI,CAAC,KAAM;AACX,QAAI,KAAK,IAAI;AAAA,EACf;AACA,SAAO;AACT;AAEA,SAAS,qBAAqB,IAAsB;AAClD,MAAI,MAAM,GAAG;AACb,SAAO,KAAK;AACV,QAAI,IAAI,aAAa,WAAW,EAAG,QAAO;AAC1C,UAAM,IAAI;AAAA,EACZ;AACA,SAAO;AACT;AAEA,SAAS,kBAAkB,IAA0C;AACnE,QAAM,WAAW,GAAG,aAAa,UAAU,KAAK;AAChD,QAAM,OAAO,WAAW,SAAS,MAAM,GAAG,EAAE,IAAI,IAAK;AACrD,MAAI,CAAC,KAAM,QAAO;AAClB,QAAM,SAAkC,CAAC;AAEzC,mBAAiB,IAAI,MAAM;AAC3B,SAAO,EAAE,YAAY,aAAa,MAAM,OAAO;AACjD;AAEA,SAAS,iBAAiB,MAAe,QAAuC;AAC9E,QAAM,QAAmB,MAAM,KAAK,KAAK,QAAQ;AACjD,SAAO,MAAM,QAAQ;AACnB,UAAM,KAAK,MAAM,MAAM;AACvB,UAAM,OAAO,GAAG,aAAa,UAAU;AACvC,QAAI,MAAM;AACR,UAAI;AACJ,UAAI,GAAG,aAAa,WAAW,GAAG;AAChC,cAAM,SAAkC,CAAC;AACzC,yBAAiB,IAAI,MAAM;AAC3B,gBAAQ;AAAA,MACV,OAAO;AACL,gBACE,GAAG,aAAa,SAAS,KACzB,GAAG,aAAa,MAAM,KACtB,GAAG,aAAa,KAAK,MACpB,GAAG,eAAe,IAAI,KAAK;AAAA,MAChC;AACA,gBAAU,QAAQ,MAAM,KAAK;AAAA,IAC/B;AAIA,QAAI,GAAG,aAAa,WAAW,EAAG;AAClC,eAAW,KAAK,GAAG,SAAU,OAAM,KAAK,CAAC;AAAA,EAC3C;AACF;AAEA,SAAS,UAAU,QAAiC,MAAc,OAAsB;AACtF,MAAI,OAAO,IAAI,MAAM,QAAW;AAC9B,WAAO,IAAI,IAAI;AACf;AAAA,EACF;AACA,MAAI,MAAM,QAAQ,OAAO,IAAI,CAAC,GAAG;AAC/B,IAAC,OAAO,IAAI,EAAgB,KAAK,KAAK;AACtC;AAAA,EACF;AACA,SAAO,IAAI,IAAI,CAAC,OAAO,IAAI,GAAG,KAAK;AACrC;AAEA,SAAS,kBAAkB,KAAuC;AAChE,QAAM,MAA8B,CAAC;AACrC,QAAM,MAAM,MAAM,KAAK,IAAI,iBAAiB,UAAU,CAAC;AACvD,QAAM,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE,CAAC;AACtD,aAAW,MAAM,MAAM;AACrB,UAAM,WAAW,GAAG,aAAa,QAAQ,KAAK;AAC9C,UAAM,OAAO,SAAS,MAAM,KAAK,EAAE,CAAC,GAAG,MAAM,MAAM,EAAE,IAAI,KAAK;AAC9D,QAAI,CAAC,KAAM;AACX,UAAM,SAAkC,CAAC;AACzC,qBAAiB,IAAI,MAAM;AAC3B,QAAI,KAAK,EAAE,YAAY,QAAQ,MAAM,OAAO,CAAC;AAAA,EAC/C;AACA,SAAO;AACT;AAEA,SAAS,kBAAkB,IAAsB;AAC/C,MAAI,MAAM,GAAG;AACb,SAAO,KAAK;AACV,QAAI,IAAI,aAAa,QAAQ,EAAG,QAAO;AACvC,UAAM,IAAI;AAAA,EACZ;AACA,SAAO;AACT;AAEA,SAAS,iBAAiB,MAAe,QAAuC;AAC9E,QAAM,QAAmB,MAAM,KAAK,KAAK,QAAQ;AACjD,SAAO,MAAM,QAAQ;AACnB,UAAM,KAAK,MAAM,MAAM;AACvB,UAAM,OAAO,GAAG,aAAa,UAAU;AACvC,QAAI,MAAM;AACR,YAAM,WAAW,KAAK,MAAM,MAAM,EAAE,IAAI,KAAK;AAC7C,UAAI;AACJ,UAAI,GAAG,aAAa,QAAQ,GAAG;AAC7B,cAAM,SAAkC,CAAC;AACzC,yBAAiB,IAAI,MAAM;AAC3B,gBAAQ;AAAA,MACV,OAAO;AACL,gBACE,GAAG,aAAa,SAAS,KACzB,GAAG,aAAa,MAAM,KACtB,GAAG,aAAa,UAAU,MACzB,GAAG,eAAe,IAAI,KAAK;AAAA,MAChC;AACA,gBAAU,QAAQ,UAAU,KAAK;AAAA,IACnC;AAEA,QAAI,GAAG,aAAa,QAAQ,EAAG;AAC/B,eAAW,KAAK,GAAG,SAAU,OAAM,KAAK,CAAC;AAAA,EAC3C;AACF;AAEO,MAAM,qBAA0C;","names":[]}
@@ -0,0 +1,4 @@
1
+ import type { StructuredData } from '../types.js';
2
+ export declare function extractStructured(html: string): StructuredData;
3
+ export type { TableData } from '../types.js';
4
+ //# sourceMappingURL=structured.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structured.d.ts","sourceRoot":"","sources":["../../src/extraction/structured.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAA2C,MAAM,aAAa,CAAC;AAU3F,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,cAAc,CAgB9D;AAuLD,YAAY,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC"}
@@ -0,0 +1,163 @@
1
+ import { parseHTML } from "linkedom";
2
+ import { extractTables } from "./extract.js";
3
+ import { extractJsonLd } from "./jsonld.js";
4
+ const MAX_VALUE_LEN = 400;
5
+ const MAX_ITEMS_PER_TYPE = 200;
6
+ function extractStructured(html) {
7
+ const { document: doc } = parseHTML(html);
8
+ const tables = extractTables(html);
9
+ const jsonld = extractJsonLd(html);
10
+ const definitions = extractDefinitions(doc);
11
+ const chart_hints = extractChartHints(doc);
12
+ const key_value_pairs = extractKeyValuePairs(doc);
13
+ return {
14
+ tables,
15
+ definitions,
16
+ jsonld,
17
+ chart_hints,
18
+ key_value_pairs
19
+ };
20
+ }
21
+ function extractDefinitions(doc) {
22
+ const out = [];
23
+ const dlists = doc.querySelectorAll("dl");
24
+ for (const dl of dlists) {
25
+ const children = Array.from(dl.children);
26
+ let pending = null;
27
+ const buffer = [];
28
+ const flush = () => {
29
+ if (pending !== null && buffer.length > 0) {
30
+ out.push({
31
+ term: pending,
32
+ description: truncate(buffer.join(" ").trim())
33
+ });
34
+ }
35
+ pending = null;
36
+ buffer.length = 0;
37
+ };
38
+ for (const c of children) {
39
+ const tag = c.tagName.toLowerCase();
40
+ if (tag === "dt") {
41
+ flush();
42
+ pending = truncate((c.textContent ?? "").trim());
43
+ } else if (tag === "dd" && pending !== null) {
44
+ const text = (c.textContent ?? "").trim();
45
+ if (text) buffer.push(text);
46
+ }
47
+ if (out.length >= MAX_ITEMS_PER_TYPE) break;
48
+ }
49
+ flush();
50
+ if (out.length >= MAX_ITEMS_PER_TYPE) break;
51
+ }
52
+ return out;
53
+ }
54
+ function extractChartHints(doc) {
55
+ const out = [];
56
+ const seen = /* @__PURE__ */ new Set();
57
+ for (const svg of doc.querySelectorAll("svg")) {
58
+ const title = (svg.querySelector("title")?.textContent ?? "").trim();
59
+ const aria_label = (svg.getAttribute("aria-label") ?? "").trim();
60
+ const role = (svg.getAttribute("role") ?? "").trim();
61
+ let figcaption;
62
+ const figParent = closestFigure(svg);
63
+ if (figParent) {
64
+ const cap = figParent.querySelector("figcaption");
65
+ figcaption = cap?.textContent?.trim() || void 0;
66
+ }
67
+ if (!title && !aria_label && !figcaption) continue;
68
+ const hint = {
69
+ ...title ? { title: truncate(title) } : {},
70
+ ...aria_label ? { aria_label: truncate(aria_label) } : {},
71
+ ...figcaption ? { figcaption: truncate(figcaption) } : {},
72
+ type_hint: inferChartType(title, aria_label, role, figcaption)
73
+ };
74
+ const key = `${hint.title ?? ""}|${hint.aria_label ?? ""}|${hint.figcaption ?? ""}`;
75
+ if (seen.has(key)) continue;
76
+ seen.add(key);
77
+ out.push(hint);
78
+ if (out.length >= MAX_ITEMS_PER_TYPE) break;
79
+ }
80
+ for (const fig of doc.querySelectorAll("figure")) {
81
+ if (fig.querySelector("svg")) continue;
82
+ const cap = fig.querySelector("figcaption")?.textContent?.trim();
83
+ if (!cap) continue;
84
+ const key = `||${cap}`;
85
+ if (seen.has(key)) continue;
86
+ seen.add(key);
87
+ out.push({
88
+ figcaption: truncate(cap),
89
+ type_hint: inferChartType("", "", "", cap)
90
+ });
91
+ if (out.length >= MAX_ITEMS_PER_TYPE) break;
92
+ }
93
+ return out;
94
+ }
95
+ function closestFigure(el) {
96
+ let cur = el.parentElement;
97
+ while (cur) {
98
+ if (cur.tagName.toLowerCase() === "figure") return cur;
99
+ cur = cur.parentElement;
100
+ }
101
+ return null;
102
+ }
103
+ function inferChartType(title, ariaLabel, role, figcaption) {
104
+ const all = [title, ariaLabel, role, figcaption ?? ""].join(" ").toLowerCase();
105
+ if (/\b(chart|bar|line|pie|donut|scatter|area|histogram)\b/.test(all)) return "chart";
106
+ if (/\b(diagram|flow|architecture|topology)\b/.test(all)) return "diagram";
107
+ if (/\b(graph|network|tree)\b/.test(all)) return "graph";
108
+ if (role === "img") return "chart";
109
+ return void 0;
110
+ }
111
+ function extractKeyValuePairs(doc) {
112
+ const out = [];
113
+ const seen = /* @__PURE__ */ new Set();
114
+ for (const el of doc.querySelectorAll("[itemprop]")) {
115
+ const key = el.getAttribute("itemprop") ?? "";
116
+ const value = (el.getAttribute("content") ?? el.textContent ?? "").trim();
117
+ if (!key || !value) continue;
118
+ pushUnique(out, seen, { key, value: truncate(value), source: "microdata" });
119
+ if (out.length >= MAX_ITEMS_PER_TYPE) return out;
120
+ }
121
+ for (const el of doc.querySelectorAll("[data-label][data-value], [data-key][data-value]")) {
122
+ const key = (el.getAttribute("data-label") ?? el.getAttribute("data-key") ?? "").trim();
123
+ const value = (el.getAttribute("data-value") ?? "").trim();
124
+ if (!key || !value) continue;
125
+ pushUnique(out, seen, { key, value: truncate(value), source: "data-attr" });
126
+ if (out.length >= MAX_ITEMS_PER_TYPE) return out;
127
+ }
128
+ for (const row of doc.querySelectorAll('[class*="row"], [class*="spec"], [class*="field"]')) {
129
+ const label = row.querySelector('[class*="label"], [class*="name"], [class*="key"], dt, th');
130
+ const value = row.querySelector('[class*="value"], [class*="data"], dd, td');
131
+ if (!label || !value) continue;
132
+ const k = (label.textContent ?? "").trim();
133
+ const v = (value.textContent ?? "").trim();
134
+ if (!k || !v || k === v) continue;
135
+ if (k.length > 100 || v.length === 0) continue;
136
+ pushUnique(out, seen, { key: k, value: truncate(v), source: "comparison-grid" });
137
+ if (out.length >= MAX_ITEMS_PER_TYPE) return out;
138
+ }
139
+ for (const el of doc.querySelectorAll("li, p")) {
140
+ const text = (el.textContent ?? "").trim();
141
+ if (!text || text.length > 300) continue;
142
+ const m = text.match(/^([A-Z][A-Za-z0-9 _-]{1,40}):\s+(.+)$/);
143
+ if (!m) continue;
144
+ pushUnique(out, seen, { key: m[1].trim(), value: truncate(m[2].trim()), source: "text-pattern" });
145
+ if (out.length >= MAX_ITEMS_PER_TYPE) return out;
146
+ }
147
+ return out;
148
+ }
149
+ function pushUnique(list, seen, pair) {
150
+ const key = `${pair.key.toLowerCase()}|${pair.value.toLowerCase()}`;
151
+ if (seen.has(key)) return;
152
+ seen.add(key);
153
+ list.push(pair);
154
+ }
155
+ function truncate(text) {
156
+ const collapsed = text.replace(/\s+/g, " ").trim();
157
+ if (collapsed.length <= MAX_VALUE_LEN) return collapsed;
158
+ return collapsed.slice(0, MAX_VALUE_LEN - 1) + "\u2026";
159
+ }
160
+ export {
161
+ extractStructured
162
+ };
163
+ //# sourceMappingURL=structured.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/extraction/structured.ts"],"sourcesContent":["import { parseHTML } from 'linkedom';\nimport type { StructuredData, DefinitionPair, ChartHint, KeyValuePair } from '../types.js';\nimport { extractTables } from './extract.js';\nimport { extractJsonLd } from './jsonld.js';\n\nconst MAX_VALUE_LEN = 400;\nconst MAX_ITEMS_PER_TYPE = 200;\n\n// Entry point. Collects every structured-data pattern we can surface from\n// raw HTML so host LLMs receive a rich, schema-free brief without needing\n// to re-parse the page.\nexport function extractStructured(html: string): StructuredData {\n const { document: doc } = parseHTML(html);\n\n const tables = extractTables(html);\n const jsonld = extractJsonLd(html);\n const definitions = extractDefinitions(doc);\n const chart_hints = extractChartHints(doc);\n const key_value_pairs = extractKeyValuePairs(doc);\n\n return {\n tables,\n definitions,\n jsonld,\n chart_hints,\n key_value_pairs,\n };\n}\n\n// <dl><dt>Term</dt><dd>Description</dd></dl> is the canonical key-value\n// structure on the web; we also handle multiple <dd> per <dt> by joining.\nfunction extractDefinitions(doc: Document): DefinitionPair[] {\n const out: DefinitionPair[] = [];\n const dlists = doc.querySelectorAll('dl');\n for (const dl of dlists) {\n const children = Array.from(dl.children);\n let pending: string | null = null;\n const buffer: string[] = [];\n const flush = () => {\n if (pending !== null && buffer.length > 0) {\n out.push({\n term: pending,\n description: truncate(buffer.join(' ').trim()),\n });\n }\n pending = null;\n buffer.length = 0;\n };\n for (const c of children) {\n const tag = c.tagName.toLowerCase();\n if (tag === 'dt') {\n flush();\n pending = truncate((c.textContent ?? '').trim());\n } else if (tag === 'dd' && pending !== null) {\n const text = (c.textContent ?? '').trim();\n if (text) buffer.push(text);\n }\n if (out.length >= MAX_ITEMS_PER_TYPE) break;\n }\n flush();\n if (out.length >= MAX_ITEMS_PER_TYPE) break;\n }\n return out;\n}\n\n// SVG / figure accessibility hints are the cheapest way to surface\n// chart structure when the chart itself is rendered by JS. Host LLMs\n// use these to describe a data viz without needing the underlying data.\nfunction extractChartHints(doc: Document): ChartHint[] {\n const out: ChartHint[] = [];\n const seen = new Set<string>();\n\n for (const svg of doc.querySelectorAll('svg')) {\n const title = (svg.querySelector('title')?.textContent ?? '').trim();\n const aria_label = (svg.getAttribute('aria-label') ?? '').trim();\n const role = (svg.getAttribute('role') ?? '').trim();\n\n let figcaption: string | undefined;\n const figParent = closestFigure(svg);\n if (figParent) {\n const cap = figParent.querySelector('figcaption');\n figcaption = cap?.textContent?.trim() || undefined;\n }\n\n if (!title && !aria_label && !figcaption) continue;\n\n const hint: ChartHint = {\n ...(title ? { title: truncate(title) } : {}),\n ...(aria_label ? { aria_label: truncate(aria_label) } : {}),\n ...(figcaption ? { figcaption: truncate(figcaption) } : {}),\n type_hint: inferChartType(title, aria_label, role, figcaption),\n };\n const key = `${hint.title ?? ''}|${hint.aria_label ?? ''}|${hint.figcaption ?? ''}`;\n if (seen.has(key)) continue;\n seen.add(key);\n out.push(hint);\n if (out.length >= MAX_ITEMS_PER_TYPE) break;\n }\n\n // <figure><figcaption> without SVG still surfaces dataviz context for\n // pages that render charts as images or canvas.\n for (const fig of doc.querySelectorAll('figure')) {\n if (fig.querySelector('svg')) continue; // already handled above\n const cap = fig.querySelector('figcaption')?.textContent?.trim();\n if (!cap) continue;\n const key = `||${cap}`;\n if (seen.has(key)) continue;\n seen.add(key);\n out.push({\n figcaption: truncate(cap),\n type_hint: inferChartType('', '', '', cap),\n });\n if (out.length >= MAX_ITEMS_PER_TYPE) break;\n }\n\n return out;\n}\n\nfunction closestFigure(el: Element): Element | null {\n let cur: Element | null = el.parentElement;\n while (cur) {\n if (cur.tagName.toLowerCase() === 'figure') return cur;\n cur = cur.parentElement;\n }\n return null;\n}\n\nfunction inferChartType(\n title: string,\n ariaLabel: string,\n role: string,\n figcaption?: string,\n): ChartHint['type_hint'] {\n const all = [title, ariaLabel, role, figcaption ?? ''].join(' ').toLowerCase();\n if (/\\b(chart|bar|line|pie|donut|scatter|area|histogram)\\b/.test(all)) return 'chart';\n if (/\\b(diagram|flow|architecture|topology)\\b/.test(all)) return 'diagram';\n if (/\\b(graph|network|tree)\\b/.test(all)) return 'graph';\n if (role === 'img') return 'chart';\n return undefined;\n}\n\n// Comparison grids, spec sheets, and product info boxes often use\n// explicit \"Label: Value\" or [data-label] patterns. We harvest those\n// plus <meta name=...> pairs not already covered by extractMetadata.\nfunction extractKeyValuePairs(doc: Document): KeyValuePair[] {\n const out: KeyValuePair[] = [];\n const seen = new Set<string>();\n\n // Microdata itemprop pairs\n for (const el of doc.querySelectorAll('[itemprop]')) {\n const key = el.getAttribute('itemprop') ?? '';\n const value = (el.getAttribute('content') ?? el.textContent ?? '').trim();\n if (!key || !value) continue;\n pushUnique(out, seen, { key, value: truncate(value), source: 'microdata' });\n if (out.length >= MAX_ITEMS_PER_TYPE) return out;\n }\n\n // data-* attributes where the name looks meaningful (>= 3 chars after data-)\n for (const el of doc.querySelectorAll('[data-label][data-value], [data-key][data-value]')) {\n const key = (el.getAttribute('data-label') ?? el.getAttribute('data-key') ?? '').trim();\n const value = (el.getAttribute('data-value') ?? '').trim();\n if (!key || !value) continue;\n pushUnique(out, seen, { key, value: truncate(value), source: 'data-attr' });\n if (out.length >= MAX_ITEMS_PER_TYPE) return out;\n }\n\n // Comparison grid rows: <div class=\"row\"><div class=\"label\">X</div><div class=\"value\">Y</div></div>\n for (const row of doc.querySelectorAll('[class*=\"row\"], [class*=\"spec\"], [class*=\"field\"]')) {\n const label = row.querySelector('[class*=\"label\"], [class*=\"name\"], [class*=\"key\"], dt, th');\n const value = row.querySelector('[class*=\"value\"], [class*=\"data\"], dd, td');\n if (!label || !value) continue;\n const k = (label.textContent ?? '').trim();\n const v = (value.textContent ?? '').trim();\n if (!k || !v || k === v) continue;\n if (k.length > 100 || v.length === 0) continue;\n pushUnique(out, seen, { key: k, value: truncate(v), source: 'comparison-grid' });\n if (out.length >= MAX_ITEMS_PER_TYPE) return out;\n }\n\n // \"Key: Value\" text patterns within <li>/<p> — cheap heuristic for spec sheets\n for (const el of doc.querySelectorAll('li, p')) {\n const text = (el.textContent ?? '').trim();\n if (!text || text.length > 300) continue;\n const m = text.match(/^([A-Z][A-Za-z0-9 _-]{1,40}):\\s+(.+)$/);\n if (!m) continue;\n pushUnique(out, seen, { key: m[1].trim(), value: truncate(m[2].trim()), source: 'text-pattern' });\n if (out.length >= MAX_ITEMS_PER_TYPE) return out;\n }\n\n return out;\n}\n\nfunction pushUnique(\n list: KeyValuePair[],\n seen: Set<string>,\n pair: KeyValuePair,\n): void {\n const key = `${pair.key.toLowerCase()}|${pair.value.toLowerCase()}`;\n if (seen.has(key)) return;\n seen.add(key);\n list.push(pair);\n}\n\nfunction truncate(text: string): string {\n const collapsed = text.replace(/\\s+/g, ' ').trim();\n if (collapsed.length <= MAX_VALUE_LEN) return collapsed;\n return collapsed.slice(0, MAX_VALUE_LEN - 1) + '…';\n}\n\n// Re-export types for callers that only import from this module.\nexport type { TableData } from '../types.js';\n"],"mappings":"AAAA,SAAS,iBAAiB;AAE1B,SAAS,qBAAqB;AAC9B,SAAS,qBAAqB;AAE9B,MAAM,gBAAgB;AACtB,MAAM,qBAAqB;AAKpB,SAAS,kBAAkB,MAA8B;AAC9D,QAAM,EAAE,UAAU,IAAI,IAAI,UAAU,IAAI;AAExC,QAAM,SAAS,cAAc,IAAI;AACjC,QAAM,SAAS,cAAc,IAAI;AACjC,QAAM,cAAc,mBAAmB,GAAG;AAC1C,QAAM,cAAc,kBAAkB,GAAG;AACzC,QAAM,kBAAkB,qBAAqB,GAAG;AAEhD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAIA,SAAS,mBAAmB,KAAiC;AAC3D,QAAM,MAAwB,CAAC;AAC/B,QAAM,SAAS,IAAI,iBAAiB,IAAI;AACxC,aAAW,MAAM,QAAQ;AACvB,UAAM,WAAW,MAAM,KAAK,GAAG,QAAQ;AACvC,QAAI,UAAyB;AAC7B,UAAM,SAAmB,CAAC;AAC1B,UAAM,QAAQ,MAAM;AAClB,UAAI,YAAY,QAAQ,OAAO,SAAS,GAAG;AACzC,YAAI,KAAK;AAAA,UACP,MAAM;AAAA,UACN,aAAa,SAAS,OAAO,KAAK,GAAG,EAAE,KAAK,CAAC;AAAA,QAC/C,CAAC;AAAA,MACH;AACA,gBAAU;AACV,aAAO,SAAS;AAAA,IAClB;AACA,eAAW,KAAK,UAAU;AACxB,YAAM,MAAM,EAAE,QAAQ,YAAY;AAClC,UAAI,QAAQ,MAAM;AAChB,cAAM;AACN,kBAAU,UAAU,EAAE,eAAe,IAAI,KAAK,CAAC;AAAA,MACjD,WAAW,QAAQ,QAAQ,YAAY,MAAM;AAC3C,cAAM,QAAQ,EAAE,eAAe,IAAI,KAAK;AACxC,YAAI,KAAM,QAAO,KAAK,IAAI;AAAA,MAC5B;AACA,UAAI,IAAI,UAAU,mBAAoB;AAAA,IACxC;AACA,UAAM;AACN,QAAI,IAAI,UAAU,mBAAoB;AAAA,EACxC;AACA,SAAO;AACT;AAKA,SAAS,kBAAkB,KAA4B;AACrD,QAAM,MAAmB,CAAC;AAC1B,QAAM,OAAO,oBAAI,IAAY;AAE7B,aAAW,OAAO,IAAI,iBAAiB,KAAK,GAAG;AAC7C,UAAM,SAAS,IAAI,cAAc,OAAO,GAAG,eAAe,IAAI,KAAK;AACnE,UAAM,cAAc,IAAI,aAAa,YAAY,KAAK,IAAI,KAAK;AAC/D,UAAM,QAAQ,IAAI,aAAa,MAAM,KAAK,IAAI,KAAK;AAEnD,QAAI;AACJ,UAAM,YAAY,cAAc,GAAG;AACnC,QAAI,WAAW;AACb,YAAM,MAAM,UAAU,cAAc,YAAY;AAChD,mBAAa,KAAK,aAAa,KAAK,KAAK;AAAA,IAC3C;AAEA,QAAI,CAAC,SAAS,CAAC,cAAc,CAAC,WAAY;AAE1C,UAAM,OAAkB;AAAA,MACtB,GAAI,QAAQ,EAAE,OAAO,SAAS,KAAK,EAAE,IAAI,CAAC;AAAA,MAC1C,GAAI,aAAa,EAAE,YAAY,SAAS,UAAU,EAAE,IAAI,CAAC;AAAA,MACzD,GAAI,aAAa,EAAE,YAAY,SAAS,UAAU,EAAE,IAAI,CAAC;AAAA,MACzD,WAAW,eAAe,OAAO,YAAY,MAAM,UAAU;AAAA,IAC/D;AACA,UAAM,MAAM,GAAG,KAAK,SAAS,EAAE,IAAI,KAAK,cAAc,EAAE,IAAI,KAAK,cAAc,EAAE;AACjF,QAAI,KAAK,IAAI,GAAG,EAAG;AACnB,SAAK,IAAI,GAAG;AACZ,QAAI,KAAK,IAAI;AACb,QAAI,IAAI,UAAU,mBAAoB;AAAA,EACxC;AAIA,aAAW,OAAO,IAAI,iBAAiB,QAAQ,GAAG;AAChD,QAAI,IAAI,cAAc,KAAK,EAAG;AAC9B,UAAM,MAAM,IAAI,cAAc,YAAY,GAAG,aAAa,KAAK;AAC/D,QAAI,CAAC,IAAK;AACV,UAAM,MAAM,KAAK,GAAG;AACpB,QAAI,KAAK,IAAI,GAAG,EAAG;AACnB,SAAK,IAAI,GAAG;AACZ,QAAI,KAAK;AAAA,MACP,YAAY,SAAS,GAAG;AAAA,MACxB,WAAW,eAAe,IAAI,IAAI,IAAI,GAAG;AAAA,IAC3C,CAAC;AACD,QAAI,IAAI,UAAU,mBAAoB;AAAA,EACxC;AAEA,SAAO;AACT;AAEA,SAAS,cAAc,IAA6B;AAClD,MAAI,MAAsB,GAAG;AAC7B,SAAO,KAAK;AACV,QAAI,IAAI,QAAQ,YAAY,MAAM,SAAU,QAAO;AACnD,UAAM,IAAI;AAAA,EACZ;AACA,SAAO;AACT;AAEA,SAAS,eACP,OACA,WACA,MACA,YACwB;AACxB,QAAM,MAAM,CAAC,OAAO,WAAW,MAAM,cAAc,EAAE,EAAE,KAAK,GAAG,EAAE,YAAY;AAC7E,MAAI,wDAAwD,KAAK,GAAG,EAAG,QAAO;AAC9E,MAAI,2CAA2C,KAAK,GAAG,EAAG,QAAO;AACjE,MAAI,2BAA2B,KAAK,GAAG,EAAG,QAAO;AACjD,MAAI,SAAS,MAAO,QAAO;AAC3B,SAAO;AACT;AAKA,SAAS,qBAAqB,KAA+B;AAC3D,QAAM,MAAsB,CAAC;AAC7B,QAAM,OAAO,oBAAI,IAAY;AAG7B,aAAW,MAAM,IAAI,iBAAiB,YAAY,GAAG;AACnD,UAAM,MAAM,GAAG,aAAa,UAAU,KAAK;AAC3C,UAAM,SAAS,GAAG,aAAa,SAAS,KAAK,GAAG,eAAe,IAAI,KAAK;AACxE,QAAI,CAAC,OAAO,CAAC,MAAO;AACpB,eAAW,KAAK,MAAM,EAAE,KAAK,OAAO,SAAS,KAAK,GAAG,QAAQ,YAAY,CAAC;AAC1E,QAAI,IAAI,UAAU,mBAAoB,QAAO;AAAA,EAC/C;AAGA,aAAW,MAAM,IAAI,iBAAiB,kDAAkD,GAAG;AACzF,UAAM,OAAO,GAAG,aAAa,YAAY,KAAK,GAAG,aAAa,UAAU,KAAK,IAAI,KAAK;AACtF,UAAM,SAAS,GAAG,aAAa,YAAY,KAAK,IAAI,KAAK;AACzD,QAAI,CAAC,OAAO,CAAC,MAAO;AACpB,eAAW,KAAK,MAAM,EAAE,KAAK,OAAO,SAAS,KAAK,GAAG,QAAQ,YAAY,CAAC;AAC1E,QAAI,IAAI,UAAU,mBAAoB,QAAO;AAAA,EAC/C;AAGA,aAAW,OAAO,IAAI,iBAAiB,mDAAmD,GAAG;AAC3F,UAAM,QAAQ,IAAI,cAAc,2DAA2D;AAC3F,UAAM,QAAQ,IAAI,cAAc,2CAA2C;AAC3E,QAAI,CAAC,SAAS,CAAC,MAAO;AACtB,UAAM,KAAK,MAAM,eAAe,IAAI,KAAK;AACzC,UAAM,KAAK,MAAM,eAAe,IAAI,KAAK;AACzC,QAAI,CAAC,KAAK,CAAC,KAAK,MAAM,EAAG;AACzB,QAAI,EAAE,SAAS,OAAO,EAAE,WAAW,EAAG;AACtC,eAAW,KAAK,MAAM,EAAE,KAAK,GAAG,OAAO,SAAS,CAAC,GAAG,QAAQ,kBAAkB,CAAC;AAC/E,QAAI,IAAI,UAAU,mBAAoB,QAAO;AAAA,EAC/C;AAGA,aAAW,MAAM,IAAI,iBAAiB,OAAO,GAAG;AAC9C,UAAM,QAAQ,GAAG,eAAe,IAAI,KAAK;AACzC,QAAI,CAAC,QAAQ,KAAK,SAAS,IAAK;AAChC,UAAM,IAAI,KAAK,MAAM,uCAAuC;AAC5D,QAAI,CAAC,EAAG;AACR,eAAW,KAAK,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,GAAG,OAAO,SAAS,EAAE,CAAC,EAAE,KAAK,CAAC,GAAG,QAAQ,eAAe,CAAC;AAChG,QAAI,IAAI,UAAU,mBAAoB,QAAO;AAAA,EAC/C;AAEA,SAAO;AACT;AAEA,SAAS,WACP,MACA,MACA,MACM;AACN,QAAM,MAAM,GAAG,KAAK,IAAI,YAAY,CAAC,IAAI,KAAK,MAAM,YAAY,CAAC;AACjE,MAAI,KAAK,IAAI,GAAG,EAAG;AACnB,OAAK,IAAI,GAAG;AACZ,OAAK,KAAK,IAAI;AAChB;AAEA,SAAS,SAAS,MAAsB;AACtC,QAAM,YAAY,KAAK,QAAQ,QAAQ,GAAG,EAAE,KAAK;AACjD,MAAI,UAAU,UAAU,cAAe,QAAO;AAC9C,SAAO,UAAU,MAAM,GAAG,gBAAgB,CAAC,IAAI;AACjD;","names":[]}
@@ -0,0 +1,3 @@
1
+ export type ContentType = 'news' | 'recipe' | 'product' | 'code' | 'docs' | 'generic';
2
+ export declare function classifyContent(url: string, html: string): ContentType;
3
+ //# sourceMappingURL=classifier.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../../../src/extraction/v1/classifier.ts"],"names":[],"mappings":"AAGA,MAAM,MAAM,WAAW,GAAG,MAAM,GAAG,QAAQ,GAAG,SAAS,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;AAuBtF,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,WAAW,CAgBtE"}