@de-otio/chaoskb-client 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (355) hide show
  1. package/dist/cli/agent-registry/config-merger.d.ts +28 -0
  2. package/dist/cli/agent-registry/config-merger.d.ts.map +1 -0
  3. package/dist/cli/agent-registry/config-merger.js +90 -0
  4. package/dist/cli/agent-registry/config-merger.js.map +1 -0
  5. package/dist/cli/agent-registry/detector.d.ts +7 -0
  6. package/dist/cli/agent-registry/detector.d.ts.map +1 -0
  7. package/dist/cli/agent-registry/detector.js +100 -0
  8. package/dist/cli/agent-registry/detector.js.map +1 -0
  9. package/dist/cli/agent-registry/index.d.ts +26 -0
  10. package/dist/cli/agent-registry/index.d.ts.map +1 -0
  11. package/dist/cli/agent-registry/index.js +77 -0
  12. package/dist/cli/agent-registry/index.js.map +1 -0
  13. package/dist/cli/agent-registry/path-validator.d.ts +11 -0
  14. package/dist/cli/agent-registry/path-validator.d.ts.map +1 -0
  15. package/dist/cli/agent-registry/path-validator.js +69 -0
  16. package/dist/cli/agent-registry/path-validator.js.map +1 -0
  17. package/dist/cli/agent-registry/registry.json +108 -0
  18. package/dist/cli/agent-registry/types.d.ts +29 -0
  19. package/dist/cli/agent-registry/types.d.ts.map +1 -0
  20. package/dist/cli/agent-registry/types.js +2 -0
  21. package/dist/cli/agent-registry/types.js.map +1 -0
  22. package/dist/cli/bootstrap-lock.d.ts +7 -0
  23. package/dist/cli/bootstrap-lock.d.ts.map +1 -0
  24. package/dist/cli/bootstrap-lock.js +62 -0
  25. package/dist/cli/bootstrap-lock.js.map +1 -0
  26. package/dist/cli/bootstrap.d.ts +23 -0
  27. package/dist/cli/bootstrap.d.ts.map +1 -0
  28. package/dist/cli/bootstrap.js +438 -0
  29. package/dist/cli/bootstrap.js.map +1 -0
  30. package/dist/cli/commands/config.d.ts +13 -0
  31. package/dist/cli/commands/config.d.ts.map +1 -0
  32. package/dist/cli/commands/config.js +244 -0
  33. package/dist/cli/commands/config.js.map +1 -0
  34. package/dist/cli/commands/devices.d.ts +21 -0
  35. package/dist/cli/commands/devices.d.ts.map +1 -0
  36. package/dist/cli/commands/devices.js +229 -0
  37. package/dist/cli/commands/devices.js.map +1 -0
  38. package/dist/cli/commands/export.d.ts +12 -0
  39. package/dist/cli/commands/export.d.ts.map +1 -0
  40. package/dist/cli/commands/export.js +183 -0
  41. package/dist/cli/commands/export.js.map +1 -0
  42. package/dist/cli/commands/import.d.ts +26 -0
  43. package/dist/cli/commands/import.d.ts.map +1 -0
  44. package/dist/cli/commands/import.js +311 -0
  45. package/dist/cli/commands/import.js.map +1 -0
  46. package/dist/cli/commands/kb.d.ts +39 -0
  47. package/dist/cli/commands/kb.d.ts.map +1 -0
  48. package/dist/cli/commands/kb.js +138 -0
  49. package/dist/cli/commands/kb.js.map +1 -0
  50. package/dist/cli/commands/project.d.ts +6 -0
  51. package/dist/cli/commands/project.d.ts.map +1 -0
  52. package/dist/cli/commands/project.js +115 -0
  53. package/dist/cli/commands/project.js.map +1 -0
  54. package/dist/cli/commands/projects.d.ts +33 -0
  55. package/dist/cli/commands/projects.d.ts.map +1 -0
  56. package/dist/cli/commands/projects.js +189 -0
  57. package/dist/cli/commands/projects.js.map +1 -0
  58. package/dist/cli/commands/register.d.ts +8 -0
  59. package/dist/cli/commands/register.d.ts.map +1 -0
  60. package/dist/cli/commands/register.js +146 -0
  61. package/dist/cli/commands/register.js.map +1 -0
  62. package/dist/cli/commands/rotate-key.d.ts +16 -0
  63. package/dist/cli/commands/rotate-key.d.ts.map +1 -0
  64. package/dist/cli/commands/rotate-key.js +197 -0
  65. package/dist/cli/commands/rotate-key.js.map +1 -0
  66. package/dist/cli/commands/setup-sync.d.ts +2 -0
  67. package/dist/cli/commands/setup-sync.d.ts.map +1 -0
  68. package/dist/cli/commands/setup-sync.js +165 -0
  69. package/dist/cli/commands/setup-sync.js.map +1 -0
  70. package/dist/cli/commands/setup.d.ts +12 -0
  71. package/dist/cli/commands/setup.d.ts.map +1 -0
  72. package/dist/cli/commands/setup.js +39 -0
  73. package/dist/cli/commands/setup.js.map +1 -0
  74. package/dist/cli/commands/status.d.ts +5 -0
  75. package/dist/cli/commands/status.d.ts.map +1 -0
  76. package/dist/cli/commands/status.js +96 -0
  77. package/dist/cli/commands/status.js.map +1 -0
  78. package/dist/cli/commands/uninstall.d.ts +4 -0
  79. package/dist/cli/commands/uninstall.d.ts.map +1 -0
  80. package/dist/cli/commands/uninstall.js +85 -0
  81. package/dist/cli/commands/uninstall.js.map +1 -0
  82. package/dist/cli/commands/unregister.d.ts +2 -0
  83. package/dist/cli/commands/unregister.d.ts.map +1 -0
  84. package/dist/cli/commands/unregister.js +46 -0
  85. package/dist/cli/commands/unregister.js.map +1 -0
  86. package/dist/cli/device-metadata.d.ts +15 -0
  87. package/dist/cli/device-metadata.d.ts.map +1 -0
  88. package/dist/cli/device-metadata.js +58 -0
  89. package/dist/cli/device-metadata.js.map +1 -0
  90. package/dist/cli/github.d.ts +38 -0
  91. package/dist/cli/github.d.ts.map +1 -0
  92. package/dist/cli/github.js +159 -0
  93. package/dist/cli/github.js.map +1 -0
  94. package/dist/cli/guide-hashes.json +13 -0
  95. package/dist/cli/index.d.ts +3 -0
  96. package/dist/cli/index.d.ts.map +1 -0
  97. package/dist/cli/index.js +226 -0
  98. package/dist/cli/index.js.map +1 -0
  99. package/dist/cli/mcp-server.d.ts +205 -0
  100. package/dist/cli/mcp-server.d.ts.map +1 -0
  101. package/dist/cli/mcp-server.js +366 -0
  102. package/dist/cli/mcp-server.js.map +1 -0
  103. package/dist/cli/tools/kb-delete.d.ts +10 -0
  104. package/dist/cli/tools/kb-delete.d.ts.map +1 -0
  105. package/dist/cli/tools/kb-delete.js +28 -0
  106. package/dist/cli/tools/kb-delete.js.map +1 -0
  107. package/dist/cli/tools/kb-ingest.d.ts +13 -0
  108. package/dist/cli/tools/kb-ingest.d.ts.map +1 -0
  109. package/dist/cli/tools/kb-ingest.js +72 -0
  110. package/dist/cli/tools/kb-ingest.js.map +1 -0
  111. package/dist/cli/tools/kb-list.d.ts +20 -0
  112. package/dist/cli/tools/kb-list.d.ts.map +1 -0
  113. package/dist/cli/tools/kb-list.js +24 -0
  114. package/dist/cli/tools/kb-list.js.map +1 -0
  115. package/dist/cli/tools/kb-query-shared.d.ts +27 -0
  116. package/dist/cli/tools/kb-query-shared.d.ts.map +1 -0
  117. package/dist/cli/tools/kb-query-shared.js +28 -0
  118. package/dist/cli/tools/kb-query-shared.js.map +1 -0
  119. package/dist/cli/tools/kb-query.d.ts +20 -0
  120. package/dist/cli/tools/kb-query.d.ts.map +1 -0
  121. package/dist/cli/tools/kb-query.js +109 -0
  122. package/dist/cli/tools/kb-query.js.map +1 -0
  123. package/dist/cli/tools/kb-summary.d.ts +29 -0
  124. package/dist/cli/tools/kb-summary.d.ts.map +1 -0
  125. package/dist/cli/tools/kb-summary.js +89 -0
  126. package/dist/cli/tools/kb-summary.js.map +1 -0
  127. package/dist/cli/tools/kb-sync-status.d.ts +7 -0
  128. package/dist/cli/tools/kb-sync-status.d.ts.map +1 -0
  129. package/dist/cli/tools/kb-sync-status.js +48 -0
  130. package/dist/cli/tools/kb-sync-status.js.map +1 -0
  131. package/dist/crypto/aad.d.ts +8 -0
  132. package/dist/crypto/aad.d.ts.map +1 -0
  133. package/dist/crypto/aad.js +11 -0
  134. package/dist/crypto/aad.js.map +1 -0
  135. package/dist/crypto/aead.d.ts +21 -0
  136. package/dist/crypto/aead.d.ts.map +1 -0
  137. package/dist/crypto/aead.js +43 -0
  138. package/dist/crypto/aead.js.map +1 -0
  139. package/dist/crypto/argon2.d.ts +11 -0
  140. package/dist/crypto/argon2.d.ts.map +1 -0
  141. package/dist/crypto/argon2.js +33 -0
  142. package/dist/crypto/argon2.js.map +1 -0
  143. package/dist/crypto/blob-id.d.ts +6 -0
  144. package/dist/crypto/blob-id.d.ts.map +1 -0
  145. package/dist/crypto/blob-id.js +33 -0
  146. package/dist/crypto/blob-id.js.map +1 -0
  147. package/dist/crypto/canonical-json.d.ts +6 -0
  148. package/dist/crypto/canonical-json.d.ts.map +1 -0
  149. package/dist/crypto/canonical-json.js +88 -0
  150. package/dist/crypto/canonical-json.js.map +1 -0
  151. package/dist/crypto/commitment.d.ts +12 -0
  152. package/dist/crypto/commitment.d.ts.map +1 -0
  153. package/dist/crypto/commitment.js +37 -0
  154. package/dist/crypto/commitment.js.map +1 -0
  155. package/dist/crypto/encryption-service.d.ts +19 -0
  156. package/dist/crypto/encryption-service.d.ts.map +1 -0
  157. package/dist/crypto/encryption-service.js +38 -0
  158. package/dist/crypto/encryption-service.js.map +1 -0
  159. package/dist/crypto/envelope-cbor.d.ts +37 -0
  160. package/dist/crypto/envelope-cbor.d.ts.map +1 -0
  161. package/dist/crypto/envelope-cbor.js +124 -0
  162. package/dist/crypto/envelope-cbor.js.map +1 -0
  163. package/dist/crypto/envelope.d.ts +34 -0
  164. package/dist/crypto/envelope.d.ts.map +1 -0
  165. package/dist/crypto/envelope.js +160 -0
  166. package/dist/crypto/envelope.js.map +1 -0
  167. package/dist/crypto/hkdf.d.ts +16 -0
  168. package/dist/crypto/hkdf.d.ts.map +1 -0
  169. package/dist/crypto/hkdf.js +33 -0
  170. package/dist/crypto/hkdf.js.map +1 -0
  171. package/dist/crypto/index.d.ts +15 -0
  172. package/dist/crypto/index.d.ts.map +1 -0
  173. package/dist/crypto/index.js +15 -0
  174. package/dist/crypto/index.js.map +1 -0
  175. package/dist/crypto/invite.d.ts +31 -0
  176. package/dist/crypto/invite.d.ts.map +1 -0
  177. package/dist/crypto/invite.js +137 -0
  178. package/dist/crypto/invite.js.map +1 -0
  179. package/dist/crypto/keyring.d.ts +37 -0
  180. package/dist/crypto/keyring.d.ts.map +1 -0
  181. package/dist/crypto/keyring.js +219 -0
  182. package/dist/crypto/keyring.js.map +1 -0
  183. package/dist/crypto/known-keys.d.ts +34 -0
  184. package/dist/crypto/known-keys.d.ts.map +1 -0
  185. package/dist/crypto/known-keys.js +106 -0
  186. package/dist/crypto/known-keys.js.map +1 -0
  187. package/dist/crypto/project-keys.d.ts +26 -0
  188. package/dist/crypto/project-keys.d.ts.map +1 -0
  189. package/dist/crypto/project-keys.js +69 -0
  190. package/dist/crypto/project-keys.js.map +1 -0
  191. package/dist/crypto/secure-buffer.d.ts +31 -0
  192. package/dist/crypto/secure-buffer.d.ts.map +1 -0
  193. package/dist/crypto/secure-buffer.js +61 -0
  194. package/dist/crypto/secure-buffer.js.map +1 -0
  195. package/dist/crypto/ssh-agent.d.ts +16 -0
  196. package/dist/crypto/ssh-agent.d.ts.map +1 -0
  197. package/dist/crypto/ssh-agent.js +225 -0
  198. package/dist/crypto/ssh-agent.js.map +1 -0
  199. package/dist/crypto/ssh-keys.d.ts +19 -0
  200. package/dist/crypto/ssh-keys.d.ts.map +1 -0
  201. package/dist/crypto/ssh-keys.js +121 -0
  202. package/dist/crypto/ssh-keys.js.map +1 -0
  203. package/dist/crypto/tiers/enhanced.d.ts +25 -0
  204. package/dist/crypto/tiers/enhanced.d.ts.map +1 -0
  205. package/dist/crypto/tiers/enhanced.js +56 -0
  206. package/dist/crypto/tiers/enhanced.js.map +1 -0
  207. package/dist/crypto/tiers/maximum.d.ts +19 -0
  208. package/dist/crypto/tiers/maximum.d.ts.map +1 -0
  209. package/dist/crypto/tiers/maximum.js +25 -0
  210. package/dist/crypto/tiers/maximum.js.map +1 -0
  211. package/dist/crypto/tiers/standard.d.ts +27 -0
  212. package/dist/crypto/tiers/standard.d.ts.map +1 -0
  213. package/dist/crypto/tiers/standard.js +147 -0
  214. package/dist/crypto/tiers/standard.js.map +1 -0
  215. package/dist/crypto/types.d.ts +169 -0
  216. package/dist/crypto/types.d.ts.map +1 -0
  217. package/dist/crypto/types.js +11 -0
  218. package/dist/crypto/types.js.map +1 -0
  219. package/dist/pipeline/chunker.d.ts +27 -0
  220. package/dist/pipeline/chunker.d.ts.map +1 -0
  221. package/dist/pipeline/chunker.js +96 -0
  222. package/dist/pipeline/chunker.js.map +1 -0
  223. package/dist/pipeline/content-pipeline.d.ts +24 -0
  224. package/dist/pipeline/content-pipeline.d.ts.map +1 -0
  225. package/dist/pipeline/content-pipeline.js +49 -0
  226. package/dist/pipeline/content-pipeline.js.map +1 -0
  227. package/dist/pipeline/embedder.d.ts +49 -0
  228. package/dist/pipeline/embedder.d.ts.map +1 -0
  229. package/dist/pipeline/embedder.js +195 -0
  230. package/dist/pipeline/embedder.js.map +1 -0
  231. package/dist/pipeline/extract.d.ts +17 -0
  232. package/dist/pipeline/extract.d.ts.map +1 -0
  233. package/dist/pipeline/extract.js +70 -0
  234. package/dist/pipeline/extract.js.map +1 -0
  235. package/dist/pipeline/fetch.d.ts +26 -0
  236. package/dist/pipeline/fetch.d.ts.map +1 -0
  237. package/dist/pipeline/fetch.js +91 -0
  238. package/dist/pipeline/fetch.js.map +1 -0
  239. package/dist/pipeline/index.d.ts +10 -0
  240. package/dist/pipeline/index.d.ts.map +1 -0
  241. package/dist/pipeline/index.js +10 -0
  242. package/dist/pipeline/index.js.map +1 -0
  243. package/dist/pipeline/model-manager.d.ts +57 -0
  244. package/dist/pipeline/model-manager.d.ts.map +1 -0
  245. package/dist/pipeline/model-manager.js +234 -0
  246. package/dist/pipeline/model-manager.js.map +1 -0
  247. package/dist/pipeline/search.d.ts +37 -0
  248. package/dist/pipeline/search.d.ts.map +1 -0
  249. package/dist/pipeline/search.js +65 -0
  250. package/dist/pipeline/search.js.map +1 -0
  251. package/dist/pipeline/tokenizer.d.ts +29 -0
  252. package/dist/pipeline/tokenizer.d.ts.map +1 -0
  253. package/dist/pipeline/tokenizer.js +54 -0
  254. package/dist/pipeline/tokenizer.js.map +1 -0
  255. package/dist/pipeline/types.d.ts +86 -0
  256. package/dist/pipeline/types.d.ts.map +1 -0
  257. package/dist/pipeline/types.js +2 -0
  258. package/dist/pipeline/types.js.map +1 -0
  259. package/dist/pipeline/wordpiece-tokenizer.d.ts +60 -0
  260. package/dist/pipeline/wordpiece-tokenizer.d.ts.map +1 -0
  261. package/dist/pipeline/wordpiece-tokenizer.js +251 -0
  262. package/dist/pipeline/wordpiece-tokenizer.js.map +1 -0
  263. package/dist/storage/chunk-repo.d.ts +29 -0
  264. package/dist/storage/chunk-repo.d.ts.map +1 -0
  265. package/dist/storage/chunk-repo.js +115 -0
  266. package/dist/storage/chunk-repo.js.map +1 -0
  267. package/dist/storage/database-manager.d.ts +17 -0
  268. package/dist/storage/database-manager.d.ts.map +1 -0
  269. package/dist/storage/database-manager.js +100 -0
  270. package/dist/storage/database-manager.js.map +1 -0
  271. package/dist/storage/database.d.ts +10 -0
  272. package/dist/storage/database.d.ts.map +1 -0
  273. package/dist/storage/database.js +34 -0
  274. package/dist/storage/database.js.map +1 -0
  275. package/dist/storage/embedding-index.d.ts +22 -0
  276. package/dist/storage/embedding-index.d.ts.map +1 -0
  277. package/dist/storage/embedding-index.js +78 -0
  278. package/dist/storage/embedding-index.js.map +1 -0
  279. package/dist/storage/index.d.ts +10 -0
  280. package/dist/storage/index.d.ts.map +1 -0
  281. package/dist/storage/index.js +10 -0
  282. package/dist/storage/index.js.map +1 -0
  283. package/dist/storage/kb-database.d.ts +11 -0
  284. package/dist/storage/kb-database.d.ts.map +1 -0
  285. package/dist/storage/kb-database.js +24 -0
  286. package/dist/storage/kb-database.js.map +1 -0
  287. package/dist/storage/schema.d.ts +6 -0
  288. package/dist/storage/schema.d.ts.map +1 -0
  289. package/dist/storage/schema.js +122 -0
  290. package/dist/storage/schema.js.map +1 -0
  291. package/dist/storage/source-repo.d.ts +20 -0
  292. package/dist/storage/source-repo.d.ts.map +1 -0
  293. package/dist/storage/source-repo.js +120 -0
  294. package/dist/storage/source-repo.js.map +1 -0
  295. package/dist/storage/sync-status-repo.d.ts +15 -0
  296. package/dist/storage/sync-status-repo.d.ts.map +1 -0
  297. package/dist/storage/sync-status-repo.js +40 -0
  298. package/dist/storage/sync-status-repo.js.map +1 -0
  299. package/dist/storage/types.d.ts +139 -0
  300. package/dist/storage/types.d.ts.map +1 -0
  301. package/dist/storage/types.js +9 -0
  302. package/dist/storage/types.js.map +1 -0
  303. package/dist/sync/canary.d.ts +14 -0
  304. package/dist/sync/canary.d.ts.map +1 -0
  305. package/dist/sync/canary.js +53 -0
  306. package/dist/sync/canary.js.map +1 -0
  307. package/dist/sync/full-sync.d.ts +16 -0
  308. package/dist/sync/full-sync.d.ts.map +1 -0
  309. package/dist/sync/full-sync.js +91 -0
  310. package/dist/sync/full-sync.js.map +1 -0
  311. package/dist/sync/http-client.d.ts +28 -0
  312. package/dist/sync/http-client.d.ts.map +1 -0
  313. package/dist/sync/http-client.js +90 -0
  314. package/dist/sync/http-client.js.map +1 -0
  315. package/dist/sync/incremental-sync.d.ts +17 -0
  316. package/dist/sync/incremental-sync.d.ts.map +1 -0
  317. package/dist/sync/incremental-sync.js +155 -0
  318. package/dist/sync/incremental-sync.js.map +1 -0
  319. package/dist/sync/index.d.ts +12 -0
  320. package/dist/sync/index.d.ts.map +1 -0
  321. package/dist/sync/index.js +12 -0
  322. package/dist/sync/index.js.map +1 -0
  323. package/dist/sync/quota.d.ts +17 -0
  324. package/dist/sync/quota.d.ts.map +1 -0
  325. package/dist/sync/quota.js +48 -0
  326. package/dist/sync/quota.js.map +1 -0
  327. package/dist/sync/sequence.d.ts +21 -0
  328. package/dist/sync/sequence.d.ts.map +1 -0
  329. package/dist/sync/sequence.js +49 -0
  330. package/dist/sync/sequence.js.map +1 -0
  331. package/dist/sync/ssh-signer.d.ts +59 -0
  332. package/dist/sync/ssh-signer.d.ts.map +1 -0
  333. package/dist/sync/ssh-signer.js +241 -0
  334. package/dist/sync/ssh-signer.js.map +1 -0
  335. package/dist/sync/sync-service.d.ts +48 -0
  336. package/dist/sync/sync-service.d.ts.map +1 -0
  337. package/dist/sync/sync-service.js +116 -0
  338. package/dist/sync/sync-service.js.map +1 -0
  339. package/dist/sync/types.d.ts +106 -0
  340. package/dist/sync/types.d.ts.map +1 -0
  341. package/dist/sync/types.js +2 -0
  342. package/dist/sync/types.js.map +1 -0
  343. package/dist/sync/upload-queue.d.ts +40 -0
  344. package/dist/sync/upload-queue.d.ts.map +1 -0
  345. package/dist/sync/upload-queue.js +148 -0
  346. package/dist/sync/upload-queue.js.map +1 -0
  347. package/dist/sync/verification.d.ts +17 -0
  348. package/dist/sync/verification.d.ts.map +1 -0
  349. package/dist/sync/verification.js +25 -0
  350. package/dist/sync/verification.js.map +1 -0
  351. package/dist/vitest.config.d.ts +3 -0
  352. package/dist/vitest.config.d.ts.map +1 -0
  353. package/dist/vitest.config.js +16 -0
  354. package/dist/vitest.config.js.map +1 -0
  355. package/package.json +68 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../crypto/types.ts"],"names":[],"mappings":"AAqBA,uCAAuC;AACvC,MAAM,CAAN,IAAY,YAOX;AAPD,WAAY,YAAY;IACtB,+EAA+E;IAC/E,qCAAqB,CAAA;IACrB,yFAAyF;IACzF,qCAAqB,CAAA;IACrB,kDAAkD;IAClD,mCAAmB,CAAA;AACrB,CAAC,EAPW,YAAY,KAAZ,YAAY,QAOvB"}
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Text chunking for the content pipeline.
3
+ *
4
+ * Splits extracted text into overlapping chunks of approximately
5
+ * `maxTokens` tokens, breaking on sentence boundaries where possible.
6
+ */
7
+ import type { Chunk } from './types.js';
8
+ /** Configuration for the chunker. */
9
+ export interface ChunkConfig {
10
+ /** Maximum tokens per chunk (default: 500). */
11
+ maxTokens: number;
12
+ /** Token overlap between consecutive chunks (default: 50). */
13
+ overlapTokens: number;
14
+ }
15
+ /**
16
+ * Split text into overlapping chunks of approximately `maxTokens` tokens.
17
+ *
18
+ * Splitting is performed on sentence boundaries where possible. If a
19
+ * single sentence exceeds `maxTokens`, it is included as-is in its own
20
+ * chunk (no mid-sentence splitting).
21
+ *
22
+ * @param text - The text to split into chunks.
23
+ * @param config - Optional chunking configuration.
24
+ * @returns Array of chunks with content, index, token count, and byte offset.
25
+ */
26
+ export declare function chunkText(text: string, config?: Partial<ChunkConfig>): Chunk[];
27
+ //# sourceMappingURL=chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../pipeline/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAExC,qCAAqC;AACrC,MAAM,WAAW,WAAW;IAC1B,+CAA+C;IAC/C,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,aAAa,EAAE,MAAM,CAAC;CACvB;AAaD;;;;;;;;;;GAUG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,GAAG,KAAK,EAAE,CAoE9E"}
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Text chunking for the content pipeline.
3
+ *
4
+ * Splits extracted text into overlapping chunks of approximately
5
+ * `maxTokens` tokens, breaking on sentence boundaries where possible.
6
+ */
7
+ import { countTokens } from './tokenizer.js';
8
+ const DEFAULT_CONFIG = {
9
+ maxTokens: 500,
10
+ overlapTokens: 50,
11
+ };
12
+ /**
13
+ * Sentence-boundary regex. Matches `.`, `!`, or `?` followed by
14
+ * whitespace or end of string. Keeps the punctuation with the sentence.
15
+ */
16
+ const SENTENCE_BOUNDARY_RE = /(?<=[.!?])\s+/;
17
+ /**
18
+ * Split text into overlapping chunks of approximately `maxTokens` tokens.
19
+ *
20
+ * Splitting is performed on sentence boundaries where possible. If a
21
+ * single sentence exceeds `maxTokens`, it is included as-is in its own
22
+ * chunk (no mid-sentence splitting).
23
+ *
24
+ * @param text - The text to split into chunks.
25
+ * @param config - Optional chunking configuration.
26
+ * @returns Array of chunks with content, index, token count, and byte offset.
27
+ */
28
+ export function chunkText(text, config) {
29
+ const maxTokens = config?.maxTokens ?? DEFAULT_CONFIG.maxTokens;
30
+ const overlapTokens = config?.overlapTokens ?? DEFAULT_CONFIG.overlapTokens;
31
+ if (!text || text.trim().length === 0) {
32
+ return [];
33
+ }
34
+ // Split into sentences
35
+ const sentences = text.split(SENTENCE_BOUNDARY_RE).filter((s) => s.length > 0);
36
+ if (sentences.length === 0) {
37
+ return [];
38
+ }
39
+ // Pre-compute token counts for each sentence
40
+ const sentenceTokens = sentences.map((s) => countTokens(s));
41
+ const chunks = [];
42
+ let sentenceIdx = 0;
43
+ while (sentenceIdx < sentences.length) {
44
+ // Build a chunk by accumulating sentences up to maxTokens
45
+ const chunkSentences = [];
46
+ let chunkTokenCount = 0;
47
+ const startSentenceIdx = sentenceIdx;
48
+ while (sentenceIdx < sentences.length) {
49
+ const stc = sentenceTokens[sentenceIdx];
50
+ // If adding this sentence would exceed max and we already have content,
51
+ // stop (unless the chunk is empty — always include at least one sentence).
52
+ if (chunkTokenCount + stc > maxTokens && chunkSentences.length > 0) {
53
+ break;
54
+ }
55
+ chunkSentences.push(sentences[sentenceIdx]);
56
+ chunkTokenCount += stc;
57
+ sentenceIdx++;
58
+ }
59
+ const content = chunkSentences.join(' ');
60
+ // Compute byte offset: sum of byte lengths of all sentences before startSentenceIdx
61
+ // plus the separator spaces between them
62
+ const byteOffset = computeByteOffset(sentences, startSentenceIdx);
63
+ chunks.push({
64
+ content,
65
+ index: chunks.length,
66
+ tokenCount: countTokens(content),
67
+ byteOffset,
68
+ });
69
+ // Apply overlap: back up by enough sentences to cover overlapTokens
70
+ if (sentenceIdx < sentences.length) {
71
+ let overlapCount = 0;
72
+ let backtrack = sentenceIdx - 1;
73
+ while (backtrack > startSentenceIdx && overlapCount < overlapTokens) {
74
+ overlapCount += sentenceTokens[backtrack];
75
+ backtrack--;
76
+ }
77
+ // sentenceIdx should start from backtrack + 1 (the first overlap sentence)
78
+ sentenceIdx = backtrack + 1;
79
+ }
80
+ }
81
+ return chunks;
82
+ }
83
+ /**
84
+ * Compute the byte offset of the sentence at the given index
85
+ * within the original text (assuming sentences are joined by single spaces).
86
+ */
87
+ function computeByteOffset(sentences, targetIdx) {
88
+ let offset = 0;
89
+ for (let i = 0; i < targetIdx; i++) {
90
+ offset += Buffer.byteLength(sentences[i], 'utf-8');
91
+ // Account for the whitespace separator between sentences
92
+ offset += 1; // The space that was split on
93
+ }
94
+ return offset;
95
+ }
96
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../pipeline/chunker.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAW7C,MAAM,cAAc,GAAgB;IAClC,SAAS,EAAE,GAAG;IACd,aAAa,EAAE,EAAE;CAClB,CAAC;AAEF;;;GAGG;AACH,MAAM,oBAAoB,GAAG,eAAe,CAAC;AAE7C;;;;;;;;;;GAUG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,MAA6B;IACnE,MAAM,SAAS,GAAG,MAAM,EAAE,SAAS,IAAI,cAAc,CAAC,SAAS,CAAC;IAChE,MAAM,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,cAAc,CAAC,aAAa,CAAC;IAE5E,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,uBAAuB;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE/E,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,6CAA6C;IAC7C,MAAM,cAAc,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5D,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,OAAO,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;QACtC,0DAA0D;QAC1D,MAAM,cAAc,GAAa,EAAE,CAAC;QACpC,IAAI,eAAe,GAAG,CAAC,CAAC;QACxB,MAAM,gBAAgB,GAAG,WAAW,CAAC;QAErC,OAAO,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACtC,MAAM,GAAG,GAAG,cAAc,CAAC,WAAW,CAAC,CAAC;YAExC,wEAAwE;YACxE,2EAA2E;YAC3E,IAAI,eAAe,GAAG,GAAG,GAAG,SAAS,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnE,MAAM;YACR,CAAC;YAED,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC;YAC5C,eAAe,IAAI,GAAG,CAAC;YACvB,WAAW,EAAE,CAAC;QAChB,CAAC;QAED,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEzC,oFAAoF;QACpF,yCAAyC;QACzC,MAAM,UAAU,GAAG,iBAAiB,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;QAElE,MAAM,CAAC,IAAI,CAAC;YACV,OAAO;YACP,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,UAAU,EAAE,WAAW,CAAC,OAAO,CAAC;YAChC,UAAU;SACX,CAAC,CAAC;QAEH,oEAAoE;QACpE,IAAI,WAAW,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC;YACnC,IAAI,YAAY,GAAG,CAAC,CAAC;YACrB,IAAI,SAAS,GAAG,WAAW,GAAG,CAAC,CAAC;YAChC,OAAO,SAAS,GAAG,gBAAgB,IAAI,YAAY,GAAG,aAAa,EAAE,CAAC;gBACpE,YAAY,IAAI,cAAc,CAAC,SAAS,CAAC,CAAC;gBAC1C,SAAS,EAAE,CAAC;YACd,CAAC;YACD,2EAA2E;YAC3E,WAAW,GAAG,SAAS,GAAG,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,iBAAiB,CAAC,SAAmB,EAAE,SAAiB;IAC/D,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,MAAM,IAAI,MAAM,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QACnD,yDAAyD;QACzD,MAAM,IAAI,CAAC,CAAC,CAAC,8BAA8B;IAC7C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,24 @@
1
+ import type { Embedder } from './embedder.js';
2
+ import type { Chunk, EmbeddedChunk, EmbeddingVector, ExtractedContent, IContentPipeline, PipelineConfig } from './types.js';
3
+ /**
4
+ * Concrete implementation of IContentPipeline.
5
+ *
6
+ * Orchestrates fetching, extraction, chunking, embedding, and search
7
+ * by delegating to the standalone pipeline functions and an injected Embedder.
8
+ */
9
+ export declare class ContentPipeline implements IContentPipeline {
10
+ private readonly config;
11
+ private readonly embedder;
12
+ constructor(config: Partial<PipelineConfig>, embedder: Embedder);
13
+ /** Fetch a URL and extract its main article content. */
14
+ fetchAndExtract(url: string): Promise<ExtractedContent>;
15
+ /** Split text into overlapping chunks. */
16
+ chunk(text: string): Chunk[];
17
+ /** Embed a single text string. */
18
+ embed(text: string): Promise<EmbeddingVector>;
19
+ /** Embed multiple chunks and zip results into EmbeddedChunk[]. */
20
+ embedChunks(chunks: Chunk[]): Promise<EmbeddedChunk[]>;
21
+ /** Search embeddings for the top-K most similar, returning their indices. */
22
+ search(query: EmbeddingVector, embeddings: EmbeddingVector[], topK: number): number[];
23
+ }
24
+ //# sourceMappingURL=content-pipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-pipeline.d.ts","sourceRoot":"","sources":["../../pipeline/content-pipeline.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAI9C,OAAO,KAAK,EACV,KAAK,EACL,aAAa,EACb,eAAe,EACf,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACf,MAAM,YAAY,CAAC;AAEpB;;;;;GAKG;AACH,qBAAa,eAAgB,YAAW,gBAAgB;IACtD,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA0B;IACjD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAW;gBAExB,MAAM,EAAE,OAAO,CAAC,cAAc,CAAC,EAAE,QAAQ,EAAE,QAAQ;IAK/D,wDAAwD;IAClD,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAK7D,0CAA0C;IAC1C,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,KAAK,EAAE;IAO5B,kCAAkC;IAC5B,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAInD,kEAAkE;IAC5D,WAAW,CAAC,MAAM,EAAE,KAAK,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;IAS5D,6EAA6E;IAC7E,MAAM,CAAC,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,EAAE,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE;CAItF"}
@@ -0,0 +1,49 @@
1
+ import { chunkText } from './chunker.js';
2
+ import { extractContent } from './extract.js';
3
+ import { fetchUrl } from './fetch.js';
4
+ import { searchEmbeddings } from './search.js';
5
+ /**
6
+ * Concrete implementation of IContentPipeline.
7
+ *
8
+ * Orchestrates fetching, extraction, chunking, embedding, and search
9
+ * by delegating to the standalone pipeline functions and an injected Embedder.
10
+ */
11
+ export class ContentPipeline {
12
+ config;
13
+ embedder;
14
+ constructor(config, embedder) {
15
+ this.config = config;
16
+ this.embedder = embedder;
17
+ }
18
+ /** Fetch a URL and extract its main article content. */
19
+ async fetchAndExtract(url) {
20
+ const result = await fetchUrl(url, this.config);
21
+ return extractContent(result.html, result.finalUrl);
22
+ }
23
+ /** Split text into overlapping chunks. */
24
+ chunk(text) {
25
+ return chunkText(text, {
26
+ maxTokens: this.config.maxChunkTokens ?? 500,
27
+ overlapTokens: this.config.overlapTokens ?? 50,
28
+ });
29
+ }
30
+ /** Embed a single text string. */
31
+ async embed(text) {
32
+ return this.embedder.embed(text);
33
+ }
34
+ /** Embed multiple chunks and zip results into EmbeddedChunk[]. */
35
+ async embedChunks(chunks) {
36
+ const vectors = await this.embedder.embedBatch(chunks.map((c) => c.content));
37
+ return chunks.map((chunk, i) => ({
38
+ ...chunk,
39
+ embedding: vectors[i],
40
+ model: 'snowflake-arctic-embed-s@384',
41
+ }));
42
+ }
43
+ /** Search embeddings for the top-K most similar, returning their indices. */
44
+ search(query, embeddings, topK) {
45
+ const results = searchEmbeddings(query, embeddings, topK);
46
+ return results.map((r) => r.index);
47
+ }
48
+ }
49
+ //# sourceMappingURL=content-pipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-pipeline.js","sourceRoot":"","sources":["../../pipeline/content-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEzC,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACtC,OAAO,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAU/C;;;;;GAKG;AACH,MAAM,OAAO,eAAe;IACT,MAAM,CAA0B;IAChC,QAAQ,CAAW;IAEpC,YAAY,MAA+B,EAAE,QAAkB;QAC7D,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IAED,wDAAwD;IACxD,KAAK,CAAC,eAAe,CAAC,GAAW;QAC/B,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAChD,OAAO,cAAc,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC;IACtD,CAAC;IAED,0CAA0C;IAC1C,KAAK,CAAC,IAAY;QAChB,OAAO,SAAS,CAAC,IAAI,EAAE;YACrB,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc,IAAI,GAAG;YAC5C,aAAa,EAAE,IAAI,CAAC,MAAM,CAAC,aAAa,IAAI,EAAE;SAC/C,CAAC,CAAC;IACL,CAAC;IAED,kCAAkC;IAClC,KAAK,CAAC,KAAK,CAAC,IAAY;QACtB,OAAO,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,CAAC;IAED,kEAAkE;IAClE,KAAK,CAAC,WAAW,CAAC,MAAe;QAC/B,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;QAC7E,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC/B,GAAG,KAAK;YACR,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC;YACrB,KAAK,EAAE,8BAA8B;SACtC,CAAC,CAAC,CAAC;IACN,CAAC;IAED,6EAA6E;IAC7E,MAAM,CAAC,KAAsB,EAAE,UAA6B,EAAE,IAAY;QACxE,MAAM,OAAO,GAAG,gBAAgB,CAAC,KAAK,EAAE,UAAU,EAAE,IAAI,CAAC,CAAC;QAC1D,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;CACF"}
@@ -0,0 +1,49 @@
1
+ /**
2
+ * ONNX Runtime embedding using snowflake-arctic-embed-s.
3
+ *
4
+ * Provides a high-level Embedder class that loads an ONNX model and
5
+ * produces 384-dimensional embedding vectors from text input.
6
+ *
7
+ * Uses a real BERT WordPiece tokenizer with the model's vocabulary
8
+ * for proper subword tokenization and meaningful embeddings.
9
+ */
10
+ import type { EmbeddingVector } from './types.js';
11
+ /**
12
+ * Embedder wraps an ONNX inference session for producing text embeddings.
13
+ */
14
+ export declare class Embedder {
15
+ private session;
16
+ private vocab;
17
+ readonly modelPath: string;
18
+ private readonly vocabPath;
19
+ /**
20
+ * @param modelPath - Absolute path to the ONNX model file.
21
+ * @param vocabPath - Absolute path to the vocab.txt file.
22
+ */
23
+ constructor(modelPath: string, vocabPath?: string);
24
+ /**
25
+ * Load the ONNX model into memory. Called lazily on first embed() call,
26
+ * but can be called explicitly to pre-warm.
27
+ */
28
+ initialize(): Promise<void>;
29
+ /**
30
+ * Embed a single text string into a 384-dimensional vector.
31
+ *
32
+ * @param text - The text to embed.
33
+ * @param prefix - Optional prefix (e.g., "query: " for search queries).
34
+ * @returns A Float32Array of 384 dimensions.
35
+ */
36
+ embed(text: string, prefix?: string): Promise<EmbeddingVector>;
37
+ /**
38
+ * Embed multiple texts in a single batch.
39
+ *
40
+ * @param texts - Array of text strings to embed.
41
+ * @returns Array of Float32Array embeddings, one per input text.
42
+ */
43
+ embedBatch(texts: string[]): Promise<EmbeddingVector[]>;
44
+ /**
45
+ * Release the ONNX session and free memory.
46
+ */
47
+ dispose(): void;
48
+ }
49
+ //# sourceMappingURL=embedder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.d.ts","sourceRoot":"","sources":["../../pipeline/embedder.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAGH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMlD;;GAEG;AACH,qBAAa,QAAQ;IACnB,OAAO,CAAC,OAAO,CAAqC;IACpD,OAAO,CAAC,KAAK,CAA2B;IACxC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IAEnC;;;OAGG;gBACS,SAAS,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM;IAMjD;;;OAGG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAqBjC;;;;;;OAMG;IACG,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;IAKpE;;;;;OAKG;IACG,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC;IAoH7D;;OAEG;IACH,OAAO,IAAI,IAAI;CAQhB"}
@@ -0,0 +1,195 @@
1
+ /**
2
+ * ONNX Runtime embedding using snowflake-arctic-embed-s.
3
+ *
4
+ * Provides a high-level Embedder class that loads an ONNX model and
5
+ * produces 384-dimensional embedding vectors from text input.
6
+ *
7
+ * Uses a real BERT WordPiece tokenizer with the model's vocabulary
8
+ * for proper subword tokenization and meaningful embeddings.
9
+ */
10
+ import * as ort from 'onnxruntime-node';
11
+ import { loadVocabulary, tokenize } from './wordpiece-tokenizer.js';
12
+ /** Maximum sequence length for the model. */
13
+ const MAX_SEQ_LENGTH = 512;
14
+ /**
15
+ * Embedder wraps an ONNX inference session for producing text embeddings.
16
+ */
17
+ export class Embedder {
18
+ session = null;
19
+ vocab = null;
20
+ modelPath;
21
+ vocabPath;
22
+ /**
23
+ * @param modelPath - Absolute path to the ONNX model file.
24
+ * @param vocabPath - Absolute path to the vocab.txt file.
25
+ */
26
+ constructor(modelPath, vocabPath) {
27
+ this.modelPath = modelPath;
28
+ // Default: vocab.txt in the same directory as the model
29
+ this.vocabPath = vocabPath ?? modelPath.replace(/[^/\\]+$/, 'vocab.txt');
30
+ }
31
+ /**
32
+ * Load the ONNX model into memory. Called lazily on first embed() call,
33
+ * but can be called explicitly to pre-warm.
34
+ */
35
+ async initialize() {
36
+ if (this.session) {
37
+ return;
38
+ }
39
+ try {
40
+ this.session = await ort.InferenceSession.create(this.modelPath, {
41
+ executionProviders: ['cpu'],
42
+ });
43
+ }
44
+ catch (error) {
45
+ const msg = error instanceof Error ? error.message : String(error);
46
+ throw new Error(`Failed to load ONNX model at ${this.modelPath}: ${msg}. ` +
47
+ 'Ensure the model file exists and is a valid ONNX model.');
48
+ }
49
+ // Load vocabulary
50
+ this.vocab = loadVocabulary(this.vocabPath);
51
+ }
52
+ /**
53
+ * Embed a single text string into a 384-dimensional vector.
54
+ *
55
+ * @param text - The text to embed.
56
+ * @param prefix - Optional prefix (e.g., "query: " for search queries).
57
+ * @returns A Float32Array of 384 dimensions.
58
+ */
59
+ async embed(text, prefix) {
60
+ const results = await this.embedBatch([prefix ? `${prefix}${text}` : text]);
61
+ return results[0];
62
+ }
63
+ /**
64
+ * Embed multiple texts in a single batch.
65
+ *
66
+ * @param texts - Array of text strings to embed.
67
+ * @returns Array of Float32Array embeddings, one per input text.
68
+ */
69
+ async embedBatch(texts) {
70
+ if (!this.session) {
71
+ await this.initialize();
72
+ }
73
+ const session = this.session;
74
+ const vocab = this.vocab;
75
+ const batchSize = texts.length;
76
+ // Tokenize each text into input IDs using real WordPiece tokenizer
77
+ const allInputIds = [];
78
+ const allAttentionMask = [];
79
+ let maxLen = 0;
80
+ for (const text of texts) {
81
+ const ids = tokenize(text, vocab, MAX_SEQ_LENGTH);
82
+ allInputIds.push(ids);
83
+ allAttentionMask.push(ids.map(() => 1n));
84
+ maxLen = Math.max(maxLen, ids.length);
85
+ }
86
+ // Pad to uniform length
87
+ for (let i = 0; i < batchSize; i++) {
88
+ while (allInputIds[i].length < maxLen) {
89
+ allInputIds[i].push(0n); // PAD token
90
+ allAttentionMask[i].push(0n);
91
+ }
92
+ }
93
+ // Flatten into typed arrays
94
+ const inputIdsFlat = new BigInt64Array(batchSize * maxLen);
95
+ const attentionMaskFlat = new BigInt64Array(batchSize * maxLen);
96
+ for (let i = 0; i < batchSize; i++) {
97
+ for (let j = 0; j < maxLen; j++) {
98
+ inputIdsFlat[i * maxLen + j] = allInputIds[i][j];
99
+ attentionMaskFlat[i * maxLen + j] = allAttentionMask[i][j];
100
+ }
101
+ }
102
+ // Create ONNX tensors
103
+ const feeds = {
104
+ input_ids: new ort.Tensor('int64', inputIdsFlat, [batchSize, maxLen]),
105
+ attention_mask: new ort.Tensor('int64', attentionMaskFlat, [batchSize, maxLen]),
106
+ };
107
+ // Some models expect token_type_ids
108
+ const inputNames = session.inputNames;
109
+ if (inputNames.includes('token_type_ids')) {
110
+ const tokenTypeIds = new BigInt64Array(batchSize * maxLen); // all zeros
111
+ feeds['token_type_ids'] = new ort.Tensor('int64', tokenTypeIds, [batchSize, maxLen]);
112
+ }
113
+ // Run inference
114
+ const results = await session.run(feeds);
115
+ // Extract embeddings from output
116
+ // The model may output under various names; try common ones
117
+ const outputKey = results['sentence_embedding'] ? 'sentence_embedding' :
118
+ results['last_hidden_state'] ? 'last_hidden_state' :
119
+ Object.keys(results)[0];
120
+ const outputTensor = results[outputKey];
121
+ const outputData = outputTensor.data;
122
+ const outputDims = outputTensor.dims;
123
+ // If output is [batch, seq_len, dim], mean-pool over seq_len
124
+ // If output is [batch, dim], use directly
125
+ const embeddings = [];
126
+ if (outputDims.length === 3) {
127
+ // [batch, seq_len, dim] — mean pooling with attention mask
128
+ const seqLen = outputDims[1];
129
+ const dim = outputDims[2];
130
+ for (let b = 0; b < batchSize; b++) {
131
+ const embedding = new Float32Array(dim);
132
+ let tokenCount = 0;
133
+ for (let s = 0; s < seqLen; s++) {
134
+ if (allAttentionMask[b][s] === 1n) {
135
+ tokenCount++;
136
+ for (let d = 0; d < dim; d++) {
137
+ embedding[d] += outputData[b * seqLen * dim + s * dim + d];
138
+ }
139
+ }
140
+ }
141
+ // Average
142
+ if (tokenCount > 0) {
143
+ for (let d = 0; d < dim; d++) {
144
+ embedding[d] /= tokenCount;
145
+ }
146
+ }
147
+ // L2 normalize
148
+ embeddings.push(l2Normalize(embedding));
149
+ }
150
+ }
151
+ else if (outputDims.length === 2) {
152
+ // [batch, dim] — already pooled
153
+ const dim = outputDims[1];
154
+ for (let b = 0; b < batchSize; b++) {
155
+ const embedding = new Float32Array(dim);
156
+ for (let d = 0; d < dim; d++) {
157
+ embedding[d] = outputData[b * dim + d];
158
+ }
159
+ embeddings.push(l2Normalize(embedding));
160
+ }
161
+ }
162
+ else {
163
+ throw new Error(`Unexpected output tensor shape: [${outputDims.join(', ')}]`);
164
+ }
165
+ return embeddings;
166
+ }
167
+ /**
168
+ * Release the ONNX session and free memory.
169
+ */
170
+ dispose() {
171
+ if (this.session) {
172
+ // InferenceSession doesn't have a sync dispose in all versions;
173
+ // release is best-effort.
174
+ this.session.release?.();
175
+ this.session = null;
176
+ }
177
+ }
178
+ }
179
+ /**
180
+ * L2-normalize a vector in place and return it.
181
+ */
182
+ function l2Normalize(vec) {
183
+ let sumSq = 0;
184
+ for (let i = 0; i < vec.length; i++) {
185
+ sumSq += vec[i] * vec[i];
186
+ }
187
+ const norm = Math.sqrt(sumSq);
188
+ if (norm > 0) {
189
+ for (let i = 0; i < vec.length; i++) {
190
+ vec[i] /= norm;
191
+ }
192
+ }
193
+ return vec;
194
+ }
195
+ //# sourceMappingURL=embedder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.js","sourceRoot":"","sources":["../../pipeline/embedder.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,KAAK,GAAG,MAAM,kBAAkB,CAAC;AAExC,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAmB,MAAM,0BAA0B,CAAC;AAErF,6CAA6C;AAC7C,MAAM,cAAc,GAAG,GAAG,CAAC;AAE3B;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,OAAO,GAAgC,IAAI,CAAC;IAC5C,KAAK,GAAsB,IAAI,CAAC;IAC/B,SAAS,CAAS;IACV,SAAS,CAAS;IAEnC;;;OAGG;IACH,YAAY,SAAiB,EAAE,SAAkB;QAC/C,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,wDAAwD;QACxD,IAAI,CAAC,SAAS,GAAG,SAAS,IAAI,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC3E,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,UAAU;QACd,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,SAAS,EAAE;gBAC/D,kBAAkB,EAAE,CAAC,KAAK,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,GAAG,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACnE,MAAM,IAAI,KAAK,CACb,gCAAgC,IAAI,CAAC,SAAS,KAAK,GAAG,IAAI;gBACxD,yDAAyD,CAC5D,CAAC;QACJ,CAAC;QAED,kBAAkB;QAClB,IAAI,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9C,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,MAAe;QACvC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QAC5E,OAAO,OAAO,CAAC,CAAC,CAAC,CAAC;IACpB,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,UAAU,CAAC,KAAe;QAC9B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;QAC1B,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAAQ,CAAC;QAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAM,CAAC;QAC1B,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC;QAE/B,mEAAmE;QACnE,MAAM,WAAW,GAAe,EAAE,CAAC;QACnC,MAAM,gBAAgB,GAAe,EAAE,CAAC;QACxC,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,QAAQ,CAAC,IAAI,EAAE,KAAK,EAAE,cAAc,CAAC,CAAC;YAClD,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtB,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;YACzC,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;QACxC,CAAC;QAED,wBAAwB;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,MAAM,EAAE,CAAC;gBACtC,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY;gBACrC,gBAAgB,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;QAC3D,MAAM,iBAAiB,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;QAEhE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAChC,YAAY,CAAC,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBACjD,iBAAiB,CAAC,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;QAED,sBAAsB;QACtB,MAAM,KAAK,GAA+B;YACxC,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;YACrE,cAAc,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,iBAAiB,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;SAChF,CAAC;QAEF,oCAAoC;QACpC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC,CAAC,YAAY;YACxE,KAAK,CAAC,gBAAgB,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,CAAC;QACvF,CAAC;QAED,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,iCAAiC;QACjC,4DAA4D;QAC5D,MAAM,SAAS,GACb,OAAO,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC;YACtD,OAAO,CAAC,mBAAmB,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC;gBACpD,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;QAE1B,MAAM,YAAY,GAAG,OAAO,CAAC,SAAS,CAAC,CAAC;QACxC,MAAM,UAAU,GAAG,YAAY,CAAC,IAAoB,CAAC;QACrD,MAAM,UAAU,GAAG,YAAY,CAAC,IAAyB,CAAC;QAE1D,6DAA6D;QAC7D,0CAA0C;QAC1C,MAAM,UAAU,GAAsB,EAAE,CAAC;QAEzC,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,2DAA2D;YAC3D,MAAM,MAAM,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YACvC,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,MAAM,SAAS,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;gBACxC,IAAI,UAAU,GAAG,CAAC,CAAC;gBAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBAChC,IAAI,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC;wBAClC,UAAU,EAAE,CAAC;wBACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;4BAC7B,SAAS,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC,CAAC,GAAG,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC;wBAC7D,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,UAAU;gBACV,IAAI,UAAU,GAAG,CAAC,EAAE,CAAC;oBACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;wBAC7B,SAAS,CAAC,CAAC,CAAC,IAAI,UAAU,CAAC;oBAC7B,CAAC;gBACH,CAAC;gBAED,eAAe;gBACf,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;aAAM,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,gCAAgC;YAChC,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAW,CAAC;YACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,MAAM,SAAS,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;gBACxC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC7B,SAAS,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,CAAC;gBACzC,CAAC;gBACD,UAAU,CAAC,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,KAAK,CAAC,oCAAoC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAChF,CAAC;QAED,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,OAAO;QACL,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,gEAAgE;YAChE,0BAA0B;YAC1B,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;YACzB,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;CACF;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,GAAiB;IACpC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,IAAI,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;IAC3B,CAAC;IACD,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC9B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Content extraction from HTML using Mozilla Readability.
3
+ *
4
+ * Parses HTML with `linkedom` and runs it through Readability to pull out
5
+ * the main article content, stripped of navigation, ads, and boilerplate.
6
+ */
7
+ import type { ExtractedContent } from './types.js';
8
+ /**
9
+ * Extract the main article content from an HTML string.
10
+ *
11
+ * @param html - The raw HTML string to extract content from.
12
+ * @param url - The source URL (used for resolving relative links and metadata).
13
+ * @returns Extracted content with title, plain text, URL, and byte length.
14
+ * @throws If no article content can be extracted.
15
+ */
16
+ export declare function extractContent(html: string, url: string): ExtractedContent;
17
+ //# sourceMappingURL=extract.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAEnD;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,CAoD1E"}
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Content extraction from HTML using Mozilla Readability.
3
+ *
4
+ * Parses HTML with `linkedom` and runs it through Readability to pull out
5
+ * the main article content, stripped of navigation, ads, and boilerplate.
6
+ */
7
+ import { Readability } from '@mozilla/readability';
8
+ import { parseHTML } from 'linkedom';
9
+ /**
10
+ * Extract the main article content from an HTML string.
11
+ *
12
+ * @param html - The raw HTML string to extract content from.
13
+ * @param url - The source URL (used for resolving relative links and metadata).
14
+ * @returns Extracted content with title, plain text, URL, and byte length.
15
+ * @throws If no article content can be extracted.
16
+ */
17
+ export function extractContent(html, url) {
18
+ if (!html || html.trim().length === 0) {
19
+ throw new Error(`Empty HTML content from ${url}`);
20
+ }
21
+ const { document } = parseHTML(html);
22
+ // Attempt Readability extraction
23
+ const reader = new Readability(document);
24
+ const article = reader.parse();
25
+ let title;
26
+ let rawContent;
27
+ if (article && article.textContent && article.textContent.trim().length > 0) {
28
+ title = article.title || '';
29
+ rawContent = article.textContent;
30
+ }
31
+ else {
32
+ // Fallback: extract text from body (strip script/style first)
33
+ // Wrap in a full HTML document to ensure linkedom creates a body element
34
+ const wrappedHtml = html.includes('<body') ? html : `<html><body>${html}</body></html>`;
35
+ const { document: fallbackDoc } = parseHTML(wrappedHtml);
36
+ for (const el of fallbackDoc.querySelectorAll('script, style')) {
37
+ el.remove();
38
+ }
39
+ const body = fallbackDoc.querySelector('body');
40
+ rawContent = body ? body.textContent ?? '' : '';
41
+ if (rawContent.trim().length === 0) {
42
+ throw new Error(`No extractable content from ${url}`);
43
+ }
44
+ title = '';
45
+ }
46
+ // Fallback title: try <title> tag from a fresh parse
47
+ if (!title) {
48
+ const { document: titleDoc } = parseHTML(html);
49
+ const titleEl = titleDoc.querySelector('title');
50
+ title = titleEl?.textContent?.trim() ?? '';
51
+ }
52
+ // Clean up the text: collapse whitespace runs, trim lines
53
+ const content = cleanText(rawContent);
54
+ if (content.length === 0) {
55
+ throw new Error(`No extractable content from ${url}`);
56
+ }
57
+ const byteLength = Buffer.byteLength(content, 'utf-8');
58
+ return { title, content, url, byteLength };
59
+ }
60
+ /**
61
+ * Clean extracted text by collapsing whitespace and trimming.
62
+ */
63
+ function cleanText(text) {
64
+ return text
65
+ .replace(/[\t ]+/g, ' ') // collapse horizontal whitespace
66
+ .replace(/\n{3,}/g, '\n\n') // collapse excessive newlines
67
+ .replace(/^ +| +$/gm, '') // trim each line
68
+ .trim();
69
+ }
70
+ //# sourceMappingURL=extract.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extract.js","sourceRoot":"","sources":["../../pipeline/extract.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAGrC;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,KAAK,CAAC,2BAA2B,GAAG,EAAE,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAErC,iCAAiC;IACjC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,QAAe,CAAC,CAAC;IAChD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,KAAa,CAAC;IAClB,IAAI,UAAkB,CAAC;IAEvB,IAAI,OAAO,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5E,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC;QAC5B,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC;IACnC,CAAC;SAAM,CAAC;QACN,8DAA8D;QAC9D,yEAAyE;QACzE,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,eAAe,IAAI,gBAAgB,CAAC;QACxF,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,GAAG,SAAS,CAAC,WAAW,CAAC,CAAC;QACzD,KAAK,MAAM,EAAE,IAAI,WAAW,CAAC,gBAAgB,CAAC,eAAe,CAAC,EAAE,CAAC;YAC/D,EAAE,CAAC,MAAM,EAAE,CAAC;QACd,CAAC;QACD,MAAM,IAAI,GAAG,WAAW,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/C,UAAU,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QAEhD,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,KAAK,GAAG,EAAE,CAAC;IACb,CAAC;IAED,qDAAqD;IACrD,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/C,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAChD,KAAK,GAAG,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7C,CAAC;IAED,0DAA0D;IAC1D,MAAM,OAAO,GAAG,SAAS,CAAC,UAAU,CAAC,CAAC;IAEtC,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,+BAA+B,GAAG,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;IAEvD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAO,iCAAiC;SAC/D,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAI,8BAA8B;SAC5D,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAM,iBAAiB;SAC/C,IAAI,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * URL fetching for the content pipeline.
3
+ *
4
+ * Uses Node.js built-in `fetch` (available since Node 18) with
5
+ * configurable timeout, redirect limits, and user-agent.
6
+ */
7
+ import type { PipelineConfig } from './types.js';
8
+ /** Result of a successful URL fetch. */
9
+ export interface FetchResult {
10
+ /** Raw HTML body. */
11
+ html: string;
12
+ /** Final URL after any redirects. */
13
+ finalUrl: string;
14
+ /** Content-Type header value. */
15
+ contentType: string;
16
+ }
17
+ /**
18
+ * Fetch the HTML content of a URL.
19
+ *
20
+ * @param url - The URL to fetch.
21
+ * @param config - Optional partial pipeline config overrides.
22
+ * @returns The HTML content, final URL, and content type.
23
+ * @throws On network errors, non-2xx status codes, or non-HTML content.
24
+ */
25
+ export declare function fetchUrl(url: string, config?: Partial<PipelineConfig>): Promise<FetchResult>;
26
+ //# sourceMappingURL=fetch.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetch.d.ts","sourceRoot":"","sources":["../../pipeline/fetch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AASjD,wCAAwC;AACxC,MAAM,WAAW,WAAW;IAC1B,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,qCAAqC;IACrC,QAAQ,EAAE,MAAM,CAAC;IACjB,iCAAiC;IACjC,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,OAAO,CAAC,cAAc,CAAC,GAC/B,OAAO,CAAC,WAAW,CAAC,CA6EtB"}