vectra 0.12.3 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (375) hide show
  1. package/README.md +92 -100
  2. package/lib/BrowserWebFetcher.d.ts +75 -0
  3. package/lib/BrowserWebFetcher.d.ts.map +1 -0
  4. package/lib/BrowserWebFetcher.js +290 -0
  5. package/lib/BrowserWebFetcher.js.map +1 -0
  6. package/lib/FileFetcher.d.ts.map +1 -1
  7. package/lib/FileFetcher.js +25 -15
  8. package/lib/FileFetcher.js.map +1 -1
  9. package/lib/FileFetcher.spec.d.ts +2 -0
  10. package/lib/FileFetcher.spec.d.ts.map +1 -0
  11. package/lib/FileFetcher.spec.js +244 -0
  12. package/lib/FileFetcher.spec.js.map +1 -0
  13. package/lib/FolderWatcher.d.ts +91 -0
  14. package/lib/FolderWatcher.d.ts.map +1 -0
  15. package/lib/FolderWatcher.js +304 -0
  16. package/lib/FolderWatcher.js.map +1 -0
  17. package/lib/FolderWatcher.spec.d.ts +2 -0
  18. package/lib/FolderWatcher.spec.d.ts.map +1 -0
  19. package/lib/FolderWatcher.spec.js +308 -0
  20. package/lib/FolderWatcher.spec.js.map +1 -0
  21. package/lib/GPT3Tokenizer.spec.d.ts +2 -0
  22. package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
  23. package/lib/GPT3Tokenizer.spec.js +45 -0
  24. package/lib/GPT3Tokenizer.spec.js.map +1 -0
  25. package/lib/ItemSelector.d.ts.map +1 -1
  26. package/lib/ItemSelector.js +19 -8
  27. package/lib/ItemSelector.js.map +1 -1
  28. package/lib/ItemSelector.spec.d.ts +2 -0
  29. package/lib/ItemSelector.spec.d.ts.map +1 -0
  30. package/lib/ItemSelector.spec.js +204 -0
  31. package/lib/ItemSelector.spec.js.map +1 -0
  32. package/lib/LocalDocument.d.ts +1 -1
  33. package/lib/LocalDocument.d.ts.map +1 -1
  34. package/lib/LocalDocument.js +5 -45
  35. package/lib/LocalDocument.js.map +1 -1
  36. package/lib/LocalDocument.spec.d.ts +2 -0
  37. package/lib/LocalDocument.spec.d.ts.map +1 -0
  38. package/lib/LocalDocument.spec.js +214 -0
  39. package/lib/LocalDocument.spec.js.map +1 -0
  40. package/lib/LocalDocumentIndex.d.ts +20 -0
  41. package/lib/LocalDocumentIndex.d.ts.map +1 -1
  42. package/lib/LocalDocumentIndex.js +16 -52
  43. package/lib/LocalDocumentIndex.js.map +1 -1
  44. package/lib/LocalDocumentIndex.spec.d.ts +2 -0
  45. package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
  46. package/lib/LocalDocumentIndex.spec.js +494 -0
  47. package/lib/LocalDocumentIndex.spec.js.map +1 -0
  48. package/lib/LocalDocumentResult.d.ts +32 -11
  49. package/lib/LocalDocumentResult.d.ts.map +1 -1
  50. package/lib/LocalDocumentResult.js +305 -257
  51. package/lib/LocalDocumentResult.js.map +1 -1
  52. package/lib/LocalDocumentResult.spec.d.ts +2 -0
  53. package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
  54. package/lib/LocalDocumentResult.spec.js +373 -0
  55. package/lib/LocalDocumentResult.spec.js.map +1 -0
  56. package/lib/LocalEmbeddings.d.ts +59 -0
  57. package/lib/LocalEmbeddings.d.ts.map +1 -0
  58. package/lib/LocalEmbeddings.js +101 -0
  59. package/lib/LocalEmbeddings.js.map +1 -0
  60. package/lib/LocalEmbeddings.spec.d.ts +2 -0
  61. package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
  62. package/lib/LocalEmbeddings.spec.js +155 -0
  63. package/lib/LocalEmbeddings.spec.js.map +1 -0
  64. package/lib/LocalIndex.d.ts +27 -18
  65. package/lib/LocalIndex.d.ts.map +1 -1
  66. package/lib/LocalIndex.js +109 -105
  67. package/lib/LocalIndex.js.map +1 -1
  68. package/lib/LocalIndex.spec.js +434 -43
  69. package/lib/LocalIndex.spec.js.map +1 -1
  70. package/lib/OpenAIEmbeddings.d.ts +4 -6
  71. package/lib/OpenAIEmbeddings.d.ts.map +1 -1
  72. package/lib/OpenAIEmbeddings.js +16 -24
  73. package/lib/OpenAIEmbeddings.js.map +1 -1
  74. package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
  75. package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
  76. package/lib/OpenAIEmbeddings.spec.js +298 -0
  77. package/lib/OpenAIEmbeddings.spec.js.map +1 -0
  78. package/lib/TextSplitter.d.ts +2 -0
  79. package/lib/TextSplitter.d.ts.map +1 -1
  80. package/lib/TextSplitter.js +154 -111
  81. package/lib/TextSplitter.js.map +1 -1
  82. package/lib/TextSplitter.spec.js +289 -61
  83. package/lib/TextSplitter.spec.js.map +1 -1
  84. package/lib/TransformersEmbeddings.d.ts +121 -0
  85. package/lib/TransformersEmbeddings.d.ts.map +1 -0
  86. package/lib/TransformersEmbeddings.js +176 -0
  87. package/lib/TransformersEmbeddings.js.map +1 -0
  88. package/lib/TransformersEmbeddings.spec.d.ts +2 -0
  89. package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
  90. package/lib/TransformersEmbeddings.spec.js +198 -0
  91. package/lib/TransformersEmbeddings.spec.js.map +1 -0
  92. package/lib/TransformersTokenizer.d.ts +33 -0
  93. package/lib/TransformersTokenizer.d.ts.map +1 -0
  94. package/lib/TransformersTokenizer.js +44 -0
  95. package/lib/TransformersTokenizer.js.map +1 -0
  96. package/lib/TransformersTokenizer.spec.d.ts +2 -0
  97. package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
  98. package/lib/TransformersTokenizer.spec.js +112 -0
  99. package/lib/TransformersTokenizer.spec.js.map +1 -0
  100. package/lib/WebFetcher.d.ts +1 -2
  101. package/lib/WebFetcher.d.ts.map +1 -1
  102. package/lib/WebFetcher.js +58 -54
  103. package/lib/WebFetcher.js.map +1 -1
  104. package/lib/WebFetcher.spec.d.ts +2 -0
  105. package/lib/WebFetcher.spec.d.ts.map +1 -0
  106. package/lib/WebFetcher.spec.js +263 -0
  107. package/lib/WebFetcher.spec.js.map +1 -0
  108. package/lib/browser.d.ts +30 -0
  109. package/lib/browser.d.ts.map +1 -0
  110. package/lib/browser.js +52 -0
  111. package/lib/browser.js.map +1 -0
  112. package/lib/codecs/IndexCodec.d.ts +37 -0
  113. package/lib/codecs/IndexCodec.d.ts.map +1 -0
  114. package/lib/codecs/IndexCodec.js +3 -0
  115. package/lib/codecs/IndexCodec.js.map +1 -0
  116. package/lib/codecs/JsonCodec.d.ts +19 -0
  117. package/lib/codecs/JsonCodec.d.ts.map +1 -0
  118. package/lib/codecs/JsonCodec.js +35 -0
  119. package/lib/codecs/JsonCodec.js.map +1 -0
  120. package/lib/codecs/JsonCodec.spec.d.ts +2 -0
  121. package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
  122. package/lib/codecs/JsonCodec.spec.js +66 -0
  123. package/lib/codecs/JsonCodec.spec.js.map +1 -0
  124. package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
  125. package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
  126. package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
  127. package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
  128. package/lib/codecs/ProtobufCodec.d.ts +20 -0
  129. package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
  130. package/lib/codecs/ProtobufCodec.js +225 -0
  131. package/lib/codecs/ProtobufCodec.js.map +1 -0
  132. package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
  133. package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
  134. package/lib/codecs/ProtobufCodec.spec.js +155 -0
  135. package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
  136. package/lib/codecs/index.d.ts +5 -0
  137. package/lib/codecs/index.d.ts.map +1 -0
  138. package/lib/codecs/index.js +21 -0
  139. package/lib/codecs/index.js.map +1 -0
  140. package/lib/codecs/migrateIndex.d.ts +24 -0
  141. package/lib/codecs/migrateIndex.d.ts.map +1 -0
  142. package/lib/codecs/migrateIndex.js +119 -0
  143. package/lib/codecs/migrateIndex.js.map +1 -0
  144. package/lib/codecs/migrateIndex.spec.d.ts +2 -0
  145. package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
  146. package/lib/codecs/migrateIndex.spec.js +151 -0
  147. package/lib/codecs/migrateIndex.spec.js.map +1 -0
  148. package/lib/codecs/schemas/index.proto +34 -0
  149. package/lib/index.d.ts +9 -1
  150. package/lib/index.d.ts.map +1 -1
  151. package/lib/index.js +9 -1
  152. package/lib/index.js.map +1 -1
  153. package/lib/internals/Colorize.d.ts.map +1 -1
  154. package/lib/internals/Colorize.js +20 -15
  155. package/lib/internals/Colorize.js.map +1 -1
  156. package/lib/server/IndexManager.d.ts +78 -0
  157. package/lib/server/IndexManager.d.ts.map +1 -0
  158. package/lib/server/IndexManager.js +259 -0
  159. package/lib/server/IndexManager.js.map +1 -0
  160. package/lib/server/VectraServer.d.ts +40 -0
  161. package/lib/server/VectraServer.d.ts.map +1 -0
  162. package/lib/server/VectraServer.js +151 -0
  163. package/lib/server/VectraServer.js.map +1 -0
  164. package/lib/server/VectraServer.spec.d.ts +2 -0
  165. package/lib/server/VectraServer.spec.d.ts.map +1 -0
  166. package/lib/server/VectraServer.spec.js +322 -0
  167. package/lib/server/VectraServer.spec.js.map +1 -0
  168. package/lib/server/handlers/documentHandlers.d.ts +15 -0
  169. package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
  170. package/lib/server/handlers/documentHandlers.js +95 -0
  171. package/lib/server/handlers/documentHandlers.js.map +1 -0
  172. package/lib/server/handlers/helpers.d.ts +23 -0
  173. package/lib/server/handlers/helpers.d.ts.map +1 -0
  174. package/lib/server/handlers/helpers.js +138 -0
  175. package/lib/server/handlers/helpers.js.map +1 -0
  176. package/lib/server/handlers/index.d.ts +8 -0
  177. package/lib/server/handlers/index.d.ts.map +1 -0
  178. package/lib/server/handlers/index.js +22 -0
  179. package/lib/server/handlers/index.js.map +1 -0
  180. package/lib/server/handlers/indexHandlers.d.ts +14 -0
  181. package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
  182. package/lib/server/handlers/indexHandlers.js +85 -0
  183. package/lib/server/handlers/indexHandlers.js.map +1 -0
  184. package/lib/server/handlers/itemHandlers.d.ts +34 -0
  185. package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
  186. package/lib/server/handlers/itemHandlers.js +166 -0
  187. package/lib/server/handlers/itemHandlers.js.map +1 -0
  188. package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
  189. package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
  190. package/lib/server/handlers/lifecycleHandlers.js +31 -0
  191. package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
  192. package/lib/server/handlers/queryHandlers.d.ts +27 -0
  193. package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
  194. package/lib/server/handlers/queryHandlers.js +135 -0
  195. package/lib/server/handlers/queryHandlers.js.map +1 -0
  196. package/lib/server/handlers/statsHandlers.d.ts +17 -0
  197. package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
  198. package/lib/server/handlers/statsHandlers.js +81 -0
  199. package/lib/server/handlers/statsHandlers.js.map +1 -0
  200. package/lib/server/index.d.ts +4 -0
  201. package/lib/server/index.d.ts.map +1 -0
  202. package/lib/server/index.js +23 -0
  203. package/lib/server/index.js.map +1 -0
  204. package/lib/storage/FileStorage.d.ts +92 -0
  205. package/lib/storage/FileStorage.d.ts.map +1 -0
  206. package/lib/storage/FileStorage.js +3 -0
  207. package/lib/storage/FileStorage.js.map +1 -0
  208. package/lib/storage/FileStorageUtilities.d.ts +36 -0
  209. package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
  210. package/lib/storage/FileStorageUtilities.js +91 -0
  211. package/lib/storage/FileStorageUtilities.js.map +1 -0
  212. package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
  213. package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
  214. package/lib/storage/FileStorageUtilities.spec.js +98 -0
  215. package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
  216. package/lib/storage/FileType.d.ts +29 -0
  217. package/lib/storage/FileType.d.ts.map +1 -0
  218. package/lib/storage/FileType.js +38 -0
  219. package/lib/storage/FileType.js.map +1 -0
  220. package/lib/storage/IndexedDBStorage.d.ts +47 -0
  221. package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
  222. package/lib/storage/IndexedDBStorage.js +347 -0
  223. package/lib/storage/IndexedDBStorage.js.map +1 -0
  224. package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
  225. package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
  226. package/lib/storage/LocalFileStorage.browser.js +43 -0
  227. package/lib/storage/LocalFileStorage.browser.js.map +1 -0
  228. package/lib/storage/LocalFileStorage.d.ts +23 -0
  229. package/lib/storage/LocalFileStorage.d.ts.map +1 -0
  230. package/lib/storage/LocalFileStorage.js +152 -0
  231. package/lib/storage/LocalFileStorage.js.map +1 -0
  232. package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
  233. package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
  234. package/lib/storage/LocalFileStorage.spec.js +249 -0
  235. package/lib/storage/LocalFileStorage.spec.js.map +1 -0
  236. package/lib/storage/VirtualFileStorage.d.ts +18 -0
  237. package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
  238. package/lib/storage/VirtualFileStorage.js +178 -0
  239. package/lib/storage/VirtualFileStorage.js.map +1 -0
  240. package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
  241. package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
  242. package/lib/storage/VirtualFileStorage.spec.js +302 -0
  243. package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
  244. package/lib/storage/index.d.ts +6 -0
  245. package/lib/storage/index.d.ts.map +1 -0
  246. package/lib/storage/index.js +22 -0
  247. package/lib/storage/index.js.map +1 -0
  248. package/lib/templates/templates/csharp/README.md +48 -0
  249. package/lib/templates/templates/csharp/VectraClient.cs +234 -0
  250. package/lib/templates/templates/go/README.md +71 -0
  251. package/lib/templates/templates/go/vectra_client.go +322 -0
  252. package/lib/templates/templates/java/README.md +81 -0
  253. package/lib/templates/templates/java/VectraClient.java +232 -0
  254. package/lib/templates/templates/python/README.md +37 -0
  255. package/lib/templates/templates/python/vectra_client.py +279 -0
  256. package/lib/templates/templates/rust/Cargo.toml +14 -0
  257. package/lib/templates/templates/rust/README.md +39 -0
  258. package/lib/templates/templates/rust/build.rs +4 -0
  259. package/lib/templates/templates/rust/lib.rs +284 -0
  260. package/lib/templates/templates/typescript/README.md +96 -0
  261. package/lib/templates/templates/typescript/VectraClient.ts +374 -0
  262. package/lib/templates/typescript/VectraClient.d.ts +114 -0
  263. package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
  264. package/lib/templates/typescript/VectraClient.js +328 -0
  265. package/lib/templates/typescript/VectraClient.js.map +1 -0
  266. package/lib/types.d.ts +7 -0
  267. package/lib/types.d.ts.map +1 -1
  268. package/lib/utils/index.d.ts +2 -0
  269. package/lib/utils/index.d.ts.map +1 -0
  270. package/lib/utils/index.js +18 -0
  271. package/lib/utils/index.js.map +1 -0
  272. package/lib/utils/pathUtils.d.ts +40 -0
  273. package/lib/utils/pathUtils.d.ts.map +1 -0
  274. package/lib/utils/pathUtils.js +98 -0
  275. package/lib/utils/pathUtils.js.map +1 -0
  276. package/lib/vectra-cli.d.ts.map +1 -1
  277. package/lib/vectra-cli.generate.spec.d.ts +2 -0
  278. package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
  279. package/lib/vectra-cli.generate.spec.js +112 -0
  280. package/lib/vectra-cli.generate.spec.js.map +1 -0
  281. package/lib/vectra-cli.js +446 -9
  282. package/lib/vectra-cli.js.map +1 -1
  283. package/lib/vectra-cli.spec.d.ts +1 -0
  284. package/lib/vectra-cli.spec.d.ts.map +1 -0
  285. package/lib/vectra-cli.spec.js +2 -0
  286. package/lib/vectra-cli.spec.js.map +1 -0
  287. package/package.json +89 -16
  288. package/proto/vectra_service.proto +276 -0
  289. package/src/BrowserWebFetcher.ts +345 -0
  290. package/src/FileFetcher.spec.ts +234 -0
  291. package/src/FileFetcher.ts +37 -25
  292. package/src/FolderWatcher.spec.ts +288 -0
  293. package/src/FolderWatcher.ts +304 -0
  294. package/src/GPT3Tokenizer.spec.ts +50 -0
  295. package/src/ItemSelector.spec.ts +252 -0
  296. package/src/ItemSelector.ts +163 -150
  297. package/src/LocalDocument.spec.ts +211 -0
  298. package/src/LocalDocument.ts +88 -94
  299. package/src/LocalDocumentIndex.spec.ts +481 -0
  300. package/src/LocalDocumentIndex.ts +39 -40
  301. package/src/LocalDocumentResult.spec.ts +373 -0
  302. package/src/LocalDocumentResult.ts +489 -319
  303. package/src/LocalEmbeddings.spec.ts +138 -0
  304. package/src/LocalEmbeddings.ts +120 -0
  305. package/src/LocalIndex.spec.ts +808 -323
  306. package/src/LocalIndex.ts +479 -430
  307. package/src/OpenAIEmbeddings.spec.ts +354 -0
  308. package/src/OpenAIEmbeddings.ts +26 -27
  309. package/src/TextSplitter.spec.ts +320 -65
  310. package/src/TextSplitter.ts +172 -115
  311. package/src/TransformersEmbeddings.spec.ts +188 -0
  312. package/src/TransformersEmbeddings.ts +232 -0
  313. package/src/TransformersTokenizer.spec.ts +143 -0
  314. package/src/TransformersTokenizer.ts +45 -0
  315. package/src/WebFetcher.spec.ts +288 -0
  316. package/src/WebFetcher.ts +184 -186
  317. package/src/browser.ts +69 -0
  318. package/src/codecs/IndexCodec.ts +40 -0
  319. package/src/codecs/JsonCodec.spec.ts +70 -0
  320. package/src/codecs/JsonCodec.ts +37 -0
  321. package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
  322. package/src/codecs/ProtobufCodec.spec.ts +166 -0
  323. package/src/codecs/ProtobufCodec.ts +193 -0
  324. package/src/codecs/index.ts +4 -0
  325. package/src/codecs/migrateIndex.spec.ts +176 -0
  326. package/src/codecs/migrateIndex.ts +125 -0
  327. package/src/codecs/schemas/index.proto +34 -0
  328. package/src/index.ts +9 -1
  329. package/src/internals/Colorize.ts +19 -16
  330. package/src/server/IndexManager.ts +243 -0
  331. package/src/server/VectraServer.spec.ts +303 -0
  332. package/src/server/VectraServer.ts +156 -0
  333. package/src/server/handlers/documentHandlers.ts +59 -0
  334. package/src/server/handlers/helpers.ts +93 -0
  335. package/src/server/handlers/index.ts +7 -0
  336. package/src/server/handlers/indexHandlers.ts +44 -0
  337. package/src/server/handlers/itemHandlers.ts +140 -0
  338. package/src/server/handlers/lifecycleHandlers.ts +26 -0
  339. package/src/server/handlers/queryHandlers.ts +96 -0
  340. package/src/server/handlers/statsHandlers.ts +38 -0
  341. package/src/server/index.ts +3 -0
  342. package/src/storage/FileStorage.ts +105 -0
  343. package/src/storage/FileStorageUtilities.spec.ts +106 -0
  344. package/src/storage/FileStorageUtilities.ts +77 -0
  345. package/src/storage/FileType.ts +61 -0
  346. package/src/storage/IndexedDBStorage.ts +365 -0
  347. package/src/storage/LocalFileStorage.browser.ts +52 -0
  348. package/src/storage/LocalFileStorage.spec.ts +292 -0
  349. package/src/storage/LocalFileStorage.ts +98 -0
  350. package/src/storage/VirtualFileStorage.spec.ts +307 -0
  351. package/src/storage/VirtualFileStorage.ts +169 -0
  352. package/src/storage/index.ts +5 -0
  353. package/src/templates/csharp/README.md +48 -0
  354. package/src/templates/csharp/VectraClient.cs +234 -0
  355. package/src/templates/go/README.md +71 -0
  356. package/src/templates/go/vectra_client.go +322 -0
  357. package/src/templates/java/README.md +81 -0
  358. package/src/templates/java/VectraClient.java +232 -0
  359. package/src/templates/python/README.md +37 -0
  360. package/src/templates/python/vectra_client.py +279 -0
  361. package/src/templates/rust/Cargo.toml +14 -0
  362. package/src/templates/rust/README.md +39 -0
  363. package/src/templates/rust/build.rs +4 -0
  364. package/src/templates/rust/lib.rs +284 -0
  365. package/src/templates/typescript/README.md +96 -0
  366. package/src/templates/typescript/VectraClient.ts +374 -0
  367. package/src/types.ts +131 -123
  368. package/src/utils/index.ts +1 -0
  369. package/src/utils/pathUtils.ts +106 -0
  370. package/src/vectra-cli.generate.spec.ts +72 -0
  371. package/src/vectra-cli.spec.ts +0 -0
  372. package/src/vectra-cli.ts +687 -246
  373. package/README.draft.md +0 -499
  374. package/README.draft.outline.md +0 -160
  375. package/README.research.md +0 -2159
@@ -1,8 +1,6 @@
1
1
  import { GPT3Tokenizer } from "./GPT3Tokenizer";
2
2
  import { TextChunk, Tokenizer } from "./types";
3
3
 
4
- const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
-
6
4
  export interface TextSplitterConfig {
7
5
  separators: string[];
8
6
  keepSeparators: boolean;
@@ -22,17 +20,14 @@ export class TextSplitter {
22
20
  chunkOverlap: 40,
23
21
  } as TextSplitterConfig, config);
24
22
 
25
- // Create a default tokenizer if none is provided
26
23
  if (!this._config.tokenizer) {
27
24
  this._config.tokenizer = new GPT3Tokenizer();
28
25
  }
29
26
 
30
- // Use default separators if none are provided
31
27
  if (!this._config.separators || this._config.separators.length === 0) {
32
28
  this._config.separators = this.getSeparators(this._config.docType);
33
29
  }
34
30
 
35
- // Validate the config settings
36
31
  if (this._config.chunkSize < 1) {
37
32
  throw new Error("chunkSize must be >= 1");
38
33
  } else if (this._config.chunkOverlap < 0) {
@@ -43,30 +38,19 @@ export class TextSplitter {
43
38
  }
44
39
 
45
40
  public split(text: string): TextChunk[] {
46
- // Get basic chunks
47
41
  const chunks = this.recursiveSplit(text, this._config.separators, 0);
48
42
 
49
- const that = this;
50
- function getOverlapTokens(tokens?: number[]): number[] {
51
- if (tokens != undefined) {
52
- const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
53
- return tokens.slice(0, len);
54
- } else {
55
- return [];
56
- }
57
- }
58
-
59
- // Add overlap tokens and text to the start and end of each chunk
60
43
  if (this._config.chunkOverlap > 0) {
61
- for (let i = 1; i < chunks.length; i++) {
62
- const previousChunk = chunks[i - 1];
63
- const chunk = chunks[i];
64
- const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
65
-
66
- // Use copies to avoid reversing in place (preserve token order in previous chunks)
67
- const prevTokensCopy = previousChunk.tokens.slice();
68
- chunk.startOverlap = getOverlapTokens(prevTokensCopy.reverse()).reverse();
69
- chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
44
+ for (let i = 0; i < chunks.length - 1; i++) {
45
+ const current = chunks[i];
46
+ const next = chunks[i + 1];
47
+
48
+ const currTokensCopy = current.tokens.slice();
49
+ const trailing = currTokensCopy.reverse().slice(0, this._config.chunkOverlap).reverse();
50
+ next.startOverlap = trailing;
51
+
52
+ const leadLen = Math.min(this._config.chunkOverlap, next.tokens.length);
53
+ current.endOverlap = next.tokens.slice(0, leadLen);
70
54
  }
71
55
  }
72
56
 
@@ -74,132 +58,205 @@ export class TextSplitter {
74
58
  }
75
59
 
76
60
  private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
77
- const chunks: TextChunk[] = [];
61
+ if (text.length === 0) return [];
78
62
 
79
- if (text.length > 0) {
80
- // Split text into parts
81
- let parts: string[];
82
- let separator = '';
63
+ if (separators.length > 0) {
64
+ const sep = separators[0];
83
65
  const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
84
66
 
85
- if (separators.length > 0) {
86
- // Split by separator
87
- separator = separators[0];
88
- parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
89
- } else {
90
- // Cut text in half
91
- const half = Math.floor(text.length / 2);
92
- parts = [text.substring(0, half), text.substring(half)];
93
- }
67
+ const parts = sep === ' ' ? this.splitBySpaces(text) : text.split(sep);
68
+ const out: TextChunk[] = [];
94
69
 
95
- // Iterate over parts
70
+ let pos = startPos;
96
71
  for (let i = 0; i < parts.length; i++) {
97
- const lastChunk = (i === parts.length - 1);
98
-
99
- // Get chunk text and endPos
100
- let chunk = parts[i];
101
- const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
72
+ const lastPart = (i === parts.length - 1);
73
+ let piece = parts[i];
102
74
 
103
- if (this._config.keepSeparators && !lastChunk) {
104
- chunk += separator;
75
+ if (this._config.keepSeparators && !lastPart) {
76
+ piece += sep;
105
77
  }
106
78
 
107
- // Keep chunks that contain any non-whitespace; drop whitespace-only
108
- if (!/\S/.test(chunk)) {
109
- // drop whitespace-only chunks
110
- startPos = endPos + 1;
79
+ if (!/\S/.test(piece)) {
80
+ const consumed = parts[i].length + (lastPart ? 0 : sep.length);
81
+ pos += consumed;
111
82
  continue;
112
83
  }
113
84
 
114
- // Optimization to avoid encoding really large chunks
115
- if (chunk.length / 6 > this._config.chunkSize) {
116
- // Break the text into smaller chunks
117
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
118
- chunks.push(...subChunks);
85
+ const sub = this.recursiveSplit(piece, nextSeparators, pos);
86
+ if (sub.length > 0) {
87
+ out.push(...sub);
119
88
  } else {
120
- // Encode chunk text
121
- const tokens = this._config.tokenizer.encode(chunk);
122
- if (tokens.length > this._config.chunkSize) {
123
- // Break the text into smaller chunks
124
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
125
- chunks.push(...subChunks);
126
- } else {
127
- // Append chunk to output
128
- chunks.push({
129
- text: chunk,
130
- tokens: tokens,
131
- startPos: startPos,
132
- endPos: endPos,
133
- startOverlap: [],
134
- endOverlap: [],
135
- });
136
- }
89
+ out.push(...this.finalizeToChunks(piece, pos));
137
90
  }
138
91
 
139
- // Update startPos
140
- startPos = endPos + 1;
92
+ const consumed = parts[i].length + (lastPart ? 0 : sep.length);
93
+ pos += consumed;
141
94
  }
95
+
96
+ const joiner =
97
+ this._config.keepSeparators
98
+ ? ''
99
+ : (sep !== ' ' && (sep.includes('\n') || sep.includes('\t')) ? ' ' : '');
100
+
101
+ return this.combineChunks(out, joiner);
142
102
  }
143
103
 
144
- return this.combineChunks(chunks);
104
+ return this.combineChunks(this.finalizeToChunks(text, startPos), '');
145
105
  }
146
106
 
147
- private combineChunks(chunks: TextChunk[]): TextChunk[] {
148
- const combinedChunks: TextChunk[] = [];
149
- let currentChunk: TextChunk | undefined;
150
- let currentLength = 0;
107
+ // Strip inline punctuation-only runs when keepSeparators=false.
108
+ // Only removes runs that touch non-whitespace on at least one side (inline),
109
+ // preserving standalone lines like '---' or '***' that are separated by whitespace/newlines.
110
+ private stripInlineSeparators(s: string): string {
111
+ if (this._config.keepSeparators || s.length === 0) return s;
112
+ const re = /(-{3,}|\*{3,}|={3,}|_{3,})/g;
113
+ let out = '';
114
+ let lastIndex = 0;
115
+ let m: RegExpExecArray | null;
116
+ while ((m = re.exec(s)) !== null) {
117
+ const start = m.index;
118
+ const end = start + m[0].length;
119
+ const left = start > 0 ? s[start - 1] : undefined;
120
+ const right = end < s.length ? s[end] : undefined;
121
+ const leftNonWS = left !== undefined && !/\s/.test(left);
122
+ const rightNonWS = right !== undefined && !/\s/.test(right);
123
+ // Inline if touching non-whitespace on at least one side
124
+ if (leftNonWS || rightNonWS) {
125
+ out += s.slice(lastIndex, start);
126
+ lastIndex = end; // drop the run
127
+ }
128
+ }
129
+ out += s.slice(lastIndex);
130
+ return out;
131
+ }
151
132
 
152
- // When not keeping separators, we previously inserted a space between merged chunks.
153
- // We will still use a space for normal merges, but we will prevent merging punctuation-only
154
- // separator chunks (e.g., '---', '***', '====') to preserve them as standalone.
155
- const separator = this._config.keepSeparators ? '' : ' ';
133
+ // Produce one or more chunks under budget.
134
+ private finalizeToChunks(text: string, startPos: number): TextChunk[] {
135
+ const chunks: TextChunk[] = [];
136
+ const tokens = this._config.tokenizer.encode(text);
137
+
138
+ // Token-budget splitting
139
+ if (tokens.length > this._config.chunkSize) {
140
+ let remaining = tokens.slice();
141
+ let pos = startPos;
142
+
143
+ while (remaining.length > 0) {
144
+ const span = remaining.splice(0, this._config.chunkSize);
145
+ const original = this._config.tokenizer.decode(span);
146
+
147
+ const leadingWSMatch = original.match(/^\s+/);
148
+ const leadingWSLen = leadingWSMatch ? leadingWSMatch[0].length : 0;
149
+
150
+ let sliceText = leadingWSLen > 0 ? original.slice(leadingWSLen) : original;
151
+ if (sliceText.length === 0) {
152
+ pos += original.length;
153
+ continue;
154
+ }
155
+
156
+ // Drop inline punctuation-only runs if configured
157
+ const stripped = this.stripInlineSeparators(sliceText);
158
+
159
+ const sliceStart = pos + leadingWSLen;
160
+ const sliceEnd = sliceStart + stripped.length - 1;
161
+
162
+ const spanTokens = this._config.tokenizer.encode(stripped);
163
+
164
+ chunks.push({
165
+ text: stripped,
166
+ tokens: spanTokens,
167
+ startPos: sliceStart,
168
+ endPos: sliceEnd,
169
+ startOverlap: [],
170
+ endOverlap: [],
171
+ });
172
+
173
+ pos += original.length;
174
+ }
175
+ return chunks;
176
+ }
177
+
178
+ // If text fits but is a very long unbroken string with no configured separators, fall back to char windows
179
+ if (text.length > this._config.chunkSize) {
180
+ const hasWhitespace = /\s/.test(text);
181
+ const hasAnyConfiguredSep = (this._config.separators || []).some(s => s && text.includes(s));
182
+
183
+ if (!hasWhitespace && !hasAnyConfiguredSep) {
184
+ let pos = startPos;
185
+ for (let off = 0; off < text.length; off += this._config.chunkSize) {
186
+ const slice = text.slice(off, off + this._config.chunkSize);
187
+ const stripped = this.stripInlineSeparators(slice);
188
+ const sliceTokens = this._config.tokenizer.encode(stripped);
189
+ const sliceStart = pos;
190
+ const sliceEnd = sliceStart + stripped.length - 1;
191
+ chunks.push({
192
+ text: stripped,
193
+ tokens: sliceTokens,
194
+ startPos: sliceStart,
195
+ endPos: sliceEnd,
196
+ startOverlap: [],
197
+ endOverlap: [],
198
+ });
199
+ pos = sliceEnd + 1;
200
+ }
201
+ return chunks;
202
+ }
203
+ }
204
+
205
+ const stripped = this.stripInlineSeparators(text);
206
+ const outTokens = this._config.tokenizer.encode(stripped);
207
+
208
+ chunks.push({
209
+ text: stripped,
210
+ tokens: outTokens,
211
+ startPos,
212
+ endPos: startPos + stripped.length - 1,
213
+ startOverlap: [],
214
+ endOverlap: [],
215
+ });
216
+ return chunks;
217
+ }
218
+
219
+ private combineChunks(chunks: TextChunk[], joiner: string): TextChunk[] {
220
+ const combined: TextChunk[] = [];
221
+ let current: TextChunk | undefined;
156
222
 
157
223
  const isWhitespaceOnly = (t: string) => !/\S/.test(t);
158
224
  const isPunctuationOnly = (t: string) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
159
225
 
160
226
  for (let i = 0; i < chunks.length; i++) {
161
- const chunk = chunks[i];
162
-
163
- if (!currentChunk) {
164
- currentChunk = chunk;
165
- currentLength = chunk.tokens.length;
227
+ const next = chunks[i];
228
+ if (!current) {
229
+ current = next;
166
230
  continue;
167
231
  }
168
232
 
169
- // If either the current or next chunk is punctuation-only (non-whitespace, no alphanumeric),
170
- // do not merge; keep them as separate chunks to preserve separators like '---'.
171
- if (isPunctuationOnly(currentChunk.text) || isPunctuationOnly(chunk.text)) {
172
- combinedChunks.push(currentChunk);
173
- currentChunk = chunk;
174
- currentLength = chunk.tokens.length;
233
+ // Keep punctuation-only chunks standalone
234
+ if (isPunctuationOnly(current.text) || isPunctuationOnly(next.text)) {
235
+ combined.push(current);
236
+ current = next;
175
237
  continue;
176
238
  }
177
239
 
178
- // Normal merge path constrained by token budget
179
- const length = currentChunk.tokens.length + chunk.tokens.length;
180
- if (length > this._config.chunkSize) {
181
- combinedChunks.push(currentChunk);
182
- currentChunk = chunk;
183
- currentLength = chunk.tokens.length;
240
+ const tokenLength = current.tokens.length + next.tokens.length;
241
+ const textLength = current.text.length + (joiner ? joiner.length : 0) + next.text.length;
242
+
243
+ if (tokenLength > this._config.chunkSize || textLength > this._config.chunkSize) {
244
+ combined.push(current);
245
+ current = next;
184
246
  } else {
185
- // Only insert separator if neither chunk is whitespace-only (defensive)
186
- const joiner = (!this._config.keepSeparators && !isWhitespaceOnly(currentChunk.text) && !isWhitespaceOnly(chunk.text)) ? separator : '';
187
- currentChunk.text += joiner + chunk.text;
188
- currentChunk.endPos = chunk.endPos;
189
- currentChunk.tokens.push(...chunk.tokens);
190
- currentLength += chunk.tokens.length;
247
+ const sep = (!this._config.keepSeparators && !isWhitespaceOnly(current.text) && !isWhitespaceOnly(next.text)) ? joiner : '';
248
+ current.text += sep + next.text;
249
+ current.endPos = next.endPos;
250
+ current.tokens.push(...next.tokens);
191
251
  }
192
252
  }
193
253
 
194
- if (currentChunk) {
195
- combinedChunks.push(currentChunk);
196
- }
197
-
198
- return combinedChunks;
254
+ if (current) combined.push(current);
255
+ return combined;
199
256
  }
200
257
 
258
+ // Token-window splitting utility used for the ' ' logical separator
201
259
  private splitBySpaces(text: string): string[] {
202
- // Split text by tokens and return parts
203
260
  const parts: string[] = [];
204
261
  let tokens = this._config.tokenizer.encode(text);
205
262
 
@@ -486,4 +543,4 @@ export class TextSplitter {
486
543
  ];
487
544
  }
488
545
  }
489
- }
546
+ }
@@ -0,0 +1,188 @@
1
+ import { strict as assert } from 'node:assert';
2
+ import { describe, it, beforeEach, afterEach } from 'mocha';
3
+ import sinon from 'sinon';
4
+ import { EmbeddingsModel } from './types';
5
+ import * as transformersModule from '@huggingface/transformers';
6
+
7
+ describe('TransformersEmbeddings', () => {
8
+ let TransformersEmbeddings: any;
9
+ let mockExtractor: sinon.SinonStub;
10
+ let mockTokenizer: any;
11
+ let sandbox: sinon.SinonSandbox;
12
+ let pipelineStub: sinon.SinonStub;
13
+
14
+ beforeEach(async () => {
15
+ sandbox = sinon.createSandbox();
16
+
17
+ // Create mock tokenizer
18
+ mockTokenizer = {
19
+ __call__: sandbox.stub().returns({
20
+ input_ids: { data: BigInt64Array.from([BigInt(1), BigInt(2), BigInt(3)]) }
21
+ }),
22
+ decode: sandbox.stub().returns('decoded text')
23
+ };
24
+ // Make it callable
25
+ const callableTokenizer = Object.assign(
26
+ (...args: any[]) => mockTokenizer.__call__(...args),
27
+ mockTokenizer
28
+ );
29
+
30
+ // Create mock extractor (feature extraction pipeline)
31
+ mockExtractor = sandbox.stub().callsFake(async (inputs: string | string[]) => {
32
+ const inputArray = Array.isArray(inputs) ? inputs : [inputs];
33
+ const batchSize = inputArray.length;
34
+ const embeddingDim = 4;
35
+
36
+ const data = new Float32Array(batchSize * embeddingDim);
37
+ for (let i = 0; i < batchSize; i++) {
38
+ data[i * embeddingDim] = 0.1;
39
+ data[i * embeddingDim + 1] = 0.2;
40
+ data[i * embeddingDim + 2] = 0.3;
41
+ data[i * embeddingDim + 3] = 0.4;
42
+ }
43
+
44
+ return {
45
+ data: data,
46
+ dims: [batchSize, embeddingDim]
47
+ };
48
+ });
49
+
50
+ // Attach tokenizer to the mock extractor so pipeline result has .tokenizer
51
+ (mockExtractor as any).tokenizer = callableTokenizer;
52
+
53
+ // Stub the pipeline function from @huggingface/transformers
54
+ pipelineStub = sandbox.stub(transformersModule, 'pipeline' as any).resolves(mockExtractor);
55
+
56
+ // Import TransformersEmbeddings fresh (uses the stubbed pipeline via dynamic import)
57
+ const mod = await import('./TransformersEmbeddings');
58
+ TransformersEmbeddings = mod.TransformersEmbeddings;
59
+ });
60
+
61
+ afterEach(() => {
62
+ sandbox.restore();
63
+ });
64
+
65
+ describe('create()', () => {
66
+ it('creates instance with default options', async () => {
67
+ const embeddings = await TransformersEmbeddings.create();
68
+
69
+ assert.equal(embeddings.maxTokens, 512, 'default maxTokens should be 512');
70
+ assert.equal(embeddings.model, 'Xenova/all-MiniLM-L6-v2', 'default model should be all-MiniLM-L6-v2');
71
+
72
+ // Verify pipeline was called with correct arguments
73
+ assert.ok(pipelineStub.calledOnce, 'pipeline should be called once');
74
+ assert.equal(pipelineStub.firstCall.args[0], 'feature-extraction');
75
+ assert.equal(pipelineStub.firstCall.args[1], 'Xenova/all-MiniLM-L6-v2');
76
+ });
77
+
78
+ it('creates instance with custom options', async () => {
79
+ const embeddings = await TransformersEmbeddings.create({
80
+ model: 'Xenova/bge-small-en-v1.5',
81
+ maxTokens: 256,
82
+ device: 'cpu',
83
+ normalize: false,
84
+ pooling: 'cls'
85
+ });
86
+
87
+ assert.equal(embeddings.maxTokens, 256);
88
+ assert.equal(embeddings.model, 'Xenova/bge-small-en-v1.5');
89
+ });
90
+
91
+ it('implements EmbeddingsModel interface', async () => {
92
+ const embeddings: EmbeddingsModel = await TransformersEmbeddings.create();
93
+
94
+ assert.equal(typeof embeddings.maxTokens, 'number');
95
+ assert.equal(typeof embeddings.createEmbeddings, 'function');
96
+ });
97
+ });
98
+
99
+ describe('createEmbeddings()', () => {
100
+ it('generates embeddings for single string', async () => {
101
+ const embeddings = await TransformersEmbeddings.create();
102
+ const result = await embeddings.createEmbeddings('hello world');
103
+
104
+ assert.equal(result.status, 'success');
105
+ assert.ok(result.output, 'output should be defined');
106
+ assert.equal(result.output!.length, 1, 'should have one embedding');
107
+ assert.equal(result.output![0].length, 4, 'embedding should have 4 dimensions');
108
+ const expected = [0.1, 0.2, 0.3, 0.4];
109
+ result.output![0].forEach((val: number, i: number) => {
110
+ assert.ok(Math.abs(val - expected[i]) < 0.001, `value ${val} should be close to ${expected[i]}`);
111
+ });
112
+ assert.equal(result.model, 'Xenova/all-MiniLM-L6-v2');
113
+ });
114
+
115
+ it('generates embeddings for string array', async () => {
116
+ const embeddings = await TransformersEmbeddings.create();
117
+ const result = await embeddings.createEmbeddings(['hello', 'world']);
118
+
119
+ assert.equal(result.status, 'success');
120
+ assert.ok(result.output, 'output should be defined');
121
+ assert.equal(result.output!.length, 2, 'should have two embeddings');
122
+
123
+ assert.equal(mockExtractor.callCount, 1);
124
+ assert.deepEqual(mockExtractor.firstCall.args[0], ['hello', 'world']);
125
+ });
126
+
127
+ it('passes pooling and normalize options to extractor', async () => {
128
+ const embeddings = await TransformersEmbeddings.create({
129
+ pooling: 'cls',
130
+ normalize: false
131
+ });
132
+ await embeddings.createEmbeddings('test');
133
+
134
+ assert.ok(mockExtractor.calledOnce);
135
+ const options = mockExtractor.firstCall.args[1];
136
+ assert.equal(options.pooling, 'cls');
137
+ assert.equal(options.normalize, false);
138
+ });
139
+
140
+ it('returns error status on failure', async () => {
141
+ mockExtractor.rejects(new Error('Model inference failed'));
142
+
143
+ const embeddings = await TransformersEmbeddings.create();
144
+ const result = await embeddings.createEmbeddings('test');
145
+
146
+ assert.equal(result.status, 'error');
147
+ assert.ok(result.message?.includes('Model inference failed'));
148
+ });
149
+
150
+ it('handles empty string input', async () => {
151
+ const embeddings = await TransformersEmbeddings.create();
152
+ const result = await embeddings.createEmbeddings('');
153
+
154
+ assert.equal(result.status, 'success');
155
+ assert.ok(result.output);
156
+ assert.equal(result.output!.length, 1);
157
+ });
158
+
159
+ it('handles empty array input', async () => {
160
+ const embeddings = await TransformersEmbeddings.create();
161
+ const result = await embeddings.createEmbeddings([]);
162
+
163
+ assert.equal(result.status, 'success');
164
+ assert.ok(result.output);
165
+ assert.equal(result.output!.length, 0);
166
+ });
167
+ });
168
+
169
+ describe('getTokenizer()', () => {
170
+ it('returns a TransformersTokenizer instance', async () => {
171
+ const embeddings = await TransformersEmbeddings.create();
172
+ const tokenizer = embeddings.getTokenizer();
173
+
174
+ assert.ok(tokenizer, 'tokenizer should be defined');
175
+ assert.equal(typeof tokenizer.encode, 'function');
176
+ assert.equal(typeof tokenizer.decode, 'function');
177
+ });
178
+
179
+ it('returns consistent tokenizer across calls', async () => {
180
+ const embeddings = await TransformersEmbeddings.create();
181
+ const tokenizer1 = embeddings.getTokenizer();
182
+ const tokenizer2 = embeddings.getTokenizer();
183
+
184
+ assert.ok(tokenizer1);
185
+ assert.ok(tokenizer2);
186
+ });
187
+ });
188
+ });