vectra 0.12.2 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (392) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +92 -100
  3. package/bin/vectra.js +3 -0
  4. package/lib/BrowserWebFetcher.d.ts +75 -0
  5. package/lib/BrowserWebFetcher.d.ts.map +1 -0
  6. package/lib/BrowserWebFetcher.js +290 -0
  7. package/lib/BrowserWebFetcher.js.map +1 -0
  8. package/lib/FileFetcher.d.ts +5 -0
  9. package/lib/FileFetcher.d.ts.map +1 -0
  10. package/lib/FileFetcher.js +89 -0
  11. package/lib/FileFetcher.js.map +1 -0
  12. package/lib/FileFetcher.spec.d.ts +2 -0
  13. package/lib/FileFetcher.spec.d.ts.map +1 -0
  14. package/lib/FileFetcher.spec.js +244 -0
  15. package/lib/FileFetcher.spec.js.map +1 -0
  16. package/lib/FolderWatcher.d.ts +91 -0
  17. package/lib/FolderWatcher.d.ts.map +1 -0
  18. package/lib/FolderWatcher.js +304 -0
  19. package/lib/FolderWatcher.js.map +1 -0
  20. package/lib/FolderWatcher.spec.d.ts +2 -0
  21. package/lib/FolderWatcher.spec.d.ts.map +1 -0
  22. package/lib/FolderWatcher.spec.js +308 -0
  23. package/lib/FolderWatcher.spec.js.map +1 -0
  24. package/lib/GPT3Tokenizer.d.ts +9 -0
  25. package/lib/GPT3Tokenizer.spec.d.ts +2 -0
  26. package/lib/GPT3Tokenizer.spec.d.ts.map +1 -0
  27. package/lib/GPT3Tokenizer.spec.js +45 -0
  28. package/lib/GPT3Tokenizer.spec.js.map +1 -0
  29. package/lib/ItemSelector.d.ts +41 -0
  30. package/lib/ItemSelector.d.ts.map +1 -0
  31. package/lib/ItemSelector.js +179 -0
  32. package/lib/ItemSelector.js.map +1 -0
  33. package/lib/ItemSelector.spec.d.ts +2 -0
  34. package/lib/ItemSelector.spec.d.ts.map +1 -0
  35. package/lib/ItemSelector.spec.js +204 -0
  36. package/lib/ItemSelector.spec.js.map +1 -0
  37. package/lib/LocalDocument.d.ts +54 -0
  38. package/lib/LocalDocument.d.ts.map +1 -1
  39. package/lib/LocalDocument.js +116 -0
  40. package/lib/LocalDocument.js.map +1 -0
  41. package/lib/LocalDocument.spec.d.ts +2 -0
  42. package/lib/LocalDocument.spec.d.ts.map +1 -0
  43. package/lib/LocalDocument.spec.js +214 -0
  44. package/lib/LocalDocument.spec.js.map +1 -0
  45. package/lib/LocalDocumentIndex.d.ts +152 -0
  46. package/lib/LocalDocumentIndex.d.ts.map +1 -1
  47. package/lib/LocalDocumentIndex.js +420 -0
  48. package/lib/LocalDocumentIndex.js.map +1 -0
  49. package/lib/LocalDocumentIndex.spec.d.ts +2 -0
  50. package/lib/LocalDocumentIndex.spec.d.ts.map +1 -0
  51. package/lib/LocalDocumentIndex.spec.js +494 -0
  52. package/lib/LocalDocumentIndex.spec.js.map +1 -0
  53. package/lib/LocalDocumentResult.d.ts +66 -0
  54. package/lib/LocalDocumentResult.d.ts.map +1 -1
  55. package/lib/LocalDocumentResult.js +376 -0
  56. package/lib/LocalDocumentResult.js.map +1 -0
  57. package/lib/LocalDocumentResult.spec.d.ts +2 -0
  58. package/lib/LocalDocumentResult.spec.d.ts.map +1 -0
  59. package/lib/LocalDocumentResult.spec.js +373 -0
  60. package/lib/LocalDocumentResult.spec.js.map +1 -0
  61. package/lib/LocalEmbeddings.d.ts +59 -0
  62. package/lib/LocalEmbeddings.d.ts.map +1 -0
  63. package/lib/LocalEmbeddings.js +101 -0
  64. package/lib/LocalEmbeddings.js.map +1 -0
  65. package/lib/LocalEmbeddings.spec.d.ts +2 -0
  66. package/lib/LocalEmbeddings.spec.d.ts.map +1 -0
  67. package/lib/LocalEmbeddings.spec.js +155 -0
  68. package/lib/LocalEmbeddings.spec.js.map +1 -0
  69. package/lib/LocalIndex.d.ts +159 -0
  70. package/lib/LocalIndex.d.ts.map +1 -1
  71. package/lib/LocalIndex.js +519 -0
  72. package/lib/LocalIndex.js.map +1 -0
  73. package/lib/LocalIndex.spec.d.ts +2 -0
  74. package/lib/LocalIndex.spec.js +611 -9
  75. package/lib/LocalIndex.spec.js.map +1 -1
  76. package/lib/OpenAIEmbeddings.d.ts +124 -0
  77. package/lib/OpenAIEmbeddings.d.ts.map +1 -0
  78. package/lib/OpenAIEmbeddings.js +166 -0
  79. package/lib/OpenAIEmbeddings.js.map +1 -0
  80. package/lib/OpenAIEmbeddings.spec.d.ts +2 -0
  81. package/lib/OpenAIEmbeddings.spec.d.ts.map +1 -0
  82. package/lib/OpenAIEmbeddings.spec.js +298 -0
  83. package/lib/OpenAIEmbeddings.spec.js.map +1 -0
  84. package/lib/TextSplitter.d.ts +21 -0
  85. package/lib/TextSplitter.d.ts.map +1 -1
  86. package/lib/TextSplitter.js +500 -0
  87. package/lib/TextSplitter.js.map +1 -0
  88. package/lib/TextSplitter.spec.d.ts +2 -0
  89. package/lib/TextSplitter.spec.d.ts.map +1 -0
  90. package/lib/TextSplitter.spec.js +337 -0
  91. package/lib/TextSplitter.spec.js.map +1 -0
  92. package/lib/TransformersEmbeddings.d.ts +121 -0
  93. package/lib/TransformersEmbeddings.d.ts.map +1 -0
  94. package/lib/TransformersEmbeddings.js +176 -0
  95. package/lib/TransformersEmbeddings.js.map +1 -0
  96. package/lib/TransformersEmbeddings.spec.d.ts +2 -0
  97. package/lib/TransformersEmbeddings.spec.d.ts.map +1 -0
  98. package/lib/TransformersEmbeddings.spec.js +198 -0
  99. package/lib/TransformersEmbeddings.spec.js.map +1 -0
  100. package/lib/TransformersTokenizer.d.ts +33 -0
  101. package/lib/TransformersTokenizer.d.ts.map +1 -0
  102. package/lib/TransformersTokenizer.js +44 -0
  103. package/lib/TransformersTokenizer.js.map +1 -0
  104. package/lib/TransformersTokenizer.spec.d.ts +2 -0
  105. package/lib/TransformersTokenizer.spec.d.ts.map +1 -0
  106. package/lib/TransformersTokenizer.spec.js +112 -0
  107. package/lib/TransformersTokenizer.spec.js.map +1 -0
  108. package/lib/WebFetcher.d.ts +14 -0
  109. package/lib/WebFetcher.d.ts.map +1 -0
  110. package/lib/WebFetcher.js +238 -0
  111. package/lib/WebFetcher.js.map +1 -0
  112. package/lib/WebFetcher.spec.d.ts +2 -0
  113. package/lib/WebFetcher.spec.d.ts.map +1 -0
  114. package/lib/WebFetcher.spec.js +263 -0
  115. package/lib/WebFetcher.spec.js.map +1 -0
  116. package/lib/browser.d.ts +30 -0
  117. package/lib/browser.d.ts.map +1 -0
  118. package/lib/browser.js +52 -0
  119. package/lib/browser.js.map +1 -0
  120. package/lib/codecs/IndexCodec.d.ts +37 -0
  121. package/lib/codecs/IndexCodec.d.ts.map +1 -0
  122. package/lib/codecs/IndexCodec.js +3 -0
  123. package/lib/codecs/IndexCodec.js.map +1 -0
  124. package/lib/codecs/JsonCodec.d.ts +19 -0
  125. package/lib/codecs/JsonCodec.d.ts.map +1 -0
  126. package/lib/codecs/JsonCodec.js +35 -0
  127. package/lib/codecs/JsonCodec.js.map +1 -0
  128. package/lib/codecs/JsonCodec.spec.d.ts +2 -0
  129. package/lib/codecs/JsonCodec.spec.d.ts.map +1 -0
  130. package/lib/codecs/JsonCodec.spec.js +66 -0
  131. package/lib/codecs/JsonCodec.spec.js.map +1 -0
  132. package/lib/codecs/LocalIndex.protobuf.spec.d.ts +2 -0
  133. package/lib/codecs/LocalIndex.protobuf.spec.d.ts.map +1 -0
  134. package/lib/codecs/LocalIndex.protobuf.spec.js +108 -0
  135. package/lib/codecs/LocalIndex.protobuf.spec.js.map +1 -0
  136. package/lib/codecs/ProtobufCodec.d.ts +20 -0
  137. package/lib/codecs/ProtobufCodec.d.ts.map +1 -0
  138. package/lib/codecs/ProtobufCodec.js +225 -0
  139. package/lib/codecs/ProtobufCodec.js.map +1 -0
  140. package/lib/codecs/ProtobufCodec.spec.d.ts +2 -0
  141. package/lib/codecs/ProtobufCodec.spec.d.ts.map +1 -0
  142. package/lib/codecs/ProtobufCodec.spec.js +155 -0
  143. package/lib/codecs/ProtobufCodec.spec.js.map +1 -0
  144. package/lib/codecs/index.d.ts +5 -0
  145. package/lib/codecs/index.d.ts.map +1 -0
  146. package/lib/codecs/index.js +21 -0
  147. package/lib/codecs/index.js.map +1 -0
  148. package/lib/codecs/migrateIndex.d.ts +24 -0
  149. package/lib/codecs/migrateIndex.d.ts.map +1 -0
  150. package/lib/codecs/migrateIndex.js +119 -0
  151. package/lib/codecs/migrateIndex.js.map +1 -0
  152. package/lib/codecs/migrateIndex.spec.d.ts +2 -0
  153. package/lib/codecs/migrateIndex.spec.d.ts.map +1 -0
  154. package/lib/codecs/migrateIndex.spec.js +151 -0
  155. package/lib/codecs/migrateIndex.spec.js.map +1 -0
  156. package/lib/codecs/schemas/index.proto +34 -0
  157. package/lib/index.d.ts +20 -0
  158. package/lib/index.d.ts.map +1 -1
  159. package/lib/index.js +36 -0
  160. package/lib/index.js.map +1 -0
  161. package/lib/internals/Colorize.d.ts +14 -0
  162. package/lib/internals/Colorize.d.ts.map +1 -0
  163. package/lib/internals/Colorize.js +69 -0
  164. package/lib/internals/Colorize.js.map +1 -0
  165. package/lib/internals/index.d.ts +3 -0
  166. package/lib/internals/index.d.ts.map +1 -0
  167. package/lib/internals/index.js +19 -0
  168. package/lib/internals/index.js.map +1 -0
  169. package/lib/internals/types.d.ts +43 -0
  170. package/lib/internals/types.d.ts.map +1 -0
  171. package/lib/internals/types.js +3 -0
  172. package/lib/internals/types.js.map +1 -0
  173. package/lib/server/IndexManager.d.ts +78 -0
  174. package/lib/server/IndexManager.d.ts.map +1 -0
  175. package/lib/server/IndexManager.js +259 -0
  176. package/lib/server/IndexManager.js.map +1 -0
  177. package/lib/server/VectraServer.d.ts +40 -0
  178. package/lib/server/VectraServer.d.ts.map +1 -0
  179. package/lib/server/VectraServer.js +151 -0
  180. package/lib/server/VectraServer.js.map +1 -0
  181. package/lib/server/VectraServer.spec.d.ts +2 -0
  182. package/lib/server/VectraServer.spec.d.ts.map +1 -0
  183. package/lib/server/VectraServer.spec.js +322 -0
  184. package/lib/server/VectraServer.spec.js.map +1 -0
  185. package/lib/server/handlers/documentHandlers.d.ts +15 -0
  186. package/lib/server/handlers/documentHandlers.d.ts.map +1 -0
  187. package/lib/server/handlers/documentHandlers.js +95 -0
  188. package/lib/server/handlers/documentHandlers.js.map +1 -0
  189. package/lib/server/handlers/helpers.d.ts +23 -0
  190. package/lib/server/handlers/helpers.d.ts.map +1 -0
  191. package/lib/server/handlers/helpers.js +138 -0
  192. package/lib/server/handlers/helpers.js.map +1 -0
  193. package/lib/server/handlers/index.d.ts +8 -0
  194. package/lib/server/handlers/index.d.ts.map +1 -0
  195. package/lib/server/handlers/index.js +22 -0
  196. package/lib/server/handlers/index.js.map +1 -0
  197. package/lib/server/handlers/indexHandlers.d.ts +14 -0
  198. package/lib/server/handlers/indexHandlers.d.ts.map +1 -0
  199. package/lib/server/handlers/indexHandlers.js +85 -0
  200. package/lib/server/handlers/indexHandlers.js.map +1 -0
  201. package/lib/server/handlers/itemHandlers.d.ts +34 -0
  202. package/lib/server/handlers/itemHandlers.d.ts.map +1 -0
  203. package/lib/server/handlers/itemHandlers.js +166 -0
  204. package/lib/server/handlers/itemHandlers.js.map +1 -0
  205. package/lib/server/handlers/lifecycleHandlers.d.ts +11 -0
  206. package/lib/server/handlers/lifecycleHandlers.d.ts.map +1 -0
  207. package/lib/server/handlers/lifecycleHandlers.js +31 -0
  208. package/lib/server/handlers/lifecycleHandlers.js.map +1 -0
  209. package/lib/server/handlers/queryHandlers.d.ts +27 -0
  210. package/lib/server/handlers/queryHandlers.d.ts.map +1 -0
  211. package/lib/server/handlers/queryHandlers.js +135 -0
  212. package/lib/server/handlers/queryHandlers.js.map +1 -0
  213. package/lib/server/handlers/statsHandlers.d.ts +17 -0
  214. package/lib/server/handlers/statsHandlers.d.ts.map +1 -0
  215. package/lib/server/handlers/statsHandlers.js +81 -0
  216. package/lib/server/handlers/statsHandlers.js.map +1 -0
  217. package/lib/server/index.d.ts +4 -0
  218. package/lib/server/index.d.ts.map +1 -0
  219. package/lib/server/index.js +23 -0
  220. package/lib/server/index.js.map +1 -0
  221. package/lib/storage/FileStorage.d.ts +92 -0
  222. package/lib/storage/FileStorage.d.ts.map +1 -0
  223. package/lib/storage/FileStorage.js +3 -0
  224. package/lib/storage/FileStorage.js.map +1 -0
  225. package/lib/storage/FileStorageUtilities.d.ts +36 -0
  226. package/lib/storage/FileStorageUtilities.d.ts.map +1 -0
  227. package/lib/storage/FileStorageUtilities.js +91 -0
  228. package/lib/storage/FileStorageUtilities.js.map +1 -0
  229. package/lib/storage/FileStorageUtilities.spec.d.ts +2 -0
  230. package/lib/storage/FileStorageUtilities.spec.d.ts.map +1 -0
  231. package/lib/storage/FileStorageUtilities.spec.js +98 -0
  232. package/lib/storage/FileStorageUtilities.spec.js.map +1 -0
  233. package/lib/storage/FileType.d.ts +29 -0
  234. package/lib/storage/FileType.d.ts.map +1 -0
  235. package/lib/storage/FileType.js +38 -0
  236. package/lib/storage/FileType.js.map +1 -0
  237. package/lib/storage/IndexedDBStorage.d.ts +47 -0
  238. package/lib/storage/IndexedDBStorage.d.ts.map +1 -0
  239. package/lib/storage/IndexedDBStorage.js +347 -0
  240. package/lib/storage/IndexedDBStorage.js.map +1 -0
  241. package/lib/storage/LocalFileStorage.browser.d.ts +19 -0
  242. package/lib/storage/LocalFileStorage.browser.d.ts.map +1 -0
  243. package/lib/storage/LocalFileStorage.browser.js +43 -0
  244. package/lib/storage/LocalFileStorage.browser.js.map +1 -0
  245. package/lib/storage/LocalFileStorage.d.ts +23 -0
  246. package/lib/storage/LocalFileStorage.d.ts.map +1 -0
  247. package/lib/storage/LocalFileStorage.js +152 -0
  248. package/lib/storage/LocalFileStorage.js.map +1 -0
  249. package/lib/storage/LocalFileStorage.spec.d.ts +2 -0
  250. package/lib/storage/LocalFileStorage.spec.d.ts.map +1 -0
  251. package/lib/storage/LocalFileStorage.spec.js +249 -0
  252. package/lib/storage/LocalFileStorage.spec.js.map +1 -0
  253. package/lib/storage/VirtualFileStorage.d.ts +18 -0
  254. package/lib/storage/VirtualFileStorage.d.ts.map +1 -0
  255. package/lib/storage/VirtualFileStorage.js +178 -0
  256. package/lib/storage/VirtualFileStorage.js.map +1 -0
  257. package/lib/storage/VirtualFileStorage.spec.d.ts +2 -0
  258. package/lib/storage/VirtualFileStorage.spec.d.ts.map +1 -0
  259. package/lib/storage/VirtualFileStorage.spec.js +302 -0
  260. package/lib/storage/VirtualFileStorage.spec.js.map +1 -0
  261. package/lib/storage/index.d.ts +6 -0
  262. package/lib/storage/index.d.ts.map +1 -0
  263. package/lib/storage/index.js +22 -0
  264. package/lib/storage/index.js.map +1 -0
  265. package/lib/templates/templates/csharp/README.md +48 -0
  266. package/lib/templates/templates/csharp/VectraClient.cs +234 -0
  267. package/lib/templates/templates/go/README.md +71 -0
  268. package/lib/templates/templates/go/vectra_client.go +322 -0
  269. package/lib/templates/templates/java/README.md +81 -0
  270. package/lib/templates/templates/java/VectraClient.java +232 -0
  271. package/lib/templates/templates/python/README.md +37 -0
  272. package/lib/templates/templates/python/vectra_client.py +279 -0
  273. package/lib/templates/templates/rust/Cargo.toml +14 -0
  274. package/lib/templates/templates/rust/README.md +39 -0
  275. package/lib/templates/templates/rust/build.rs +4 -0
  276. package/lib/templates/templates/rust/lib.rs +284 -0
  277. package/lib/templates/templates/typescript/README.md +96 -0
  278. package/lib/templates/templates/typescript/VectraClient.ts +374 -0
  279. package/lib/templates/typescript/VectraClient.d.ts +114 -0
  280. package/lib/templates/typescript/VectraClient.d.ts.map +1 -0
  281. package/lib/templates/typescript/VectraClient.js +328 -0
  282. package/lib/templates/typescript/VectraClient.js.map +1 -0
  283. package/lib/types.d.ts +153 -0
  284. package/lib/types.d.ts.map +1 -0
  285. package/lib/types.js +3 -0
  286. package/lib/types.js.map +1 -0
  287. package/lib/utils/index.d.ts +2 -0
  288. package/lib/utils/index.d.ts.map +1 -0
  289. package/lib/utils/index.js +18 -0
  290. package/lib/utils/index.js.map +1 -0
  291. package/lib/utils/pathUtils.d.ts +40 -0
  292. package/lib/utils/pathUtils.d.ts.map +1 -0
  293. package/lib/utils/pathUtils.js +98 -0
  294. package/lib/utils/pathUtils.js.map +1 -0
  295. package/lib/vectra-cli.d.ts +2 -0
  296. package/lib/vectra-cli.d.ts.map +1 -1
  297. package/lib/vectra-cli.generate.spec.d.ts +2 -0
  298. package/lib/vectra-cli.generate.spec.d.ts.map +1 -0
  299. package/lib/vectra-cli.generate.spec.js +112 -0
  300. package/lib/vectra-cli.generate.spec.js.map +1 -0
  301. package/lib/vectra-cli.js +760 -0
  302. package/lib/vectra-cli.js.map +1 -0
  303. package/lib/vectra-cli.spec.d.ts +1 -0
  304. package/lib/vectra-cli.spec.d.ts.map +1 -0
  305. package/lib/vectra-cli.spec.js +2 -0
  306. package/lib/vectra-cli.spec.js.map +1 -0
  307. package/package.json +91 -16
  308. package/proto/vectra_service.proto +276 -0
  309. package/src/BrowserWebFetcher.ts +345 -0
  310. package/src/FileFetcher.spec.ts +234 -0
  311. package/src/FileFetcher.ts +37 -25
  312. package/src/FolderWatcher.spec.ts +288 -0
  313. package/src/FolderWatcher.ts +304 -0
  314. package/src/GPT3Tokenizer.spec.ts +50 -0
  315. package/src/ItemSelector.spec.ts +252 -0
  316. package/src/ItemSelector.ts +163 -150
  317. package/src/LocalDocument.spec.ts +211 -0
  318. package/src/LocalDocument.ts +88 -94
  319. package/src/LocalDocumentIndex.spec.ts +481 -0
  320. package/src/LocalDocumentIndex.ts +39 -40
  321. package/src/LocalDocumentResult.spec.ts +373 -0
  322. package/src/LocalDocumentResult.ts +489 -319
  323. package/src/LocalEmbeddings.spec.ts +138 -0
  324. package/src/LocalEmbeddings.ts +120 -0
  325. package/src/LocalIndex.spec.ts +808 -66
  326. package/src/LocalIndex.ts +479 -429
  327. package/src/OpenAIEmbeddings.spec.ts +354 -0
  328. package/src/OpenAIEmbeddings.ts +26 -27
  329. package/src/TextSplitter.spec.ts +342 -0
  330. package/src/TextSplitter.ts +517 -532
  331. package/src/TransformersEmbeddings.spec.ts +188 -0
  332. package/src/TransformersEmbeddings.ts +232 -0
  333. package/src/TransformersTokenizer.spec.ts +143 -0
  334. package/src/TransformersTokenizer.ts +45 -0
  335. package/src/WebFetcher.spec.ts +288 -0
  336. package/src/WebFetcher.ts +184 -186
  337. package/src/browser.ts +69 -0
  338. package/src/codecs/IndexCodec.ts +40 -0
  339. package/src/codecs/JsonCodec.spec.ts +70 -0
  340. package/src/codecs/JsonCodec.ts +37 -0
  341. package/src/codecs/LocalIndex.protobuf.spec.ts +115 -0
  342. package/src/codecs/ProtobufCodec.spec.ts +166 -0
  343. package/src/codecs/ProtobufCodec.ts +193 -0
  344. package/src/codecs/index.ts +4 -0
  345. package/src/codecs/migrateIndex.spec.ts +176 -0
  346. package/src/codecs/migrateIndex.ts +125 -0
  347. package/src/codecs/schemas/index.proto +34 -0
  348. package/src/index.ts +9 -1
  349. package/src/internals/Colorize.ts +19 -16
  350. package/src/server/IndexManager.ts +243 -0
  351. package/src/server/VectraServer.spec.ts +303 -0
  352. package/src/server/VectraServer.ts +156 -0
  353. package/src/server/handlers/documentHandlers.ts +59 -0
  354. package/src/server/handlers/helpers.ts +93 -0
  355. package/src/server/handlers/index.ts +7 -0
  356. package/src/server/handlers/indexHandlers.ts +44 -0
  357. package/src/server/handlers/itemHandlers.ts +140 -0
  358. package/src/server/handlers/lifecycleHandlers.ts +26 -0
  359. package/src/server/handlers/queryHandlers.ts +96 -0
  360. package/src/server/handlers/statsHandlers.ts +38 -0
  361. package/src/server/index.ts +3 -0
  362. package/src/storage/FileStorage.ts +105 -0
  363. package/src/storage/FileStorageUtilities.spec.ts +106 -0
  364. package/src/storage/FileStorageUtilities.ts +77 -0
  365. package/src/storage/FileType.ts +61 -0
  366. package/src/storage/IndexedDBStorage.ts +365 -0
  367. package/src/storage/LocalFileStorage.browser.ts +52 -0
  368. package/src/storage/LocalFileStorage.spec.ts +292 -0
  369. package/src/storage/LocalFileStorage.ts +98 -0
  370. package/src/storage/VirtualFileStorage.spec.ts +307 -0
  371. package/src/storage/VirtualFileStorage.ts +169 -0
  372. package/src/storage/index.ts +5 -0
  373. package/src/templates/csharp/README.md +48 -0
  374. package/src/templates/csharp/VectraClient.cs +234 -0
  375. package/src/templates/go/README.md +71 -0
  376. package/src/templates/go/vectra_client.go +322 -0
  377. package/src/templates/java/README.md +81 -0
  378. package/src/templates/java/VectraClient.java +232 -0
  379. package/src/templates/python/README.md +37 -0
  380. package/src/templates/python/vectra_client.py +279 -0
  381. package/src/templates/rust/Cargo.toml +14 -0
  382. package/src/templates/rust/README.md +39 -0
  383. package/src/templates/rust/build.rs +4 -0
  384. package/src/templates/rust/lib.rs +284 -0
  385. package/src/templates/typescript/README.md +96 -0
  386. package/src/templates/typescript/VectraClient.ts +374 -0
  387. package/src/types.ts +131 -123
  388. package/src/utils/index.ts +1 -0
  389. package/src/utils/pathUtils.ts +106 -0
  390. package/src/vectra-cli.generate.spec.ts +72 -0
  391. package/src/vectra-cli.spec.ts +0 -0
  392. package/src/vectra-cli.ts +687 -246
@@ -1,561 +1,546 @@
1
1
  import { GPT3Tokenizer } from "./GPT3Tokenizer";
2
2
  import { TextChunk, Tokenizer } from "./types";
3
3
 
4
- const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
5
-
6
4
  export interface TextSplitterConfig {
7
- separators: string[];
8
- keepSeparators: boolean;
9
- chunkSize: number;
10
- chunkOverlap: number;
11
- tokenizer: Tokenizer;
12
- docType?: string;
5
+ separators: string[];
6
+ keepSeparators: boolean;
7
+ chunkSize: number;
8
+ chunkOverlap: number;
9
+ tokenizer: Tokenizer;
10
+ docType?: string;
13
11
  }
14
12
 
15
13
  export class TextSplitter {
16
- private readonly _config: TextSplitterConfig;
17
-
18
- public constructor(config?: Partial<TextSplitterConfig>) {
19
- this._config = Object.assign({
20
- keepSeparators: false,
21
- chunkSize: 400,
22
- chunkOverlap: 40,
23
- } as TextSplitterConfig, config);
24
-
25
- // Create a default tokenizer if none is provided
26
- if (!this._config.tokenizer) {
27
- this._config.tokenizer = new GPT3Tokenizer();
28
- }
14
+ private readonly _config: TextSplitterConfig;
29
15
 
30
- // Use default separators if none are provided
31
- if (!this._config.separators || this._config.separators.length === 0) {
32
- this._config.separators = this.getSeparators(this._config.docType);
33
- }
16
+ public constructor(config?: Partial<TextSplitterConfig>) {
17
+ this._config = Object.assign({
18
+ keepSeparators: false,
19
+ chunkSize: 400,
20
+ chunkOverlap: 40,
21
+ } as TextSplitterConfig, config);
34
22
 
35
- // Validate the config settings
36
- if (this._config.chunkSize < 1) {
37
- throw new Error("chunkSize must be >= 1");
38
- } else if (this._config.chunkOverlap < 0) {
39
- throw new Error("chunkOverlap must be >= 0");
40
- } else if (this._config.chunkOverlap > this._config.chunkSize) {
41
- throw new Error("chunkOverlap must be <= chunkSize");
42
- }
23
+ if (!this._config.tokenizer) {
24
+ this._config.tokenizer = new GPT3Tokenizer();
25
+ }
26
+
27
+ if (!this._config.separators || this._config.separators.length === 0) {
28
+ this._config.separators = this.getSeparators(this._config.docType);
29
+ }
30
+
31
+ if (this._config.chunkSize < 1) {
32
+ throw new Error("chunkSize must be >= 1");
33
+ } else if (this._config.chunkOverlap < 0) {
34
+ throw new Error("chunkOverlap must be >= 0");
35
+ } else if (this._config.chunkOverlap > this._config.chunkSize) {
36
+ throw new Error("chunkOverlap must be <= chunkSize");
37
+ }
38
+ }
39
+
40
+ public split(text: string): TextChunk[] {
41
+ const chunks = this.recursiveSplit(text, this._config.separators, 0);
42
+
43
+ if (this._config.chunkOverlap > 0) {
44
+ for (let i = 0; i < chunks.length - 1; i++) {
45
+ const current = chunks[i];
46
+ const next = chunks[i + 1];
47
+
48
+ const currTokensCopy = current.tokens.slice();
49
+ const trailing = currTokensCopy.reverse().slice(0, this._config.chunkOverlap).reverse();
50
+ next.startOverlap = trailing;
51
+
52
+ const leadLen = Math.min(this._config.chunkOverlap, next.tokens.length);
53
+ current.endOverlap = next.tokens.slice(0, leadLen);
54
+ }
43
55
  }
44
56
 
45
- public split(text: string): TextChunk[] {
46
- // Get basic chunks
47
- const chunks = this.recursiveSplit(text, this._config.separators, 0);
48
-
49
- const that = this;
50
- function getOverlapTokens(tokens?: number[]): number[] {
51
- if (tokens != undefined) {
52
- const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
53
- return tokens.slice(0, len);
54
- } else {
55
- return [];
56
- }
57
+ return chunks;
58
+ }
59
+
60
+ private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
61
+ if (text.length === 0) return [];
62
+
63
+ if (separators.length > 0) {
64
+ const sep = separators[0];
65
+ const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
66
+
67
+ const parts = sep === ' ' ? this.splitBySpaces(text) : text.split(sep);
68
+ const out: TextChunk[] = [];
69
+
70
+ let pos = startPos;
71
+ for (let i = 0; i < parts.length; i++) {
72
+ const lastPart = (i === parts.length - 1);
73
+ let piece = parts[i];
74
+
75
+ if (this._config.keepSeparators && !lastPart) {
76
+ piece += sep;
57
77
  }
58
78
 
59
- // Add overlap tokens and text to the start and end of each chunk
60
- if (this._config.chunkOverlap > 0) {
61
- for (let i = 1; i < chunks.length; i++) {
62
- const previousChunk = chunks[i - 1];
63
- const chunk = chunks[i];
64
- const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
65
- chunk.startOverlap = getOverlapTokens(previousChunk.tokens.reverse()).reverse();
66
- chunk.endOverlap = getOverlapTokens(nextChunk?.tokens);
67
- }
79
+ if (!/\S/.test(piece)) {
80
+ const consumed = parts[i].length + (lastPart ? 0 : sep.length);
81
+ pos += consumed;
82
+ continue;
68
83
  }
69
84
 
70
- return chunks;
85
+ const sub = this.recursiveSplit(piece, nextSeparators, pos);
86
+ if (sub.length > 0) {
87
+ out.push(...sub);
88
+ } else {
89
+ out.push(...this.finalizeToChunks(piece, pos));
90
+ }
91
+
92
+ const consumed = parts[i].length + (lastPart ? 0 : sep.length);
93
+ pos += consumed;
94
+ }
95
+
96
+ const joiner =
97
+ this._config.keepSeparators
98
+ ? ''
99
+ : (sep !== ' ' && (sep.includes('\n') || sep.includes('\t')) ? ' ' : '');
100
+
101
+ return this.combineChunks(out, joiner);
71
102
  }
72
103
 
73
- private recursiveSplit(text: string, separators: string[], startPos: number): TextChunk[] {
74
- const chunks: TextChunk[] = [];
75
- if (text.length > 0) {
76
- // Split text into parts
77
- let parts: string[];
78
- let separator = '';
79
- const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
80
- if (separators.length > 0) {
81
- // Split by separator
82
- separator = separators[0];
83
- parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
84
- } else {
85
- // Cut text in half
86
- const half = Math.floor(text.length / 2);
87
- parts = [text.substring(0, half), text.substring(half)];
88
- }
89
-
90
- // Iterate over parts
91
- for (let i = 0; i < parts.length; i++) {
92
- const lastChunk = (i === parts.length - 1);
93
-
94
- // Get chunk text and endPos
95
- let chunk = parts[i];
96
- const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
97
- if (this._config.keepSeparators && !lastChunk) {
98
- chunk += separator;
99
- }
100
-
101
- // Ensure chunk contains text
102
- if (!this.containsAlphanumeric(chunk)) {
103
- continue;
104
- }
105
-
106
- // Optimization to avoid encoding really large chunks
107
- if (chunk.length / 6 > this._config.chunkSize) {
108
- // Break the text into smaller chunks
109
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
110
- chunks.push(...subChunks);
111
- } else {
112
- // Encode chunk text
113
- const tokens = this._config.tokenizer.encode(chunk);
114
- if (tokens.length > this._config.chunkSize) {
115
- // Break the text into smaller chunks
116
- const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
117
- chunks.push(...subChunks);
118
- } else {
119
- // Append chunk to output
120
- chunks.push({
121
- text: chunk,
122
- tokens: tokens,
123
- startPos: startPos,
124
- endPos: endPos,
125
- startOverlap: [],
126
- endOverlap: [],
127
- });
128
- }
129
-
130
- }
131
-
132
-
133
- // Update startPos
134
- startPos = endPos + 1;
135
- }
136
- }
104
+ return this.combineChunks(this.finalizeToChunks(text, startPos), '');
105
+ }
137
106
 
138
- return this.combineChunks(chunks);
107
+ // Strip inline punctuation-only runs when keepSeparators=false.
108
+ // Only removes runs that touch non-whitespace on at least one side (inline),
109
+ // preserving standalone lines like '---' or '***' that are separated by whitespace/newlines.
110
+ private stripInlineSeparators(s: string): string {
111
+ if (this._config.keepSeparators || s.length === 0) return s;
112
+ const re = /(-{3,}|\*{3,}|={3,}|_{3,})/g;
113
+ let out = '';
114
+ let lastIndex = 0;
115
+ let m: RegExpExecArray | null;
116
+ while ((m = re.exec(s)) !== null) {
117
+ const start = m.index;
118
+ const end = start + m[0].length;
119
+ const left = start > 0 ? s[start - 1] : undefined;
120
+ const right = end < s.length ? s[end] : undefined;
121
+ const leftNonWS = left !== undefined && !/\s/.test(left);
122
+ const rightNonWS = right !== undefined && !/\s/.test(right);
123
+ // Inline if touching non-whitespace on at least one side
124
+ if (leftNonWS || rightNonWS) {
125
+ out += s.slice(lastIndex, start);
126
+ lastIndex = end; // drop the run
127
+ }
139
128
  }
129
+ out += s.slice(lastIndex);
130
+ return out;
131
+ }
140
132
 
141
- private combineChunks(chunks: TextChunk[]): TextChunk[] {
142
- const combinedChunks: TextChunk[] = [];
143
- let currentChunk: TextChunk|undefined;
144
- let currentLength = 0;
145
- const separator = this._config.keepSeparators ? '' : ' ';
146
- for (let i = 0; i < chunks.length; i++) {
147
- const chunk = chunks[i];
148
- if (currentChunk) {
149
- const length = currentChunk.tokens.length + chunk.tokens.length;
150
- if (length > this._config.chunkSize) {
151
- combinedChunks.push(currentChunk);
152
- currentChunk = chunk;
153
- currentLength = chunk.tokens.length;
154
- } else {
155
- currentChunk.text += separator + chunk.text;
156
- currentChunk.endPos = chunk.endPos;
157
- currentChunk.tokens.push(...chunk.tokens);
158
- currentLength += chunk.tokens.length;
159
- }
160
- } else {
161
- currentChunk = chunk;
162
- currentLength = chunk.tokens.length;
163
- }
164
- }
165
- if (currentChunk) {
166
- combinedChunks.push(currentChunk);
133
+ // Produce one or more chunks under budget.
134
+ private finalizeToChunks(text: string, startPos: number): TextChunk[] {
135
+ const chunks: TextChunk[] = [];
136
+ const tokens = this._config.tokenizer.encode(text);
137
+
138
+ // Token-budget splitting
139
+ if (tokens.length > this._config.chunkSize) {
140
+ let remaining = tokens.slice();
141
+ let pos = startPos;
142
+
143
+ while (remaining.length > 0) {
144
+ const span = remaining.splice(0, this._config.chunkSize);
145
+ const original = this._config.tokenizer.decode(span);
146
+
147
+ const leadingWSMatch = original.match(/^\s+/);
148
+ const leadingWSLen = leadingWSMatch ? leadingWSMatch[0].length : 0;
149
+
150
+ let sliceText = leadingWSLen > 0 ? original.slice(leadingWSLen) : original;
151
+ if (sliceText.length === 0) {
152
+ pos += original.length;
153
+ continue;
167
154
  }
168
- return combinedChunks;
155
+
156
+ // Drop inline punctuation-only runs if configured
157
+ const stripped = this.stripInlineSeparators(sliceText);
158
+
159
+ const sliceStart = pos + leadingWSLen;
160
+ const sliceEnd = sliceStart + stripped.length - 1;
161
+
162
+ const spanTokens = this._config.tokenizer.encode(stripped);
163
+
164
+ chunks.push({
165
+ text: stripped,
166
+ tokens: spanTokens,
167
+ startPos: sliceStart,
168
+ endPos: sliceEnd,
169
+ startOverlap: [],
170
+ endOverlap: [],
171
+ });
172
+
173
+ pos += original.length;
174
+ }
175
+ return chunks;
169
176
  }
170
177
 
171
- private containsAlphanumeric(text: string): boolean {
172
- for (let i = 0; i < text.length; i++) {
173
- if (ALPHANUMERIC_CHARS.includes(text[i])) {
174
- return true;
175
- }
178
+ // If text fits but is a very long unbroken string with no configured separators, fall back to char windows
179
+ if (text.length > this._config.chunkSize) {
180
+ const hasWhitespace = /\s/.test(text);
181
+ const hasAnyConfiguredSep = (this._config.separators || []).some(s => s && text.includes(s));
182
+
183
+ if (!hasWhitespace && !hasAnyConfiguredSep) {
184
+ let pos = startPos;
185
+ for (let off = 0; off < text.length; off += this._config.chunkSize) {
186
+ const slice = text.slice(off, off + this._config.chunkSize);
187
+ const stripped = this.stripInlineSeparators(slice);
188
+ const sliceTokens = this._config.tokenizer.encode(stripped);
189
+ const sliceStart = pos;
190
+ const sliceEnd = sliceStart + stripped.length - 1;
191
+ chunks.push({
192
+ text: stripped,
193
+ tokens: sliceTokens,
194
+ startPos: sliceStart,
195
+ endPos: sliceEnd,
196
+ startOverlap: [],
197
+ endOverlap: [],
198
+ });
199
+ pos = sliceEnd + 1;
176
200
  }
177
- return false;
201
+ return chunks;
202
+ }
178
203
  }
179
204
 
180
- private splitBySpaces(text: string): string[] {
181
- // Split text by tokens and return parts
182
- const parts: string[] = [];
183
- let tokens = this._config.tokenizer.encode(text);
184
- do {
185
- if (tokens.length <= this._config.chunkSize) {
186
- parts.push(this._config.tokenizer.decode(tokens));
187
- break;
188
- } else {
189
- const span = tokens.splice(0, this._config.chunkSize);
190
- parts.push(this._config.tokenizer.decode(span));
191
- }
192
- } while (true);
193
-
194
- return parts;
205
+ const stripped = this.stripInlineSeparators(text);
206
+ const outTokens = this._config.tokenizer.encode(stripped);
207
+
208
+ chunks.push({
209
+ text: stripped,
210
+ tokens: outTokens,
211
+ startPos,
212
+ endPos: startPos + stripped.length - 1,
213
+ startOverlap: [],
214
+ endOverlap: [],
215
+ });
216
+ return chunks;
217
+ }
218
+
219
+ private combineChunks(chunks: TextChunk[], joiner: string): TextChunk[] {
220
+ const combined: TextChunk[] = [];
221
+ let current: TextChunk | undefined;
222
+
223
+ const isWhitespaceOnly = (t: string) => !/\S/.test(t);
224
+ const isPunctuationOnly = (t: string) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
225
+
226
+ for (let i = 0; i < chunks.length; i++) {
227
+ const next = chunks[i];
228
+ if (!current) {
229
+ current = next;
230
+ continue;
231
+ }
232
+
233
+ // Keep punctuation-only chunks standalone
234
+ if (isPunctuationOnly(current.text) || isPunctuationOnly(next.text)) {
235
+ combined.push(current);
236
+ current = next;
237
+ continue;
238
+ }
239
+
240
+ const tokenLength = current.tokens.length + next.tokens.length;
241
+ const textLength = current.text.length + (joiner ? joiner.length : 0) + next.text.length;
242
+
243
+ if (tokenLength > this._config.chunkSize || textLength > this._config.chunkSize) {
244
+ combined.push(current);
245
+ current = next;
246
+ } else {
247
+ const sep = (!this._config.keepSeparators && !isWhitespaceOnly(current.text) && !isWhitespaceOnly(next.text)) ? joiner : '';
248
+ current.text += sep + next.text;
249
+ current.endPos = next.endPos;
250
+ current.tokens.push(...next.tokens);
251
+ }
195
252
  }
196
253
 
197
- private getSeparators(docType?: string): string[] {
198
- switch (docType ?? '') {
199
- case "cpp":
200
- return [
201
- // Split along class definitions
202
- "\nclass ",
203
- // Split along function definitions
204
- "\nvoid ",
205
- "\nint ",
206
- "\nfloat ",
207
- "\ndouble ",
208
- // Split along control flow statements
209
- "\nif ",
210
- "\nfor ",
211
- "\nwhile ",
212
- "\nswitch ",
213
- "\ncase ",
214
- // Split by the normal type of lines
215
- "\n\n",
216
- "\n",
217
- " "
218
- ];
219
- case "go":
220
- return [
221
- // Split along function definitions
222
- "\nfunc ",
223
- "\nvar ",
224
- "\nconst ",
225
- "\ntype ",
226
- // Split along control flow statements
227
- "\nif ",
228
- "\nfor ",
229
- "\nswitch ",
230
- "\ncase ",
231
- // Split by the normal type of lines
232
- "\n\n",
233
- "\n",
234
- " "
235
- ];
236
- case "java":
237
- case "c#":
238
- case "csharp":
239
- case "cs":
240
- case "ts":
241
- case "tsx":
242
- case "typescript":
243
- return [
244
- // split along regions
245
- "// LLM-REGION",
246
- "/* LLM-REGION",
247
- "/** LLM-REGION",
248
- // Split along class definitions
249
- "\nclass ",
250
- // Split along method definitions
251
- "\npublic ",
252
- "\nprotected ",
253
- "\nprivate ",
254
- "\nstatic ",
255
- // Split along control flow statements
256
- "\nif ",
257
- "\nfor ",
258
- "\nwhile ",
259
- "\nswitch ",
260
- "\ncase ",
261
- // Split by the normal type of lines
262
- "\n\n",
263
- "\n",
264
- " "
265
- ];
266
- case "js":
267
- case "jsx":
268
- case "javascript":
269
- return [
270
- // split along regions
271
- "// LLM-REGION",
272
- "/* LLM-REGION",
273
- "/** LLM-REGION",
274
- // Split along class definitions
275
- "\nclass ",
276
- // Split along function definitions
277
- "\nfunction ",
278
- "\nconst ",
279
- "\nlet ",
280
- "\nvar ",
281
- "\nclass ",
282
- // Split along control flow statements
283
- "\nif ",
284
- "\nfor ",
285
- "\nwhile ",
286
- "\nswitch ",
287
- "\ncase ",
288
- "\ndefault ",
289
- // Split by the normal type of lines
290
- "\n\n",
291
- "\n",
292
- " "
293
- ];
294
- case "php":
295
- return [
296
- // Split along function definitions
297
- "\nfunction ",
298
- // Split along class definitions
299
- "\nclass ",
300
- // Split along control flow statements
301
- "\nif ",
302
- "\nforeach ",
303
- "\nwhile ",
304
- "\ndo ",
305
- "\nswitch ",
306
- "\ncase ",
307
- // Split by the normal type of lines
308
- "\n\n",
309
- "\n",
310
- " "
311
- ];
312
- case "proto":
313
- return [
314
- // Split along message definitions
315
- "\nmessage ",
316
- // Split along service definitions
317
- "\nservice ",
318
- // Split along enum definitions
319
- "\nenum ",
320
- // Split along option definitions
321
- "\noption ",
322
- // Split along import statements
323
- "\nimport ",
324
- // Split along syntax declarations
325
- "\nsyntax ",
326
- // Split by the normal type of lines
327
- "\n\n",
328
- "\n",
329
- " "
330
- ];
331
- case "python":
332
- case "py":
333
- return [
334
- // First, try to split along class definitions
335
- "\nclass ",
336
- "\ndef ",
337
- "\n\tdef ",
338
- // Now split by the normal type of lines
339
- "\n\n",
340
- "\n",
341
- " "
342
- ];
343
- case "rst":
344
- return [
345
- // Split along section titles
346
- "\n===\n",
347
- "\n---\n",
348
- "\n***\n",
349
- // Split along directive markers
350
- "\n.. ",
351
- // Split by the normal type of lines
352
- "\n\n",
353
- "\n",
354
- " "
355
- ];
356
- case "ruby":
357
- return [
358
- // Split along method definitions
359
- "\ndef ",
360
- "\nclass ",
361
- // Split along control flow statements
362
- "\nif ",
363
- "\nunless ",
364
- "\nwhile ",
365
- "\nfor ",
366
- "\ndo ",
367
- "\nbegin ",
368
- "\nrescue ",
369
- // Split by the normal type of lines
370
- "\n\n",
371
- "\n",
372
- " "
373
- ];
374
- case "rust":
375
- return [
376
- // Split along function definitions
377
- "\nfn ",
378
- "\nconst ",
379
- "\nlet ",
380
- // Split along control flow statements
381
- "\nif ",
382
- "\nwhile ",
383
- "\nfor ",
384
- "\nloop ",
385
- "\nmatch ",
386
- "\nconst ",
387
- // Split by the normal type of lines
388
- "\n\n",
389
- "\n",
390
- " "
391
- ];
392
- case "scala":
393
- return [
394
- // Split along class definitions
395
- "\nclass ",
396
- "\nobject ",
397
- // Split along method definitions
398
- "\ndef ",
399
- "\nval ",
400
- "\nvar ",
401
- // Split along control flow statements
402
- "\nif ",
403
- "\nfor ",
404
- "\nwhile ",
405
- "\nmatch ",
406
- "\ncase ",
407
- // Split by the normal type of lines
408
- "\n\n",
409
- "\n",
410
- " "
411
- ];
412
- case "swift":
413
- return [
414
- // Split along function definitions
415
- "\nfunc ",
416
- // Split along class definitions
417
- "\nclass ",
418
- "\nstruct ",
419
- "\nenum ",
420
- // Split along control flow statements
421
- "\nif ",
422
- "\nfor ",
423
- "\nwhile ",
424
- "\ndo ",
425
- "\nswitch ",
426
- "\ncase ",
427
- // Split by the normal type of lines
428
- "\n\n",
429
- "\n",
430
- " "
431
- ];
432
- case "md":
433
- case "markdown":
434
- return [
435
- // First, try to split along Markdown headings (starting with level 2)
436
- "\n## ",
437
- "\n### ",
438
- "\n#### ",
439
- "\n##### ",
440
- "\n###### ",
441
- // Note the alternative syntax for headings (below) is not handled here
442
- // Heading level 2
443
- // ---------------
444
- // End of code block
445
- "```\n\n",
446
- // Horizontal lines
447
- "\n\n***\n\n",
448
- "\n\n---\n\n",
449
- "\n\n___\n\n",
450
- // Note that this splitter doesn't handle horizontal lines defined
451
- // by *three or more* of ***, ---, or ___, but this is not handled
452
- // Github tables
453
- "<table>",
454
- // "<tr>",
455
- // "<td>",
456
- // "<td ",
457
- "\n\n",
458
- "\n",
459
- " "
460
- ];
461
- case "latex":
462
- return [
463
- // First, try to split along Latex sections
464
- "\n\\chapter{",
465
- "\n\\section{",
466
- "\n\\subsection{",
467
- "\n\\subsubsection{",
468
-
469
- // Now split by environments
470
- "\n\\begin{enumerate}",
471
- "\n\\begin{itemize}",
472
- "\n\\begin{description}",
473
- "\n\\begin{list}",
474
- "\n\\begin{quote}",
475
- "\n\\begin{quotation}",
476
- "\n\\begin{verse}",
477
- "\n\\begin{verbatim}",
478
-
479
- // Now split by math environments
480
- "\n\\begin{align}",
481
- "$$",
482
- "$",
483
-
484
- // Now split by the normal type of lines
485
- "\n\n",
486
- "\n",
487
- " "
488
- ];
489
- case "html":
490
- return [
491
- // First, try to split along HTML tags
492
- "<body>",
493
- "<div>",
494
- "<p>",
495
- "<br>",
496
- "<li>",
497
- "<h1>",
498
- "<h2>",
499
- "<h3>",
500
- "<h4>",
501
- "<h5>",
502
- "<h6>",
503
- "<span>",
504
- "<table>",
505
- "<tr>",
506
- "<td>",
507
- "<th>",
508
- "<ul>",
509
- "<ol>",
510
- "<header>",
511
- "<footer>",
512
- "<nav>",
513
- // Head
514
- "<head>",
515
- "<style>",
516
- "<script>",
517
- "<meta>",
518
- "<title>",
519
- // Normal type of lines
520
- " "
521
- ];
522
- case "sol":
523
- return [
524
- // Split along compiler informations definitions
525
- "\npragma ",
526
- "\nusing ",
527
- // Split along contract definitions
528
- "\ncontract ",
529
- "\ninterface ",
530
- "\nlibrary ",
531
- // Split along method definitions
532
- "\nconstructor ",
533
- "\ntype ",
534
- "\nfunction ",
535
- "\nevent ",
536
- "\nmodifier ",
537
- "\nerror ",
538
- "\nstruct ",
539
- "\nenum ",
540
- // Split along control flow statements
541
- "\nif ",
542
- "\nfor ",
543
- "\nwhile ",
544
- "\ndo while ",
545
- "\nassembly ",
546
- // Split by the normal type of lines
547
- "\n\n",
548
- "\n",
549
- " "
550
- ];
551
- default:
552
- return [
553
- // Split by the normal type of lines
554
- "\n\n",
555
- "\n",
556
- " ",
557
- "",
558
- ];
559
- }
254
+ if (current) combined.push(current);
255
+ return combined;
256
+ }
257
+
258
+ // Token-window splitting utility used for the ' ' logical separator
259
+ private splitBySpaces(text: string): string[] {
260
+ const parts: string[] = [];
261
+ let tokens = this._config.tokenizer.encode(text);
262
+
263
+ do {
264
+ if (tokens.length <= this._config.chunkSize) {
265
+ parts.push(this._config.tokenizer.decode(tokens));
266
+ break;
267
+ } else {
268
+ const span = tokens.splice(0, this._config.chunkSize);
269
+ parts.push(this._config.tokenizer.decode(span));
270
+ }
271
+ } while (true);
272
+
273
+ return parts;
274
+ }
275
+
276
+ private getSeparators(docType?: string): string[] {
277
+ switch (docType ?? '') {
278
+ case "cpp":
279
+ return [
280
+ "\nclass ",
281
+ "\nvoid ",
282
+ "\nint ",
283
+ "\nfloat ",
284
+ "\ndouble ",
285
+ "\nif ",
286
+ "\nfor ",
287
+ "\nwhile ",
288
+ "\nswitch ",
289
+ "\ncase ",
290
+ "\n\n",
291
+ "\n",
292
+ ];
293
+ case "go":
294
+ return [
295
+ "\nfunc ",
296
+ "\nvar ",
297
+ "\nconst ",
298
+ "\ntype ",
299
+ "\nif ",
300
+ "\nfor ",
301
+ "\nswitch ",
302
+ "\ncase ",
303
+ "\n\n",
304
+ "\n",
305
+ ];
306
+ case "java":
307
+ case "c#":
308
+ case "csharp":
309
+ case "cs":
310
+ case "ts":
311
+ case "tsx":
312
+ case "typescript":
313
+ return [
314
+ "// LLM-REGION",
315
+ "/* LLM-REGION",
316
+ "/** LLM-REGION",
317
+ "\nclass ",
318
+ "\npublic ",
319
+ "\nprotected ",
320
+ "\nprivate ",
321
+ "\nstatic ",
322
+ "\nif ",
323
+ "\nfor ",
324
+ "\nwhile ",
325
+ "\nswitch ",
326
+ "\ncase ",
327
+ "\n\n",
328
+ "\n",
329
+ " "
330
+ ];
331
+ case "js":
332
+ case "jsx":
333
+ case "javascript":
334
+ return [
335
+ "// LLM-REGION",
336
+ "/* LLM-REGION",
337
+ "/** LLM-REGION",
338
+ "\nclass ",
339
+ "\nfunction ",
340
+ "\nconst ",
341
+ "\nlet ",
342
+ "\nvar ",
343
+ "\nclass ",
344
+ "\nif ",
345
+ "\nfor ",
346
+ "\nwhile ",
347
+ "\nswitch ",
348
+ "\ncase ",
349
+ "\ndefault ",
350
+ "\n\n",
351
+ "\n",
352
+ ];
353
+ case "php":
354
+ return [
355
+ "\nfunction ",
356
+ "\nclass ",
357
+ "\nif ",
358
+ "\nforeach ",
359
+ "\nwhile ",
360
+ "\ndo ",
361
+ "\nswitch ",
362
+ "\ncase ",
363
+ "\n\n",
364
+ "\n",
365
+ ];
366
+ case "proto":
367
+ return [
368
+ "\nmessage ",
369
+ "\nservice ",
370
+ "\nenum ",
371
+ "\noption ",
372
+ "\nimport ",
373
+ "\nsyntax ",
374
+ "\n\n",
375
+ "\n",
376
+ ];
377
+ case "python":
378
+ case "py":
379
+ return [
380
+ "\nclass ",
381
+ "\ndef ",
382
+ "\n\tdef ",
383
+ "\n\n",
384
+ "\n",
385
+ ];
386
+ case "rst":
387
+ return [
388
+ "\n===\n",
389
+ "\n---\n",
390
+ "\n***\n",
391
+ "\n.. ",
392
+ "\n\n",
393
+ "\n",
394
+ ];
395
+ case "ruby":
396
+ return [
397
+ "\ndef ",
398
+ "\nclass ",
399
+ "\nif ",
400
+ "\nunless ",
401
+ "\nwhile ",
402
+ "\nfor ",
403
+ "\ndo ",
404
+ "\nbegin ",
405
+ "\nrescue ",
406
+ "\n\n",
407
+ "\n",
408
+ ];
409
+ case "rust":
410
+ return [
411
+ "\nfn ",
412
+ "\nconst ",
413
+ "\nlet ",
414
+ "\nif ",
415
+ "\nwhile ",
416
+ "\nfor ",
417
+ "\nloop ",
418
+ "\nmatch ",
419
+ "\nconst ",
420
+ "\n\n",
421
+ "\n",
422
+ ];
423
+ case "scala":
424
+ return [
425
+ "\nclass ",
426
+ "\nobject ",
427
+ "\ndef ",
428
+ "\nval ",
429
+ "\nvar ",
430
+ "\nif ",
431
+ "\nfor ",
432
+ "\nwhile ",
433
+ "\nmatch ",
434
+ "\ncase ",
435
+ "\n\n",
436
+ "\n",
437
+ ];
438
+ case "swift":
439
+ return [
440
+ "\nfunc ",
441
+ "\nclass ",
442
+ "\nstruct ",
443
+ "\nenum ",
444
+ "\nif ",
445
+ "\nfor ",
446
+ "\nwhile ",
447
+ "\ndo ",
448
+ "\nswitch ",
449
+ "\ncase ",
450
+ "\n\n",
451
+ "\n",
452
+ ];
453
+ case "md":
454
+ case "markdown":
455
+ return [
456
+ "\n## ",
457
+ "\n### ",
458
+ "\n#### ",
459
+ "\n##### ",
460
+ "\n###### ",
461
+ "```\n\n",
462
+ "\n\n***\n\n",
463
+ "\n\n---\n\n",
464
+ "\n\n___\n\n",
465
+ "<table>",
466
+ "\n\n",
467
+ "\n",
468
+ ];
469
+ case "latex":
470
+ return [
471
+ "\n\\chapter{",
472
+ "\n\\section{",
473
+ "\n\\subsection{",
474
+ "\n\\subsubsection{",
475
+ "\n\\begin{enumerate}",
476
+ "\n\\begin{itemize}",
477
+ "\n\\begin{description}",
478
+ "\n\\begin{list}",
479
+ "\n\\begin{quote}",
480
+ "\n\\begin{quotation}",
481
+ "\n\\begin{verse}",
482
+ "\n\\begin{verbatim}",
483
+ "\n\\begin{align}",
484
+ "\n\n",
485
+ "\n",
486
+ ];
487
+ case "html":
488
+ return [
489
+ "<body>",
490
+ "<div>",
491
+ "<p>",
492
+ "<br>",
493
+ "<li>",
494
+ "<h1>",
495
+ "<h2>",
496
+ "<h3>",
497
+ "<h4>",
498
+ "<h5>",
499
+ "<h6>",
500
+ "<span>",
501
+ "<table>",
502
+ "<tr>",
503
+ "<td>",
504
+ "<th>",
505
+ "<ul>",
506
+ "<ol>",
507
+ "<header>",
508
+ "<footer>",
509
+ "<nav>",
510
+ "<head>",
511
+ "<style>",
512
+ "<script>",
513
+ "<meta>",
514
+ "<title>",
515
+ ];
516
+ case "sol":
517
+ return [
518
+ "\npragma ",
519
+ "\nusing ",
520
+ "\ncontract ",
521
+ "\ninterface ",
522
+ "\nlibrary ",
523
+ "\nconstructor ",
524
+ "\ntype ",
525
+ "\nfunction ",
526
+ "\nevent ",
527
+ "\nmodifier ",
528
+ "\nerror ",
529
+ "\nstruct ",
530
+ "\nenum ",
531
+ "\nif ",
532
+ "\nfor ",
533
+ "\nwhile ",
534
+ "\ndo while ",
535
+ "\nassembly ",
536
+ "\n\n",
537
+ "\n",
538
+ ];
539
+ default:
540
+ return [
541
+ "\n\n",
542
+ "\n",
543
+ ];
560
544
  }
561
- }
545
+ }
546
+ }