@robthepcguy/rag-vault 1.7.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +28 -9
  2. package/dist/chunker/semantic-chunker.d.ts +0 -1
  3. package/dist/chunker/semantic-chunker.d.ts.map +1 -1
  4. package/dist/chunker/semantic-chunker.js +1 -1
  5. package/dist/chunker/semantic-chunker.js.map +1 -1
  6. package/dist/embedder/index.d.ts +5 -0
  7. package/dist/embedder/index.d.ts.map +1 -1
  8. package/dist/embedder/index.js +40 -5
  9. package/dist/embedder/index.js.map +1 -1
  10. package/dist/errors/index.d.ts +1 -1
  11. package/dist/errors/index.d.ts.map +1 -1
  12. package/dist/flywheel/feedback.d.ts +1 -1
  13. package/dist/flywheel/feedback.d.ts.map +1 -1
  14. package/dist/flywheel/feedback.js +1 -1
  15. package/dist/flywheel/feedback.js.map +1 -1
  16. package/dist/parser/pdf-filter.d.ts +3 -5
  17. package/dist/parser/pdf-filter.d.ts.map +1 -1
  18. package/dist/parser/pdf-filter.js +1 -1
  19. package/dist/parser/pdf-filter.js.map +1 -1
  20. package/dist/query/parser.d.ts +2 -6
  21. package/dist/query/parser.d.ts.map +1 -1
  22. package/dist/query/parser.js +14 -22
  23. package/dist/query/parser.js.map +1 -1
  24. package/dist/server/index.d.ts.map +1 -1
  25. package/dist/server/index.js +37 -34
  26. package/dist/server/index.js.map +1 -1
  27. package/dist/server/raw-data-utils.d.ts +0 -40
  28. package/dist/server/raw-data-utils.d.ts.map +1 -1
  29. package/dist/server/raw-data-utils.js +9 -8
  30. package/dist/server/raw-data-utils.js.map +1 -1
  31. package/dist/server/remote-transport.d.ts +2 -1
  32. package/dist/server/remote-transport.d.ts.map +1 -1
  33. package/dist/server/remote-transport.js +27 -7
  34. package/dist/server/remote-transport.js.map +1 -1
  35. package/dist/server/schemas.d.ts +5 -29
  36. package/dist/server/schemas.d.ts.map +1 -1
  37. package/dist/server/schemas.js +6 -6
  38. package/dist/server/schemas.js.map +1 -1
  39. package/dist/vectordb/index.d.ts +12 -9
  40. package/dist/vectordb/index.d.ts.map +1 -1
  41. package/dist/vectordb/index.js +242 -163
  42. package/dist/vectordb/index.js.map +1 -1
  43. package/dist/web/api-routes.d.ts.map +1 -1
  44. package/dist/web/api-routes.js +23 -10
  45. package/dist/web/api-routes.js.map +1 -1
  46. package/dist/web/database-manager.d.ts.map +1 -1
  47. package/dist/web/database-manager.js +32 -25
  48. package/dist/web/database-manager.js.map +1 -1
  49. package/dist/web/http-server.d.ts +0 -5
  50. package/dist/web/http-server.d.ts.map +1 -1
  51. package/dist/web/http-server.js +4 -8
  52. package/dist/web/http-server.js.map +1 -1
  53. package/dist/web/index.js +4 -2
  54. package/dist/web/index.js.map +1 -1
  55. package/dist/web/middleware/async-handler.d.ts +2 -1
  56. package/dist/web/middleware/async-handler.d.ts.map +1 -1
  57. package/dist/web/middleware/rate-limit.d.ts +2 -1
  58. package/dist/web/middleware/rate-limit.d.ts.map +1 -1
  59. package/dist/web/middleware/request-logger.d.ts +1 -1
  60. package/dist/web/middleware/request-logger.d.ts.map +1 -1
  61. package/package.json +1 -1
  62. package/web-ui/dist/assets/{CollectionsPage-BDmEfv3V.js → CollectionsPage-CjLs8_5j.js} +1 -1
  63. package/web-ui/dist/assets/{FilesPage-pG9HmpgQ.js → FilesPage-Bw9x9aMr.js} +1 -1
  64. package/web-ui/dist/assets/ReaderPage-JPNiOF-x.js +28 -0
  65. package/web-ui/dist/assets/{ReaderSettingsContext-CkSjqsRh.js → ReaderSettingsContext-BLFJnEne.js} +1 -1
  66. package/web-ui/dist/assets/{SearchPage-DAltjnLL.js → SearchPage-D3_Vtbdw.js} +1 -1
  67. package/web-ui/dist/assets/{SettingsPage-C6J5BITP.js → SettingsPage-BAxB2264.js} +1 -1
  68. package/web-ui/dist/assets/{StatusPage-powRGmW3.js → StatusPage-CzJZW8Gs.js} +1 -1
  69. package/web-ui/dist/assets/{UploadPage-eyfSjL4u.js → UploadPage-DW8OujeJ.js} +5 -5
  70. package/web-ui/dist/assets/index-ANt8Xo4z.js +6 -0
  71. package/web-ui/dist/assets/index-DovQIIL4.css +1 -0
  72. package/web-ui/dist/assets/motion-Brxs0UET.js +9 -0
  73. package/web-ui/dist/assets/vendor-DSXQOR6A.js +10 -0
  74. package/web-ui/dist/index.html +3 -3
  75. package/web-ui/dist/assets/ReaderPage-CwMN03NU.js +0 -28
  76. package/web-ui/dist/assets/index-BpwaiuGL.css +0 -1
  77. package/web-ui/dist/assets/index-D068MV_o.js +0 -6
  78. package/web-ui/dist/assets/motion-CKwJwI3J.js +0 -9
  79. package/web-ui/dist/assets/vendor-C2QPsZ3S.js +0 -10
@@ -10,12 +10,6 @@ export { DatabaseError } from '../errors/index.js';
10
10
  * @returns true if path is safe for use in queries
11
11
  */
12
12
  export declare function isValidFilePath(filePath: string): boolean;
13
- /**
14
- * Generate a content-based fingerprint for a chunk.
15
- * Uses SHA-256 hash of normalized text (first 16 hex chars for compactness).
16
- * This enables stable chunk identification across re-indexing.
17
- */
18
- export declare function generateChunkFingerprint(text: string): string;
19
13
  /**
20
14
  * Grouping mode for quality filtering
21
15
  * - 'similar': Only return the most similar group (stops at first distance jump)
@@ -25,7 +19,7 @@ export type GroupingMode = 'similar' | 'related';
25
19
  /**
26
20
  * VectorStore configuration
27
21
  */
28
- export interface VectorStoreConfig {
22
+ interface VectorStoreConfig {
29
23
  /** LanceDB database path */
30
24
  dbPath: string;
31
25
  /** Table name */
@@ -44,7 +38,7 @@ export interface VectorStoreConfig {
44
38
  /**
45
39
  * Document metadata
46
40
  */
47
- export interface DocumentMetadata {
41
+ interface DocumentMetadata {
48
42
  /** File name */
49
43
  fileName: string;
50
44
  /** File size in bytes */
@@ -78,7 +72,7 @@ export interface VectorChunk {
78
72
  /**
79
73
  * Search result
80
74
  */
81
- export interface SearchResult {
75
+ interface SearchResult {
82
76
  /** File path */
83
77
  filePath: string;
84
78
  /** Chunk index */
@@ -183,6 +177,15 @@ export declare class VectorStore {
183
177
  * @param filePath - File path (absolute)
184
178
  */
185
179
  deleteChunks(filePath: string): Promise<void>;
180
+ /**
181
+ * Delete chunks for a file, excluding a set of IDs.
182
+ * Used by insert-then-delete re-ingestion to remove old vectors
183
+ * while keeping newly inserted ones.
184
+ *
185
+ * @param filePath - File path whose old chunks should be removed
186
+ * @param excludeIds - Set of chunk IDs to keep (the new batch)
187
+ */
188
+ deleteChunksExcluding(filePath: string, excludeIds: Set<string>): Promise<void>;
186
189
  /**
187
190
  * Batch insert vector chunks
188
191
  *
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/vectordb/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AA4FlD;;;;GAIG;AACH;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAOzD;AAgBD;;;;GAIG;AACH,wBAAgB,wBAAwB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAK7D;AAMD;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,SAAS,GAAG,SAAS,CAAA;AAEhD;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAA;IACd,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,kEAAkE;IAClE,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,qDAAqD;IACrD,QAAQ,CAAC,EAAE,YAAY,CAAA;IACvB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,yBAAyB;IACzB,QAAQ,EAAE,MAAM,CAAA;IAChB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,sBAAsB;IACtB,EAAE,EAAE,MAAM,CAAA;IACV,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAA;IAChB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ,oDAAoD;IACpD,MAAM,EAAE,MAAM,EAAE,CAAA;IAChB,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,4CAA4C;IAC5C,SAAS,EAAE,MAAM,CAAA;IACjB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,kBAAkB;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAA;IACb,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAuGD;;;;;;;;;;;;GAYG;AACH,qBAAa,WAAW;IACtB,OAAO,CAAC,EAAE,CAA0B;IACpC,OAAO,CAAC,KAAK,CAAqB;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,UAAU,CAAQ;IAC1B,OAAO,CAAC,eAAe,CAAI;IAC3B,OAAO,CAAC,cAAc,CAAsB;IAC5C,2DAA2D;IAC3D,OAAO,CAAC,kBAAkB,CAAgC;IAC1D,qEAAqE;IACrE,OAAO,CAAC,oBAAoB,CAAsB;IAClD,6EAA6E;IAC7E,OAAO,CAAC,oBAAoB,CAA6B;gBAE7C,MAAM,EAAE,iBAAiB;IAIrC;;OAEG;IACH,eAAe,IAAI,MAAM;IAIzB;;;OAGG;IACH,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI;IAOrC;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAqCxB;;;;OAIG;IACH,OAAO,CAAC,gBAAgB;IAcxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAQxB;;;;;;;OAOG;IACH,OAAO,CAAC,qCAAqC;IAU7C;;;;;;OAMG;IACH,OAAO,CAAC,8BAA8B;IAoDtC;;;;;OAKG;YACW,2BAA2B;IA+CzC;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IA6CjC;;;;OAIG;IACG,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAkDnD;;;;OAIG;IACG,YAAY,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IA4DxD;;;;OAIG;YACW,cAAc;IAyC5B;;;;OAIG;YACW,eAAe;IAY7B;;;;;;;;;;OAUG;IACH,OAAO,CAAC,aAAa;IA2CrB;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,WAAW,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,EAAE,KAAK,SAAK,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAqE5F;;;;;;;;;;;;OAYG;IACH,OAAO,CAAC,iBAAiB;IA0CzB;;;;;;;OAOG;IACG,SAAS,CAAC,OAAO,CAAC,EAAE;QACxB,KAAK,CAAC,EAAE,MAAM,CAAA;QACd,MAAM,CAAC,EAAE,MAAM,CAAA;KAChB,GAAG,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;IA2D1E;;;;;;;;;OASG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAuC5B;;;;;OAKG;IACG,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAqClE;;;;;;;;OAQG;IACG,iBAAiB,CACrB,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,MAAM,EAClB,KAAK,SAAI,EACT,mBAAmB,UAAO,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC;IA6E1B;;;;OAIG;IACG,SAAS,IAAI,OAAO,CAAC;QACzB,aAAa,EAAE,MAAM,CAAA;QACrB,UAAU,EAAE,MAAM,CAAA;QAClB,WAAW,EAAE,MAAM,CAAA;QACnB,MAAM,EAAE,MAAM,CAAA;QACd,eAAe,EAAE,OAAO,CAAA;QACxB,UAAU,EAAE,QAAQ,GAAG,aAAa,CAAA;KACrC,CAAC;CA2CH"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/vectordb/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AA0HlD;;;;GAIG;AACH;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAOzD;AAgCD;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,SAAS,GAAG,SAAS,CAAA;AAEhD;;GAEG;AACH,UAAU,iBAAiB;IACzB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAA;IACd,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,kEAAkE;IAClE,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,qDAAqD;IACrD,QAAQ,CAAC,EAAE,YAAY,CAAA;IACvB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED;;GAEG;AACH,UAAU,gBAAgB;IACxB,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,yBAAyB;IACzB,QAAQ,EAAE,MAAM,CAAA;IAChB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,sBAAsB;IACtB,EAAE,EAAE,MAAM,CAAA;IACV,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAA;IAChB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ,oDAAoD;IACpD,MAAM,EAAE,MAAM,EAAE,CAAA;IAChB,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,4CAA4C;IAC5C,SAAS,EAAE,MAAM,CAAA;IACjB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,kBAAkB;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAA;IACb,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAuGD;;;;;;;;;;;;GAYG;AACH,qBAAa,WAAW;IACtB,OAAO,CAAC,EAAE,CAA0B;IACpC,OAAO,CAAC,KAAK,CAAqB;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,UAAU,CAAQ;IAC1B,OAAO,CAAC,eAAe,CAAI;IAC3B,OAAO,CAAC,cAAc,CAAsB;IAC5C,2DAA2D;IAC3D,OAAO,CAAC,kBAAkB,CAAgC;IAC1D,qEAAqE;IACrE,OAAO,CAAC,oBAAoB,CAAsB;IAClD,6EAA6E;IAC7E,OAAO,CAAC,oBAAoB,CAA6B;gBAE7C,MAAM,EAAE,iBAAiB;IAIrC;;OAEG;IACH,eAAe,IAAI,MAAM;IAIzB;;;OAGG;IACH,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI;IAOrC;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAqCxB;;;;OAIG;IACH,OAAO,CAAC,gBAAgB;IAcxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAQxB;;;;;;;OAOG;IACH,OAAO,CAAC,qCAAqC;IAU7C;;;;;;OAMG;IACH,OAAO,CAAC,8BAA8B;IAoDtC;;;;;OAKG;YACW,2BAA2B;IA+CzC;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IA6CjC;;;;OAIG;IACG,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAkDnD;;;;;;;OAOG;IACG,qBAAqB,CAAC,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IA8CrF;;;;OAIG;IACG,YAAY,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IA4DxD;;;;OAIG;YACW,cAAc;IAyC5B;;;;OAIG;YACW,eAAe;IAY7B;;;;;;;;;;OAUG;IACH,OAAO,CAAC,aAAa;IA2CrB;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,WAAW,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,EAAE,KAAK,SAAK,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IA6E5F;;;;;;;;;;;;OAYG;IACH,OAAO,CAAC,iBAAiB;IA0CzB;;;;;;;OAOG;IACG,SAAS,CAAC,OAAO,CAAC,EAAE;QACxB,KAAK,CAAC,EAAE,MAAM,CAAA;QACd,MAAM,CAAC,EAAE,MAAM,CAAA;KAChB,GAAG,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;IA8D1E;;;;;;;;;OASG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAuC5B;;;;;OAKG;IACG,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAsClE;;;;;;;;OAQG;IACG,iBAAiB,CACrB,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,MAAM,EAClB,KAAK,SAAI,EACT,mBAAmB,UAAO,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC;IAgF1B;;;;OAIG;IACG,SAAS,IAAI,OAAO,CAAC;QACzB,aAAa,EAAE,MAAM,CAAA;QACrB,UAAU,EAAE,MAAM,CAAA;QAClB,WAAW,EAAE,MAAM,CAAA;QACnB,MAAM,EAAE,MAAM,CAAA;QACd,eAAe,EAAE,OAAO,CAAA;QACxB,UAAU,EAAE,QAAQ,GAAG,aAAa,CAAA;KACrC,CAAC;CA2CH"}
@@ -1,6 +1,6 @@
1
1
  // VectorStore implementation with LanceDB integration
2
2
  import { createHash } from 'node:crypto';
3
- import { Index, connect } from '@lancedb/lancedb';
3
+ import { connect, Index } from '@lancedb/lancedb';
4
4
  import { DatabaseError } from '../errors/index.js';
5
5
  // Re-export error class for backwards compatibility
6
6
  export { DatabaseError } from '../errors/index.js';
@@ -15,7 +15,9 @@ function parseEnvNumber(envVar, defaultValue) {
15
15
  if (!value)
16
16
  return defaultValue;
17
17
  const parsed = Number.parseFloat(value);
18
- return Number.isNaN(parsed) ? defaultValue : parsed;
18
+ if (Number.isNaN(parsed) || !Number.isFinite(parsed))
19
+ return defaultValue;
20
+ return parsed;
19
21
  }
20
22
  /**
21
23
  * Parse an integer environment variable with fallback
@@ -25,7 +27,9 @@ function parseEnvInt(envVar, defaultValue) {
25
27
  if (!value)
26
28
  return defaultValue;
27
29
  const parsed = Number.parseInt(value, 10);
28
- return Number.isNaN(parsed) ? defaultValue : parsed;
30
+ if (Number.isNaN(parsed) || !Number.isFinite(parsed))
31
+ return defaultValue;
32
+ return parsed;
29
33
  }
30
34
  /**
31
35
  * Standard deviation multiplier for detecting group boundaries.
@@ -81,6 +85,27 @@ const CUSTOM_METADATA_ALL_FIELDS = '__all__';
81
85
  * Rejects paths with SQL injection attempts or path traversal.
82
86
  */
83
87
  const SAFE_PATH_REGEX = /^[a-zA-Z0-9\\/_.:\- ]+$/;
88
+ /**
89
+ * Retry a read-only async operation with exponential backoff.
90
+ * Used for transient disk/IO errors on VectorStore reads.
91
+ */
92
+ async function withRetry(fn, label, maxAttempts = 3, baseDelayMs = 100) {
93
+ let lastError;
94
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
95
+ try {
96
+ return await fn();
97
+ }
98
+ catch (error) {
99
+ lastError = error;
100
+ if (attempt < maxAttempts) {
101
+ const delayMs = baseDelayMs * 2 ** (attempt - 1);
102
+ console.warn(`${label}: attempt ${attempt}/${maxAttempts} failed (${lastError.message}), retrying in ${delayMs}ms...`);
103
+ await new Promise((resolve) => setTimeout(resolve, delayMs));
104
+ }
105
+ }
106
+ }
107
+ throw lastError;
108
+ }
84
109
  /**
85
110
  * Validate file path to prevent SQL injection and path traversal attacks.
86
111
  * @param filePath - The file path to validate
@@ -121,7 +146,7 @@ function normalizeTextForFingerprint(text) {
121
146
  * Uses SHA-256 hash of normalized text (first 16 hex chars for compactness).
122
147
  * This enables stable chunk identification across re-indexing.
123
148
  */
124
- export function generateChunkFingerprint(text) {
149
+ function generateChunkFingerprint(text) {
125
150
  const normalized = normalizeTextForFingerprint(text);
126
151
  const hash = createHash('sha256').update(normalized, 'utf8').digest('hex');
127
152
  // Use first 16 characters (64 bits) - sufficient for practical uniqueness
@@ -507,6 +532,48 @@ export class VectorStore {
507
532
  // Ignorable errors (no matching records) are logged but not thrown
508
533
  }
509
534
  }
535
+ /**
536
+ * Delete chunks for a file, excluding a set of IDs.
537
+ * Used by insert-then-delete re-ingestion to remove old vectors
538
+ * while keeping newly inserted ones.
539
+ *
540
+ * @param filePath - File path whose old chunks should be removed
541
+ * @param excludeIds - Set of chunk IDs to keep (the new batch)
542
+ */
543
+ async deleteChunksExcluding(filePath, excludeIds) {
544
+ if (!this.table || excludeIds.size === 0) {
545
+ return;
546
+ }
547
+ if (!isValidFilePath(filePath)) {
548
+ throw new DatabaseError('Invalid file path: contains disallowed characters or patterns');
549
+ }
550
+ const escapedFilePath = filePath.replace(/'/g, "''");
551
+ try {
552
+ // Query existing chunks for this file to find old IDs
553
+ const existing = await this.table
554
+ .query()
555
+ .where(`\`filePath\` = '${escapedFilePath}'`)
556
+ .select(['id'])
557
+ .toArray();
558
+ const oldIds = existing.map((row) => row.id).filter((id) => !excludeIds.has(id));
559
+ if (oldIds.length === 0) {
560
+ return;
561
+ }
562
+ // Delete old chunks by ID
563
+ const idList = oldIds.map((id) => `'${id.replace(/'/g, "''")}'`).join(', ');
564
+ await this.table.delete(`\`id\` IN (${idList})`);
565
+ console.error(`VectorStore: Removed ${oldIds.length} old chunks for "${filePath}"`);
566
+ await this.rebuildFtsIndex();
567
+ }
568
+ catch (error) {
569
+ // Non-fatal: temporary duplicates are acceptable
570
+ const errorMessage = error.message.toLowerCase();
571
+ const isIgnorable = DELETE_IGNORABLE_PATTERNS.some((pattern) => errorMessage.includes(pattern));
572
+ if (!isIgnorable) {
573
+ throw new DatabaseError(`Failed to clean up old chunks for file: ${filePath}`, error);
574
+ }
575
+ }
576
+ }
510
577
  /**
511
578
  * Batch insert vector chunks
512
579
  *
@@ -688,55 +755,61 @@ export class VectorStore {
688
755
  if (limit < 1 || limit > 20) {
689
756
  throw new DatabaseError(`Invalid limit: expected 1-20, got ${limit}`);
690
757
  }
691
- try {
692
- // Step 1: Semantic (vector) search - always the primary search
693
- const candidateLimit = limit * HYBRID_SEARCH_CANDIDATE_MULTIPLIER;
694
- // Assumes normalized embeddings so dot behaves like cosine distance (lower is better, [0,2]).
695
- let query = this.table.vectorSearch(queryVector).distanceType('dot').limit(candidateLimit);
696
- // Apply distance threshold at query level
697
- if (this.config.maxDistance !== undefined) {
698
- query = query.distanceRange(undefined, this.config.maxDistance);
699
- }
700
- const vectorResults = await query.toArray();
701
- // Convert to SearchResult format with type validation
702
- let results = vectorResults.map((result) => toSearchResult(result));
703
- // Step 2: Apply grouping filter on vector distances (before keyword boost)
704
- // Grouping is meaningful only on semantic distances, not after keyword boost
705
- if (this.config.grouping && results.length > 1) {
706
- results = this.applyGrouping(results, this.config.grouping);
707
- }
708
- // Step 3: Apply keyword boost if enabled (with circuit breaker)
709
- const hybridWeight = this.getHybridWeight();
710
- if (this.shouldAttemptFts() && queryText && queryText.trim().length > 0 && hybridWeight > 0) {
711
- try {
712
- // Get unique filePaths from vector results to filter FTS search
713
- const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
714
- // Build WHERE clause with IN for targeted FTS search
715
- // Use backticks for column name (required for camelCase in LanceDB)
716
- const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
717
- const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
718
- const ftsResults = await this.table
719
- .search(queryText, 'fts', 'text')
720
- .where(whereClause)
721
- .select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
722
- .limit(results.length * 2) // Enough to cover all vector results
723
- .toArray();
724
- results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
725
- // FTS succeeded - reset circuit breaker
726
- this.recordFtsSuccess();
758
+ const table = this.table;
759
+ return withRetry(async () => {
760
+ try {
761
+ // Step 1: Semantic (vector) search - always the primary search
762
+ const candidateLimit = limit * HYBRID_SEARCH_CANDIDATE_MULTIPLIER;
763
+ // Assumes normalized embeddings so dot behaves like cosine distance (lower is better, [0,2]).
764
+ let query = table.vectorSearch(queryVector).distanceType('dot').limit(candidateLimit);
765
+ // Apply distance threshold at query level
766
+ if (this.config.maxDistance !== undefined) {
767
+ query = query.distanceRange(undefined, this.config.maxDistance);
727
768
  }
728
- catch (ftsError) {
729
- // Record failure for circuit breaker (will auto-recover after cooldown)
730
- this.recordFtsFailure(ftsError);
731
- // Continue with vector-only results
769
+ const vectorResults = await query.toArray();
770
+ // Convert to SearchResult format with type validation
771
+ let results = vectorResults.map((result) => toSearchResult(result));
772
+ // Step 2: Apply grouping filter on vector distances (before keyword boost)
773
+ // Grouping is meaningful only on semantic distances, not after keyword boost
774
+ if (this.config.grouping && results.length > 1) {
775
+ results = this.applyGrouping(results, this.config.grouping);
732
776
  }
777
+ // Step 3: Apply keyword boost if enabled (with circuit breaker)
778
+ const hybridWeight = this.getHybridWeight();
779
+ if (this.shouldAttemptFts() &&
780
+ queryText &&
781
+ queryText.trim().length > 0 &&
782
+ hybridWeight > 0) {
783
+ try {
784
+ // Get unique filePaths from vector results to filter FTS search
785
+ const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
786
+ // Build WHERE clause with IN for targeted FTS search
787
+ // Use backticks for column name (required for camelCase in LanceDB)
788
+ const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
789
+ const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
790
+ const ftsResults = await table
791
+ .search(queryText, 'fts', 'text')
792
+ .where(whereClause)
793
+ .select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
794
+ .limit(results.length * 2) // Enough to cover all vector results
795
+ .toArray();
796
+ results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
797
+ // FTS succeeded - reset circuit breaker
798
+ this.recordFtsSuccess();
799
+ }
800
+ catch (ftsError) {
801
+ // Record failure for circuit breaker (will auto-recover after cooldown)
802
+ this.recordFtsFailure(ftsError);
803
+ // Continue with vector-only results
804
+ }
805
+ }
806
+ // Return top results after all filtering and boosting
807
+ return results.slice(0, limit);
733
808
  }
734
- // Return top results after all filtering and boosting
735
- return results.slice(0, limit);
736
- }
737
- catch (error) {
738
- throw new DatabaseError('Failed to search vectors', error);
739
- }
809
+ catch (error) {
810
+ throw new DatabaseError('Failed to search vectors', error);
811
+ }
812
+ }, 'VectorStore.search');
740
813
  }
741
814
  /**
742
815
  * Apply keyword boost to rerank vector search results
@@ -798,52 +871,55 @@ export class VectorStore {
798
871
  if (!this.table) {
799
872
  return []; // Return empty array if table doesn't exist
800
873
  }
801
- try {
802
- // Retrieve all records - LanceDB doesn't support GROUP BY aggregation,
803
- // so we must fetch records and group in memory
804
- // TODO(perf): Consider caching file list or using incremental updates for very large datasets
805
- const allRecords = await this.table.query().toArray();
806
- // Group by file path
807
- const fileMap = new Map();
808
- for (const record of allRecords) {
809
- const filePath = record.filePath;
810
- const timestamp = record.timestamp;
811
- if (fileMap.has(filePath)) {
812
- const fileInfo = fileMap.get(filePath);
813
- if (fileInfo) {
814
- fileInfo.chunkCount += 1;
815
- // Keep most recent timestamp
816
- if (timestamp > fileInfo.timestamp) {
817
- fileInfo.timestamp = timestamp;
874
+ const table = this.table;
875
+ return withRetry(async () => {
876
+ try {
877
+ // Retrieve all records - LanceDB doesn't support GROUP BY aggregation,
878
+ // so we must fetch records and group in memory
879
+ // TODO(perf): Consider caching file list or using incremental updates for very large datasets
880
+ const allRecords = await table.query().toArray();
881
+ // Group by file path
882
+ const fileMap = new Map();
883
+ for (const record of allRecords) {
884
+ const filePath = record.filePath;
885
+ const timestamp = record.timestamp;
886
+ if (fileMap.has(filePath)) {
887
+ const fileInfo = fileMap.get(filePath);
888
+ if (fileInfo) {
889
+ fileInfo.chunkCount += 1;
890
+ // Keep most recent timestamp
891
+ if (timestamp > fileInfo.timestamp) {
892
+ fileInfo.timestamp = timestamp;
893
+ }
818
894
  }
819
895
  }
896
+ else {
897
+ fileMap.set(filePath, { chunkCount: 1, timestamp });
898
+ }
820
899
  }
821
- else {
822
- fileMap.set(filePath, { chunkCount: 1, timestamp });
900
+ // Convert Map to array of objects
901
+ let results = Array.from(fileMap.entries()).map(([filePath, info]) => ({
902
+ filePath,
903
+ chunkCount: info.chunkCount,
904
+ timestamp: info.timestamp,
905
+ }));
906
+ // Sort by timestamp descending (most recent first)
907
+ results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
908
+ // Apply pagination if provided
909
+ const offset = options?.offset ?? 0;
910
+ const limit = options?.limit;
911
+ if (offset > 0) {
912
+ results = results.slice(offset);
823
913
  }
914
+ if (limit !== undefined && limit > 0) {
915
+ results = results.slice(0, limit);
916
+ }
917
+ return results;
824
918
  }
825
- // Convert Map to array of objects
826
- let results = Array.from(fileMap.entries()).map(([filePath, info]) => ({
827
- filePath,
828
- chunkCount: info.chunkCount,
829
- timestamp: info.timestamp,
830
- }));
831
- // Sort by timestamp descending (most recent first)
832
- results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
833
- // Apply pagination if provided
834
- const offset = options?.offset ?? 0;
835
- const limit = options?.limit;
836
- if (offset > 0) {
837
- results = results.slice(offset);
838
- }
839
- if (limit !== undefined && limit > 0) {
840
- results = results.slice(0, limit);
919
+ catch (error) {
920
+ throw new DatabaseError('Failed to list files', error);
841
921
  }
842
- return results;
843
- }
844
- catch (error) {
845
- throw new DatabaseError('Failed to list files', error);
846
- }
922
+ }, 'VectorStore.listFiles');
847
923
  }
848
924
  /**
849
925
  * Close the database connection and release resources
@@ -902,30 +978,30 @@ export class VectorStore {
902
978
  if (!isValidFilePath(filePath)) {
903
979
  throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
904
980
  }
905
- try {
906
- const escapedFilePath = filePath.replace(/'/g, "''");
907
- const results = await this.table
908
- .query()
909
- .where(`\`filePath\` = '${escapedFilePath}'`)
910
- .toArray();
911
- // Convert to SearchResult format and sort by chunkIndex
912
- const chunks = results.map((record) => {
913
- const text = record.text;
914
- return {
915
- filePath: record.filePath,
916
- chunkIndex: record.chunkIndex,
917
- text,
918
- score: 0, // No distance score for direct retrieval
919
- metadata: record.metadata,
920
- // Include fingerprint - generate if not stored (backwards compatibility)
921
- fingerprint: record.fingerprint || generateChunkFingerprint(text),
922
- };
923
- });
924
- return chunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
925
- }
926
- catch (error) {
927
- throw new DatabaseError(`Failed to get document chunks for: ${filePath}`, error);
928
- }
981
+ const table = this.table;
982
+ return withRetry(async () => {
983
+ try {
984
+ const escapedFilePath = filePath.replace(/'/g, "''");
985
+ const results = await table.query().where(`\`filePath\` = '${escapedFilePath}'`).toArray();
986
+ // Convert to SearchResult format and sort by chunkIndex
987
+ const chunks = results.map((record) => {
988
+ const text = record.text;
989
+ return {
990
+ filePath: record.filePath,
991
+ chunkIndex: record.chunkIndex,
992
+ text,
993
+ score: 0, // No distance score for direct retrieval
994
+ metadata: record.metadata,
995
+ // Include fingerprint - generate if not stored (backwards compatibility)
996
+ fingerprint: record.fingerprint || generateChunkFingerprint(text),
997
+ };
998
+ });
999
+ return chunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
1000
+ }
1001
+ catch (error) {
1002
+ throw new DatabaseError(`Failed to get document chunks for: ${filePath}`, error);
1003
+ }
1004
+ }, 'VectorStore.getDocumentChunks');
929
1005
  }
930
1006
  /**
931
1007
  * Find related chunks using a chunk's stored embedding
@@ -944,59 +1020,62 @@ export class VectorStore {
944
1020
  if (!isValidFilePath(filePath)) {
945
1021
  throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
946
1022
  }
947
- try {
948
- // First, fetch the source chunk to get its vector
949
- const escapedFilePath = filePath.replace(/'/g, "''");
950
- const sourceResults = await this.table
951
- .query()
952
- .where(`\`filePath\` = '${escapedFilePath}' AND \`chunkIndex\` = ${chunkIndex}`)
953
- .toArray();
954
- if (sourceResults.length === 0) {
955
- return [];
956
- }
957
- const sourceChunk = sourceResults[0];
958
- const rawVector = sourceChunk?.vector;
959
- // LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
960
- // Convert to number[] for compatibility
961
- let sourceVector;
962
- if (rawVector) {
963
- if (Array.isArray(rawVector)) {
964
- sourceVector = rawVector;
1023
+ const table = this.table;
1024
+ return withRetry(async () => {
1025
+ try {
1026
+ // First, fetch the source chunk to get its vector
1027
+ const escapedFilePath = filePath.replace(/'/g, "''");
1028
+ const sourceResults = await table
1029
+ .query()
1030
+ .where(`\`filePath\` = '${escapedFilePath}' AND \`chunkIndex\` = ${chunkIndex}`)
1031
+ .toArray();
1032
+ if (sourceResults.length === 0) {
1033
+ return [];
965
1034
  }
966
- else if (typeof rawVector === 'object' && 'length' in rawVector) {
967
- // Handle Arrow Vector, Float32Array, or other array-like objects
968
- sourceVector = Array.from(rawVector);
1035
+ const sourceChunk = sourceResults[0];
1036
+ const rawVector = sourceChunk?.vector;
1037
+ // LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
1038
+ // Convert to number[] for compatibility
1039
+ let sourceVector;
1040
+ if (rawVector) {
1041
+ if (Array.isArray(rawVector)) {
1042
+ sourceVector = rawVector;
1043
+ }
1044
+ else if (typeof rawVector === 'object' && 'length' in rawVector) {
1045
+ // Handle Arrow Vector, Float32Array, or other array-like objects
1046
+ sourceVector = Array.from(rawVector);
1047
+ }
969
1048
  }
1049
+ if (!sourceVector || sourceVector.length === 0) {
1050
+ // Chunk exists but has no embedding (e.g., upload timed out mid-process)
1051
+ // Return empty results instead of throwing - allows batch operations to continue
1052
+ console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
1053
+ return [];
1054
+ }
1055
+ // Search for similar chunks using the source vector
1056
+ // Request more candidates to allow for filtering
1057
+ const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
1058
+ let query = table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
1059
+ // Apply distance threshold if configured
1060
+ if (this.config.maxDistance !== undefined) {
1061
+ query = query.distanceRange(undefined, this.config.maxDistance);
1062
+ }
1063
+ const vectorResults = await query.toArray();
1064
+ // Convert to SearchResult format with type validation
1065
+ let results = vectorResults.map((result) => toSearchResult(result));
1066
+ // Filter out the source chunk itself
1067
+ results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
1068
+ // Optionally filter out same-document chunks
1069
+ if (excludeSameDocument) {
1070
+ results = results.filter((r) => r.filePath !== filePath);
1071
+ }
1072
+ return results.slice(0, limit);
970
1073
  }
971
- if (!sourceVector || sourceVector.length === 0) {
972
- // Chunk exists but has no embedding (e.g., upload timed out mid-process)
973
- // Return empty results instead of throwing - allows batch operations to continue
974
- console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
975
- return [];
976
- }
977
- // Search for similar chunks using the source vector
978
- // Request more candidates to allow for filtering
979
- const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
980
- let query = this.table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
981
- // Apply distance threshold if configured
982
- if (this.config.maxDistance !== undefined) {
983
- query = query.distanceRange(undefined, this.config.maxDistance);
984
- }
985
- const vectorResults = await query.toArray();
986
- // Convert to SearchResult format with type validation
987
- let results = vectorResults.map((result) => toSearchResult(result));
988
- // Filter out the source chunk itself
989
- results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
990
- // Optionally filter out same-document chunks
991
- if (excludeSameDocument) {
992
- results = results.filter((r) => r.filePath !== filePath);
1074
+ catch (error) {
1075
+ const cause = error instanceof Error ? error.message : String(error);
1076
+ throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
993
1077
  }
994
- return results.slice(0, limit);
995
- }
996
- catch (error) {
997
- const cause = error instanceof Error ? error.message : String(error);
998
- throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
999
- }
1078
+ }, 'VectorStore.findRelatedChunks');
1000
1079
  }
1001
1080
  /**
1002
1081
  * Get system status