ydb-qdrant 4.1.0 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -402,11 +402,11 @@ curl -X POST http://localhost:8080/collections/mycol/points/delete \
402
402
  ## Notes
403
403
  - Storage layout:
404
404
  - **multi_table** (default): one YDB table per collection; metadata is tracked in `qdr__collections`.
405
- - **one_table**: a single global table `qdrant_all_points` with `(uid, point_id)` PK, where `uid` encodes tenant+collection.
405
+ - **one_table**: a single global table `qdrant_all_points` with `(uid, point_id)` PK, where `uid` encodes tenant+collection. Columns: `uid Utf8`, `point_id Utf8`, `embedding String` (binary float), `embedding_bit String` (bit‑quantized), `payload JsonDocument`.
406
406
  - Per‑collection table schema (multi_table): `point_id Utf8` (PK), `embedding String` (binary), `payload JsonDocument`.
407
407
  - Vectors are serialized with `Knn::ToBinaryStringFloat`.
408
408
  - Search uses a single-phase top‑k over `embedding` with automatic YDB vector index (`emb_idx`) when available; falls back to table scan if missing.
409
- - **Vector index auto-build** (multi_table mode only): After ≥100 points upserted + 5s quiet window, a `vector_kmeans_tree` index (levels=1, clusters=128) is built automatically. Incremental updates (<100 points) skip index rebuild. In one_table mode, vector indexes are not supported and all searches use table scans.
409
+ - **Vector index auto-build** (multi_table mode only): After ≥100 points upserted + 5s quiet window, a `vector_kmeans_tree` index (levels=1, clusters=128) is built automatically. Incremental updates (<100 points) skip index rebuild. In one_table mode, vector indexes are not supported; searches use a two‑phase approximate+exact flow over `qdrant_all_points` (bit‑quantized candidates via `embedding_bit` using the corresponding distance function, then exact re‑ranking over `embedding`). Note: For Dot metric, Phase 1 uses CosineDistance as a proxy since there is no direct distance equivalent for inner product on bit vectors.
410
410
  - **Concurrency**: During index rebuilds, YDB may return transient `Aborted`/schema metadata errors. Upserts include bounded retries with backoff to handle this automatically.
411
411
  - Filters are not yet modeled; can be added if needed.
412
412
 
@@ -1,7 +1,7 @@
1
- import { TypedValues, withSession } from "../ydb/client.js";
1
+ import { TypedValues, Types, withSession } from "../ydb/client.js";
2
2
  import { buildJsonOrEmpty, buildVectorParam } from "../ydb/helpers.js";
3
3
  import { notifyUpsert } from "../indexing/IndexScheduler.js";
4
- import { mapDistanceToKnnFn } from "../utils/distance.js";
4
+ import { mapDistanceToKnnFn, mapDistanceToBitKnnFn, } from "../utils/distance.js";
5
5
  import { withRetry, isTransientYdbError } from "../utils/retry.js";
6
6
  export async function upsertPointsOneTable(tableName, points, dimension, uid) {
7
7
  let upserted = 0;
@@ -16,11 +16,12 @@ export async function upsertPointsOneTable(tableName, points, dimension, uid) {
16
16
  DECLARE $id AS Utf8;
17
17
  DECLARE $vec AS List<Float>;
18
18
  DECLARE $payload AS JsonDocument;
19
- UPSERT INTO ${tableName} (uid, point_id, embedding, payload)
19
+ UPSERT INTO ${tableName} (uid, point_id, embedding, embedding_bit, payload)
20
20
  VALUES (
21
21
  $uid,
22
22
  $id,
23
23
  Untag(Knn::ToBinaryStringFloat($vec), "FloatVector"),
24
+ Untag(Knn::ToBinaryStringBit($vec), "BitVector"),
24
25
  $payload
25
26
  );
26
27
  `;
@@ -45,50 +46,83 @@ export async function searchPointsOneTable(tableName, queryVector, top, withPayl
45
46
  throw new Error(`Vector dimension mismatch: got ${queryVector.length}, expected ${dimension}`);
46
47
  }
47
48
  const { fn, order } = mapDistanceToKnnFn(distance);
49
+ const { fn: bitFn, order: bitOrder } = mapDistanceToBitKnnFn(distance);
48
50
  const qf = buildVectorParam(queryVector);
49
- const params = {
50
- $qf: qf,
51
- $k2: TypedValues.uint32(top),
52
- $uid: TypedValues.utf8(uid),
53
- };
54
- const query = `
55
- DECLARE $qf AS List<Float>;
56
- DECLARE $k2 AS Uint32;
57
- DECLARE $uid AS Utf8;
58
- $qbinf = Knn::ToBinaryStringFloat($qf);
59
- SELECT point_id, ${withPayload ? "payload, " : ""}${fn}(embedding, $qbinf) AS score
60
- FROM ${tableName}
61
- WHERE uid = $uid
62
- ORDER BY score ${order}
63
- LIMIT $k2;
64
- `;
65
- const rs = await withSession(async (s) => {
66
- return await s.executeQuery(query, params);
67
- });
68
- const rowset = rs.resultSets?.[0];
69
- const rows = (rowset?.rows ?? []);
70
- return rows.map((row) => {
71
- const id = row.items?.[0]?.textValue;
72
- if (typeof id !== "string") {
73
- throw new Error("point_id is missing in YDB search result");
51
+ const candidateLimit = top * 10;
52
+ const results = await withSession(async (s) => {
53
+ // Phase 1: approximate candidate selection using embedding_bit
54
+ const phase1Query = `
55
+ DECLARE $qf AS List<Float>;
56
+ DECLARE $k AS Uint32;
57
+ DECLARE $uid AS Utf8;
58
+ $qbin_bit = Knn::ToBinaryStringBit($qf);
59
+ SELECT point_id
60
+ FROM ${tableName}
61
+ WHERE uid = $uid AND embedding_bit IS NOT NULL
62
+ ORDER BY ${bitFn}(embedding_bit, $qbin_bit) ${bitOrder}
63
+ LIMIT $k;
64
+ `;
65
+ const phase1Params = {
66
+ $qf: qf,
67
+ $k: TypedValues.uint32(candidateLimit),
68
+ $uid: TypedValues.utf8(uid),
69
+ };
70
+ const rs1 = await s.executeQuery(phase1Query, phase1Params);
71
+ const rowset1 = rs1.resultSets?.[0];
72
+ const rows1 = (rowset1?.rows ?? []);
73
+ const candidateIds = rows1
74
+ .map((row) => row.items?.[0]?.textValue)
75
+ .filter((id) => typeof id === "string");
76
+ if (candidateIds.length === 0) {
77
+ return [];
74
78
  }
75
- let payload;
76
- let scoreIdx = 1;
77
- if (withPayload) {
78
- const payloadText = row.items?.[1]?.textValue;
79
- if (payloadText) {
80
- try {
81
- payload = JSON.parse(payloadText);
82
- }
83
- catch {
84
- payload = undefined;
79
+ // Phase 2: exact re-ranking on full-precision embedding for candidates only
80
+ const phase2Query = `
81
+ DECLARE $qf AS List<Float>;
82
+ DECLARE $k AS Uint32;
83
+ DECLARE $uid AS Utf8;
84
+ DECLARE $ids AS List<Utf8>;
85
+ $qbinf = Knn::ToBinaryStringFloat($qf);
86
+ SELECT point_id, ${withPayload ? "payload, " : ""}${fn}(embedding, $qbinf) AS score
87
+ FROM ${tableName}
88
+ WHERE uid = $uid AND point_id IN $ids
89
+ ORDER BY score ${order}
90
+ LIMIT $k;
91
+ `;
92
+ const idsParam = TypedValues.list(Types.UTF8, candidateIds);
93
+ const phase2Params = {
94
+ $qf: qf,
95
+ $k: TypedValues.uint32(top),
96
+ $uid: TypedValues.utf8(uid),
97
+ $ids: idsParam,
98
+ };
99
+ const rs2 = await s.executeQuery(phase2Query, phase2Params);
100
+ const rowset2 = rs2.resultSets?.[0];
101
+ const rows2 = (rowset2?.rows ?? []);
102
+ return rows2.map((row) => {
103
+ const id = row.items?.[0]?.textValue;
104
+ if (typeof id !== "string") {
105
+ throw new Error("point_id is missing in YDB search result");
106
+ }
107
+ let payload;
108
+ let scoreIdx = 1;
109
+ if (withPayload) {
110
+ const payloadText = row.items?.[1]?.textValue;
111
+ if (payloadText) {
112
+ try {
113
+ payload = JSON.parse(payloadText);
114
+ }
115
+ catch {
116
+ payload = undefined;
117
+ }
85
118
  }
119
+ scoreIdx = 2;
86
120
  }
87
- scoreIdx = 2;
88
- }
89
- const score = Number(row.items?.[scoreIdx]?.floatValue ?? row.items?.[scoreIdx]?.textValue);
90
- return { id, score, ...(payload ? { payload } : {}) };
121
+ const score = Number(row.items?.[scoreIdx]?.floatValue ?? row.items?.[scoreIdx]?.textValue);
122
+ return { id, score, ...(payload ? { payload } : {}) };
123
+ });
91
124
  });
125
+ return results;
92
126
  }
93
127
  export async function deletePointsOneTable(tableName, ids, uid) {
94
128
  let deleted = 0;
@@ -4,3 +4,14 @@ export declare function mapDistanceToKnnFn(distance: DistanceKind): {
4
4
  order: "ASC" | "DESC";
5
5
  };
6
6
  export declare function mapDistanceToIndexParam(distance: DistanceKind): string;
7
+ /**
8
+ * Maps a user-specified distance metric to a YDB Knn distance function
9
+ * suitable for bit-quantized vectors (Phase 1 approximate candidate selection).
10
+ * Always returns a distance function (lower is better, ASC ordering).
11
+ * For Dot, falls back to CosineDistance as a proxy since there is no
12
+ * direct distance equivalent for inner product.
13
+ */
14
+ export declare function mapDistanceToBitKnnFn(distance: DistanceKind): {
15
+ fn: string;
16
+ order: "ASC";
17
+ };
@@ -26,3 +26,25 @@ export function mapDistanceToIndexParam(distance) {
26
26
  return "cosine";
27
27
  }
28
28
  }
29
+ /**
30
+ * Maps a user-specified distance metric to a YDB Knn distance function
31
+ * suitable for bit-quantized vectors (Phase 1 approximate candidate selection).
32
+ * Always returns a distance function (lower is better, ASC ordering).
33
+ * For Dot, falls back to CosineDistance as a proxy since there is no
34
+ * direct distance equivalent for inner product.
35
+ */
36
+ export function mapDistanceToBitKnnFn(distance) {
37
+ switch (distance) {
38
+ case "Cosine":
39
+ return { fn: "Knn::CosineDistance", order: "ASC" };
40
+ case "Dot":
41
+ // No direct distance equivalent; use Cosine as proxy
42
+ return { fn: "Knn::CosineDistance", order: "ASC" };
43
+ case "Euclid":
44
+ return { fn: "Knn::EuclideanDistance", order: "ASC" };
45
+ case "Manhattan":
46
+ return { fn: "Knn::ManhattanDistance", order: "ASC" };
47
+ default:
48
+ return { fn: "Knn::CosineDistance", order: "ASC" };
49
+ }
50
+ }
@@ -30,19 +30,59 @@ export async function ensureGlobalPointsTable() {
30
30
  }
31
31
  try {
32
32
  await withSession(async (s) => {
33
+ let tableDescription = null;
33
34
  try {
34
- await s.describeTable(GLOBAL_POINTS_TABLE);
35
- globalPointsTableReady = true;
36
- return;
35
+ tableDescription = await s.describeTable(GLOBAL_POINTS_TABLE);
37
36
  }
38
37
  catch {
38
+ // Table doesn't exist, create it with all columns
39
39
  const desc = new TableDescription()
40
- .withColumns(new Column("uid", Types.UTF8), new Column("point_id", Types.UTF8), new Column("embedding", Types.BYTES), new Column("payload", Types.JSON_DOCUMENT))
40
+ .withColumns(new Column("uid", Types.UTF8), new Column("point_id", Types.UTF8), new Column("embedding", Types.BYTES), new Column("embedding_bit", Types.BYTES), new Column("payload", Types.JSON_DOCUMENT))
41
41
  .withPrimaryKeys("uid", "point_id");
42
42
  await s.createTable(GLOBAL_POINTS_TABLE, desc);
43
43
  globalPointsTableReady = true;
44
44
  logger.info(`created global points table ${GLOBAL_POINTS_TABLE}`);
45
+ return;
46
+ }
47
+ // Table exists, check if embedding_bit column is present
48
+ const columns = tableDescription.columns ?? [];
49
+ const hasEmbeddingBit = columns.some((col) => col.name === "embedding_bit");
50
+ let needsBackfill = false;
51
+ if (!hasEmbeddingBit) {
52
+ // Add the missing embedding_bit column
53
+ const alterDdl = `
54
+ ALTER TABLE ${GLOBAL_POINTS_TABLE}
55
+ ADD COLUMN embedding_bit String;
56
+ `;
57
+ await s.executeQuery(alterDdl);
58
+ logger.info(`added embedding_bit column to existing table ${GLOBAL_POINTS_TABLE}`);
59
+ needsBackfill = true;
60
+ }
61
+ else {
62
+ // Column exists; check if any legacy rows still have NULL embedding_bit
63
+ const checkNullsDdl = `
64
+ SELECT 1 AS has_null
65
+ FROM ${GLOBAL_POINTS_TABLE}
66
+ WHERE embedding_bit IS NULL
67
+ LIMIT 1;
68
+ `;
69
+ const checkRes = await s.executeQuery(checkNullsDdl);
70
+ const hasNullRows = checkRes.resultSets?.[0]?.rows &&
71
+ checkRes.resultSets[0].rows.length > 0;
72
+ needsBackfill = Boolean(hasNullRows);
73
+ }
74
+ if (needsBackfill) {
75
+ // Backfill existing rows: convert embedding to bit representation
76
+ const backfillDdl = `
77
+ UPDATE ${GLOBAL_POINTS_TABLE}
78
+ SET embedding_bit = Untag(Knn::ToBinaryStringBit(Knn::FloatFromBinaryString(embedding)), "BitVector")
79
+ WHERE embedding_bit IS NULL;
80
+ `;
81
+ await s.executeQuery(backfillDdl);
82
+ logger.info(`backfilled embedding_bit column from embedding in ${GLOBAL_POINTS_TABLE}`);
45
83
  }
84
+ // Mark table ready only after schema (and any required backfill) succeed
85
+ globalPointsTableReady = true;
46
86
  });
47
87
  }
48
88
  catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ydb-qdrant",
3
- "version": "4.1.0",
3
+ "version": "4.1.1",
4
4
  "main": "dist/package/api.js",
5
5
  "types": "dist/package/api.d.ts",
6
6
  "exports": {