npm - @convex-dev/rag - Versions diffs - 0.1.7 - Mend

@convex-dev/rag 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/LICENSE +201 -0
package/README.md +371 -0
package/dist/client/_generated/_ignore.d.ts +1 -0
package/dist/client/_generated/_ignore.d.ts.map +1 -0
package/dist/client/_generated/_ignore.js +3 -0
package/dist/client/_generated/_ignore.js.map +1 -0
package/dist/client/defaultChunker.d.ts +15 -0
package/dist/client/defaultChunker.d.ts.map +1 -0
package/dist/client/defaultChunker.js +148 -0
package/dist/client/defaultChunker.js.map +1 -0
package/dist/client/fileUtils.d.ts +24 -0
package/dist/client/fileUtils.d.ts.map +1 -0
package/dist/client/fileUtils.js +179 -0
package/dist/client/fileUtils.js.map +1 -0
package/dist/client/index.d.ts +442 -0
package/dist/client/index.d.ts.map +1 -0
package/dist/client/index.js +597 -0
package/dist/client/index.js.map +1 -0
package/dist/client/types.d.ts +29 -0
package/dist/client/types.d.ts.map +1 -0
package/dist/client/types.js +2 -0
package/dist/client/types.js.map +1 -0
package/dist/component/_generated/api.d.ts +439 -0
package/dist/component/_generated/api.d.ts.map +1 -0
package/dist/component/_generated/api.js +22 -0
package/dist/component/_generated/api.js.map +1 -0
package/dist/component/_generated/dataModel.d.ts +60 -0
package/dist/component/_generated/server.d.ts +149 -0
package/dist/component/_generated/server.d.ts.map +1 -0
package/dist/component/_generated/server.js +74 -0
package/dist/component/_generated/server.js.map +1 -0
package/dist/component/chunks.d.ts +139 -0
package/dist/component/chunks.d.ts.map +1 -0
package/dist/component/chunks.js +413 -0
package/dist/component/chunks.js.map +1 -0
package/dist/component/convex.config.d.ts +3 -0
package/dist/component/convex.config.d.ts.map +1 -0
package/dist/component/convex.config.js +6 -0
package/dist/component/convex.config.js.map +1 -0
package/dist/component/embeddings/importance.d.ts +21 -0
package/dist/component/embeddings/importance.d.ts.map +1 -0
package/dist/component/embeddings/importance.js +67 -0
package/dist/component/embeddings/importance.js.map +1 -0
package/dist/component/embeddings/index.d.ts +23 -0
package/dist/component/embeddings/index.d.ts.map +1 -0
package/dist/component/embeddings/index.js +54 -0
package/dist/component/embeddings/index.js.map +1 -0
package/dist/component/embeddings/tables.d.ts +39 -0
package/dist/component/embeddings/tables.d.ts.map +1 -0
package/dist/component/embeddings/tables.js +53 -0
package/dist/component/embeddings/tables.js.map +1 -0
package/dist/component/entries.d.ts +167 -0
package/dist/component/entries.d.ts.map +1 -0
package/dist/component/entries.js +409 -0
package/dist/component/entries.js.map +1 -0
package/dist/component/filters.d.ts +46 -0
package/dist/component/filters.d.ts.map +1 -0
package/dist/component/filters.js +72 -0
package/dist/component/filters.js.map +1 -0
package/dist/component/namespaces.d.ts +131 -0
package/dist/component/namespaces.d.ts.map +1 -0
package/dist/component/namespaces.js +222 -0
package/dist/component/namespaces.js.map +1 -0
package/dist/component/schema.d.ts +1697 -0
package/dist/component/schema.d.ts.map +1 -0
package/dist/component/schema.js +88 -0
package/dist/component/schema.js.map +1 -0
package/dist/component/search.d.ts +20 -0
package/dist/component/search.d.ts.map +1 -0
package/dist/component/search.js +69 -0
package/dist/component/search.js.map +1 -0
package/dist/package.json +3 -0
package/dist/react/index.d.ts +2 -0
package/dist/react/index.d.ts.map +1 -0
package/dist/react/index.js +6 -0
package/dist/react/index.js.map +1 -0
package/dist/shared.d.ts +479 -0
package/dist/shared.d.ts.map +1 -0
package/dist/shared.js +98 -0
package/dist/shared.js.map +1 -0
package/package.json +97 -0
package/src/client/_generated/_ignore.ts +1 -0
package/src/client/defaultChunker.test.ts +243 -0
package/src/client/defaultChunker.ts +183 -0
package/src/client/fileUtils.ts +179 -0
package/src/client/index.test.ts +475 -0
package/src/client/index.ts +1125 -0
package/src/client/setup.test.ts +28 -0
package/src/client/types.ts +69 -0
package/src/component/_generated/api.d.ts +439 -0
package/src/component/_generated/api.js +23 -0
package/src/component/_generated/dataModel.d.ts +60 -0
package/src/component/_generated/server.d.ts +149 -0
package/src/component/_generated/server.js +90 -0
package/src/component/chunks.test.ts +915 -0
package/src/component/chunks.ts +555 -0
package/src/component/convex.config.ts +7 -0
package/src/component/embeddings/importance.test.ts +249 -0
package/src/component/embeddings/importance.ts +75 -0
package/src/component/embeddings/index.test.ts +482 -0
package/src/component/embeddings/index.ts +99 -0
package/src/component/embeddings/tables.ts +114 -0
package/src/component/entries.test.ts +341 -0
package/src/component/entries.ts +546 -0
package/src/component/filters.ts +119 -0
package/src/component/namespaces.ts +299 -0
package/src/component/schema.ts +106 -0
package/src/component/search.test.ts +445 -0
package/src/component/search.ts +97 -0
package/src/component/setup.test.ts +5 -0
package/src/react/index.ts +7 -0
package/src/shared.ts +247 -0
package/src/vitest.config.ts +7 -0

package/src/component/chunks.ts ADDED Viewed

@@ -0,0 +1,555 @@
+import { assert } from "convex-helpers";
+import { paginator } from "convex-helpers/server/pagination";
+import { mergedStream, stream } from "convex-helpers/server/stream";
+import { paginationOptsValidator } from "convex/server";
+import { convexToJson, type Infer } from "convex/values";
+import {
+  statuses,
+  vChunk,
+  vCreateChunkArgs,
+  vEntry,
+  vPaginationResult,
+  vStatus,
+  type Entry,
+} from "../shared.js";
+import type { Doc, Id } from "./_generated/dataModel.js";
+import {
+  internalQuery,
+  mutation,
+  query,
+  type MutationCtx,
+  type QueryCtx,
+} from "./_generated/server.js";
+import { insertEmbedding } from "./embeddings/index.js";
+import { vVectorId } from "./embeddings/tables.js";
+import { schema, v } from "./schema.js";
+import { getPreviousEntry, publicEntry } from "./entries.js";
+import {
+  filterFieldsFromNumbers,
+  numberedFilterFromNamedFilters,
+} from "./filters.js";
+const KB = 1_024;
+const MB = 1_024 * KB;
+const BANDWIDTH_PER_TRANSACTION_HARD_LIMIT = 8 * MB;
+const BANDWIDTH_PER_TRANSACTION_SOFT_LIMIT = 4 * MB;
+export const vInsertChunksArgs = v.object({
+  entryId: v.id("entries"),
+  startOrder: v.number(),
+  chunks: v.array(vCreateChunkArgs),
+});
+type InsertChunksArgs = Infer<typeof vInsertChunksArgs>;
+export const insert = mutation({
+  args: vInsertChunksArgs,
+  returns: v.object({ status: vStatus }),
+  handler: insertChunks,
+});
+export async function insertChunks(
+  ctx: MutationCtx,
+  { entryId, startOrder, chunks }: InsertChunksArgs
+) {
+  const entry = await ctx.db.get(entryId);
+  if (!entry) {
+    throw new Error(`Entry ${entryId} not found`);
+  }
+  await ensureLatestEntryVersion(ctx, entry);
+  // Get the namespace for filter conversion
+  const namespace = await ctx.db.get(entry.namespaceId);
+  assert(namespace, `Namespace ${entry.namespaceId} not found`);
+  const previousEntry = await getPreviousEntry(ctx, entry);
+  let order = startOrder;
+  const chunkIds: Id<"chunks">[] = [];
+  const existingChunks = await ctx.db
+    .query("chunks")
+    .withIndex("entryId_order", (q) =>
+      q
+        .eq("entryId", entryId)
+        .gte("order", startOrder)
+        .lt("order", startOrder + chunks.length)
+    )
+    .collect();
+  if (existingChunks.length > 0) {
+    console.debug(
+      `Deleting ${existingChunks.length} existing chunks for entry ${entryId} at version ${entry.version}`
+    );
+  }
+  // TODO: avoid writing if they're the same
+  await Promise.all(
+    existingChunks.map(async (c) => {
+      if (c.state.kind === "ready") {
+        await ctx.db.delete(c.state.embeddingId);
+      }
+      await ctx.db.delete(c.contentId);
+      await ctx.db.delete(c._id);
+    })
+  );
+  const numberedFilter = numberedFilterFromNamedFilters(
+    entry.filterValues,
+    namespace!.filterNames
+  );
+  for (const chunk of chunks) {
+    const contentId = await ctx.db.insert("content", {
+      text: chunk.content.text,
+      metadata: chunk.content.metadata,
+    });
+    let state: Doc<"chunks">["state"] = {
+      kind: "pending",
+      embedding: chunk.embedding,
+      importance: entry.importance,
+      pendingSearchableText: chunk.searchableText,
+    };
+    if (!previousEntry) {
+      const embeddingId = await insertEmbedding(
+        ctx,
+        chunk.embedding,
+        entry.namespaceId,
+        entry.importance,
+        numberedFilter
+      );
+      state = {
+        kind: "ready",
+        embeddingId,
+        searchableText: chunk.searchableText,
+      };
+    }
+    chunkIds.push(
+      await ctx.db.insert("chunks", {
+        entryId,
+        order,
+        state,
+        contentId,
+        namespaceId: entry.namespaceId,
+        ...filterFieldsFromNumbers(entry.namespaceId, numberedFilter),
+      })
+    );
+    order++;
+  }
+  return {
+    status: previousEntry ? ("pending" as const) : ("ready" as const),
+  };
+}
+async function ensureLatestEntryVersion(ctx: QueryCtx, entry: Doc<"entries">) {
+  if (!entry.key) {
+    return true;
+  }
+  const newerEntry = await mergedStream(
+    statuses.map((status) =>
+      stream(ctx.db, schema)
+        .query("entries")
+        .withIndex("namespaceId_status_key_version", (q) =>
+          q
+            .eq("namespaceId", entry.namespaceId)
+            .eq("status.kind", status)
+            .eq("key", entry.key)
+            .gt("version", entry.version)
+        )
+    ),
+    ["version"]
+  ).first();
+  if (newerEntry) {
+    console.warn(
+      `Bailing from inserting chunks for entry ${entry.key} at version ${entry.version} since there's a newer version ${newerEntry.version} (status ${newerEntry.status}) creation time difference ${(newerEntry._creationTime - entry._creationTime).toFixed(0)}ms`
+    );
+    return false;
+  }
+  return true;
+}
+export const replaceChunksPage = mutation({
+  args: v.object({
+    entryId: v.id("entries"),
+    startOrder: v.number(),
+  }),
+  returns: v.object({
+    status: vStatus,
+    nextStartOrder: v.number(),
+  }),
+  handler: async (ctx, args) => {
+    const { entryId, startOrder } = args;
+    const entryOrNull = await ctx.db.get(entryId);
+    if (!entryOrNull) {
+      throw new Error(`Entry ${entryId} not found`);
+    }
+    const entry = entryOrNull;
+    const isLatest = await ensureLatestEntryVersion(ctx, entry);
+    if (!isLatest) {
+      return {
+        status: "replaced" as const,
+        nextStartOrder: startOrder,
+      };
+    }
+    // Get the namespace for filter conversion
+    const namespace = await ctx.db.get(entry.namespaceId);
+    assert(namespace, `Namespace ${entry.namespaceId} not found`);
+    const previousEntry = await getPreviousEntry(ctx, entry);
+    const pendingEntries =
+      entry.key && previousEntry
+        ? await ctx.db
+            .query("entries")
+            .withIndex("namespaceId_status_key_version", (q) =>
+              q
+                .eq("namespaceId", entry.namespaceId)
+                .eq("status.kind", "pending")
+                .eq("key", entry.key)
+            )
+            .collect()
+        : [];
+    const chunkStream = mergedStream(
+      [entry, ...pendingEntries, previousEntry]
+        .filter((d) => d !== null)
+        .map((entry) =>
+          stream(ctx.db, schema)
+            .query("chunks")
+            .withIndex("entryId_order", (q) =>
+              q.eq("entryId", entry._id).gte("order", startOrder)
+            )
+        ),
+      ["order"]
+    );
+    const namespaceId = entry.namespaceId;
+    const namedFilters = numberedFilterFromNamedFilters(
+      entry.filterValues,
+      namespace!.filterNames
+    );
+    async function addChunk(
+      chunk: Doc<"chunks"> & { state: { kind: "pending" } }
+    ) {
+      const embeddingId = await insertEmbedding(
+        ctx,
+        chunk.state.embedding,
+        namespaceId,
+        entry.importance,
+        namedFilters
+      );
+      await ctx.db.patch(chunk._id, {
+        state: { kind: "ready", embeddingId },
+      });
+    }
+    let dataUsedSoFar = 0;
+    let indexToDelete = startOrder;
+    let chunksToDeleteEmbeddings: Doc<"chunks">[] = [];
+    let chunkToAdd: (Doc<"chunks"> & { state: { kind: "pending" } }) | null =
+      null;
+    async function handleBatch() {
+      await Promise.all(
+        chunksToDeleteEmbeddings.map(async (chunk) => {
+          assert(chunk.state.kind === "ready");
+          const vector = await ctx.db.get(chunk.state.embeddingId);
+          assert(vector, `Vector ${chunk.state.embeddingId} not found`);
+          await ctx.db.delete(chunk.state.embeddingId);
+          await ctx.db.patch(chunk._id, {
+            state: {
+              kind: "replaced",
+              embeddingId: chunk.state.embeddingId,
+              vector: vector.vector,
+              pendingSearchableText: chunk.state.searchableText,
+            },
+          });
+        })
+      );
+      chunksToDeleteEmbeddings = [];
+      if (chunkToAdd) {
+        await addChunk(chunkToAdd);
+      }
+      chunkToAdd = null;
+    }
+    for await (const chunk of chunkStream) {
+      if (chunk.state.kind === "pending") {
+        dataUsedSoFar += await estimateChunkSize(chunk);
+      } else {
+        dataUsedSoFar += 17 * KB; // embedding conservative estimate
+      }
+      if (chunk.order > indexToDelete) {
+        await handleBatch();
+        indexToDelete = chunk.order;
+        // delete the chunks
+        // check if we're close to the limit
+        // if so, bail and pick up on this chunk.order.
+        if (dataUsedSoFar > BANDWIDTH_PER_TRANSACTION_SOFT_LIMIT) {
+          return {
+            status: "pending" as const,
+            nextStartOrder: indexToDelete,
+          };
+        }
+      }
+      if (dataUsedSoFar > BANDWIDTH_PER_TRANSACTION_HARD_LIMIT) {
+        return {
+          status: "pending" as const,
+          nextStartOrder: indexToDelete,
+        };
+      }
+      if (chunk.state.kind === "pending") {
+        if (chunk.entryId === entryId) {
+          if (chunkToAdd) {
+            console.warn(
+              `Multiple pending chunks before changing order ${chunk.order} for entry ${entryId} version ${entry.version}: ${chunkToAdd._id} and ${chunk._id}`
+            );
+            await addChunk(chunkToAdd);
+          }
+          chunkToAdd = chunk as Doc<"chunks"> & { state: { kind: "pending" } };
+        }
+      } else {
+        if (chunk.entryId !== entryId && chunk.state.kind === "ready") {
+          chunksToDeleteEmbeddings.push(chunk);
+        } else {
+          console.debug(
+            `Skipping adding chunk ${chunk._id} for entry ${entryId} version ${entry.version} since it's already ready`
+          );
+        }
+      }
+    }
+    // handle the last batch
+    await handleBatch();
+    return {
+      status: "ready" as const,
+      nextStartOrder: 0,
+    };
+  },
+});
+export const vRangeResult = v.object({
+  entryId: v.id("entries"),
+  order: v.number(),
+  startOrder: v.number(),
+  content: v.array(
+    v.object({
+      text: v.string(),
+      metadata: v.optional(v.record(v.string(), v.any())),
+    })
+  ),
+});
+export const getRangesOfChunks = internalQuery({
+  args: {
+    embeddingIds: v.array(vVectorId),
+    chunkContext: v.object({ before: v.number(), after: v.number() }),
+  },
+  returns: v.object({
+    ranges: v.array(v.union(v.null(), vRangeResult)),
+    entries: v.array(vEntry),
+  }),
+  handler: async (
+    ctx,
+    args
+  ): Promise<{
+    ranges: (null | Infer<typeof vRangeResult>)[];
+    entries: Entry[];
+  }> => {
+    const { embeddingIds, chunkContext } = args;
+    const chunks = await Promise.all(
+      embeddingIds.map((embeddingId) =>
+        ctx.db
+          .query("chunks")
+          .withIndex("embeddingId", (q) =>
+            q.eq("state.embeddingId", embeddingId)
+          )
+          .order("desc")
+          .first()
+      )
+    );
+    // Note: This preserves order of entries as they first appeared.
+    const entries = (
+      await Promise.all(
+        Array.from(
+          new Set(chunks.filter((c) => c !== null).map((c) => c.entryId))
+        ).map((id) => ctx.db.get(id))
+      )
+    )
+      .filter((d) => d !== null)
+      .map(publicEntry);
+    const entryOders = chunks
+      .filter((c) => c !== null)
+      .map((c) => [c.entryId, c.order] as const)
+      .reduce(
+        (acc, [entryId, order]) => {
+          if (acc[entryId]?.includes(order)) {
+            // De-dupe orders
+            return acc;
+          }
+          acc[entryId] = [...(acc[entryId] ?? []), order].sort((a, b) => a - b);
+          return acc;
+        },
+        {} as Record<Id<"entries">, number[]>
+      );
+    const result: Array<Infer<typeof vRangeResult> | null> = [];
+    for (const chunk of chunks) {
+      if (chunk === null) {
+        result.push(null);
+        continue;
+      }
+      // Note: if we parallelize this in the future, we could have a race
+      // instead we'd check that other chunks are not the same doc/order
+      if (
+        result.find(
+          (r) => r?.entryId === chunk.entryId && r?.order === chunk.order
+        )
+      ) {
+        // De-dupe chunks
+        result.push(null);
+        continue;
+      }
+      const entryId = chunk.entryId;
+      const entry = await ctx.db.get(entryId);
+      assert(entry, `Entry ${entryId} not found`);
+      const otherOrders = entryOders[entryId] ?? [chunk.order];
+      const ourOrderIndex = otherOrders.indexOf(chunk.order);
+      const previousOrder = otherOrders[ourOrderIndex - 1] ?? -Infinity;
+      const nextOrder = otherOrders[ourOrderIndex + 1] ?? Infinity;
+      // We absorb all previous context up to the previous chunk.
+      const startOrder = Math.max(
+        chunk.order - chunkContext.before,
+        0,
+        Math.min(previousOrder + 1, chunk.order)
+      );
+      // We stop short if the next chunk order's "before" context will cover it.
+      const endOrder = Math.min(
+        chunk.order + chunkContext.after + 1,
+        Math.max(nextOrder - chunkContext.before, chunk.order + 1)
+      );
+      const contentIds: Id<"content">[] = [];
+      if (startOrder === chunk.order && endOrder === chunk.order + 1) {
+        contentIds.push(chunk.contentId);
+      } else {
+        const chunks = await ctx.db
+          .query("chunks")
+          .withIndex("entryId_order", (q) =>
+            q
+              .eq("entryId", entryId)
+              .gte("order", startOrder)
+              .lt("order", endOrder)
+          )
+          .collect();
+        for (const chunk of chunks) {
+          contentIds.push(chunk.contentId);
+        }
+      }
+      const content = await Promise.all(
+        contentIds.map(async (contentId) => {
+          const content = await ctx.db.get(contentId);
+          assert(content, `Content ${contentId} not found`);
+          return { text: content.text, metadata: content.metadata };
+        })
+      );
+      result.push({
+        entryId,
+        order: chunk.order,
+        startOrder,
+        content,
+      });
+    }
+    return {
+      ranges: result,
+      entries,
+    };
+  },
+});
+export const list = query({
+  args: v.object({
+    entryId: v.id("entries"),
+    paginationOpts: paginationOptsValidator,
+  }),
+  returns: vPaginationResult(vChunk),
+  handler: async (ctx, args) => {
+    const { entryId, paginationOpts } = args;
+    const chunks = await paginator(ctx.db, schema)
+      .query("chunks")
+      .withIndex("entryId_order", (q) => q.eq("entryId", entryId))
+      .order("asc")
+      .paginate(paginationOpts);
+    return {
+      ...chunks,
+      page: await Promise.all(
+        chunks.page.map(async (chunk) => {
+          const content = await ctx.db.get(chunk.contentId);
+          assert(content, `Content ${chunk.contentId} not found`);
+          return publicChunk(chunk, content);
+        })
+      ),
+    };
+  },
+});
+// export async function findLastChunk(
+//   ctx: MutationCtx,
+//   entryId: Id<"entries">
+// ): Promise<Chunk | null> {
+//   const chunk = await ctx.db
+//     .query("chunks")
+//     .withIndex("entryId_order", (q) => q.eq("entryId", entryId))
+//     .order("desc")
+//     .first();
+//   if (!chunk) {
+//     return null;
+//   }
+//   const content = await ctx.db.get(chunk.contentId);
+//   assert(content, `Content for chunk ${chunk._id} not found`);
+//   return publicChunk(chunk, content);
+// }
+async function publicChunk(chunk: Doc<"chunks">, content: Doc<"content">) {
+  return {
+    order: chunk.order,
+    state: chunk.state.kind,
+    text: content.text,
+    metadata: content.metadata,
+  };
+}
+export async function deleteChunksPage(
+  ctx: MutationCtx,
+  { entryId, startOrder }: { entryId: Id<"entries">; startOrder: number }
+) {
+  const chunkStream = ctx.db
+    .query("chunks")
+    .withIndex("entryId_order", (q) =>
+      q.eq("entryId", entryId).gte("order", startOrder)
+    );
+  let dataUsedSoFar = 0;
+  for await (const chunk of chunkStream) {
+    dataUsedSoFar += await estimateChunkSize(chunk);
+    await ctx.db.delete(chunk._id);
+    dataUsedSoFar += await estimateContentSize(ctx, chunk.contentId);
+    await ctx.db.delete(chunk.contentId);
+    if (dataUsedSoFar > BANDWIDTH_PER_TRANSACTION_HARD_LIMIT) {
+      // TODO: schedule follow-up - workpool?
+      return { isDone: false, nextStartOrder: chunk.order };
+    }
+  }
+  return { isDone: true, nextStartOrder: -1 };
+}
+async function estimateChunkSize(chunk: Doc<"chunks">) {
+  let dataUsedSoFar = 100; // constant metadata - roughly
+  if (chunk.state.kind === "pending") {
+    dataUsedSoFar += chunk.state.embedding.length * 8;
+  }
+  return dataUsedSoFar;
+}
+async function estimateContentSize(ctx: QueryCtx, contentId: Id<"content">) {
+  let dataUsedSoFar = 0;
+  // TODO: if/when deletions don't count as bandwidth, we can remove this.
+  const content = await ctx.db.get(contentId);
+  if (content) {
+    dataUsedSoFar += content.text.length;
+    dataUsedSoFar += JSON.stringify(
+      convexToJson(content.metadata ?? {})
+    ).length;
+  }
+  return dataUsedSoFar;
+}

package/src/component/convex.config.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import { defineComponent } from "convex/server";
+import workpool from "@convex-dev/workpool/convex.config";
+const component = defineComponent("rag");
+component.use(workpool);
+export default component;