kontext-engine 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -72,6 +72,11 @@ interface ChunkWithFile {
72
72
  text: string;
73
73
  exports: boolean;
74
74
  }
75
+ interface IndexEmbedderMetadata {
76
+ provider: string;
77
+ model: string;
78
+ dimensions: number;
79
+ }
75
80
  interface ChunkSearchFilters {
76
81
  name?: string;
77
82
  nameMode?: "exact" | "prefix" | "contains";
@@ -93,6 +98,7 @@ interface KontextDatabase {
93
98
  insertChunks(fileId: number, chunks: ChunkInput[]): number[];
94
99
  getChunksByFile(fileId: number): ChunkRecord[];
95
100
  getChunksByIds(ids: number[]): ChunkWithFile[];
101
+ getChunksMissingVectors(): ChunkWithFile[];
96
102
  deleteChunksByFile(fileId: number): void;
97
103
  insertDependency(sourceChunkId: number, targetChunkId: number, type: string): void;
98
104
  getDependencies(chunkId: number): {
@@ -118,6 +124,9 @@ interface KontextDatabase {
118
124
  close(): void;
119
125
  getSchemaVersion(): number;
120
126
  pragma(key: string): string;
127
+ getVectorDimensions(): number | null;
128
+ getIndexEmbedder(): IndexEmbedderMetadata | null;
129
+ setIndexEmbedder(metadata: IndexEmbedderMetadata): void;
121
130
  }
122
131
  /** Create or open a SQLite database at the given path. Initializes schema and loads sqlite-vec. */
123
132
  declare function createDatabase(dbPath: string, dimensions?: number): KontextDatabase;
@@ -420,6 +429,8 @@ interface StatusOutput {
420
429
  lastIndexed: string | null;
421
430
  languages: Map<string, number>;
422
431
  config: ProjectConfig | null;
432
+ indexEmbedder: ProjectConfig | null;
433
+ embedderWarning: string | null;
423
434
  text: string;
424
435
  }
425
436
  /** Gather index statistics: file/chunk/vector counts, languages, DB size, config. */
package/dist/index.js CHANGED
@@ -1089,6 +1089,8 @@ var DatabaseError = class extends KontextError {
1089
1089
  // src/storage/db.ts
1090
1090
  var DEFAULT_DIMENSIONS = 384;
1091
1091
  var VECTOR_DIMENSIONS_META_KEY = "vector_dimensions";
1092
+ var INDEX_EMBEDDER_PROVIDER_META_KEY = "index_embedder_provider";
1093
+ var INDEX_EMBEDDER_MODEL_META_KEY = "index_embedder_model";
1092
1094
  function createDatabase(dbPath, dimensions) {
1093
1095
  const dir = path3.dirname(dbPath);
1094
1096
  if (!fs4.existsSync(dir)) {
@@ -1121,6 +1123,16 @@ function createDatabase(dbPath, dimensions) {
1121
1123
  const stmtGetChunksByFile = db.prepare(
1122
1124
  "SELECT id, file_id as fileId, line_start as lineStart, line_end as lineEnd, type, name, parent, text, imports, exports, hash FROM chunks WHERE file_id = ? ORDER BY line_start"
1123
1125
  );
1126
+ const stmtGetChunksMissingVectors = db.prepare(
1127
+ `SELECT c.id, c.file_id as fileId, f.path as filePath, f.language,
1128
+ c.line_start as lineStart, c.line_end as lineEnd,
1129
+ c.type, c.name, c.parent, c.text, c.exports as exports
1130
+ FROM chunks c
1131
+ JOIN files f ON f.id = c.file_id
1132
+ LEFT JOIN chunk_vectors v ON v.rowid = c.id
1133
+ WHERE v.rowid IS NULL
1134
+ ORDER BY c.id`
1135
+ );
1124
1136
  const stmtGetChunkIdsByFile = db.prepare(
1125
1137
  "SELECT id FROM chunks WHERE file_id = ?"
1126
1138
  );
@@ -1150,6 +1162,10 @@ function createDatabase(dbPath, dimensions) {
1150
1162
  const stmtLastIndexed = db.prepare(
1151
1163
  "SELECT MAX(last_indexed) as lastIndexed FROM files"
1152
1164
  );
1165
+ const stmtGetMeta = db.prepare("SELECT value FROM meta WHERE key = ?");
1166
+ const stmtSetMeta = db.prepare(
1167
+ "INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)"
1168
+ );
1153
1169
  return {
1154
1170
  upsertFile(file) {
1155
1171
  const row = stmtUpsertFile.get({
@@ -1263,6 +1279,13 @@ function createDatabase(dbPath, dimensions) {
1263
1279
  exports: r.exports === 1
1264
1280
  }));
1265
1281
  },
1282
+ getChunksMissingVectors() {
1283
+ const rows = stmtGetChunksMissingVectors.all();
1284
+ return rows.map((r) => ({
1285
+ ...r,
1286
+ exports: r.exports === 1
1287
+ }));
1288
+ },
1266
1289
  searchChunks(filters, limit) {
1267
1290
  const conditions = [];
1268
1291
  const params = [];
@@ -1360,6 +1383,66 @@ function createDatabase(dbPath, dimensions) {
1360
1383
  return Object.values(result[0])[0];
1361
1384
  }
1362
1385
  return String(result);
1386
+ },
1387
+ getVectorDimensions() {
1388
+ const row = stmtGetMeta.get(VECTOR_DIMENSIONS_META_KEY);
1389
+ if (!row) return null;
1390
+ const dimensions2 = Number.parseInt(row.value, 10);
1391
+ if (!Number.isInteger(dimensions2) || dimensions2 <= 0) {
1392
+ throw new DatabaseError(
1393
+ `Invalid stored vector dimensions metadata: ${row.value}`,
1394
+ ErrorCode.DB_CORRUPTED
1395
+ );
1396
+ }
1397
+ return dimensions2;
1398
+ },
1399
+ getIndexEmbedder() {
1400
+ const providerRow = stmtGetMeta.get(INDEX_EMBEDDER_PROVIDER_META_KEY);
1401
+ const modelRow = stmtGetMeta.get(INDEX_EMBEDDER_MODEL_META_KEY);
1402
+ if (!providerRow && !modelRow) return null;
1403
+ if (!providerRow || !modelRow) {
1404
+ throw new DatabaseError(
1405
+ "Corrupted index embedder metadata: provider/model keys are incomplete.",
1406
+ ErrorCode.DB_CORRUPTED
1407
+ );
1408
+ }
1409
+ const dimensions2 = this.getVectorDimensions();
1410
+ if (dimensions2 === null) {
1411
+ throw new DatabaseError(
1412
+ "Corrupted index embedder metadata: vector dimensions are missing.",
1413
+ ErrorCode.DB_CORRUPTED
1414
+ );
1415
+ }
1416
+ return {
1417
+ provider: providerRow.value,
1418
+ model: modelRow.value,
1419
+ dimensions: dimensions2
1420
+ };
1421
+ },
1422
+ setIndexEmbedder(metadata) {
1423
+ if (!metadata.provider || !metadata.model) {
1424
+ throw new DatabaseError(
1425
+ "Invalid index embedder metadata: provider and model are required.",
1426
+ ErrorCode.DB_WRITE_FAILED
1427
+ );
1428
+ }
1429
+ if (!Number.isInteger(metadata.dimensions) || metadata.dimensions <= 0) {
1430
+ throw new DatabaseError(
1431
+ `Invalid index embedder metadata dimensions: ${String(metadata.dimensions)}`,
1432
+ ErrorCode.DB_WRITE_FAILED
1433
+ );
1434
+ }
1435
+ const vectorDimensions = this.getVectorDimensions();
1436
+ if (vectorDimensions !== null && vectorDimensions !== metadata.dimensions) {
1437
+ throw new DatabaseError(
1438
+ `Index embedder metadata dimensions (${metadata.dimensions}) do not match vector table dimensions (${vectorDimensions}).`,
1439
+ ErrorCode.DB_WRITE_FAILED
1440
+ );
1441
+ }
1442
+ db.transaction(() => {
1443
+ stmtSetMeta.run(INDEX_EMBEDDER_PROVIDER_META_KEY, metadata.provider);
1444
+ stmtSetMeta.run(INDEX_EMBEDDER_MODEL_META_KEY, metadata.model);
1445
+ })();
1363
1446
  }
1364
1447
  };
1365
1448
  }
@@ -1775,6 +1858,7 @@ var PATH_BOOST_PARTIAL = 1.2;
1775
1858
  var IMPORT_PENALTY = 0.5;
1776
1859
  var TEST_FILE_PENALTY = 0.65;
1777
1860
  var SMALL_SNIPPET_PENALTY = 0.75;
1861
+ var DATA_LITERAL_PENALTY = 0.7;
1778
1862
  var PUBLIC_API_BOOST = 1.12;
1779
1863
  var TEST_FILE_DIRECTORY_PATTERN = /(?:^|\/)(?:tests|__tests__)(?:\/|$)/;
1780
1864
  var TEST_FILE_NAME_PATTERN = /(?:^|\/)[^/]*\.(?:test|spec)\.[cm]?[jt]sx?$/;
@@ -1789,7 +1873,8 @@ function fusionMergeWithPathBoost(strategyResults, limit, pathBoostTerms) {
1789
1873
  const importAdjusted = applyImportDeprioritization(boosted);
1790
1874
  const testAdjusted = applyTestFileDeprioritization(importAdjusted);
1791
1875
  const snippetAdjusted = applySmallSnippetDeprioritization(testAdjusted);
1792
- const boostedApi = applyPublicApiBoost(snippetAdjusted);
1876
+ const dataLiteralAdjusted = applyDataLiteralDeprioritization(snippetAdjusted);
1877
+ const boostedApi = applyPublicApiBoost(dataLiteralAdjusted);
1793
1878
  const adjusted = applyFileDiversityDiminishingReturns(boostedApi);
1794
1879
  adjusted.sort((a, b) => b.score - a.score);
1795
1880
  const sliced = adjusted.slice(0, limit);
@@ -1870,6 +1955,21 @@ function applySmallSnippetDeprioritization(results) {
1870
1955
  return r;
1871
1956
  });
1872
1957
  }
1958
+ function applyDataLiteralDeprioritization(results) {
1959
+ const hasNonDataLiteral = results.some((r) => !isDataLiteralChunk(r));
1960
+ if (!hasNonDataLiteral) return results;
1961
+ const maxNonDataScore = Math.max(
1962
+ ...results.filter((r) => !isDataLiteralChunk(r)).map((r) => r.score),
1963
+ 0
1964
+ );
1965
+ if (maxNonDataScore === 0) return results;
1966
+ return results.map((r) => {
1967
+ if (isDataLiteralChunk(r)) {
1968
+ return { ...r, score: r.score * DATA_LITERAL_PENALTY };
1969
+ }
1970
+ return r;
1971
+ });
1972
+ }
1873
1973
  function applyPublicApiBoost(results) {
1874
1974
  return results.map((r) => {
1875
1975
  if (isPublicApiSymbol(r)) {
@@ -1904,6 +2004,23 @@ function isPublicApiSymbol(result) {
1904
2004
  const textStart = result.text.trimStart().toLowerCase();
1905
2005
  return textStart.startsWith("export ");
1906
2006
  }
2007
+ function isDataLiteralChunk(result) {
2008
+ if (result.type !== "constant") return false;
2009
+ const text = result.text;
2010
+ const lines = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0);
2011
+ if (lines.length === 0) return false;
2012
+ const keyValueLineCount = lines.filter(
2013
+ (line) => /['"`]?[A-Za-z0-9_-]+['"`]?\s*:\s*/.test(line)
2014
+ ).length;
2015
+ const quoteCount = (text.match(/["'`]/g) ?? []).length;
2016
+ const colonCount = (text.match(/:/g) ?? []).length;
2017
+ const structuralCount = quoteCount + colonCount;
2018
+ const nonWhitespaceLength = text.replace(/\s+/g, "").length;
2019
+ const structuralDensity = structuralCount / Math.max(nonWhitespaceLength, 1);
2020
+ const hasLiteralShape = text.includes("{") && text.includes("}") || text.includes("[") && text.includes("]");
2021
+ const kvLineRatio = keyValueLineCount / lines.length;
2022
+ return hasLiteralShape && keyValueLineCount >= 2 && kvLineRatio >= 0.35 && structuralDensity >= 0.04;
2023
+ }
1907
2024
  function getFileDiversityFactor(fileOccurrence) {
1908
2025
  if (fileOccurrence <= 1) return 1;
1909
2026
  if (fileOccurrence === 2) return 0.9;
@@ -2660,10 +2777,14 @@ function validateProjectEmbedderConfig(config) {
2660
2777
  }
2661
2778
 
2662
2779
  // src/cli/commands/init.ts
2780
+ function isSameEmbedderConfig(a, b) {
2781
+ return a.provider === b.provider && a.model === b.model && a.dimensions === b.dimensions;
2782
+ }
2663
2783
  var CTX_DIR2 = ".ctx";
2664
2784
  var DB_FILENAME = "index.db";
2665
2785
  var CONFIG_FILENAME2 = "config.json";
2666
2786
  var GITIGNORE_ENTRY = ".ctx/";
2787
+ var EMBEDDING_SAVE_BATCH_SIZE = 128;
2667
2788
  function ensureGitignore(projectRoot) {
2668
2789
  const gitignorePath = path5.join(projectRoot, ".gitignore");
2669
2790
  if (fs6.existsSync(gitignorePath)) {
@@ -2698,6 +2819,25 @@ function formatLanguageSummary(counts) {
2698
2819
  const entries = [...counts.entries()].sort((a, b) => b[1] - a[1]).map(([lang, count]) => `${lang}: ${count}`);
2699
2820
  return entries.join(", ");
2700
2821
  }
2822
+ async function embedAndPersistInBatches(db, embedder, chunks, log) {
2823
+ let vectorsCreated = 0;
2824
+ const total = chunks.length;
2825
+ for (let i = 0; i < chunks.length; i += EMBEDDING_SAVE_BATCH_SIZE) {
2826
+ const batch = chunks.slice(i, i + EMBEDDING_SAVE_BATCH_SIZE);
2827
+ const texts = batch.map(
2828
+ (chunk) => prepareChunkText(chunk.filePath, chunk.parent, chunk.text)
2829
+ );
2830
+ const vectors = await embedder.embed(texts);
2831
+ db.transaction(() => {
2832
+ for (let j = 0; j < batch.length; j++) {
2833
+ db.insertVector(batch[j].id, vectors[j]);
2834
+ }
2835
+ });
2836
+ vectorsCreated += vectors.length;
2837
+ log(` Embedding... ${vectorsCreated}/${total}`);
2838
+ }
2839
+ return vectorsCreated;
2840
+ }
2701
2841
  async function runInit(projectPath, options = {}) {
2702
2842
  const log = options.log ?? console.log;
2703
2843
  const absoluteRoot = path5.resolve(projectPath);
@@ -2711,6 +2851,24 @@ async function runInit(projectPath, options = {}) {
2711
2851
  const dbPath = path5.join(ctxDir, DB_FILENAME);
2712
2852
  const db = createDatabase(dbPath, embedderConfig.dimensions);
2713
2853
  try {
2854
+ const existingEmbedder = db.getIndexEmbedder();
2855
+ if (existingEmbedder) {
2856
+ if (!isSameEmbedderConfig(existingEmbedder, embedderConfig)) {
2857
+ throw new IndexError(
2858
+ `Index embedder mismatch: index uses ${existingEmbedder.provider} (${existingEmbedder.model}, ${existingEmbedder.dimensions} dims) but config requests ${embedderConfig.provider} (${embedderConfig.model}, ${embedderConfig.dimensions} dims). Rebuild the index.`,
2859
+ ErrorCode.CONFIG_INVALID
2860
+ );
2861
+ }
2862
+ } else {
2863
+ const isEmptyIndex = db.getFileCount() === 0 && db.getChunkCount() === 0 && db.getVectorCount() === 0;
2864
+ if (isEmptyIndex) {
2865
+ db.setIndexEmbedder({
2866
+ provider: embedderConfig.provider,
2867
+ model: embedderConfig.model,
2868
+ dimensions: embedderConfig.dimensions
2869
+ });
2870
+ }
2871
+ }
2714
2872
  const discovered = await discoverFiles({
2715
2873
  root: absoluteRoot,
2716
2874
  extraIgnore: [".ctx/"]
@@ -2796,21 +2954,34 @@ async function runInit(projectPath, options = {}) {
2796
2954
  }
2797
2955
  log(` ${allChunksWithMeta.length} chunks created`);
2798
2956
  let vectorsCreated = 0;
2799
- if (!options.skipEmbedding && allChunksWithMeta.length > 0) {
2800
- const embedder = await createEmbedder(absoluteRoot);
2801
- const texts = allChunksWithMeta.map(
2802
- (cm) => prepareChunkText(cm.fileRelPath, cm.chunk.parent, cm.chunk.text)
2803
- );
2804
- const vectors = await embedder.embed(texts, (done, total) => {
2805
- log(` Embedding... ${done}/${total}`);
2806
- });
2807
- db.transaction(() => {
2808
- for (let i = 0; i < allChunksWithMeta.length; i++) {
2809
- const chunkDbId = parseInt(allChunksWithMeta[i].chunk.id, 10);
2810
- db.insertVector(chunkDbId, vectors[i]);
2957
+ if (!options.skipEmbedding) {
2958
+ const chunksMissingVectors = db.getChunksMissingVectors().map((chunk) => ({
2959
+ id: chunk.id,
2960
+ filePath: chunk.filePath,
2961
+ parent: chunk.parent,
2962
+ text: chunk.text
2963
+ }));
2964
+ if (chunksMissingVectors.length > 0) {
2965
+ log(` ${chunksMissingVectors.length} chunks need embeddings`);
2966
+ }
2967
+ if (chunksMissingVectors.length > 0) {
2968
+ const embedder = await createEmbedder(absoluteRoot);
2969
+ try {
2970
+ vectorsCreated = await embedAndPersistInBatches(
2971
+ db,
2972
+ embedder,
2973
+ chunksMissingVectors,
2974
+ log
2975
+ );
2976
+ } catch (err) {
2977
+ const total = chunksMissingVectors.length;
2978
+ throw new IndexError(
2979
+ `Embedding failed after saving ${vectorsCreated}/${total} vectors. Run "ctx init" again to resume. ${err instanceof Error ? err.message : String(err)}`,
2980
+ ErrorCode.EMBEDDER_FAILED,
2981
+ err instanceof Error ? err : void 0
2982
+ );
2811
2983
  }
2812
- });
2813
- vectorsCreated = vectors.length;
2984
+ }
2814
2985
  }
2815
2986
  const durationMs = performance.now() - start;
2816
2987
  const dbSize = fs6.existsSync(dbPath) ? fs6.statSync(dbPath).size : 0;
@@ -3302,15 +3473,54 @@ function formatStatus(projectPath, output) {
3302
3473
  lines.push(` ${label}${count} file${count !== 1 ? "s" : ""}`);
3303
3474
  }
3304
3475
  }
3305
- if (output.config) {
3306
- lines.push("");
3307
- lines.push(
3308
- ` Embedder: ${output.config.provider} (${output.config.model}, ${output.config.dimensions} dims)`
3309
- );
3476
+ const hasConfig = output.config !== null;
3477
+ const hasIndexEmbedder = output.indexEmbedder !== null;
3478
+ if (hasConfig || hasIndexEmbedder) lines.push("");
3479
+ if (hasConfig && hasIndexEmbedder) {
3480
+ const config = output.config;
3481
+ const indexEmbedder = output.indexEmbedder;
3482
+ if (!config || !indexEmbedder) {
3483
+ lines.push(" Embedder: unknown");
3484
+ } else if (isSameEmbedder(config, indexEmbedder)) {
3485
+ lines.push(
3486
+ ` Embedder: ${indexEmbedder.provider} (${indexEmbedder.model}, ${indexEmbedder.dimensions} dims)`
3487
+ );
3488
+ } else {
3489
+ lines.push(
3490
+ ` Index embedder: ${indexEmbedder.provider} (${indexEmbedder.model}, ${indexEmbedder.dimensions} dims)`
3491
+ );
3492
+ lines.push(
3493
+ ` Config embedder: ${config.provider} (${config.model}, ${config.dimensions} dims)`
3494
+ );
3495
+ }
3496
+ } else if (hasIndexEmbedder) {
3497
+ const indexEmbedder = output.indexEmbedder;
3498
+ if (!indexEmbedder) {
3499
+ lines.push(" Index embedder: unknown");
3500
+ } else {
3501
+ lines.push(
3502
+ ` Index embedder: ${indexEmbedder.provider} (${indexEmbedder.model}, ${indexEmbedder.dimensions} dims)`
3503
+ );
3504
+ }
3505
+ } else if (hasConfig) {
3506
+ const config = output.config;
3507
+ if (!config) {
3508
+ lines.push(" Config embedder: unknown");
3509
+ } else {
3510
+ lines.push(
3511
+ ` Config embedder: ${config.provider} (${config.model}, ${config.dimensions} dims)`
3512
+ );
3513
+ }
3514
+ }
3515
+ if (output.embedderWarning) {
3516
+ lines.push(` Warning: ${output.embedderWarning}`);
3310
3517
  }
3311
3518
  lines.push("");
3312
3519
  return lines.join("\n");
3313
3520
  }
3521
+ function isSameEmbedder(a, b) {
3522
+ return a.provider === b.provider && a.model === b.model && a.dimensions === b.dimensions;
3523
+ }
3314
3524
  async function runStatus(projectPath) {
3315
3525
  const absoluteRoot = path8.resolve(projectPath);
3316
3526
  const ctxDir = path8.join(absoluteRoot, CTX_DIR5);
@@ -3325,6 +3535,8 @@ async function runStatus(projectPath) {
3325
3535
  lastIndexed: null,
3326
3536
  languages: /* @__PURE__ */ new Map(),
3327
3537
  config: null,
3538
+ indexEmbedder: null,
3539
+ embedderWarning: null,
3328
3540
  text: formatNotInitialized(absoluteRoot)
3329
3541
  };
3330
3542
  return output;
@@ -3337,7 +3549,9 @@ async function runStatus(projectPath) {
3337
3549
  const languages = db.getLanguageBreakdown();
3338
3550
  const lastIndexed = db.getLastIndexed();
3339
3551
  const config = readConfig2(ctxDir);
3552
+ const indexEmbedder = db.getIndexEmbedder();
3340
3553
  const dbSizeBytes = fs9.statSync(dbPath).size;
3554
+ const embedderWarning = config && indexEmbedder && !isSameEmbedder(config, indexEmbedder) ? `Index built with ${indexEmbedder.provider} (${indexEmbedder.dimensions} dims), config requests ${config.provider} (${config.dimensions} dims) \u2014 rebuild needed.` : null;
3341
3555
  const output = {
3342
3556
  initialized: true,
3343
3557
  fileCount,
@@ -3347,6 +3561,8 @@ async function runStatus(projectPath) {
3347
3561
  lastIndexed,
3348
3562
  languages,
3349
3563
  config,
3564
+ indexEmbedder,
3565
+ embedderWarning,
3350
3566
  text: ""
3351
3567
  };
3352
3568
  output.text = formatStatus(absoluteRoot, output);