@equinor/fusion-framework-cli-plugin-ai-index 2.0.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/dist/esm/bin/apply-metadata.js +15 -5
  3. package/dist/esm/bin/apply-metadata.js.map +1 -1
  4. package/dist/esm/bin/apply-schema.js +64 -0
  5. package/dist/esm/bin/apply-schema.js.map +1 -0
  6. package/dist/esm/bin/apply-schema.test.js +143 -0
  7. package/dist/esm/bin/apply-schema.test.js.map +1 -0
  8. package/dist/esm/bin/delete-removed-files.js +1 -1
  9. package/dist/esm/bin/delete-removed-files.js.map +1 -1
  10. package/dist/esm/bin/embed.js +265 -55
  11. package/dist/esm/bin/embed.js.map +1 -1
  12. package/dist/esm/bin/get-diff.js +5 -0
  13. package/dist/esm/bin/get-diff.js.map +1 -1
  14. package/dist/esm/create-command.js +186 -0
  15. package/dist/esm/create-command.js.map +1 -0
  16. package/dist/esm/delete-command.js +14 -2
  17. package/dist/esm/delete-command.js.map +1 -1
  18. package/dist/esm/delete-command.options.js +7 -31
  19. package/dist/esm/delete-command.options.js.map +1 -1
  20. package/dist/esm/delete-index-command.js +94 -0
  21. package/dist/esm/delete-index-command.js.map +1 -0
  22. package/dist/esm/embed-command.js +30 -0
  23. package/dist/esm/embed-command.js.map +1 -0
  24. package/dist/esm/embeddings-command.js +14 -17
  25. package/dist/esm/embeddings-command.js.map +1 -1
  26. package/dist/esm/embeddings-command.options.js +12 -43
  27. package/dist/esm/embeddings-command.options.js.map +1 -1
  28. package/dist/esm/index.js +12 -3
  29. package/dist/esm/index.js.map +1 -1
  30. package/dist/esm/schema.js +41 -0
  31. package/dist/esm/schema.js.map +1 -0
  32. package/dist/esm/search-command.js +17 -5
  33. package/dist/esm/search-command.js.map +1 -1
  34. package/dist/esm/utils/embedding-dimensions.js +37 -0
  35. package/dist/esm/utils/embedding-dimensions.js.map +1 -0
  36. package/dist/esm/utils/zod-to-azure-fields.js +120 -0
  37. package/dist/esm/utils/zod-to-azure-fields.js.map +1 -0
  38. package/dist/esm/utils/zod-to-azure-fields.test.js +112 -0
  39. package/dist/esm/utils/zod-to-azure-fields.test.js.map +1 -0
  40. package/dist/esm/version.js +1 -1
  41. package/dist/tsconfig.tsbuildinfo +1 -1
  42. package/dist/types/bin/apply-metadata.d.ts +2 -1
  43. package/dist/types/bin/apply-schema.d.ts +22 -0
  44. package/dist/types/bin/apply-schema.test.d.ts +1 -0
  45. package/dist/types/config.d.ts +14 -0
  46. package/dist/types/create-command.d.ts +6 -0
  47. package/dist/types/delete-command.options.d.ts +10 -23
  48. package/dist/types/delete-index-command.d.ts +6 -0
  49. package/dist/types/embed-command.d.ts +12 -0
  50. package/dist/types/embeddings-command.options.d.ts +10 -28
  51. package/dist/types/index.d.ts +1 -0
  52. package/dist/types/schema.d.ts +137 -0
  53. package/dist/types/utils/embedding-dimensions.d.ts +13 -0
  54. package/dist/types/utils/zod-to-azure-fields.d.ts +61 -0
  55. package/dist/types/utils/zod-to-azure-fields.test.d.ts +1 -0
  56. package/dist/types/version.d.ts +1 -1
  57. package/package.json +5 -5
  58. package/src/bin/apply-metadata.ts +20 -4
  59. package/src/bin/apply-schema.test.ts +170 -0
  60. package/src/bin/apply-schema.ts +86 -0
  61. package/src/bin/delete-removed-files.ts +1 -1
  62. package/src/bin/embed.ts +325 -77
  63. package/src/bin/get-diff.ts +5 -0
  64. package/src/config.ts +15 -0
  65. package/src/create-command.ts +218 -0
  66. package/src/delete-command.options.ts +7 -37
  67. package/src/delete-command.ts +19 -2
  68. package/src/delete-index-command.ts +121 -0
  69. package/src/embed-command.ts +44 -0
  70. package/src/embeddings-command.options.ts +12 -50
  71. package/src/embeddings-command.ts +18 -18
  72. package/src/index.ts +12 -3
  73. package/src/schema.ts +149 -0
  74. package/src/search-command.ts +22 -5
  75. package/src/utils/embedding-dimensions.ts +39 -0
  76. package/src/utils/zod-to-azure-fields.test.ts +136 -0
  77. package/src/utils/zod-to-azure-fields.ts +177 -0
  78. package/src/version.ts +1 -1
package/src/bin/embed.ts CHANGED
@@ -1,8 +1,19 @@
1
1
  import { globbyStream } from 'globby';
2
2
  import { relative } from 'node:path';
3
3
  import multimatch from 'multimatch';
4
- import { concat, from, merge, timer } from 'rxjs';
5
- import { concatMap, filter, map, mergeMap, retry, shareReplay, toArray } from 'rxjs/operators';
4
+ import { from, merge, timer } from 'rxjs';
5
+ import {
6
+ bufferCount,
7
+ bufferTime,
8
+ concatMap,
9
+ filter,
10
+ finalize,
11
+ map,
12
+ mergeMap,
13
+ retry,
14
+ shareReplay,
15
+ tap,
16
+ } from 'rxjs/operators';
6
17
 
7
18
  import { isMarkdownFile, parseMarkdownFile } from '../utils/markdown/index.js';
8
19
  import { getFileStatus, resolveProjectRoot } from '../utils/git/index.js';
@@ -11,6 +22,7 @@ import { isTypescriptFile, parseTsDocFromFileSync } from '../utils/ts-doc/index.
11
22
  import { getDiff } from './get-diff.js';
12
23
  import { createDeleteRemovedFilesStream } from './delete-removed-files.js';
13
24
  import { applyMetadata } from './apply-metadata.js';
25
+ import { applySchema } from './apply-schema.js';
14
26
  import type {
15
27
  DocumentEntry,
16
28
  EmbeddingsBinOptions,
@@ -21,6 +33,121 @@ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/li
21
33
  import { readFileSync } from 'node:fs';
22
34
  import { generateChunkId } from '../utils/generate-chunk-id.js';
23
35
 
36
+ /** Braille spinner frames (same as ora's default). */
37
+ const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
38
+
39
+ /** Whether the process is running in a non-interactive environment (CI). */
40
+ const IS_CI = !process.stdout.isTTY || Boolean(process.env.CI);
41
+
42
+ /**
43
+ * Manages a fixed block of sticky progress lines with per-line spinners.
44
+ * Each line can be updated independently without overwriting the others.
45
+ *
46
+ * In non-interactive environments (CI) the ANSI cursor-movement dance is
47
+ * replaced with simple `console.log` lines so the output is readable in
48
+ * plain-text log viewers.
49
+ * @internal
50
+ */
51
+ class ProgressDisplay {
52
+ private lines: string[] = [];
53
+ private spinning: boolean[] = [];
54
+ private started = false;
55
+ private frame = 0;
56
+ private timer: ReturnType<typeof setInterval> | undefined;
57
+
58
+ /** Tracks last CI log time per line to throttle output. */
59
+ private lastCiLog: number[] = [];
60
+
61
+ /** Minimum interval (ms) between CI progress lines for the same line slot. */
62
+ private static CI_LOG_INTERVAL_MS = 15_000;
63
+
64
+ /** Register the line labels up front and print empty placeholders. */
65
+ start(count: number): void {
66
+ this.lines = new Array<string>(count).fill('');
67
+ this.spinning = new Array<boolean>(count).fill(false);
68
+ this.lastCiLog = new Array<number>(count).fill(0);
69
+
70
+ if (!IS_CI) {
71
+ // Print placeholder lines so the cursor block exists
72
+ for (let i = 0; i < count; i++) {
73
+ process.stdout.write('\n');
74
+ }
75
+ // Tick spinner at 80ms (same cadence as ora)
76
+ this.timer = setInterval(() => this.tick(), 80);
77
+ }
78
+ this.started = true;
79
+ }
80
+
81
+ /** Update a specific line (0-indexed) without touching the others. */
82
+ update(line: number, message: string): void {
83
+ if (!this.started) return;
84
+ this.lines[line] = message;
85
+ this.spinning[line] = true;
86
+ if (IS_CI) {
87
+ const now = Date.now();
88
+ if (now - this.lastCiLog[line] >= ProgressDisplay.CI_LOG_INTERVAL_MS) {
89
+ this.lastCiLog[line] = now;
90
+ console.log(`⏳ ${message}`);
91
+ }
92
+ return;
93
+ }
94
+ this.render(line);
95
+ }
96
+
97
+ /** Mark a line as completed — stops its spinner and shows a checkmark. */
98
+ succeed(line: number, message: string): void {
99
+ if (!this.started) return;
100
+ const text = `✅ ${message}`;
101
+ this.lines[line] = text;
102
+ this.spinning[line] = false;
103
+ if (IS_CI) {
104
+ console.log(text);
105
+ return;
106
+ }
107
+ this.render(line);
108
+ }
109
+
110
+ /** Clear all progress lines and leave the cursor on a clean line. */
111
+ clear(): void {
112
+ if (!this.started) return;
113
+ if (this.timer) clearInterval(this.timer);
114
+ if (!IS_CI) {
115
+ // Move up to the first progress line and clear each one
116
+ for (let i = 0; i < this.lines.length; i++) {
117
+ const linesUp = this.lines.length - i;
118
+ process.stdout.write(`\x1b[${linesUp}A\x1b[2K\r\x1b[${linesUp}B\r`);
119
+ }
120
+ // Move cursor up past the now-empty block
121
+ process.stdout.write(`\x1b[${this.lines.length}A\r`);
122
+ }
123
+ this.started = false;
124
+ }
125
+
126
+ /** Advance the spinner frame and re-render all spinning lines. */
127
+ private tick(): void {
128
+ this.frame = (this.frame + 1) % SPINNER_FRAMES.length;
129
+ for (let i = 0; i < this.lines.length; i++) {
130
+ if (this.spinning[i] && this.lines[i]) {
131
+ this.render(i);
132
+ }
133
+ }
134
+ }
135
+
136
+ /** Render a single line at its position. */
137
+ private render(line: number): void {
138
+ const linesUp = this.lines.length - line;
139
+ const prefix = this.spinning[line] ? SPINNER_FRAMES[this.frame] : '';
140
+ const text = this.spinning[line] ? `${prefix} ${this.lines[line]}` : this.lines[line];
141
+ process.stdout.write(`\x1b[${linesUp}A\x1b[2K\r${text}\x1b[${linesUp}B\r`);
142
+ }
143
+ }
144
+
145
+ /** Progress line indices */
146
+ const LINE_PARSE = 0;
147
+ const LINE_META = 1;
148
+ const LINE_EMBED = 2;
149
+ const LINE_INDEX = 3;
150
+
24
151
  /**
25
152
  * Default directories to skip before expensive git operations.
26
153
  * These are common build artifacts and dependencies that should be ignored.
@@ -28,6 +155,41 @@ import { generateChunkId } from '../utils/generate-chunk-id.js';
28
155
  */
29
156
  const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**', '.git'];
30
157
 
158
+ /** Concurrency limit for git subprocess operations (status, log, etc.). */
159
+ const GIT_CONCURRENCY = 20;
160
+
161
+ /** Maximum parallel upsert requests to the vector store. */
162
+ const UPSERT_CONCURRENCY = 10;
163
+
164
+ /**
165
+ * Number of texts to embed per API request.
166
+ *
167
+ * Azure OpenAI accepts up to 2 048 inputs per embedding call.
168
+ * LangChain's `batchSize` default is 1 (!) so we also set it on the
169
+ * client constructor. This outer batch controls how many documents
170
+ * are grouped before handing them to the embed client.
171
+ */
172
+ const EMBED_BATCH_SIZE = 500;
173
+
174
+ /**
175
+ * Number of concurrent embedding API requests in flight.
176
+ *
177
+ * Each request now carries EMBED_BATCH_SIZE texts in a single HTTP call
178
+ * (LangChain batchSize is aligned), so 2 concurrent requests already
179
+ * saturate most Azure OpenAI TPM quotas.
180
+ */
181
+ const EMBED_BATCH_CONCURRENCY = 2;
182
+
183
+ /**
184
+ * Maximum time (ms) to wait before flushing a partial embedding batch.
185
+ * Without this, `bufferCount` waits indefinitely for a full batch, which
186
+ * starves `mergeMap` concurrency when upstream document throughput is slow.
187
+ */
188
+ const EMBED_BUFFER_FLUSH_MS = 500;
189
+
190
+ /** Maximum retry attempts for transient / rate-limit errors per chunk. */
191
+ const MAX_RETRIES = 4;
192
+
31
193
  /**
32
194
  * Main entry point for the embeddings bin.
33
195
  * Orchestrates the entire embeddings generation pipeline.
@@ -35,11 +197,28 @@ const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**
35
197
  */
36
198
  export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
37
199
  const { framework, options, config, filePatterns } = binOptions;
200
+ const debug = options.debug ?? false;
38
201
 
39
- console.log(`📇 Index: ${options.azureSearchIndexName}`);
202
+ console.log(`📇 Index: ${options.indexName}`);
203
+
204
+ if (debug) {
205
+ console.debug('[debug] Embed model:', options.embedModel);
206
+ console.debug('[debug] File patterns:', filePatterns);
207
+ console.debug(
208
+ '[debug] Allowed patterns:',
209
+ config.index?.patterns ?? ['**/*.ts', '**/*.tsx', '**/*.md', '**/*.mdx'],
210
+ );
211
+ console.debug('[debug] Raw patterns:', config.index?.rawPatterns ?? []);
212
+ console.debug('[debug] Ignore patterns:', config.index?.ignore ?? defaultIgnore);
213
+ console.debug('[debug] Diff mode:', options.diff);
214
+ console.debug('[debug] Dry run:', options.dryRun);
215
+ console.debug('[debug] Clean:', options.clean);
216
+ }
217
+
218
+ const progress = new ProgressDisplay();
40
219
 
41
220
  // Handle clean operation (destructive - deletes all existing documents)
42
- const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
221
+ const vectorStoreService = framework.ai.useIndex(options.indexName);
43
222
  if (options.clean && !options.dryRun) {
44
223
  console.log('🧹 Cleaning vector store: deleting all existing documents...');
45
224
  // OData filter: delete all documents with non-empty source (all indexed docs)
@@ -75,8 +254,8 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
75
254
  absolute: true,
76
255
  }),
77
256
  ).pipe(
78
- // Get git status concurrently, then flatten array results
79
- mergeMap((path) => getFileStatus(path)),
257
+ // Get git status concurrently (capped to avoid spawning too many git processes)
258
+ mergeMap((path) => getFileStatus(path), GIT_CONCURRENCY),
80
259
  concatMap((files) => from(files)),
81
260
  // Share stream for multiple subscribers (removedFiles$ and indexFiles$)
82
261
  shareReplay({ refCount: true }),
@@ -92,6 +271,7 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
92
271
  ];
93
272
 
94
273
  // Process files: enrich with metadata and filter by allowed patterns
274
+ let fileCount = 0;
95
275
  const processedFiles$ = files$.pipe(
96
276
  map((file) => {
97
277
  const { filepath, status } = file;
@@ -107,8 +287,16 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
107
287
  }),
108
288
  filter((file) => {
109
289
  const matches = multimatch(file.relativePath, allowedFilePatterns);
290
+ if (debug && matches.length === 0) {
291
+ console.debug('[debug] Skipped (no pattern match):', file.relativePath);
292
+ }
110
293
  return matches.length > 0;
111
294
  }),
295
+ tap((file) => {
296
+ fileCount++;
297
+ const label = file.status === 'removed' ? '🗑️' : '📄';
298
+ progress.update(LINE_PARSE, `${label} Parsing [${fileCount}] ${file.relativePath}`);
299
+ }),
112
300
  // Share for multiple subscribers (removedFiles$, markdown$, typescript$)
113
301
  shareReplay({ refCount: true }),
114
302
  );
@@ -134,6 +322,7 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
134
322
  return false;
135
323
  };
136
324
 
325
+ let docCount = 0;
137
326
  const rawFiles$ = indexFiles$.pipe(
138
327
  filter(isRawFile),
139
328
  map((file): DocumentEntry => {
@@ -145,6 +334,8 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
145
334
  type: 'raw',
146
335
  },
147
336
  };
337
+ docCount++;
338
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
148
339
  return { status: file.status, documents: [document] };
149
340
  }),
150
341
  );
@@ -154,6 +345,11 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
154
345
  filter((file) => isMarkdownFile(file.path)),
155
346
  mergeMap(async (file) => {
156
347
  const documents = await parseMarkdownFile(file);
348
+ docCount++;
349
+ if (debug) {
350
+ console.debug(`[debug] Markdown ${file.relativePath} → ${documents.length} chunk(s)`);
351
+ }
352
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
157
353
  return { status: file.status, documents };
158
354
  }),
159
355
  );
@@ -163,84 +359,120 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
163
359
  filter((file) => isTypescriptFile(file.path)),
164
360
  map((file) => {
165
361
  const documents = parseTsDocFromFileSync(file);
362
+ docCount++;
363
+ if (debug) {
364
+ console.debug(`[debug] TypeScript ${file.relativePath} → ${documents.length} chunk(s)`);
365
+ }
366
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
166
367
  return { status: file.status, documents };
167
368
  }),
168
369
  );
169
370
 
371
+ // Merge parsed streams and signal when all parsing is done
372
+ const parsed$ = merge(rawFiles$, markdown$, typescript$).pipe(
373
+ finalize(() => {
374
+ progress.succeed(LINE_PARSE, `📄 Parsed ${docCount} files`);
375
+ }),
376
+ );
377
+
170
378
  // Apply metadata to documents
171
- const applyMetadata$ = applyMetadata(merge(rawFiles$, markdown$, typescript$), config.index);
172
-
173
- // Generate embeddings with concurrency limit and retry on rate-limit (429) errors
174
- const embeddingService = framework.ai.getService('embeddings', options.openaiEmbeddingDeployment);
175
-
176
- /** Maximum parallel embedding requests to avoid hitting Azure OpenAI TPM limits. */
177
- const EMBEDDING_CONCURRENCY = 5;
178
-
179
- /** Maximum retry attempts for transient / rate-limit errors per chunk. */
180
- const MAX_RETRIES = 4;
181
-
182
- const applyEmbedding$ = applyMetadata$.pipe(
183
- mergeMap((documents) =>
184
- from(documents).pipe(
185
- // Limit concurrency to avoid overwhelming the embedding API
186
- mergeMap(
187
- (document) =>
188
- from(embeddingService.embedQuery(document.pageContent)).pipe(
189
- retry({
190
- count: MAX_RETRIES,
191
- delay: (error, retryIndex) => {
192
- // Parse Retry-After header when available (Azure sends seconds)
193
- const retryAfterSec =
194
- error?.response?.headers?.get?.('retry-after') ??
195
- error?.responseHeaders?.['retry-after'];
196
- const retryAfterMs = retryAfterSec ? Number(retryAfterSec) * 1000 : 0;
197
-
198
- // Exponential backoff: 2s, 4s, 8s, 16s — or Retry-After if larger
199
- const backoffMs = 2 ** retryIndex * 1000;
200
- const delayMs = Math.max(backoffMs, retryAfterMs);
201
-
202
- console.warn(
203
- `⏳ Retry ${retryIndex}/${MAX_RETRIES} for "${document.metadata.source}" in ${delayMs}ms`,
204
- );
205
- return timer(delayMs);
206
- },
207
- }),
208
- map((embeddings) => {
209
- console.log('embedding document', document.metadata.source);
210
- const metadata = { ...document.metadata, embedding: embeddings };
211
- return { ...document, metadata };
212
- }),
213
- ),
214
- EMBEDDING_CONCURRENCY,
215
- ),
216
- toArray(),
217
- ),
218
- ),
379
+ let metadataCount = 0;
380
+ let metadataDone = false;
381
+ const applyMetadata$ = applyMetadata(parsed$, config.index, (source) => {
382
+ metadataCount++;
383
+ progress.update(LINE_META, `🏷️ Metadata [${metadataCount}] ${source}`);
384
+ }).pipe(
385
+ finalize(() => {
386
+ metadataDone = true;
387
+ progress.succeed(LINE_META, `🏷️ Metadata ${metadataCount} documents`);
388
+ }),
219
389
  );
220
390
 
221
- // Update vector store
391
+ // Resolve promoted schema fields (if schema is configured) — runs after
392
+ // metadata enrichment so the resolver has access to git, package, and
393
+ // custom attributes from attributeProcessor
394
+ const applySchema$ = applySchema(applyMetadata$, config.index?.schema);
395
+
396
+ // Generate embeddings in batches with retry on rate-limit (429) errors
397
+ const embeddingService = framework.ai.useEmbed(options.embedModel);
398
+
399
+ let embeddedCount = 0;
400
+ let embeddingDone = false;
401
+ const applyEmbedding$ = applySchema$.pipe(
402
+ // Flatten all documents from file-level batches, then re-batch for the API
403
+ concatMap((documents) => from(documents)),
404
+ // Flush when EMBED_BATCH_SIZE docs accumulate OR after EMBED_BUFFER_FLUSH_MS,
405
+ // whichever comes first — prevents upstream starvation from blocking concurrency
406
+ bufferTime(EMBED_BUFFER_FLUSH_MS, null, EMBED_BATCH_SIZE),
407
+ filter((batch) => batch.length > 0),
408
+ mergeMap((batch) => {
409
+ if (debug) {
410
+ console.debug(`[debug] Embedding batch of ${batch.length} documents`);
411
+ }
412
+ return from(embeddingService.embedDocuments(batch.map((d) => d.pageContent))).pipe(
413
+ retry({
414
+ count: MAX_RETRIES,
415
+ delay: (error, retryIndex) => {
416
+ // Auth errors are terminal — abort immediately with actionable message
417
+ if (error?.name === 'NoAccountsError') {
418
+ console.error(
419
+ '\n🔒 Authentication expired. Run `ffc auth login` then retry with `--diff`.',
420
+ );
421
+ throw error;
422
+ }
423
+
424
+ const retryAfterSec =
425
+ error?.response?.headers?.get?.('retry-after') ??
426
+ error?.responseHeaders?.['retry-after'];
427
+ const retryAfterMs = retryAfterSec ? Number(retryAfterSec) * 1000 : 0;
428
+
429
+ const backoffMs = 2 ** retryIndex * 1000;
430
+ const delayMs = Math.max(backoffMs, retryAfterMs);
431
+
432
+ console.warn(
433
+ `\n⏳ Retry ${retryIndex}/${MAX_RETRIES} for batch of ${batch.length} in ${delayMs}ms`,
434
+ );
435
+ return timer(delayMs);
436
+ },
437
+ }),
438
+ map((allEmbeddings) => {
439
+ return batch.map((document, i) => {
440
+ embeddedCount++;
441
+ const total = metadataDone ? metadataCount : 0;
442
+ const pct = total > 0 ? ` ${Math.round((embeddedCount / total) * 100)}%` : '';
443
+ const denominator = total > 0 ? `/${total}` : '';
444
+ progress.update(
445
+ LINE_EMBED,
446
+ `🧠 Embedding [${embeddedCount}${denominator}]${pct} — ${document.metadata.source}`,
447
+ );
448
+ const metadata = { ...document.metadata, embedding: allEmbeddings[i] };
449
+ return { ...document, metadata };
450
+ });
451
+ }),
452
+ );
453
+ }, EMBED_BATCH_CONCURRENCY),
454
+ finalize(() => {
455
+ embeddingDone = true;
456
+ progress.succeed(LINE_EMBED, `🧠 Embedded ${embeddedCount} documents`);
457
+ }),
458
+ );
459
+
460
+ // Update vector store — batch documents and upsert concurrently
222
461
  const upsert$ = applyEmbedding$.pipe(
462
+ // Flatten file-level batches, then re-batch into groups of 20 for bulk upsert
463
+ concatMap((documents) => from(documents)),
464
+ bufferCount(20),
223
465
  mergeMap(async (documents) => {
224
- const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
466
+ const vectorStoreService = framework.ai.useIndex(options.indexName);
225
467
  if (documents.length === 0) {
226
468
  return undefined;
227
469
  }
228
- for (const document of documents) {
229
- console.log(`Adding entry [${document.id}] to vector store`, document.metadata.source);
230
- }
231
470
  if (!options.dryRun) {
232
- // For multiple chunks from same file, delete existing chunks first
233
- if (documents.length > 1) {
234
- const sources = documents
235
- .map((document) => document.metadata.source)
236
- .reduce((acc, source) => acc.add(source), new Set<string>());
237
-
238
- const filterExpression = Array.from(sources)
239
- .map((source) => `metadata/source eq '${source}'`)
240
- .join(' or ');
241
-
242
- // Fire-and-forget deletion (not awaited) - brief gap before new docs are indexed
243
- vectorStoreService.deleteDocuments({ filter: { filterExpression } });
471
+ if (debug) {
472
+ console.debug(
473
+ `[debug] Upserting batch of ${documents.length} documents:`,
474
+ documents.map((d) => d.id),
475
+ );
244
476
  }
245
477
  await vectorStoreService.addDocuments(documents);
246
478
  }
@@ -248,20 +480,23 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
248
480
  status: 'added',
249
481
  documents,
250
482
  } as UpdateVectorStoreResult;
251
- }),
483
+ }, UPSERT_CONCURRENCY),
252
484
  filter((result): result is UpdateVectorStoreResult => Boolean(result)),
253
485
  );
254
486
 
255
487
  // Execute pipeline
256
488
  // Track indexing results for reporting: deleted file paths and added document IDs
489
+ let indexedCount = 0;
257
490
  const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
258
491
  deleted: [],
259
492
  added: [],
260
493
  };
261
494
 
262
- // Execute pipeline: concat ensures deletions happen before additions
263
- // This subscription triggers lazy RxJS execution and tracks all results
264
- concat(delete$, upsert$).subscribe({
495
+ // Execute pipeline: merge runs deletions and additions concurrently so
496
+ // the embedding pipeline can start as soon as metadata-enriched documents
497
+ // are available, without waiting for all file discovery to complete.
498
+ progress.start(4);
499
+ merge(delete$, upsert$).subscribe({
265
500
  next: (result) => {
266
501
  // Track deleted files by relative path
267
502
  if (result.status === 'deleted') {
@@ -275,6 +510,12 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
275
510
  id: document.id,
276
511
  })),
277
512
  );
513
+ indexedCount += result.documents.length;
514
+ // Use embeddedCount as denominator — only show % once embedding is done
515
+ const total = embeddingDone ? embeddedCount : 0;
516
+ const pct = total > 0 ? ` ${Math.round((indexedCount / total) * 100)}%` : '';
517
+ const denominator = total > 0 ? `/${total}` : '';
518
+ progress.update(LINE_INDEX, `📤 Indexed [${indexedCount}${denominator}]${pct}`);
278
519
  }
279
520
  },
280
521
  error: (error) => {
@@ -282,8 +523,15 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
282
523
  process.exit(1);
283
524
  },
284
525
  complete: () => {
285
- // Pipeline completed - log results and exit
286
- console.log('🗂️ Indexing results:', indexingResults);
526
+ // Clear the progress block before final output
527
+ progress.clear();
528
+ // Pipeline completed - log summary
529
+ if (indexingResults.deleted.length > 0) {
530
+ console.log(`🗑️ Deleted: ${indexingResults.deleted.length} files`);
531
+ }
532
+ if (indexingResults.added.length > 0) {
533
+ console.log(`📥 Indexed: ${indexingResults.added.length} documents`);
534
+ }
287
535
  console.log('✅ Embeddings generation completed!');
288
536
  process.exit(0);
289
537
  },
@@ -25,6 +25,11 @@ export async function getDiff(options: CommandOptions): Promise<ChangedFile[]> {
25
25
  }
26
26
 
27
27
  console.log(`📝 Found ${changedFiles.length} changed files matching patterns`);
28
+ if (options.debug) {
29
+ for (const file of changedFiles) {
30
+ console.debug(`[debug] ${file.status}: ${file.filepath}`);
31
+ }
32
+ }
28
33
  return changedFiles;
29
34
  } catch (error) {
30
35
  console.error(`❌ Git diff error: ${error instanceof Error ? error.message : 'Unknown error'}`);
package/src/config.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
2
2
  import type { FusionAIConfig } from '@equinor/fusion-framework-cli-plugin-ai-base';
3
+ import type { IndexSchemaConfig } from './schema.js';
3
4
 
4
5
  /**
5
6
  * Index-specific configuration for Fusion AI document indexing operations.
@@ -55,7 +56,21 @@ export interface IndexConfig {
55
56
  chunkSize?: number;
56
57
  /** Number of overlapping tokens between consecutive chunks. */
57
58
  chunkOverlap?: number;
59
+ /** Explicit vector dimensions for custom embedding models not in the known model map. */
60
+ dimensions?: number;
58
61
  };
62
+
63
+ /**
64
+ * Custom index schema that promotes frequently-filtered metadata to
65
+ * top-level Azure AI Search fields.
66
+ *
67
+ * When defined, the schema resolver runs after metadata enrichment and
68
+ * places resolved values as top-level document fields in Azure Search,
69
+ * enabling direct OData filters without the `any()` operator.
70
+ *
71
+ * @see {@link IndexSchemaConfig} for details and examples.
72
+ */
73
+ schema?: IndexSchemaConfig;
59
74
  }
60
75
 
61
76
  /**