@equinor/fusion-framework-cli-plugin-ai-index 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +52 -0
  2. package/dist/esm/bin/apply-metadata.js +15 -5
  3. package/dist/esm/bin/apply-metadata.js.map +1 -1
  4. package/dist/esm/bin/apply-schema.js +64 -0
  5. package/dist/esm/bin/apply-schema.js.map +1 -0
  6. package/dist/esm/bin/apply-schema.test.js +143 -0
  7. package/dist/esm/bin/apply-schema.test.js.map +1 -0
  8. package/dist/esm/bin/delete-removed-files.js +1 -1
  9. package/dist/esm/bin/delete-removed-files.js.map +1 -1
  10. package/dist/esm/bin/embed.js +188 -47
  11. package/dist/esm/bin/embed.js.map +1 -1
  12. package/dist/esm/create-command.js +186 -0
  13. package/dist/esm/create-command.js.map +1 -0
  14. package/dist/esm/delete-command.js +14 -2
  15. package/dist/esm/delete-command.js.map +1 -1
  16. package/dist/esm/delete-command.options.js +7 -31
  17. package/dist/esm/delete-command.options.js.map +1 -1
  18. package/dist/esm/delete-index-command.js +94 -0
  19. package/dist/esm/delete-index-command.js.map +1 -0
  20. package/dist/esm/embed-command.js +30 -0
  21. package/dist/esm/embed-command.js.map +1 -0
  22. package/dist/esm/embeddings-command.js +14 -17
  23. package/dist/esm/embeddings-command.js.map +1 -1
  24. package/dist/esm/embeddings-command.options.js +12 -43
  25. package/dist/esm/embeddings-command.options.js.map +1 -1
  26. package/dist/esm/index.js +12 -3
  27. package/dist/esm/index.js.map +1 -1
  28. package/dist/esm/schema.js +41 -0
  29. package/dist/esm/schema.js.map +1 -0
  30. package/dist/esm/search-command.js +17 -5
  31. package/dist/esm/search-command.js.map +1 -1
  32. package/dist/esm/utils/embedding-dimensions.js +37 -0
  33. package/dist/esm/utils/embedding-dimensions.js.map +1 -0
  34. package/dist/esm/utils/zod-to-azure-fields.js +120 -0
  35. package/dist/esm/utils/zod-to-azure-fields.js.map +1 -0
  36. package/dist/esm/utils/zod-to-azure-fields.test.js +112 -0
  37. package/dist/esm/utils/zod-to-azure-fields.test.js.map +1 -0
  38. package/dist/esm/version.js +1 -1
  39. package/dist/tsconfig.tsbuildinfo +1 -1
  40. package/dist/types/bin/apply-metadata.d.ts +2 -1
  41. package/dist/types/bin/apply-schema.d.ts +22 -0
  42. package/dist/types/bin/apply-schema.test.d.ts +1 -0
  43. package/dist/types/config.d.ts +14 -0
  44. package/dist/types/create-command.d.ts +6 -0
  45. package/dist/types/delete-command.options.d.ts +9 -23
  46. package/dist/types/delete-index-command.d.ts +6 -0
  47. package/dist/types/embed-command.d.ts +12 -0
  48. package/dist/types/embeddings-command.options.d.ts +9 -28
  49. package/dist/types/index.d.ts +1 -0
  50. package/dist/types/schema.d.ts +137 -0
  51. package/dist/types/utils/embedding-dimensions.d.ts +13 -0
  52. package/dist/types/utils/zod-to-azure-fields.d.ts +61 -0
  53. package/dist/types/utils/zod-to-azure-fields.test.d.ts +1 -0
  54. package/dist/types/version.d.ts +1 -1
  55. package/package.json +6 -6
  56. package/src/bin/apply-metadata.ts +20 -4
  57. package/src/bin/apply-schema.test.ts +170 -0
  58. package/src/bin/apply-schema.ts +86 -0
  59. package/src/bin/delete-removed-files.ts +1 -1
  60. package/src/bin/embed.ts +248 -76
  61. package/src/config.ts +15 -0
  62. package/src/create-command.ts +218 -0
  63. package/src/delete-command.options.ts +7 -37
  64. package/src/delete-command.ts +19 -2
  65. package/src/delete-index-command.ts +121 -0
  66. package/src/embed-command.ts +44 -0
  67. package/src/embeddings-command.options.ts +12 -50
  68. package/src/embeddings-command.ts +18 -18
  69. package/src/index.ts +12 -3
  70. package/src/schema.ts +149 -0
  71. package/src/search-command.ts +22 -5
  72. package/src/utils/embedding-dimensions.ts +39 -0
  73. package/src/utils/zod-to-azure-fields.test.ts +136 -0
  74. package/src/utils/zod-to-azure-fields.ts +177 -0
  75. package/src/version.ts +1 -1
@@ -0,0 +1,86 @@
1
+ import { map } from 'rxjs';
2
+ import type { Observable } from 'rxjs';
3
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
4
+ import type { IndexSchemaConfig } from '../schema.js';
5
+
6
+ /**
7
+ * Creates an RxJS operator that resolves promoted schema fields for each
8
+ * document and separates them from the generic `attributes` bag.
9
+ *
10
+ * For each document in the batch:
11
+ * 1. Runs the optional `prepareAttributes` callback to enrich attributes
12
+ * with type-safe access to schema-declared fields
13
+ * 2. Calls the schema resolver to compute promoted field values
14
+ * 3. Validates the resolved values against the Zod shape
15
+ * 4. Stores promoted fields on `metadata.schemaFields`
16
+ * 5. Removes promoted keys from `metadata.attributes` to avoid duplication
17
+ *
18
+ * When no schema is configured, the stream passes through unchanged.
19
+ *
20
+ * @param document$ - Stream of document batches from the metadata enrichment step.
21
+ * @param schema - The index schema config, if defined. When `undefined`, documents pass through unchanged.
22
+ * @returns Stream of document batches with promoted fields resolved and stored.
23
+ */
24
+ export function applySchema(
25
+ document$: Observable<VectorStoreDocument[]>,
26
+ schema: IndexSchemaConfig | undefined,
27
+ ): Observable<VectorStoreDocument[]> {
28
+ // No schema configured — pass through unchanged (backward compatible)
29
+ if (!schema) {
30
+ return document$;
31
+ }
32
+
33
+ const promotedKeys = new Set(Object.keys(schema.shape.shape as Record<string, unknown>));
34
+
35
+ return document$.pipe(
36
+ map((documents) =>
37
+ documents.map((document) => {
38
+ // Run typed attribute processor before schema resolution so the
39
+ // resolver receives fully enriched attributes
40
+ let enrichedDocument = document;
41
+ if (schema.prepareAttributes) {
42
+ const enrichedAttributes = schema.prepareAttributes(
43
+ (document.metadata.attributes ?? {}) as Record<string, unknown>,
44
+ document,
45
+ );
46
+ enrichedDocument = {
47
+ ...document,
48
+ metadata: {
49
+ ...document.metadata,
50
+ attributes: enrichedAttributes as Record<string, unknown>,
51
+ },
52
+ };
53
+ }
54
+
55
+ // Resolve promoted field values from the fully enriched document
56
+ const resolved = schema.resolve(enrichedDocument);
57
+
58
+ // Validate against the Zod shape — throws on invalid data with
59
+ // a clear error message pointing to the offending field
60
+ const validated = schema.shape.parse(resolved) as Record<string, unknown>;
61
+
62
+ // Remove promoted keys from attributes to avoid storing them
63
+ // in both top-level fields and the generic attributes array
64
+ const currentAttributes = (enrichedDocument.metadata.attributes ?? {}) as Record<
65
+ string,
66
+ unknown
67
+ >;
68
+ const remainingAttributes: Record<string, unknown> = {};
69
+ for (const [key, value] of Object.entries(currentAttributes)) {
70
+ if (!promotedKeys.has(key)) {
71
+ remainingAttributes[key] = value;
72
+ }
73
+ }
74
+
75
+ return {
76
+ ...enrichedDocument,
77
+ metadata: {
78
+ ...enrichedDocument.metadata,
79
+ attributes: remainingAttributes,
80
+ schemaFields: validated,
81
+ },
82
+ };
83
+ }),
84
+ ),
85
+ );
86
+ }
@@ -33,7 +33,7 @@ export function createDeleteRemovedFilesStream(
33
33
  console.log('Removing entry from vector store', file.relativePath);
34
34
  }
35
35
  if (!options.dryRun) {
36
- const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
36
+ const vectorStoreService = framework.ai.useIndex(options.indexName);
37
37
  // Single batch deletion - one file can produce multiple document chunks
38
38
  await vectorStoreService.deleteDocuments({
39
39
  filter: { filterExpression: filterExpression ?? undefined },
package/src/bin/embed.ts CHANGED
@@ -1,8 +1,19 @@
1
1
  import { globbyStream } from 'globby';
2
2
  import { relative } from 'node:path';
3
3
  import multimatch from 'multimatch';
4
- import { concat, from, merge, timer } from 'rxjs';
5
- import { concatMap, filter, map, mergeMap, retry, shareReplay, toArray } from 'rxjs/operators';
4
+ import { from, merge, timer } from 'rxjs';
5
+ import {
6
+ bufferCount,
7
+ bufferTime,
8
+ concatMap,
9
+ filter,
10
+ finalize,
11
+ map,
12
+ mergeMap,
13
+ retry,
14
+ shareReplay,
15
+ tap,
16
+ } from 'rxjs/operators';
6
17
 
7
18
  import { isMarkdownFile, parseMarkdownFile } from '../utils/markdown/index.js';
8
19
  import { getFileStatus, resolveProjectRoot } from '../utils/git/index.js';
@@ -11,6 +22,7 @@ import { isTypescriptFile, parseTsDocFromFileSync } from '../utils/ts-doc/index.
11
22
  import { getDiff } from './get-diff.js';
12
23
  import { createDeleteRemovedFilesStream } from './delete-removed-files.js';
13
24
  import { applyMetadata } from './apply-metadata.js';
25
+ import { applySchema } from './apply-schema.js';
14
26
  import type {
15
27
  DocumentEntry,
16
28
  EmbeddingsBinOptions,
@@ -21,6 +33,89 @@ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/li
21
33
  import { readFileSync } from 'node:fs';
22
34
  import { generateChunkId } from '../utils/generate-chunk-id.js';
23
35
 
36
+ /** Braille spinner frames (same as ora's default). */
37
+ const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
38
+
39
+ /**
40
+ * Manages a fixed block of sticky progress lines with per-line spinners.
41
+ * Each line can be updated independently without overwriting the others.
42
+ * @internal
43
+ */
44
+ class ProgressDisplay {
45
+ private lines: string[] = [];
46
+ private spinning: boolean[] = [];
47
+ private started = false;
48
+ private frame = 0;
49
+ private timer: ReturnType<typeof setInterval> | undefined;
50
+
51
+ /** Register the line labels up front and print empty placeholders. */
52
+ start(count: number): void {
53
+ this.lines = new Array<string>(count).fill('');
54
+ this.spinning = new Array<boolean>(count).fill(false);
55
+ // Print placeholder lines so the cursor block exists
56
+ for (let i = 0; i < count; i++) {
57
+ process.stdout.write('\n');
58
+ }
59
+ this.started = true;
60
+ // Tick spinner at 80ms (same cadence as ora)
61
+ this.timer = setInterval(() => this.tick(), 80);
62
+ }
63
+
64
+ /** Update a specific line (0-indexed) without touching the others. */
65
+ update(line: number, message: string): void {
66
+ if (!this.started) return;
67
+ this.lines[line] = message;
68
+ this.spinning[line] = true;
69
+ this.render(line);
70
+ }
71
+
72
+ /** Mark a line as completed — stops its spinner and shows a checkmark. */
73
+ succeed(line: number, message: string): void {
74
+ if (!this.started) return;
75
+ this.lines[line] = `✅ ${message}`;
76
+ this.spinning[line] = false;
77
+ this.render(line);
78
+ }
79
+
80
+ /** Clear all progress lines and leave the cursor on a clean line. */
81
+ clear(): void {
82
+ if (!this.started) return;
83
+ if (this.timer) clearInterval(this.timer);
84
+ // Move up to the first progress line and clear each one
85
+ for (let i = 0; i < this.lines.length; i++) {
86
+ const linesUp = this.lines.length - i;
87
+ process.stdout.write(`\x1b[${linesUp}A\x1b[2K\r\x1b[${linesUp}B\r`);
88
+ }
89
+ // Move cursor up past the now-empty block
90
+ process.stdout.write(`\x1b[${this.lines.length}A\r`);
91
+ this.started = false;
92
+ }
93
+
94
+ /** Advance the spinner frame and re-render all spinning lines. */
95
+ private tick(): void {
96
+ this.frame = (this.frame + 1) % SPINNER_FRAMES.length;
97
+ for (let i = 0; i < this.lines.length; i++) {
98
+ if (this.spinning[i] && this.lines[i]) {
99
+ this.render(i);
100
+ }
101
+ }
102
+ }
103
+
104
+ /** Render a single line at its position. */
105
+ private render(line: number): void {
106
+ const linesUp = this.lines.length - line;
107
+ const prefix = this.spinning[line] ? SPINNER_FRAMES[this.frame] : '';
108
+ const text = this.spinning[line] ? `${prefix} ${this.lines[line]}` : this.lines[line];
109
+ process.stdout.write(`\x1b[${linesUp}A\x1b[2K\r${text}\x1b[${linesUp}B\r`);
110
+ }
111
+ }
112
+
113
+ /** Progress line indices */
114
+ const LINE_PARSE = 0;
115
+ const LINE_META = 1;
116
+ const LINE_EMBED = 2;
117
+ const LINE_INDEX = 3;
118
+
24
119
  /**
25
120
  * Default directories to skip before expensive git operations.
26
121
  * These are common build artifacts and dependencies that should be ignored.
@@ -28,6 +123,28 @@ import { generateChunkId } from '../utils/generate-chunk-id.js';
28
123
  */
29
124
  const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**', '.git'];
30
125
 
126
+ /** Concurrency limit for git subprocess operations (status, log, etc.). */
127
+ const GIT_CONCURRENCY = 20;
128
+
129
+ /** Maximum parallel upsert requests to the vector store. */
130
+ const UPSERT_CONCURRENCY = 10;
131
+
132
+ /** Number of texts to embed per API request. */
133
+ const EMBED_BATCH_SIZE = 20;
134
+
135
+ /** Number of concurrent batch requests in flight. */
136
+ const EMBED_BATCH_CONCURRENCY = 4;
137
+
138
+ /**
139
+ * Maximum time (ms) to wait before flushing a partial embedding batch.
140
+ * Without this, `bufferCount` waits indefinitely for a full batch, which
141
+ * starves `mergeMap` concurrency when upstream document throughput is slow.
142
+ */
143
+ const EMBED_BUFFER_FLUSH_MS = 250;
144
+
145
+ /** Maximum retry attempts for transient / rate-limit errors per chunk. */
146
+ const MAX_RETRIES = 4;
147
+
31
148
  /**
32
149
  * Main entry point for the embeddings bin.
33
150
  * Orchestrates the entire embeddings generation pipeline.
@@ -36,10 +153,12 @@ const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**
36
153
  export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
37
154
  const { framework, options, config, filePatterns } = binOptions;
38
155
 
39
- console.log(`📇 Index: ${options.azureSearchIndexName}`);
156
+ console.log(`📇 Index: ${options.indexName}`);
157
+
158
+ const progress = new ProgressDisplay();
40
159
 
41
160
  // Handle clean operation (destructive - deletes all existing documents)
42
- const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
161
+ const vectorStoreService = framework.ai.useIndex(options.indexName);
43
162
  if (options.clean && !options.dryRun) {
44
163
  console.log('🧹 Cleaning vector store: deleting all existing documents...');
45
164
  // OData filter: delete all documents with non-empty source (all indexed docs)
@@ -75,8 +194,8 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
75
194
  absolute: true,
76
195
  }),
77
196
  ).pipe(
78
- // Get git status concurrently, then flatten array results
79
- mergeMap((path) => getFileStatus(path)),
197
+ // Get git status concurrently (capped to avoid spawning too many git processes)
198
+ mergeMap((path) => getFileStatus(path), GIT_CONCURRENCY),
80
199
  concatMap((files) => from(files)),
81
200
  // Share stream for multiple subscribers (removedFiles$ and indexFiles$)
82
201
  shareReplay({ refCount: true }),
@@ -92,6 +211,7 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
92
211
  ];
93
212
 
94
213
  // Process files: enrich with metadata and filter by allowed patterns
214
+ let fileCount = 0;
95
215
  const processedFiles$ = files$.pipe(
96
216
  map((file) => {
97
217
  const { filepath, status } = file;
@@ -109,6 +229,11 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
109
229
  const matches = multimatch(file.relativePath, allowedFilePatterns);
110
230
  return matches.length > 0;
111
231
  }),
232
+ tap((file) => {
233
+ fileCount++;
234
+ const label = file.status === 'removed' ? '🗑️' : '📄';
235
+ progress.update(LINE_PARSE, `${label} Parsing [${fileCount}] ${file.relativePath}`);
236
+ }),
112
237
  // Share for multiple subscribers (removedFiles$, markdown$, typescript$)
113
238
  shareReplay({ refCount: true }),
114
239
  );
@@ -134,6 +259,7 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
134
259
  return false;
135
260
  };
136
261
 
262
+ let docCount = 0;
137
263
  const rawFiles$ = indexFiles$.pipe(
138
264
  filter(isRawFile),
139
265
  map((file): DocumentEntry => {
@@ -145,6 +271,8 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
145
271
  type: 'raw',
146
272
  },
147
273
  };
274
+ docCount++;
275
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
148
276
  return { status: file.status, documents: [document] };
149
277
  }),
150
278
  );
@@ -154,6 +282,8 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
154
282
  filter((file) => isMarkdownFile(file.path)),
155
283
  mergeMap(async (file) => {
156
284
  const documents = await parseMarkdownFile(file);
285
+ docCount++;
286
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
157
287
  return { status: file.status, documents };
158
288
  }),
159
289
  );
@@ -163,105 +293,134 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
163
293
  filter((file) => isTypescriptFile(file.path)),
164
294
  map((file) => {
165
295
  const documents = parseTsDocFromFileSync(file);
296
+ docCount++;
297
+ progress.update(LINE_PARSE, `📄 Parsing [${docCount}] ${file.relativePath}`);
166
298
  return { status: file.status, documents };
167
299
  }),
168
300
  );
169
301
 
302
+ // Merge parsed streams and signal when all parsing is done
303
+ const parsed$ = merge(rawFiles$, markdown$, typescript$).pipe(
304
+ finalize(() => {
305
+ progress.succeed(LINE_PARSE, `📄 Parsed ${docCount} files`);
306
+ }),
307
+ );
308
+
170
309
  // Apply metadata to documents
171
- const applyMetadata$ = applyMetadata(merge(rawFiles$, markdown$, typescript$), config.index);
172
-
173
- // Generate embeddings with concurrency limit and retry on rate-limit (429) errors
174
- const embeddingService = framework.ai.getService('embeddings', options.openaiEmbeddingDeployment);
175
-
176
- /** Maximum parallel embedding requests to avoid hitting Azure OpenAI TPM limits. */
177
- const EMBEDDING_CONCURRENCY = 5;
178
-
179
- /** Maximum retry attempts for transient / rate-limit errors per chunk. */
180
- const MAX_RETRIES = 4;
181
-
182
- const applyEmbedding$ = applyMetadata$.pipe(
183
- mergeMap((documents) =>
184
- from(documents).pipe(
185
- // Limit concurrency to avoid overwhelming the embedding API
186
- mergeMap(
187
- (document) =>
188
- from(embeddingService.embedQuery(document.pageContent)).pipe(
189
- retry({
190
- count: MAX_RETRIES,
191
- delay: (error, retryIndex) => {
192
- // Parse Retry-After header when available (Azure sends seconds)
193
- const retryAfterSec =
194
- error?.response?.headers?.get?.('retry-after') ??
195
- error?.responseHeaders?.['retry-after'];
196
- const retryAfterMs = retryAfterSec ? Number(retryAfterSec) * 1000 : 0;
197
-
198
- // Exponential backoff: 2s, 4s, 8s, 16s — or Retry-After if larger
199
- const backoffMs = 2 ** retryIndex * 1000;
200
- const delayMs = Math.max(backoffMs, retryAfterMs);
201
-
202
- console.warn(
203
- `⏳ Retry ${retryIndex}/${MAX_RETRIES} for "${document.metadata.source}" in ${delayMs}ms`,
204
- );
205
- return timer(delayMs);
206
- },
207
- }),
208
- map((embeddings) => {
209
- console.log('embedding document', document.metadata.source);
210
- const metadata = { ...document.metadata, embedding: embeddings };
211
- return { ...document, metadata };
212
- }),
213
- ),
214
- EMBEDDING_CONCURRENCY,
310
+ let metadataCount = 0;
311
+ let metadataDone = false;
312
+ const applyMetadata$ = applyMetadata(parsed$, config.index, (source) => {
313
+ metadataCount++;
314
+ progress.update(LINE_META, `🏷️ Metadata [${metadataCount}] ${source}`);
315
+ }).pipe(
316
+ finalize(() => {
317
+ metadataDone = true;
318
+ progress.succeed(LINE_META, `🏷️ Metadata ${metadataCount} documents`);
319
+ }),
320
+ );
321
+
322
+ // Resolve promoted schema fields (if schema is configured) — runs after
323
+ // metadata enrichment so the resolver has access to git, package, and
324
+ // custom attributes from attributeProcessor
325
+ const applySchema$ = applySchema(applyMetadata$, config.index?.schema);
326
+
327
+ // Generate embeddings in batches with retry on rate-limit (429) errors
328
+ const embeddingService = framework.ai.useEmbed(options.embedModel);
329
+
330
+ let embeddedCount = 0;
331
+ let embeddingDone = false;
332
+ const applyEmbedding$ = applySchema$.pipe(
333
+ // Flatten all documents from file-level batches, then re-batch for the API
334
+ concatMap((documents) => from(documents)),
335
+ // Flush when EMBED_BATCH_SIZE docs accumulate OR after EMBED_BUFFER_FLUSH_MS,
336
+ // whichever comes first — prevents upstream starvation from blocking concurrency
337
+ bufferTime(EMBED_BUFFER_FLUSH_MS, null, EMBED_BATCH_SIZE),
338
+ filter((batch) => batch.length > 0),
339
+ mergeMap(
340
+ (batch) =>
341
+ from(embeddingService.embedDocuments(batch.map((d) => d.pageContent))).pipe(
342
+ retry({
343
+ count: MAX_RETRIES,
344
+ delay: (error, retryIndex) => {
345
+ // Auth errors are terminal — abort immediately with actionable message
346
+ if (error?.name === 'NoAccountsError') {
347
+ console.error(
348
+ '\n🔒 Authentication expired. Run `ffc auth login` then retry with `--diff`.',
349
+ );
350
+ throw error;
351
+ }
352
+
353
+ const retryAfterSec =
354
+ error?.response?.headers?.get?.('retry-after') ??
355
+ error?.responseHeaders?.['retry-after'];
356
+ const retryAfterMs = retryAfterSec ? Number(retryAfterSec) * 1000 : 0;
357
+
358
+ const backoffMs = 2 ** retryIndex * 1000;
359
+ const delayMs = Math.max(backoffMs, retryAfterMs);
360
+
361
+ console.warn(
362
+ `\n⏳ Retry ${retryIndex}/${MAX_RETRIES} for batch of ${batch.length} in ${delayMs}ms`,
363
+ );
364
+ return timer(delayMs);
365
+ },
366
+ }),
367
+ map((allEmbeddings) => {
368
+ return batch.map((document, i) => {
369
+ embeddedCount++;
370
+ const total = metadataDone ? metadataCount : 0;
371
+ const pct = total > 0 ? ` ${Math.round((embeddedCount / total) * 100)}%` : '';
372
+ const denominator = total > 0 ? `/${total}` : '';
373
+ progress.update(
374
+ LINE_EMBED,
375
+ `🧠 Embedding [${embeddedCount}${denominator}]${pct} — ${document.metadata.source}`,
376
+ );
377
+ const metadata = { ...document.metadata, embedding: allEmbeddings[i] };
378
+ return { ...document, metadata };
379
+ });
380
+ }),
215
381
  ),
216
- toArray(),
217
- ),
382
+ EMBED_BATCH_CONCURRENCY,
218
383
  ),
384
+ finalize(() => {
385
+ embeddingDone = true;
386
+ progress.succeed(LINE_EMBED, `🧠 Embedded ${embeddedCount} documents`);
387
+ }),
219
388
  );
220
389
 
221
- // Update vector store
390
+ // Update vector store — batch documents and upsert concurrently
222
391
  const upsert$ = applyEmbedding$.pipe(
392
+ // Flatten file-level batches, then re-batch into groups of 20 for bulk upsert
393
+ concatMap((documents) => from(documents)),
394
+ bufferCount(20),
223
395
  mergeMap(async (documents) => {
224
- const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
396
+ const vectorStoreService = framework.ai.useIndex(options.indexName);
225
397
  if (documents.length === 0) {
226
398
  return undefined;
227
399
  }
228
- for (const document of documents) {
229
- console.log(`Adding entry [${document.id}] to vector store`, document.metadata.source);
230
- }
231
400
  if (!options.dryRun) {
232
- // For multiple chunks from same file, delete existing chunks first
233
- if (documents.length > 1) {
234
- const sources = documents
235
- .map((document) => document.metadata.source)
236
- .reduce((acc, source) => acc.add(source), new Set<string>());
237
-
238
- const filterExpression = Array.from(sources)
239
- .map((source) => `metadata/source eq '${source}'`)
240
- .join(' or ');
241
-
242
- // Fire-and-forget deletion (not awaited) - brief gap before new docs are indexed
243
- vectorStoreService.deleteDocuments({ filter: { filterExpression } });
244
- }
245
401
  await vectorStoreService.addDocuments(documents);
246
402
  }
247
403
  return {
248
404
  status: 'added',
249
405
  documents,
250
406
  } as UpdateVectorStoreResult;
251
- }),
407
+ }, UPSERT_CONCURRENCY),
252
408
  filter((result): result is UpdateVectorStoreResult => Boolean(result)),
253
409
  );
254
410
 
255
411
  // Execute pipeline
256
412
  // Track indexing results for reporting: deleted file paths and added document IDs
413
+ let indexedCount = 0;
257
414
  const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
258
415
  deleted: [],
259
416
  added: [],
260
417
  };
261
418
 
262
- // Execute pipeline: concat ensures deletions happen before additions
263
- // This subscription triggers lazy RxJS execution and tracks all results
264
- concat(delete$, upsert$).subscribe({
419
+ // Execute pipeline: merge runs deletions and additions concurrently so
420
+ // the embedding pipeline can start as soon as metadata-enriched documents
421
+ // are available, without waiting for all file discovery to complete.
422
+ progress.start(4);
423
+ merge(delete$, upsert$).subscribe({
265
424
  next: (result) => {
266
425
  // Track deleted files by relative path
267
426
  if (result.status === 'deleted') {
@@ -275,6 +434,12 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
275
434
  id: document.id,
276
435
  })),
277
436
  );
437
+ indexedCount += result.documents.length;
438
+ // Use embeddedCount as denominator — only show % once embedding is done
439
+ const total = embeddingDone ? embeddedCount : 0;
440
+ const pct = total > 0 ? ` ${Math.round((indexedCount / total) * 100)}%` : '';
441
+ const denominator = total > 0 ? `/${total}` : '';
442
+ progress.update(LINE_INDEX, `📤 Indexed [${indexedCount}${denominator}]${pct}`);
278
443
  }
279
444
  },
280
445
  error: (error) => {
@@ -282,8 +447,15 @@ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
282
447
  process.exit(1);
283
448
  },
284
449
  complete: () => {
285
- // Pipeline completed - log results and exit
286
- console.log('🗂️ Indexing results:', indexingResults);
450
+ // Clear the progress block before final output
451
+ progress.clear();
452
+ // Pipeline completed - log summary
453
+ if (indexingResults.deleted.length > 0) {
454
+ console.log(`🗑️ Deleted: ${indexingResults.deleted.length} files`);
455
+ }
456
+ if (indexingResults.added.length > 0) {
457
+ console.log(`📥 Indexed: ${indexingResults.added.length} documents`);
458
+ }
287
459
  console.log('✅ Embeddings generation completed!');
288
460
  process.exit(0);
289
461
  },
package/src/config.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
2
2
  import type { FusionAIConfig } from '@equinor/fusion-framework-cli-plugin-ai-base';
3
+ import type { IndexSchemaConfig } from './schema.js';
3
4
 
4
5
  /**
5
6
  * Index-specific configuration for Fusion AI document indexing operations.
@@ -55,7 +56,21 @@ export interface IndexConfig {
55
56
  chunkSize?: number;
56
57
  /** Number of overlapping tokens between consecutive chunks. */
57
58
  chunkOverlap?: number;
59
+ /** Explicit vector dimensions for custom embedding models not in the known model map. */
60
+ dimensions?: number;
58
61
  };
62
+
63
+ /**
64
+ * Custom index schema that promotes frequently-filtered metadata to
65
+ * top-level Azure AI Search fields.
66
+ *
67
+ * When defined, the schema resolver runs after metadata enrichment and
68
+ * places resolved values as top-level document fields in Azure Search,
69
+ * enabling direct OData filters without the `any()` operator.
70
+ *
71
+ * @see {@link IndexSchemaConfig} for details and examples.
72
+ */
73
+ schema?: IndexSchemaConfig;
59
74
  }
60
75
 
61
76
  /**