@equinor/fusion-framework-cli-plugin-ai-index 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CHANGELOG.md +63 -0
  2. package/LICENSE +21 -0
  3. package/README.md +114 -0
  4. package/dist/esm/bin/apply-metadata.js +63 -0
  5. package/dist/esm/bin/apply-metadata.js.map +1 -0
  6. package/dist/esm/bin/delete-removed-files.js +36 -0
  7. package/dist/esm/bin/delete-removed-files.js.map +1 -0
  8. package/dist/esm/bin/embed.js +196 -0
  9. package/dist/esm/bin/embed.js.map +1 -0
  10. package/dist/esm/bin/execute-pipeline.js +40 -0
  11. package/dist/esm/bin/execute-pipeline.js.map +1 -0
  12. package/dist/esm/bin/file-stream.js +22 -0
  13. package/dist/esm/bin/file-stream.js.map +1 -0
  14. package/dist/esm/bin/get-diff.js +29 -0
  15. package/dist/esm/bin/get-diff.js.map +1 -0
  16. package/dist/esm/bin/index.js +2 -0
  17. package/dist/esm/bin/index.js.map +1 -0
  18. package/dist/esm/bin/types.js +2 -0
  19. package/dist/esm/bin/types.js.map +1 -0
  20. package/dist/esm/command.js +82 -0
  21. package/dist/esm/command.js.map +1 -0
  22. package/dist/esm/command.options.js +48 -0
  23. package/dist/esm/command.options.js.map +1 -0
  24. package/dist/esm/config.js +2 -0
  25. package/dist/esm/config.js.map +1 -0
  26. package/dist/esm/index.js +13 -0
  27. package/dist/esm/index.js.map +1 -0
  28. package/dist/esm/utils/generate-chunk-id.js +18 -0
  29. package/dist/esm/utils/generate-chunk-id.js.map +1 -0
  30. package/dist/esm/utils/git/file-changes.js +196 -0
  31. package/dist/esm/utils/git/file-changes.js.map +1 -0
  32. package/dist/esm/utils/git/git-client.js +39 -0
  33. package/dist/esm/utils/git/git-client.js.map +1 -0
  34. package/dist/esm/utils/git/index.js +9 -0
  35. package/dist/esm/utils/git/index.js.map +1 -0
  36. package/dist/esm/utils/git/metadata.js +41 -0
  37. package/dist/esm/utils/git/metadata.js.map +1 -0
  38. package/dist/esm/utils/git/status.js +34 -0
  39. package/dist/esm/utils/git/status.js.map +1 -0
  40. package/dist/esm/utils/git/types.js +2 -0
  41. package/dist/esm/utils/git/types.js.map +1 -0
  42. package/dist/esm/utils/markdown/index.js +3 -0
  43. package/dist/esm/utils/markdown/index.js.map +1 -0
  44. package/dist/esm/utils/markdown/parser.js +72 -0
  45. package/dist/esm/utils/markdown/parser.js.map +1 -0
  46. package/dist/esm/utils/markdown/types.js +2 -0
  47. package/dist/esm/utils/markdown/types.js.map +1 -0
  48. package/dist/esm/utils/package-resolver.js +40 -0
  49. package/dist/esm/utils/package-resolver.js.map +1 -0
  50. package/dist/esm/utils/ts-doc/constants.js +13 -0
  51. package/dist/esm/utils/ts-doc/constants.js.map +1 -0
  52. package/dist/esm/utils/ts-doc/extractors.js +175 -0
  53. package/dist/esm/utils/ts-doc/extractors.js.map +1 -0
  54. package/dist/esm/utils/ts-doc/index.js +3 -0
  55. package/dist/esm/utils/ts-doc/index.js.map +1 -0
  56. package/dist/esm/utils/ts-doc/parser.js +37 -0
  57. package/dist/esm/utils/ts-doc/parser.js.map +1 -0
  58. package/dist/esm/utils/ts-doc/types.js +2 -0
  59. package/dist/esm/utils/ts-doc/types.js.map +1 -0
  60. package/dist/esm/utils/types.js +2 -0
  61. package/dist/esm/utils/types.js.map +1 -0
  62. package/dist/esm/version.js +3 -0
  63. package/dist/esm/version.js.map +1 -0
  64. package/dist/tsconfig.tsbuildinfo +1 -0
  65. package/dist/types/bin/apply-metadata.d.ts +1 -0
  66. package/dist/types/bin/delete-removed-files.d.ts +1 -0
  67. package/dist/types/bin/embed.d.ts +1 -0
  68. package/dist/types/bin/execute-pipeline.d.ts +1 -0
  69. package/dist/types/bin/file-stream.d.ts +1 -0
  70. package/dist/types/bin/get-diff.d.ts +1 -0
  71. package/dist/types/bin/index.d.ts +1 -0
  72. package/dist/types/bin/types.d.ts +1 -0
  73. package/dist/types/command.d.ts +2 -0
  74. package/dist/types/command.options.d.ts +62 -0
  75. package/dist/types/config.d.ts +33 -0
  76. package/dist/types/index.d.ts +8 -0
  77. package/dist/types/utils/generate-chunk-id.d.ts +8 -0
  78. package/dist/types/utils/git/file-changes.d.ts +21 -0
  79. package/dist/types/utils/git/git-client.d.ts +17 -0
  80. package/dist/types/utils/git/index.d.ts +5 -0
  81. package/dist/types/utils/git/metadata.d.ts +7 -0
  82. package/dist/types/utils/git/status.d.ts +12 -0
  83. package/dist/types/utils/git/types.d.ts +33 -0
  84. package/dist/types/utils/markdown/index.d.ts +2 -0
  85. package/dist/types/utils/markdown/parser.d.ts +21 -0
  86. package/dist/types/utils/markdown/types.d.ts +11 -0
  87. package/dist/types/utils/package-resolver.d.ts +14 -0
  88. package/dist/types/utils/ts-doc/constants.d.ts +5 -0
  89. package/dist/types/utils/ts-doc/extractors.d.ts +28 -0
  90. package/dist/types/utils/ts-doc/index.d.ts +2 -0
  91. package/dist/types/utils/ts-doc/parser.d.ts +23 -0
  92. package/dist/types/utils/ts-doc/types.d.ts +20 -0
  93. package/dist/types/utils/types.d.ts +17 -0
  94. package/dist/types/version.d.ts +1 -0
  95. package/package.json +72 -0
  96. package/src/bin/apply-metadata.ts +77 -0
  97. package/src/bin/delete-removed-files.ts +49 -0
  98. package/src/bin/embed.ts +262 -0
  99. package/src/bin/execute-pipeline.ts +48 -0
  100. package/src/bin/file-stream.ts +34 -0
  101. package/src/bin/get-diff.ts +33 -0
  102. package/src/bin/index.ts +1 -0
  103. package/src/bin/types.ts +48 -0
  104. package/src/command.options.ts +58 -0
  105. package/src/command.ts +100 -0
  106. package/src/config.ts +39 -0
  107. package/src/index.ts +19 -0
  108. package/src/utils/generate-chunk-id.ts +17 -0
  109. package/src/utils/git/file-changes.ts +213 -0
  110. package/src/utils/git/git-client.ts +43 -0
  111. package/src/utils/git/index.ts +19 -0
  112. package/src/utils/git/metadata.ts +47 -0
  113. package/src/utils/git/status.ts +48 -0
  114. package/src/utils/git/types.ts +36 -0
  115. package/src/utils/markdown/index.ts +5 -0
  116. package/src/utils/markdown/parser.ts +92 -0
  117. package/src/utils/markdown/types.ts +20 -0
  118. package/src/utils/package-resolver.ts +44 -0
  119. package/src/utils/ts-doc/constants.ts +13 -0
  120. package/src/utils/ts-doc/extractors.ts +246 -0
  121. package/src/utils/ts-doc/index.ts +5 -0
  122. package/src/utils/ts-doc/parser.ts +51 -0
  123. package/src/utils/ts-doc/types.ts +26 -0
  124. package/src/utils/types.ts +18 -0
  125. package/src/version.ts +2 -0
  126. package/tsconfig.json +27 -0
  127. package/vitest.config.ts +14 -0
@@ -0,0 +1,262 @@
1
+ import { globbyStream } from 'globby';
2
+ import { relative } from 'node:path';
3
+ import multimatch from 'multimatch';
4
+ import { concat, from, merge } from 'rxjs';
5
+ import { concatMap, filter, map, mergeMap, shareReplay, toArray } from 'rxjs/operators';
6
+
7
+ import { isMarkdownFile, parseMarkdownFile } from '../utils/markdown/index.js';
8
+ import { getFileStatus, resolveProjectRoot } from '../utils/git/index.js';
9
+ import { isTypescriptFile, parseTsDocFromFileSync } from '../utils/ts-doc/index.js';
10
+
11
+ import { getDiff } from './get-diff.js';
12
+ import { createDeleteRemovedFilesStream } from './delete-removed-files.js';
13
+ import { applyMetadata } from './apply-metadata.js';
14
+ import type {
15
+ DocumentEntry,
16
+ EmbeddingsBinOptions,
17
+ ProcessedFile,
18
+ UpdateVectorStoreResult,
19
+ } from './types.js';
20
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
21
+ import { readFileSync } from 'node:fs';
22
+ import { generateChunkId } from '../utils/generate-chunk-id.js';
23
+
24
+ /**
25
+ * Default directories to skip before expensive git operations.
26
+ * These are common build artifacts and dependencies that should be ignored.
27
+ * @internal
28
+ */
29
+ const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**', '.git'];
30
+
31
+ /**
32
+ * Main entry point for the embeddings bin.
33
+ * Orchestrates the entire embeddings generation pipeline.
34
+ * @internal
35
+ */
36
+ export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
37
+ const { framework, options, config, filePatterns } = binOptions;
38
+
39
+ // Handle clean operation (destructive - deletes all existing documents)
40
+ const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
41
+ if (options.clean && !options.dryRun) {
42
+ console.log('🧹 Cleaning vector store: deleting all existing documents...');
43
+ // OData filter: delete all documents with non-empty source (all indexed docs)
44
+ await vectorStoreService.deleteDocuments({
45
+ filter: { filterExpression: "metadata/source ne ''" },
46
+ });
47
+ console.log('✅ Vector store cleaned successfully');
48
+ }
49
+
50
+ // Handle diff-based processing (workflow mode)
51
+ const changedFiles = options.diff ? await getDiff(options) : [];
52
+
53
+ // Create file stream: diff mode uses git changes, normal mode uses globby
54
+ const files$ = (() => {
55
+ if (options.diff) {
56
+ return from(changedFiles);
57
+ }
58
+
59
+ // Directories to skip before expensive git operations.
60
+ // Note: Even with gitignore: true, globby still traverses ignored directories when .gitignore
61
+ // contains negation patterns (like !.yarn/releases), so we add explicit ignore patterns
62
+ // to prevent traversing these directories entirely.
63
+ const ignore = config.index?.ignore ?? defaultIgnore;
64
+
65
+ return from(
66
+ globbyStream(filePatterns, {
67
+ ignore,
68
+ onlyFiles: true,
69
+ gitignore: true,
70
+ absolute: true,
71
+ }),
72
+ ).pipe(
73
+ // Get git status concurrently, then flatten array results
74
+ mergeMap((path) => getFileStatus(path)),
75
+ concatMap((files) => from(files)),
76
+ // Share stream for multiple subscribers (removedFiles$ and indexFiles$)
77
+ shareReplay({ refCount: true }),
78
+ );
79
+ })();
80
+
81
+ // Process files: enrich with metadata and filter by allowed patterns
82
+ const allowedFilePatterns = config.index?.patterns ?? [
83
+ '**/*.ts',
84
+ '**/*.tsx',
85
+ '**/*.md',
86
+ '**/*.mdx',
87
+ ];
88
+
89
+ // Process files: enrich with metadata and filter by allowed patterns
90
+ const processedFiles$ = files$.pipe(
91
+ map((file) => {
92
+ const { filepath, status } = file;
93
+ const projectRoot = resolveProjectRoot(filepath);
94
+ const relativePath = projectRoot ? relative(projectRoot, filepath) : filepath;
95
+
96
+ return {
97
+ path: filepath,
98
+ status,
99
+ projectRoot,
100
+ relativePath,
101
+ };
102
+ }),
103
+ filter((file) => {
104
+ const matches = multimatch(file.relativePath, allowedFilePatterns);
105
+ return matches.length > 0;
106
+ }),
107
+ // Share for multiple subscribers (removedFiles$, markdown$, typescript$)
108
+ shareReplay({ refCount: true }),
109
+ );
110
+
111
+ // Split stream: removed files for deletion, new/modified for indexing
112
+ const removedFiles$ = processedFiles$.pipe(filter((file) => file.status === 'removed'));
113
+
114
+ // Create processing streams
115
+ const delete$ = createDeleteRemovedFilesStream(removedFiles$, framework, options);
116
+
117
+ // New/modified files for indexing
118
+ const indexFiles$ = processedFiles$.pipe(
119
+ filter((file) => file.status === 'new' || file.status === 'modified'),
120
+ // Share for markdown$ and typescript$ pipelines
121
+ shareReplay({ refCount: true }),
122
+ );
123
+
124
+ const isRawFile = (file: ProcessedFile): boolean => {
125
+ const matches = multimatch(file.relativePath, config.index?.rawPatterns ?? []);
126
+ if (matches.length > 0) {
127
+ return true;
128
+ }
129
+ return false;
130
+ };
131
+
132
+ const rawFiles$ = indexFiles$.pipe(
133
+ filter(isRawFile),
134
+ map((file): DocumentEntry => {
135
+ const document: VectorStoreDocument = {
136
+ id: generateChunkId(file.relativePath),
137
+ pageContent: readFileSync(file.path, 'utf8'),
138
+ metadata: {
139
+ source: file.relativePath,
140
+ type: 'raw',
141
+ },
142
+ };
143
+ return { status: file.status, documents: [document] };
144
+ }),
145
+ );
146
+
147
+ const markdown$ = indexFiles$.pipe(
148
+ filter((x) => !isRawFile(x)),
149
+ filter((file) => isMarkdownFile(file.path)),
150
+ mergeMap(async (file) => {
151
+ const documents = await parseMarkdownFile(file);
152
+ return { status: file.status, documents };
153
+ }),
154
+ );
155
+
156
+ const typescript$ = indexFiles$.pipe(
157
+ filter((x) => !isRawFile(x)),
158
+ filter((file) => isTypescriptFile(file.path)),
159
+ map((file) => {
160
+ const documents = parseTsDocFromFileSync(file);
161
+ return { status: file.status, documents };
162
+ }),
163
+ );
164
+
165
+ // Apply metadata to documents
166
+ const applyMetadata$ = applyMetadata(merge(rawFiles$, markdown$, typescript$), config.index);
167
+
168
+ // Generate embeddings
169
+ const embeddingService = framework.ai.getService('embeddings', options.openaiEmbeddingDeployment);
170
+ const applyEmbedding$ = applyMetadata$.pipe(
171
+ mergeMap((documents) =>
172
+ from(documents).pipe(
173
+ mergeMap(async (document) => {
174
+ console.log('embedding document', document.metadata.source);
175
+ const embeddings = await embeddingService
176
+ .embedQuery(document.pageContent)
177
+ .catch((error) => {
178
+ console.error(
179
+ `❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
180
+ );
181
+ console.error('document', document);
182
+ process.exit(1);
183
+ });
184
+ const metadata = { ...document.metadata, embedding: embeddings };
185
+ return { ...document, metadata };
186
+ }),
187
+ toArray(),
188
+ ),
189
+ ),
190
+ );
191
+
192
+ // Update vector store
193
+ const upsert$ = applyEmbedding$.pipe(
194
+ mergeMap(async (documents) => {
195
+ const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
196
+ if (documents.length === 0) {
197
+ return undefined;
198
+ }
199
+ for (const document of documents) {
200
+ console.log(`Adding entry [${document.id}] to vector store`, document.metadata.source);
201
+ }
202
+ if (!options.dryRun) {
203
+ // For multiple chunks from same file, delete existing chunks first
204
+ if (documents.length > 1) {
205
+ const sources = documents
206
+ .map((document) => document.metadata.source)
207
+ .reduce((acc, source) => acc.add(source), new Set<string>());
208
+
209
+ const filterExpression = Array.from(sources)
210
+ .map((source) => `metadata/source eq '${source}'`)
211
+ .join(' or ');
212
+
213
+ // Fire-and-forget deletion (not awaited) - brief gap before new docs are indexed
214
+ vectorStoreService.deleteDocuments({ filter: { filterExpression } });
215
+ }
216
+ await vectorStoreService.addDocuments(documents);
217
+ }
218
+ return {
219
+ status: 'added',
220
+ documents,
221
+ } as UpdateVectorStoreResult;
222
+ }),
223
+ filter((result): result is UpdateVectorStoreResult => Boolean(result)),
224
+ );
225
+
226
+ // Execute pipeline
227
+ // Track indexing results for reporting: deleted file paths and added document IDs
228
+ const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
229
+ deleted: [],
230
+ added: [],
231
+ };
232
+
233
+ // Execute pipeline: concat ensures deletions happen before additions
234
+ // This subscription triggers lazy RxJS execution and tracks all results
235
+ concat(delete$, upsert$).subscribe({
236
+ next: (result) => {
237
+ // Track deleted files by relative path
238
+ if (result.status === 'deleted') {
239
+ indexingResults.deleted.push(...result.files.map((file) => file.relativePath));
240
+ }
241
+ // Track added documents with source and ID (one file can produce multiple IDs)
242
+ else if (result.status === 'added') {
243
+ indexingResults.added.push(
244
+ ...result.documents.map((document) => ({
245
+ source: document.metadata.source,
246
+ id: document.id,
247
+ })),
248
+ );
249
+ }
250
+ },
251
+ error: (error) => {
252
+ console.error(`❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
253
+ process.exit(1);
254
+ },
255
+ complete: () => {
256
+ // Pipeline completed - log results and exit
257
+ console.log('🗂️ Indexing results:', indexingResults);
258
+ console.log('✅ Embeddings generation completed!');
259
+ process.exit(0);
260
+ },
261
+ });
262
+ }
@@ -0,0 +1,48 @@
1
+ import { concat } from 'rxjs';
2
+ import type { DeleteRemovedFilesResult, UpdateVectorStoreResult } from './types.js';
3
+ import type { Observable } from 'rxjs';
4
+
5
+ /**
6
+ * Executes the pipeline and tracks results.
7
+ * @internal
8
+ */
9
+ export function executePipeline(
10
+ deleteRemovedFiles$: Observable<DeleteRemovedFilesResult>,
11
+ updateVectorStore$: Observable<UpdateVectorStoreResult>,
12
+ ): void {
13
+ // Track indexing results for reporting: deleted file paths and added document IDs
14
+ const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
15
+ deleted: [],
16
+ added: [],
17
+ };
18
+
19
+ // Execute pipeline: concat ensures deletions happen before additions
20
+ // This subscription triggers lazy RxJS execution and tracks all results
21
+ concat(deleteRemovedFiles$, updateVectorStore$).subscribe({
22
+ next: (result) => {
23
+ // Track deleted files by relative path
24
+ if (result.status === 'deleted') {
25
+ indexingResults.deleted.push(...result.files.map((file) => file.relativePath));
26
+ }
27
+ // Track added documents with source and ID (one file can produce multiple IDs)
28
+ else if (result.status === 'added') {
29
+ indexingResults.added.push(
30
+ ...result.documents.map((document) => ({
31
+ source: document.metadata.source,
32
+ id: document.id,
33
+ })),
34
+ );
35
+ }
36
+ },
37
+ error: (error) => {
38
+ console.error(`❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
39
+ process.exit(1);
40
+ },
41
+ complete: () => {
42
+ // Pipeline completed - log results and exit
43
+ console.log('🗂️ Indexing results:', indexingResults);
44
+ console.log('✅ Embeddings generation completed!');
45
+ process.exit(0);
46
+ },
47
+ });
48
+ }
@@ -0,0 +1,34 @@
1
+ import { globbyStream } from 'globby';
2
+ import { from, mergeMap, concatMap, shareReplay } from 'rxjs';
3
+ import type { Observable } from 'rxjs';
4
+ import { getFileStatus } from '../utils/git/index.js';
5
+ import type { ChangedFile } from '../utils/git/index.js';
6
+ import type { CommandOptions } from '../command.options.js';
7
+
8
+ /**
9
+ * Creates a file stream based on diff mode or glob patterns.
10
+ * @internal
11
+ */
12
+ export function createFileStream(
13
+ options: CommandOptions,
14
+ changedFiles: ChangedFile[],
15
+ filePatterns: string[],
16
+ ): Observable<ChangedFile> {
17
+ if (options.diff) {
18
+ return from(changedFiles);
19
+ }
20
+
21
+ return from(
22
+ globbyStream(filePatterns, {
23
+ onlyFiles: true,
24
+ gitignore: true,
25
+ absolute: true,
26
+ }),
27
+ ).pipe(
28
+ // Get git status concurrently, then flatten array results
29
+ mergeMap((path) => getFileStatus(path)),
30
+ concatMap((files) => from(files)),
31
+ // Share stream for multiple subscribers (removedFiles$ and indexFiles$)
32
+ shareReplay({ refCount: true }),
33
+ );
34
+ }
@@ -0,0 +1,33 @@
1
+ import type { ChangedFile } from '../utils/git/index.js';
2
+ import { getChangedFiles, getGitStatus } from '../utils/git/index.js';
3
+ import type { CommandOptions } from '../command.options.js';
4
+
5
+ /**
6
+ * Handles diff-based processing to get changed files from git.
7
+ * @internal
8
+ */
9
+ export async function getDiff(options: CommandOptions): Promise<ChangedFile[]> {
10
+ try {
11
+ // Get current git status for informational output
12
+ const gitStatus = await getGitStatus();
13
+ console.log(`🔍 Git status: ${gitStatus.branch}@${gitStatus.commit}`);
14
+ console.log(`📊 Changes: ${gitStatus.stagedFiles} staged, ${gitStatus.unstagedFiles} unstaged`);
15
+
16
+ // Get changed files compared to base reference (default: HEAD~1)
17
+ const changedFiles = await getChangedFiles({
18
+ diff: options.diff,
19
+ baseRef: options.baseRef,
20
+ });
21
+
22
+ if (changedFiles.length === 0) {
23
+ console.log('✅ No changed files match the provided patterns. Nothing to process.');
24
+ process.exit(0);
25
+ }
26
+
27
+ console.log(`📝 Found ${changedFiles.length} changed files matching patterns`);
28
+ return changedFiles;
29
+ } catch (error) {
30
+ console.error(`❌ Git diff error: ${error instanceof Error ? error.message : 'Unknown error'}`);
31
+ process.exit(1);
32
+ }
33
+ }
@@ -0,0 +1 @@
1
+ export { embed } from './embed.js';
@@ -0,0 +1,48 @@
1
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
2
+ import type { ChangedFile } from '../utils/git/index.js';
3
+ import type { CommandOptions } from '../command.options.js';
4
+ import type { FrameworkInstance } from '@equinor/fusion-framework-cli-plugin-ai-base';
5
+ import type { FusionAIConfigWithIndex } from '../config.js';
6
+
7
+ /**
8
+ * Result of updating the vector store with new documents
9
+ * @internal
10
+ */
11
+ export type UpdateVectorStoreResult = { status: 'added'; documents: VectorStoreDocument[] };
12
+
13
+ /**
14
+ * Result of deleting removed files from the vector store
15
+ * @internal
16
+ */
17
+ export type DeleteRemovedFilesResult = { status: 'deleted'; files: { relativePath: string }[] };
18
+
19
+ /**
20
+ * File with enriched metadata for processing
21
+ * @internal
22
+ */
23
+ export type ProcessedFile = {
24
+ path: string;
25
+ status: ChangedFile['status'];
26
+ projectRoot: string | undefined;
27
+ relativePath: string;
28
+ };
29
+
30
+ /**
31
+ * Document entry with status for processing
32
+ * @internal
33
+ */
34
+ export type DocumentEntry = {
35
+ status: ChangedFile['status'];
36
+ documents: VectorStoreDocument[];
37
+ };
38
+
39
+ /**
40
+ * Options for the embeddings bin
41
+ * @internal
42
+ */
43
+ export interface EmbeddingsBinOptions {
44
+ framework: FrameworkInstance;
45
+ options: CommandOptions;
46
+ config: FusionAIConfigWithIndex;
47
+ filePatterns: string[];
48
+ }
@@ -0,0 +1,58 @@
1
+ import { z } from 'zod';
2
+
3
+ import { AiOptionsSchema } from '@equinor/fusion-framework-cli-plugin-ai-base/command-options';
4
+
5
+ /**
6
+ * Zod schema for validating command options for the embeddings command.
7
+ *
8
+ * This schema extends the base AI options schema with embeddings-specific options,
9
+ * ensuring type safety and runtime validation of command arguments.
10
+ *
11
+ * Note: Some optional AI options become required for the embeddings command
12
+ * (openaiEmbeddingDeployment, azureSearchEndpoint, azureSearchApiKey, azureSearchIndexName)
13
+ * because the command uses withAiOptions with includeEmbedding and includeSearch set to true.
14
+ */
15
+ export const CommandOptionsSchema = AiOptionsSchema.extend({
16
+ // Override optional AI options to make them required for embeddings command
17
+ openaiEmbeddingDeployment: z
18
+ .string({ message: 'Embedding deployment name is required for embeddings command.' })
19
+ .min(1, 'Embedding deployment name must be a non-empty string.')
20
+ .describe('Azure OpenAI embedding deployment name'),
21
+ azureSearchEndpoint: z
22
+ .string({ message: 'Azure Search endpoint is required for embeddings command.' })
23
+ .url('Azure Search endpoint must be a valid URL.')
24
+ .min(1, 'Azure Search endpoint must be a non-empty string.')
25
+ .describe('Azure Search endpoint URL'),
26
+ azureSearchApiKey: z
27
+ .string({ message: 'Azure Search API key is required for embeddings command.' })
28
+ .min(1, 'Azure Search API key must be a non-empty string.')
29
+ .describe('Azure Search API key'),
30
+ azureSearchIndexName: z
31
+ .string({ message: 'Azure Search index name is required for embeddings command.' })
32
+ .min(1, 'Azure Search index name must be a non-empty string.')
33
+ .describe('Azure Search index name'),
34
+
35
+ // Embeddings-specific options
36
+ dryRun: z
37
+ .boolean({ message: 'dryRun must be a boolean value.' })
38
+ .describe('Show what would be processed without actually doing it'),
39
+ config: z
40
+ .string({ message: 'Config file path is required and must be a non-empty string.' })
41
+ .min(1, 'Config file path must be a non-empty string.')
42
+ .describe('Path to a config file'),
43
+ diff: z
44
+ .boolean({ message: 'diff must be a boolean value.' })
45
+ .describe('Process only changed files (workflow mode)'),
46
+ baseRef: z.string().min(1).optional().describe('Git reference to compare against'),
47
+ clean: z
48
+ .boolean({ message: 'clean must be a boolean value.' })
49
+ .describe('Delete all existing documents from the vector store before processing'),
50
+ }).describe('Command options for the embeddings command');
51
+
52
+ /**
53
+ * Type representing the validated command options.
54
+ *
55
+ * This type is inferred from the Zod schema and should be used throughout the command
56
+ * to ensure type safety and consistency with the schema.
57
+ */
58
+ export type CommandOptions = z.infer<typeof CommandOptionsSchema>;
package/src/command.ts ADDED
@@ -0,0 +1,100 @@
1
+ import { createCommand, createOption } from 'commander';
2
+
3
+ import { loadFusionAIConfig, setupFramework } from '@equinor/fusion-framework-cli-plugin-ai-base';
4
+ import { withOptions as withAiOptions } from '@equinor/fusion-framework-cli-plugin-ai-base/command-options';
5
+
6
+ import { embed } from './bin/embed.js';
7
+ import { CommandOptionsSchema, type CommandOptions } from './command.options.js';
8
+ import type { FusionAIConfigWithIndex } from './config.js';
9
+
10
+ /**
11
+ * CLI command: `ai embeddings`
12
+ *
13
+ * Document embedding utilities for Large Language Model processing.
14
+ *
15
+ * Features:
16
+ * - Markdown/MDX document chunking with frontmatter extraction
17
+ * - TypeScript/TSX TSDoc extraction and chunking
18
+ * - Glob pattern support for file collection
19
+ * - Git diff-based processing for workflow integration
20
+ * - Dry-run mode for testing without actual processing
21
+ * - Configurable file patterns via fusion-ai.config.ts
22
+ *
23
+ * Usage:
24
+ * $ ffc ai embeddings [options] [glob-patterns...]
25
+ *
26
+ * Arguments:
27
+ * glob-patterns Glob patterns to match files (optional when using --diff)
28
+ * Defaults to patterns from fusion-ai.config.ts if not provided
29
+ *
30
+ * Options:
31
+ * --dry-run Show what would be processed without actually doing it
32
+ * --config <config> Path to a config file (default: fusion-ai.config)
33
+ * --diff Process only changed files (workflow mode)
34
+ * --base-ref <ref> Git reference to compare against (default: HEAD~1)
35
+ * --clean Delete all existing documents from the vector store before processing
36
+ *
37
+ * AI Options (required):
38
+ * --openai-api-key <key> Azure OpenAI API key (or AZURE_OPENAI_API_KEY env var)
39
+ * --openai-api-version <version> Azure OpenAI API version (default: 2024-02-15-preview)
40
+ * --openai-instance <name> Azure OpenAI instance name (or AZURE_OPENAI_INSTANCE_NAME env var)
41
+ * --openai-embedding-deployment <name> Azure OpenAI embedding deployment name (or AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME env var)
42
+ * --azure-search-endpoint <url> Azure Search endpoint URL (or AZURE_SEARCH_ENDPOINT env var)
43
+ * --azure-search-api-key <key> Azure Search API key (or AZURE_SEARCH_API_KEY env var)
44
+ * --azure-search-index-name <name> Azure Search index name (or AZURE_SEARCH_INDEX_NAME env var)
45
+ *
46
+ * Examples:
47
+ * $ ffc ai embeddings --dry-run ./src
48
+ * $ ffc ai embeddings "*.ts" "*.md" "*.mdx"
49
+ * $ ffc ai embeddings --diff
50
+ * $ ffc ai embeddings --diff --base-ref origin/main
51
+ * $ ffc ai embeddings --clean "*.ts"
52
+ */
53
+ const _command = createCommand('embeddings')
54
+ .description('Document embedding utilities for Large Language Model processing')
55
+ .addOption(
56
+ createOption('--dry-run', 'Show what would be processed without actually doing it').default(
57
+ false,
58
+ ),
59
+ )
60
+ .addOption(createOption('--config <config>', 'Path to a config file').default('fusion-ai.config'))
61
+ .addOption(createOption('--diff', 'Process only changed files (workflow mode)').default(false))
62
+ .addOption(createOption('--base-ref <ref>', 'Git reference to compare against').default('HEAD~1'))
63
+ .addOption(
64
+ createOption(
65
+ '--clean',
66
+ 'Delete all existing documents from the vector store before processing',
67
+ ).default(false),
68
+ )
69
+ .argument('[glob-patterns...]', 'Glob patterns to match files (optional when using --diff)')
70
+ .action(async (patterns: string[], commandOptions: CommandOptions) => {
71
+ const options = await CommandOptionsSchema.parseAsync(commandOptions);
72
+
73
+ // Load configuration
74
+ const config = await loadFusionAIConfig<FusionAIConfigWithIndex>(options.config, {
75
+ baseDir: process.cwd(),
76
+ });
77
+
78
+ // CLI args take precedence over config patterns
79
+ const indexConfig = config.index ?? {};
80
+ const allowedFilePatterns = indexConfig.patterns ?? ['**/*.ts', '**/*.md', '**/*.mdx'];
81
+ const filePatterns = patterns.length ? patterns : allowedFilePatterns;
82
+
83
+ // Initialize framework
84
+ const framework = await setupFramework(options);
85
+
86
+ // Execute embeddings bin with framework and options
87
+ await embed({
88
+ framework,
89
+ options,
90
+ config,
91
+ filePatterns,
92
+ });
93
+ });
94
+
95
+ export const command = withAiOptions(_command, {
96
+ includeEmbedding: true,
97
+ includeSearch: true,
98
+ });
99
+
100
+ export default command;
package/src/config.ts ADDED
@@ -0,0 +1,39 @@
1
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
2
+ import type { FusionAIConfig } from '@equinor/fusion-framework-cli-plugin-ai-base';
3
+
4
+ /**
5
+ * Index-specific configuration for Fusion AI operations
6
+ */
7
+ export interface IndexConfig {
8
+ patterns?: string[];
9
+ /** Files will be processed as is, without any chunking or transformation */
10
+ rawPatterns?: string[];
11
+ /** Globby patterns to ignored, only used when providing paths to the command */
12
+ ignore?: string[];
13
+ /** Metadata processing configuration */
14
+ metadata?: {
15
+ /** Automatically resolve package information from source file paths */
16
+ resolvePackage?: boolean;
17
+ resolveGit?: boolean;
18
+ /** Custom metadata processors to transform metadata before embedding */
19
+ attributeProcessor?: (
20
+ metadata: Record<string, unknown>,
21
+ document: VectorStoreDocument,
22
+ ) => Record<string, unknown>;
23
+ };
24
+
25
+ /** Embedding generation configuration */
26
+ embedding?: {
27
+ /** Size of text chunks for embedding */
28
+ chunkSize?: number;
29
+ /** Overlap between chunks */
30
+ chunkOverlap?: number;
31
+ };
32
+ }
33
+
34
+ /**
35
+ * Extended Fusion AI configuration with index-specific settings
36
+ */
37
+ export interface FusionAIConfigWithIndex extends FusionAIConfig {
38
+ index?: IndexConfig;
39
+ }
package/src/index.ts ADDED
@@ -0,0 +1,19 @@
1
+ import type { Command } from 'commander';
2
+ import { registerAiPlugin as registerAiPluginBase } from '@equinor/fusion-framework-cli-plugin-ai-base';
3
+ import { command as embeddingsCommand } from './command.js';
4
+
5
+ /**
6
+ * Registers the AI index plugin command with the CLI program
7
+ * @param program - The Commander program instance to register commands with
8
+ */
9
+ export function registerAiPlugin(program: Command): void {
10
+ registerAiPluginBase(program, embeddingsCommand);
11
+ }
12
+
13
+ export default registerAiPlugin;
14
+
15
+ // Re-export config utilities for convenience
16
+ export {
17
+ configureFusionAI,
18
+ type FusionAIConfig,
19
+ } from '@equinor/fusion-framework-cli-plugin-ai-base';
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Generates a unique identifier for a document chunk based on file path
3
+ * Creates a deterministic, URL-safe hash from the file path for validation and checks
4
+ * @param filePath - The file path to generate an ID from
5
+ * @param chunkIndex - Optional chunk index to append for multi-chunk documents
6
+ * @returns A base64-encoded hash of the file path, optionally suffixed with chunk index
7
+ */
8
+ export const generateChunkId = (filePath: string, chunkIndex?: number): string => {
9
+ // Convert file path to base64 and remove non-alphanumeric characters
10
+ // This creates a stable, URL-safe identifier from the file path
11
+ // The deterministic nature allows for validation and duplicate detection
12
+ const pathHash = Buffer.from(filePath)
13
+ .toString('base64')
14
+ .replace(/[^a-zA-Z0-9]/g, '');
15
+ // Append chunk index if provided to distinguish multiple chunks from the same file
16
+ return chunkIndex ? `${pathHash}-${chunkIndex}` : pathHash;
17
+ };