@equinor/fusion-framework-cli-plugin-ai-index 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +63 -0
- package/LICENSE +21 -0
- package/README.md +114 -0
- package/dist/esm/bin/apply-metadata.js +63 -0
- package/dist/esm/bin/apply-metadata.js.map +1 -0
- package/dist/esm/bin/delete-removed-files.js +36 -0
- package/dist/esm/bin/delete-removed-files.js.map +1 -0
- package/dist/esm/bin/embed.js +196 -0
- package/dist/esm/bin/embed.js.map +1 -0
- package/dist/esm/bin/execute-pipeline.js +40 -0
- package/dist/esm/bin/execute-pipeline.js.map +1 -0
- package/dist/esm/bin/file-stream.js +22 -0
- package/dist/esm/bin/file-stream.js.map +1 -0
- package/dist/esm/bin/get-diff.js +29 -0
- package/dist/esm/bin/get-diff.js.map +1 -0
- package/dist/esm/bin/index.js +2 -0
- package/dist/esm/bin/index.js.map +1 -0
- package/dist/esm/bin/types.js +2 -0
- package/dist/esm/bin/types.js.map +1 -0
- package/dist/esm/command.js +82 -0
- package/dist/esm/command.js.map +1 -0
- package/dist/esm/command.options.js +48 -0
- package/dist/esm/command.options.js.map +1 -0
- package/dist/esm/config.js +2 -0
- package/dist/esm/config.js.map +1 -0
- package/dist/esm/index.js +13 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/utils/generate-chunk-id.js +18 -0
- package/dist/esm/utils/generate-chunk-id.js.map +1 -0
- package/dist/esm/utils/git/file-changes.js +196 -0
- package/dist/esm/utils/git/file-changes.js.map +1 -0
- package/dist/esm/utils/git/git-client.js +39 -0
- package/dist/esm/utils/git/git-client.js.map +1 -0
- package/dist/esm/utils/git/index.js +9 -0
- package/dist/esm/utils/git/index.js.map +1 -0
- package/dist/esm/utils/git/metadata.js +41 -0
- package/dist/esm/utils/git/metadata.js.map +1 -0
- package/dist/esm/utils/git/status.js +34 -0
- package/dist/esm/utils/git/status.js.map +1 -0
- package/dist/esm/utils/git/types.js +2 -0
- package/dist/esm/utils/git/types.js.map +1 -0
- package/dist/esm/utils/markdown/index.js +3 -0
- package/dist/esm/utils/markdown/index.js.map +1 -0
- package/dist/esm/utils/markdown/parser.js +72 -0
- package/dist/esm/utils/markdown/parser.js.map +1 -0
- package/dist/esm/utils/markdown/types.js +2 -0
- package/dist/esm/utils/markdown/types.js.map +1 -0
- package/dist/esm/utils/package-resolver.js +40 -0
- package/dist/esm/utils/package-resolver.js.map +1 -0
- package/dist/esm/utils/ts-doc/constants.js +13 -0
- package/dist/esm/utils/ts-doc/constants.js.map +1 -0
- package/dist/esm/utils/ts-doc/extractors.js +175 -0
- package/dist/esm/utils/ts-doc/extractors.js.map +1 -0
- package/dist/esm/utils/ts-doc/index.js +3 -0
- package/dist/esm/utils/ts-doc/index.js.map +1 -0
- package/dist/esm/utils/ts-doc/parser.js +37 -0
- package/dist/esm/utils/ts-doc/parser.js.map +1 -0
- package/dist/esm/utils/ts-doc/types.js +2 -0
- package/dist/esm/utils/ts-doc/types.js.map +1 -0
- package/dist/esm/utils/types.js +2 -0
- package/dist/esm/utils/types.js.map +1 -0
- package/dist/esm/version.js +3 -0
- package/dist/esm/version.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/dist/types/bin/apply-metadata.d.ts +1 -0
- package/dist/types/bin/delete-removed-files.d.ts +1 -0
- package/dist/types/bin/embed.d.ts +1 -0
- package/dist/types/bin/execute-pipeline.d.ts +1 -0
- package/dist/types/bin/file-stream.d.ts +1 -0
- package/dist/types/bin/get-diff.d.ts +1 -0
- package/dist/types/bin/index.d.ts +1 -0
- package/dist/types/bin/types.d.ts +1 -0
- package/dist/types/command.d.ts +2 -0
- package/dist/types/command.options.d.ts +62 -0
- package/dist/types/config.d.ts +33 -0
- package/dist/types/index.d.ts +8 -0
- package/dist/types/utils/generate-chunk-id.d.ts +8 -0
- package/dist/types/utils/git/file-changes.d.ts +21 -0
- package/dist/types/utils/git/git-client.d.ts +17 -0
- package/dist/types/utils/git/index.d.ts +5 -0
- package/dist/types/utils/git/metadata.d.ts +7 -0
- package/dist/types/utils/git/status.d.ts +12 -0
- package/dist/types/utils/git/types.d.ts +33 -0
- package/dist/types/utils/markdown/index.d.ts +2 -0
- package/dist/types/utils/markdown/parser.d.ts +21 -0
- package/dist/types/utils/markdown/types.d.ts +11 -0
- package/dist/types/utils/package-resolver.d.ts +14 -0
- package/dist/types/utils/ts-doc/constants.d.ts +5 -0
- package/dist/types/utils/ts-doc/extractors.d.ts +28 -0
- package/dist/types/utils/ts-doc/index.d.ts +2 -0
- package/dist/types/utils/ts-doc/parser.d.ts +23 -0
- package/dist/types/utils/ts-doc/types.d.ts +20 -0
- package/dist/types/utils/types.d.ts +17 -0
- package/dist/types/version.d.ts +1 -0
- package/package.json +72 -0
- package/src/bin/apply-metadata.ts +77 -0
- package/src/bin/delete-removed-files.ts +49 -0
- package/src/bin/embed.ts +262 -0
- package/src/bin/execute-pipeline.ts +48 -0
- package/src/bin/file-stream.ts +34 -0
- package/src/bin/get-diff.ts +33 -0
- package/src/bin/index.ts +1 -0
- package/src/bin/types.ts +48 -0
- package/src/command.options.ts +58 -0
- package/src/command.ts +100 -0
- package/src/config.ts +39 -0
- package/src/index.ts +19 -0
- package/src/utils/generate-chunk-id.ts +17 -0
- package/src/utils/git/file-changes.ts +213 -0
- package/src/utils/git/git-client.ts +43 -0
- package/src/utils/git/index.ts +19 -0
- package/src/utils/git/metadata.ts +47 -0
- package/src/utils/git/status.ts +48 -0
- package/src/utils/git/types.ts +36 -0
- package/src/utils/markdown/index.ts +5 -0
- package/src/utils/markdown/parser.ts +92 -0
- package/src/utils/markdown/types.ts +20 -0
- package/src/utils/package-resolver.ts +44 -0
- package/src/utils/ts-doc/constants.ts +13 -0
- package/src/utils/ts-doc/extractors.ts +246 -0
- package/src/utils/ts-doc/index.ts +5 -0
- package/src/utils/ts-doc/parser.ts +51 -0
- package/src/utils/ts-doc/types.ts +26 -0
- package/src/utils/types.ts +18 -0
- package/src/version.ts +2 -0
- package/tsconfig.json +27 -0
- package/vitest.config.ts +14 -0
package/src/bin/embed.ts
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import { globbyStream } from 'globby';
|
|
2
|
+
import { relative } from 'node:path';
|
|
3
|
+
import multimatch from 'multimatch';
|
|
4
|
+
import { concat, from, merge } from 'rxjs';
|
|
5
|
+
import { concatMap, filter, map, mergeMap, shareReplay, toArray } from 'rxjs/operators';
|
|
6
|
+
|
|
7
|
+
import { isMarkdownFile, parseMarkdownFile } from '../utils/markdown/index.js';
|
|
8
|
+
import { getFileStatus, resolveProjectRoot } from '../utils/git/index.js';
|
|
9
|
+
import { isTypescriptFile, parseTsDocFromFileSync } from '../utils/ts-doc/index.js';
|
|
10
|
+
|
|
11
|
+
import { getDiff } from './get-diff.js';
|
|
12
|
+
import { createDeleteRemovedFilesStream } from './delete-removed-files.js';
|
|
13
|
+
import { applyMetadata } from './apply-metadata.js';
|
|
14
|
+
import type {
|
|
15
|
+
DocumentEntry,
|
|
16
|
+
EmbeddingsBinOptions,
|
|
17
|
+
ProcessedFile,
|
|
18
|
+
UpdateVectorStoreResult,
|
|
19
|
+
} from './types.js';
|
|
20
|
+
import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
|
|
21
|
+
import { readFileSync } from 'node:fs';
|
|
22
|
+
import { generateChunkId } from '../utils/generate-chunk-id.js';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Default directories to skip before expensive git operations.
|
|
26
|
+
* These are common build artifacts and dependencies that should be ignored.
|
|
27
|
+
* @internal
|
|
28
|
+
*/
|
|
29
|
+
const defaultIgnore = ['node_modules', '**/node_modules/**', 'dist', '**/dist/**', '.git'];
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Main entry point for the embeddings bin.
|
|
33
|
+
* Orchestrates the entire embeddings generation pipeline.
|
|
34
|
+
* @internal
|
|
35
|
+
*/
|
|
36
|
+
export async function embed(binOptions: EmbeddingsBinOptions): Promise<void> {
|
|
37
|
+
const { framework, options, config, filePatterns } = binOptions;
|
|
38
|
+
|
|
39
|
+
// Handle clean operation (destructive - deletes all existing documents)
|
|
40
|
+
const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
|
|
41
|
+
if (options.clean && !options.dryRun) {
|
|
42
|
+
console.log('🧹 Cleaning vector store: deleting all existing documents...');
|
|
43
|
+
// OData filter: delete all documents with non-empty source (all indexed docs)
|
|
44
|
+
await vectorStoreService.deleteDocuments({
|
|
45
|
+
filter: { filterExpression: "metadata/source ne ''" },
|
|
46
|
+
});
|
|
47
|
+
console.log('✅ Vector store cleaned successfully');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Handle diff-based processing (workflow mode)
|
|
51
|
+
const changedFiles = options.diff ? await getDiff(options) : [];
|
|
52
|
+
|
|
53
|
+
// Create file stream: diff mode uses git changes, normal mode uses globby
|
|
54
|
+
const files$ = (() => {
|
|
55
|
+
if (options.diff) {
|
|
56
|
+
return from(changedFiles);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Directories to skip before expensive git operations.
|
|
60
|
+
// Note: Even with gitignore: true, globby still traverses ignored directories when .gitignore
|
|
61
|
+
// contains negation patterns (like !.yarn/releases), so we add explicit ignore patterns
|
|
62
|
+
// to prevent traversing these directories entirely.
|
|
63
|
+
const ignore = config.index?.ignore ?? defaultIgnore;
|
|
64
|
+
|
|
65
|
+
return from(
|
|
66
|
+
globbyStream(filePatterns, {
|
|
67
|
+
ignore,
|
|
68
|
+
onlyFiles: true,
|
|
69
|
+
gitignore: true,
|
|
70
|
+
absolute: true,
|
|
71
|
+
}),
|
|
72
|
+
).pipe(
|
|
73
|
+
// Get git status concurrently, then flatten array results
|
|
74
|
+
mergeMap((path) => getFileStatus(path)),
|
|
75
|
+
concatMap((files) => from(files)),
|
|
76
|
+
// Share stream for multiple subscribers (removedFiles$ and indexFiles$)
|
|
77
|
+
shareReplay({ refCount: true }),
|
|
78
|
+
);
|
|
79
|
+
})();
|
|
80
|
+
|
|
81
|
+
// Process files: enrich with metadata and filter by allowed patterns
|
|
82
|
+
const allowedFilePatterns = config.index?.patterns ?? [
|
|
83
|
+
'**/*.ts',
|
|
84
|
+
'**/*.tsx',
|
|
85
|
+
'**/*.md',
|
|
86
|
+
'**/*.mdx',
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
// Process files: enrich with metadata and filter by allowed patterns
|
|
90
|
+
const processedFiles$ = files$.pipe(
|
|
91
|
+
map((file) => {
|
|
92
|
+
const { filepath, status } = file;
|
|
93
|
+
const projectRoot = resolveProjectRoot(filepath);
|
|
94
|
+
const relativePath = projectRoot ? relative(projectRoot, filepath) : filepath;
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
path: filepath,
|
|
98
|
+
status,
|
|
99
|
+
projectRoot,
|
|
100
|
+
relativePath,
|
|
101
|
+
};
|
|
102
|
+
}),
|
|
103
|
+
filter((file) => {
|
|
104
|
+
const matches = multimatch(file.relativePath, allowedFilePatterns);
|
|
105
|
+
return matches.length > 0;
|
|
106
|
+
}),
|
|
107
|
+
// Share for multiple subscribers (removedFiles$, markdown$, typescript$)
|
|
108
|
+
shareReplay({ refCount: true }),
|
|
109
|
+
);
|
|
110
|
+
|
|
111
|
+
// Split stream: removed files for deletion, new/modified for indexing
|
|
112
|
+
const removedFiles$ = processedFiles$.pipe(filter((file) => file.status === 'removed'));
|
|
113
|
+
|
|
114
|
+
// Create processing streams
|
|
115
|
+
const delete$ = createDeleteRemovedFilesStream(removedFiles$, framework, options);
|
|
116
|
+
|
|
117
|
+
// New/modified files for indexing
|
|
118
|
+
const indexFiles$ = processedFiles$.pipe(
|
|
119
|
+
filter((file) => file.status === 'new' || file.status === 'modified'),
|
|
120
|
+
// Share for markdown$ and typescript$ pipelines
|
|
121
|
+
shareReplay({ refCount: true }),
|
|
122
|
+
);
|
|
123
|
+
|
|
124
|
+
const isRawFile = (file: ProcessedFile): boolean => {
|
|
125
|
+
const matches = multimatch(file.relativePath, config.index?.rawPatterns ?? []);
|
|
126
|
+
if (matches.length > 0) {
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
return false;
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
const rawFiles$ = indexFiles$.pipe(
|
|
133
|
+
filter(isRawFile),
|
|
134
|
+
map((file): DocumentEntry => {
|
|
135
|
+
const document: VectorStoreDocument = {
|
|
136
|
+
id: generateChunkId(file.relativePath),
|
|
137
|
+
pageContent: readFileSync(file.path, 'utf8'),
|
|
138
|
+
metadata: {
|
|
139
|
+
source: file.relativePath,
|
|
140
|
+
type: 'raw',
|
|
141
|
+
},
|
|
142
|
+
};
|
|
143
|
+
return { status: file.status, documents: [document] };
|
|
144
|
+
}),
|
|
145
|
+
);
|
|
146
|
+
|
|
147
|
+
const markdown$ = indexFiles$.pipe(
|
|
148
|
+
filter((x) => !isRawFile(x)),
|
|
149
|
+
filter((file) => isMarkdownFile(file.path)),
|
|
150
|
+
mergeMap(async (file) => {
|
|
151
|
+
const documents = await parseMarkdownFile(file);
|
|
152
|
+
return { status: file.status, documents };
|
|
153
|
+
}),
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
const typescript$ = indexFiles$.pipe(
|
|
157
|
+
filter((x) => !isRawFile(x)),
|
|
158
|
+
filter((file) => isTypescriptFile(file.path)),
|
|
159
|
+
map((file) => {
|
|
160
|
+
const documents = parseTsDocFromFileSync(file);
|
|
161
|
+
return { status: file.status, documents };
|
|
162
|
+
}),
|
|
163
|
+
);
|
|
164
|
+
|
|
165
|
+
// Apply metadata to documents
|
|
166
|
+
const applyMetadata$ = applyMetadata(merge(rawFiles$, markdown$, typescript$), config.index);
|
|
167
|
+
|
|
168
|
+
// Generate embeddings
|
|
169
|
+
const embeddingService = framework.ai.getService('embeddings', options.openaiEmbeddingDeployment);
|
|
170
|
+
const applyEmbedding$ = applyMetadata$.pipe(
|
|
171
|
+
mergeMap((documents) =>
|
|
172
|
+
from(documents).pipe(
|
|
173
|
+
mergeMap(async (document) => {
|
|
174
|
+
console.log('embedding document', document.metadata.source);
|
|
175
|
+
const embeddings = await embeddingService
|
|
176
|
+
.embedQuery(document.pageContent)
|
|
177
|
+
.catch((error) => {
|
|
178
|
+
console.error(
|
|
179
|
+
`❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
|
180
|
+
);
|
|
181
|
+
console.error('document', document);
|
|
182
|
+
process.exit(1);
|
|
183
|
+
});
|
|
184
|
+
const metadata = { ...document.metadata, embedding: embeddings };
|
|
185
|
+
return { ...document, metadata };
|
|
186
|
+
}),
|
|
187
|
+
toArray(),
|
|
188
|
+
),
|
|
189
|
+
),
|
|
190
|
+
);
|
|
191
|
+
|
|
192
|
+
// Update vector store
|
|
193
|
+
const upsert$ = applyEmbedding$.pipe(
|
|
194
|
+
mergeMap(async (documents) => {
|
|
195
|
+
const vectorStoreService = framework.ai.getService('search', options.azureSearchIndexName);
|
|
196
|
+
if (documents.length === 0) {
|
|
197
|
+
return undefined;
|
|
198
|
+
}
|
|
199
|
+
for (const document of documents) {
|
|
200
|
+
console.log(`Adding entry [${document.id}] to vector store`, document.metadata.source);
|
|
201
|
+
}
|
|
202
|
+
if (!options.dryRun) {
|
|
203
|
+
// For multiple chunks from same file, delete existing chunks first
|
|
204
|
+
if (documents.length > 1) {
|
|
205
|
+
const sources = documents
|
|
206
|
+
.map((document) => document.metadata.source)
|
|
207
|
+
.reduce((acc, source) => acc.add(source), new Set<string>());
|
|
208
|
+
|
|
209
|
+
const filterExpression = Array.from(sources)
|
|
210
|
+
.map((source) => `metadata/source eq '${source}'`)
|
|
211
|
+
.join(' or ');
|
|
212
|
+
|
|
213
|
+
// Fire-and-forget deletion (not awaited) - brief gap before new docs are indexed
|
|
214
|
+
vectorStoreService.deleteDocuments({ filter: { filterExpression } });
|
|
215
|
+
}
|
|
216
|
+
await vectorStoreService.addDocuments(documents);
|
|
217
|
+
}
|
|
218
|
+
return {
|
|
219
|
+
status: 'added',
|
|
220
|
+
documents,
|
|
221
|
+
} as UpdateVectorStoreResult;
|
|
222
|
+
}),
|
|
223
|
+
filter((result): result is UpdateVectorStoreResult => Boolean(result)),
|
|
224
|
+
);
|
|
225
|
+
|
|
226
|
+
// Execute pipeline
|
|
227
|
+
// Track indexing results for reporting: deleted file paths and added document IDs
|
|
228
|
+
const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
|
|
229
|
+
deleted: [],
|
|
230
|
+
added: [],
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
// Execute pipeline: concat ensures deletions happen before additions
|
|
234
|
+
// This subscription triggers lazy RxJS execution and tracks all results
|
|
235
|
+
concat(delete$, upsert$).subscribe({
|
|
236
|
+
next: (result) => {
|
|
237
|
+
// Track deleted files by relative path
|
|
238
|
+
if (result.status === 'deleted') {
|
|
239
|
+
indexingResults.deleted.push(...result.files.map((file) => file.relativePath));
|
|
240
|
+
}
|
|
241
|
+
// Track added documents with source and ID (one file can produce multiple IDs)
|
|
242
|
+
else if (result.status === 'added') {
|
|
243
|
+
indexingResults.added.push(
|
|
244
|
+
...result.documents.map((document) => ({
|
|
245
|
+
source: document.metadata.source,
|
|
246
|
+
id: document.id,
|
|
247
|
+
})),
|
|
248
|
+
);
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
error: (error) => {
|
|
252
|
+
console.error(`❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
253
|
+
process.exit(1);
|
|
254
|
+
},
|
|
255
|
+
complete: () => {
|
|
256
|
+
// Pipeline completed - log results and exit
|
|
257
|
+
console.log('🗂️ Indexing results:', indexingResults);
|
|
258
|
+
console.log('✅ Embeddings generation completed!');
|
|
259
|
+
process.exit(0);
|
|
260
|
+
},
|
|
261
|
+
});
|
|
262
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { concat } from 'rxjs';
|
|
2
|
+
import type { DeleteRemovedFilesResult, UpdateVectorStoreResult } from './types.js';
|
|
3
|
+
import type { Observable } from 'rxjs';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Executes the pipeline and tracks results.
|
|
7
|
+
* @internal
|
|
8
|
+
*/
|
|
9
|
+
export function executePipeline(
|
|
10
|
+
deleteRemovedFiles$: Observable<DeleteRemovedFilesResult>,
|
|
11
|
+
updateVectorStore$: Observable<UpdateVectorStoreResult>,
|
|
12
|
+
): void {
|
|
13
|
+
// Track indexing results for reporting: deleted file paths and added document IDs
|
|
14
|
+
const indexingResults: { deleted: string[]; added: { source: string; id: string }[] } = {
|
|
15
|
+
deleted: [],
|
|
16
|
+
added: [],
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// Execute pipeline: concat ensures deletions happen before additions
|
|
20
|
+
// This subscription triggers lazy RxJS execution and tracks all results
|
|
21
|
+
concat(deleteRemovedFiles$, updateVectorStore$).subscribe({
|
|
22
|
+
next: (result) => {
|
|
23
|
+
// Track deleted files by relative path
|
|
24
|
+
if (result.status === 'deleted') {
|
|
25
|
+
indexingResults.deleted.push(...result.files.map((file) => file.relativePath));
|
|
26
|
+
}
|
|
27
|
+
// Track added documents with source and ID (one file can produce multiple IDs)
|
|
28
|
+
else if (result.status === 'added') {
|
|
29
|
+
indexingResults.added.push(
|
|
30
|
+
...result.documents.map((document) => ({
|
|
31
|
+
source: document.metadata.source,
|
|
32
|
+
id: document.id,
|
|
33
|
+
})),
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
error: (error) => {
|
|
38
|
+
console.error(`❌ Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
39
|
+
process.exit(1);
|
|
40
|
+
},
|
|
41
|
+
complete: () => {
|
|
42
|
+
// Pipeline completed - log results and exit
|
|
43
|
+
console.log('🗂️ Indexing results:', indexingResults);
|
|
44
|
+
console.log('✅ Embeddings generation completed!');
|
|
45
|
+
process.exit(0);
|
|
46
|
+
},
|
|
47
|
+
});
|
|
48
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { globbyStream } from 'globby';
|
|
2
|
+
import { from, mergeMap, concatMap, shareReplay } from 'rxjs';
|
|
3
|
+
import type { Observable } from 'rxjs';
|
|
4
|
+
import { getFileStatus } from '../utils/git/index.js';
|
|
5
|
+
import type { ChangedFile } from '../utils/git/index.js';
|
|
6
|
+
import type { CommandOptions } from '../command.options.js';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Creates a file stream based on diff mode or glob patterns.
|
|
10
|
+
* @internal
|
|
11
|
+
*/
|
|
12
|
+
export function createFileStream(
|
|
13
|
+
options: CommandOptions,
|
|
14
|
+
changedFiles: ChangedFile[],
|
|
15
|
+
filePatterns: string[],
|
|
16
|
+
): Observable<ChangedFile> {
|
|
17
|
+
if (options.diff) {
|
|
18
|
+
return from(changedFiles);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return from(
|
|
22
|
+
globbyStream(filePatterns, {
|
|
23
|
+
onlyFiles: true,
|
|
24
|
+
gitignore: true,
|
|
25
|
+
absolute: true,
|
|
26
|
+
}),
|
|
27
|
+
).pipe(
|
|
28
|
+
// Get git status concurrently, then flatten array results
|
|
29
|
+
mergeMap((path) => getFileStatus(path)),
|
|
30
|
+
concatMap((files) => from(files)),
|
|
31
|
+
// Share stream for multiple subscribers (removedFiles$ and indexFiles$)
|
|
32
|
+
shareReplay({ refCount: true }),
|
|
33
|
+
);
|
|
34
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { ChangedFile } from '../utils/git/index.js';
|
|
2
|
+
import { getChangedFiles, getGitStatus } from '../utils/git/index.js';
|
|
3
|
+
import type { CommandOptions } from '../command.options.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Handles diff-based processing to get changed files from git.
|
|
7
|
+
* @internal
|
|
8
|
+
*/
|
|
9
|
+
export async function getDiff(options: CommandOptions): Promise<ChangedFile[]> {
|
|
10
|
+
try {
|
|
11
|
+
// Get current git status for informational output
|
|
12
|
+
const gitStatus = await getGitStatus();
|
|
13
|
+
console.log(`🔍 Git status: ${gitStatus.branch}@${gitStatus.commit}`);
|
|
14
|
+
console.log(`📊 Changes: ${gitStatus.stagedFiles} staged, ${gitStatus.unstagedFiles} unstaged`);
|
|
15
|
+
|
|
16
|
+
// Get changed files compared to base reference (default: HEAD~1)
|
|
17
|
+
const changedFiles = await getChangedFiles({
|
|
18
|
+
diff: options.diff,
|
|
19
|
+
baseRef: options.baseRef,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
if (changedFiles.length === 0) {
|
|
23
|
+
console.log('✅ No changed files match the provided patterns. Nothing to process.');
|
|
24
|
+
process.exit(0);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
console.log(`📝 Found ${changedFiles.length} changed files matching patterns`);
|
|
28
|
+
return changedFiles;
|
|
29
|
+
} catch (error) {
|
|
30
|
+
console.error(`❌ Git diff error: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
31
|
+
process.exit(1);
|
|
32
|
+
}
|
|
33
|
+
}
|
package/src/bin/index.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { embed } from './embed.js';
|
package/src/bin/types.ts
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
|
|
2
|
+
import type { ChangedFile } from '../utils/git/index.js';
|
|
3
|
+
import type { CommandOptions } from '../command.options.js';
|
|
4
|
+
import type { FrameworkInstance } from '@equinor/fusion-framework-cli-plugin-ai-base';
|
|
5
|
+
import type { FusionAIConfigWithIndex } from '../config.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Result of updating the vector store with new documents
|
|
9
|
+
* @internal
|
|
10
|
+
*/
|
|
11
|
+
export type UpdateVectorStoreResult = { status: 'added'; documents: VectorStoreDocument[] };
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Result of deleting removed files from the vector store
|
|
15
|
+
* @internal
|
|
16
|
+
*/
|
|
17
|
+
export type DeleteRemovedFilesResult = { status: 'deleted'; files: { relativePath: string }[] };
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* File with enriched metadata for processing
|
|
21
|
+
* @internal
|
|
22
|
+
*/
|
|
23
|
+
export type ProcessedFile = {
|
|
24
|
+
path: string;
|
|
25
|
+
status: ChangedFile['status'];
|
|
26
|
+
projectRoot: string | undefined;
|
|
27
|
+
relativePath: string;
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Document entry with status for processing
|
|
32
|
+
* @internal
|
|
33
|
+
*/
|
|
34
|
+
export type DocumentEntry = {
|
|
35
|
+
status: ChangedFile['status'];
|
|
36
|
+
documents: VectorStoreDocument[];
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Options for the embeddings bin
|
|
41
|
+
* @internal
|
|
42
|
+
*/
|
|
43
|
+
export interface EmbeddingsBinOptions {
|
|
44
|
+
framework: FrameworkInstance;
|
|
45
|
+
options: CommandOptions;
|
|
46
|
+
config: FusionAIConfigWithIndex;
|
|
47
|
+
filePatterns: string[];
|
|
48
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
import { AiOptionsSchema } from '@equinor/fusion-framework-cli-plugin-ai-base/command-options';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Zod schema for validating command options for the embeddings command.
|
|
7
|
+
*
|
|
8
|
+
* This schema extends the base AI options schema with embeddings-specific options,
|
|
9
|
+
* ensuring type safety and runtime validation of command arguments.
|
|
10
|
+
*
|
|
11
|
+
* Note: Some optional AI options become required for the embeddings command
|
|
12
|
+
* (openaiEmbeddingDeployment, azureSearchEndpoint, azureSearchApiKey, azureSearchIndexName)
|
|
13
|
+
* because the command uses withAiOptions with includeEmbedding and includeSearch set to true.
|
|
14
|
+
*/
|
|
15
|
+
export const CommandOptionsSchema = AiOptionsSchema.extend({
|
|
16
|
+
// Override optional AI options to make them required for embeddings command
|
|
17
|
+
openaiEmbeddingDeployment: z
|
|
18
|
+
.string({ message: 'Embedding deployment name is required for embeddings command.' })
|
|
19
|
+
.min(1, 'Embedding deployment name must be a non-empty string.')
|
|
20
|
+
.describe('Azure OpenAI embedding deployment name'),
|
|
21
|
+
azureSearchEndpoint: z
|
|
22
|
+
.string({ message: 'Azure Search endpoint is required for embeddings command.' })
|
|
23
|
+
.url('Azure Search endpoint must be a valid URL.')
|
|
24
|
+
.min(1, 'Azure Search endpoint must be a non-empty string.')
|
|
25
|
+
.describe('Azure Search endpoint URL'),
|
|
26
|
+
azureSearchApiKey: z
|
|
27
|
+
.string({ message: 'Azure Search API key is required for embeddings command.' })
|
|
28
|
+
.min(1, 'Azure Search API key must be a non-empty string.')
|
|
29
|
+
.describe('Azure Search API key'),
|
|
30
|
+
azureSearchIndexName: z
|
|
31
|
+
.string({ message: 'Azure Search index name is required for embeddings command.' })
|
|
32
|
+
.min(1, 'Azure Search index name must be a non-empty string.')
|
|
33
|
+
.describe('Azure Search index name'),
|
|
34
|
+
|
|
35
|
+
// Embeddings-specific options
|
|
36
|
+
dryRun: z
|
|
37
|
+
.boolean({ message: 'dryRun must be a boolean value.' })
|
|
38
|
+
.describe('Show what would be processed without actually doing it'),
|
|
39
|
+
config: z
|
|
40
|
+
.string({ message: 'Config file path is required and must be a non-empty string.' })
|
|
41
|
+
.min(1, 'Config file path must be a non-empty string.')
|
|
42
|
+
.describe('Path to a config file'),
|
|
43
|
+
diff: z
|
|
44
|
+
.boolean({ message: 'diff must be a boolean value.' })
|
|
45
|
+
.describe('Process only changed files (workflow mode)'),
|
|
46
|
+
baseRef: z.string().min(1).optional().describe('Git reference to compare against'),
|
|
47
|
+
clean: z
|
|
48
|
+
.boolean({ message: 'clean must be a boolean value.' })
|
|
49
|
+
.describe('Delete all existing documents from the vector store before processing'),
|
|
50
|
+
}).describe('Command options for the embeddings command');
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Type representing the validated command options.
|
|
54
|
+
*
|
|
55
|
+
* This type is inferred from the Zod schema and should be used throughout the command
|
|
56
|
+
* to ensure type safety and consistency with the schema.
|
|
57
|
+
*/
|
|
58
|
+
export type CommandOptions = z.infer<typeof CommandOptionsSchema>;
|
package/src/command.ts
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { createCommand, createOption } from 'commander';
|
|
2
|
+
|
|
3
|
+
import { loadFusionAIConfig, setupFramework } from '@equinor/fusion-framework-cli-plugin-ai-base';
|
|
4
|
+
import { withOptions as withAiOptions } from '@equinor/fusion-framework-cli-plugin-ai-base/command-options';
|
|
5
|
+
|
|
6
|
+
import { embed } from './bin/embed.js';
|
|
7
|
+
import { CommandOptionsSchema, type CommandOptions } from './command.options.js';
|
|
8
|
+
import type { FusionAIConfigWithIndex } from './config.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* CLI command: `ai embeddings`
|
|
12
|
+
*
|
|
13
|
+
* Document embedding utilities for Large Language Model processing.
|
|
14
|
+
*
|
|
15
|
+
* Features:
|
|
16
|
+
* - Markdown/MDX document chunking with frontmatter extraction
|
|
17
|
+
* - TypeScript/TSX TSDoc extraction and chunking
|
|
18
|
+
* - Glob pattern support for file collection
|
|
19
|
+
* - Git diff-based processing for workflow integration
|
|
20
|
+
* - Dry-run mode for testing without actual processing
|
|
21
|
+
* - Configurable file patterns via fusion-ai.config.ts
|
|
22
|
+
*
|
|
23
|
+
* Usage:
|
|
24
|
+
* $ ffc ai embeddings [options] [glob-patterns...]
|
|
25
|
+
*
|
|
26
|
+
* Arguments:
|
|
27
|
+
* glob-patterns Glob patterns to match files (optional when using --diff)
|
|
28
|
+
* Defaults to patterns from fusion-ai.config.ts if not provided
|
|
29
|
+
*
|
|
30
|
+
* Options:
|
|
31
|
+
* --dry-run Show what would be processed without actually doing it
|
|
32
|
+
* --config <config> Path to a config file (default: fusion-ai.config)
|
|
33
|
+
* --diff Process only changed files (workflow mode)
|
|
34
|
+
* --base-ref <ref> Git reference to compare against (default: HEAD~1)
|
|
35
|
+
* --clean Delete all existing documents from the vector store before processing
|
|
36
|
+
*
|
|
37
|
+
* AI Options (required):
|
|
38
|
+
* --openai-api-key <key> Azure OpenAI API key (or AZURE_OPENAI_API_KEY env var)
|
|
39
|
+
* --openai-api-version <version> Azure OpenAI API version (default: 2024-02-15-preview)
|
|
40
|
+
* --openai-instance <name> Azure OpenAI instance name (or AZURE_OPENAI_INSTANCE_NAME env var)
|
|
41
|
+
* --openai-embedding-deployment <name> Azure OpenAI embedding deployment name (or AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME env var)
|
|
42
|
+
* --azure-search-endpoint <url> Azure Search endpoint URL (or AZURE_SEARCH_ENDPOINT env var)
|
|
43
|
+
* --azure-search-api-key <key> Azure Search API key (or AZURE_SEARCH_API_KEY env var)
|
|
44
|
+
* --azure-search-index-name <name> Azure Search index name (or AZURE_SEARCH_INDEX_NAME env var)
|
|
45
|
+
*
|
|
46
|
+
* Examples:
|
|
47
|
+
* $ ffc ai embeddings --dry-run ./src
|
|
48
|
+
* $ ffc ai embeddings "*.ts" "*.md" "*.mdx"
|
|
49
|
+
* $ ffc ai embeddings --diff
|
|
50
|
+
* $ ffc ai embeddings --diff --base-ref origin/main
|
|
51
|
+
* $ ffc ai embeddings --clean "*.ts"
|
|
52
|
+
*/
|
|
53
|
+
const _command = createCommand('embeddings')
|
|
54
|
+
.description('Document embedding utilities for Large Language Model processing')
|
|
55
|
+
.addOption(
|
|
56
|
+
createOption('--dry-run', 'Show what would be processed without actually doing it').default(
|
|
57
|
+
false,
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
.addOption(createOption('--config <config>', 'Path to a config file').default('fusion-ai.config'))
|
|
61
|
+
.addOption(createOption('--diff', 'Process only changed files (workflow mode)').default(false))
|
|
62
|
+
.addOption(createOption('--base-ref <ref>', 'Git reference to compare against').default('HEAD~1'))
|
|
63
|
+
.addOption(
|
|
64
|
+
createOption(
|
|
65
|
+
'--clean',
|
|
66
|
+
'Delete all existing documents from the vector store before processing',
|
|
67
|
+
).default(false),
|
|
68
|
+
)
|
|
69
|
+
.argument('[glob-patterns...]', 'Glob patterns to match files (optional when using --diff)')
|
|
70
|
+
.action(async (patterns: string[], commandOptions: CommandOptions) => {
|
|
71
|
+
const options = await CommandOptionsSchema.parseAsync(commandOptions);
|
|
72
|
+
|
|
73
|
+
// Load configuration
|
|
74
|
+
const config = await loadFusionAIConfig<FusionAIConfigWithIndex>(options.config, {
|
|
75
|
+
baseDir: process.cwd(),
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// CLI args take precedence over config patterns
|
|
79
|
+
const indexConfig = config.index ?? {};
|
|
80
|
+
const allowedFilePatterns = indexConfig.patterns ?? ['**/*.ts', '**/*.md', '**/*.mdx'];
|
|
81
|
+
const filePatterns = patterns.length ? patterns : allowedFilePatterns;
|
|
82
|
+
|
|
83
|
+
// Initialize framework
|
|
84
|
+
const framework = await setupFramework(options);
|
|
85
|
+
|
|
86
|
+
// Execute embeddings bin with framework and options
|
|
87
|
+
await embed({
|
|
88
|
+
framework,
|
|
89
|
+
options,
|
|
90
|
+
config,
|
|
91
|
+
filePatterns,
|
|
92
|
+
});
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
export const command = withAiOptions(_command, {
|
|
96
|
+
includeEmbedding: true,
|
|
97
|
+
includeSearch: true,
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
export default command;
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
|
|
2
|
+
import type { FusionAIConfig } from '@equinor/fusion-framework-cli-plugin-ai-base';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Index-specific configuration for Fusion AI operations
|
|
6
|
+
*/
|
|
7
|
+
export interface IndexConfig {
|
|
8
|
+
patterns?: string[];
|
|
9
|
+
/** Files will be processed as is, without any chunking or transformation */
|
|
10
|
+
rawPatterns?: string[];
|
|
11
|
+
/** Globby patterns to ignored, only used when providing paths to the command */
|
|
12
|
+
ignore?: string[];
|
|
13
|
+
/** Metadata processing configuration */
|
|
14
|
+
metadata?: {
|
|
15
|
+
/** Automatically resolve package information from source file paths */
|
|
16
|
+
resolvePackage?: boolean;
|
|
17
|
+
resolveGit?: boolean;
|
|
18
|
+
/** Custom metadata processors to transform metadata before embedding */
|
|
19
|
+
attributeProcessor?: (
|
|
20
|
+
metadata: Record<string, unknown>,
|
|
21
|
+
document: VectorStoreDocument,
|
|
22
|
+
) => Record<string, unknown>;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/** Embedding generation configuration */
|
|
26
|
+
embedding?: {
|
|
27
|
+
/** Size of text chunks for embedding */
|
|
28
|
+
chunkSize?: number;
|
|
29
|
+
/** Overlap between chunks */
|
|
30
|
+
chunkOverlap?: number;
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Extended Fusion AI configuration with index-specific settings
|
|
36
|
+
*/
|
|
37
|
+
export interface FusionAIConfigWithIndex extends FusionAIConfig {
|
|
38
|
+
index?: IndexConfig;
|
|
39
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { Command } from 'commander';
|
|
2
|
+
import { registerAiPlugin as registerAiPluginBase } from '@equinor/fusion-framework-cli-plugin-ai-base';
|
|
3
|
+
import { command as embeddingsCommand } from './command.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Registers the AI index plugin command with the CLI program
|
|
7
|
+
* @param program - The Commander program instance to register commands with
|
|
8
|
+
*/
|
|
9
|
+
export function registerAiPlugin(program: Command): void {
|
|
10
|
+
registerAiPluginBase(program, embeddingsCommand);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export default registerAiPlugin;
|
|
14
|
+
|
|
15
|
+
// Re-export config utilities for convenience
|
|
16
|
+
export {
|
|
17
|
+
configureFusionAI,
|
|
18
|
+
type FusionAIConfig,
|
|
19
|
+
} from '@equinor/fusion-framework-cli-plugin-ai-base';
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generates a unique identifier for a document chunk based on file path
|
|
3
|
+
* Creates a deterministic, URL-safe hash from the file path for validation and checks
|
|
4
|
+
* @param filePath - The file path to generate an ID from
|
|
5
|
+
* @param chunkIndex - Optional chunk index to append for multi-chunk documents
|
|
6
|
+
* @returns A base64-encoded hash of the file path, optionally suffixed with chunk index
|
|
7
|
+
*/
|
|
8
|
+
export const generateChunkId = (filePath: string, chunkIndex?: number): string => {
|
|
9
|
+
// Convert file path to base64 and remove non-alphanumeric characters
|
|
10
|
+
// This creates a stable, URL-safe identifier from the file path
|
|
11
|
+
// The deterministic nature allows for validation and duplicate detection
|
|
12
|
+
const pathHash = Buffer.from(filePath)
|
|
13
|
+
.toString('base64')
|
|
14
|
+
.replace(/[^a-zA-Z0-9]/g, '');
|
|
15
|
+
// Append chunk index if provided to distinguish multiple chunks from the same file
|
|
16
|
+
return chunkIndex ? `${pathHash}-${chunkIndex}` : pathHash;
|
|
17
|
+
};
|