@equinor/fusion-framework-cli-plugin-ai-index 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +52 -0
  2. package/dist/esm/bin/apply-metadata.js +15 -5
  3. package/dist/esm/bin/apply-metadata.js.map +1 -1
  4. package/dist/esm/bin/apply-schema.js +64 -0
  5. package/dist/esm/bin/apply-schema.js.map +1 -0
  6. package/dist/esm/bin/apply-schema.test.js +143 -0
  7. package/dist/esm/bin/apply-schema.test.js.map +1 -0
  8. package/dist/esm/bin/delete-removed-files.js +1 -1
  9. package/dist/esm/bin/delete-removed-files.js.map +1 -1
  10. package/dist/esm/bin/embed.js +188 -47
  11. package/dist/esm/bin/embed.js.map +1 -1
  12. package/dist/esm/create-command.js +186 -0
  13. package/dist/esm/create-command.js.map +1 -0
  14. package/dist/esm/delete-command.js +14 -2
  15. package/dist/esm/delete-command.js.map +1 -1
  16. package/dist/esm/delete-command.options.js +7 -31
  17. package/dist/esm/delete-command.options.js.map +1 -1
  18. package/dist/esm/delete-index-command.js +94 -0
  19. package/dist/esm/delete-index-command.js.map +1 -0
  20. package/dist/esm/embed-command.js +30 -0
  21. package/dist/esm/embed-command.js.map +1 -0
  22. package/dist/esm/embeddings-command.js +14 -17
  23. package/dist/esm/embeddings-command.js.map +1 -1
  24. package/dist/esm/embeddings-command.options.js +12 -43
  25. package/dist/esm/embeddings-command.options.js.map +1 -1
  26. package/dist/esm/index.js +12 -3
  27. package/dist/esm/index.js.map +1 -1
  28. package/dist/esm/schema.js +41 -0
  29. package/dist/esm/schema.js.map +1 -0
  30. package/dist/esm/search-command.js +17 -5
  31. package/dist/esm/search-command.js.map +1 -1
  32. package/dist/esm/utils/embedding-dimensions.js +37 -0
  33. package/dist/esm/utils/embedding-dimensions.js.map +1 -0
  34. package/dist/esm/utils/zod-to-azure-fields.js +120 -0
  35. package/dist/esm/utils/zod-to-azure-fields.js.map +1 -0
  36. package/dist/esm/utils/zod-to-azure-fields.test.js +112 -0
  37. package/dist/esm/utils/zod-to-azure-fields.test.js.map +1 -0
  38. package/dist/esm/version.js +1 -1
  39. package/dist/tsconfig.tsbuildinfo +1 -1
  40. package/dist/types/bin/apply-metadata.d.ts +2 -1
  41. package/dist/types/bin/apply-schema.d.ts +22 -0
  42. package/dist/types/bin/apply-schema.test.d.ts +1 -0
  43. package/dist/types/config.d.ts +14 -0
  44. package/dist/types/create-command.d.ts +6 -0
  45. package/dist/types/delete-command.options.d.ts +9 -23
  46. package/dist/types/delete-index-command.d.ts +6 -0
  47. package/dist/types/embed-command.d.ts +12 -0
  48. package/dist/types/embeddings-command.options.d.ts +9 -28
  49. package/dist/types/index.d.ts +1 -0
  50. package/dist/types/schema.d.ts +137 -0
  51. package/dist/types/utils/embedding-dimensions.d.ts +13 -0
  52. package/dist/types/utils/zod-to-azure-fields.d.ts +61 -0
  53. package/dist/types/utils/zod-to-azure-fields.test.d.ts +1 -0
  54. package/dist/types/version.d.ts +1 -1
  55. package/package.json +6 -6
  56. package/src/bin/apply-metadata.ts +20 -4
  57. package/src/bin/apply-schema.test.ts +170 -0
  58. package/src/bin/apply-schema.ts +86 -0
  59. package/src/bin/delete-removed-files.ts +1 -1
  60. package/src/bin/embed.ts +248 -76
  61. package/src/config.ts +15 -0
  62. package/src/create-command.ts +218 -0
  63. package/src/delete-command.options.ts +7 -37
  64. package/src/delete-command.ts +19 -2
  65. package/src/delete-index-command.ts +121 -0
  66. package/src/embed-command.ts +44 -0
  67. package/src/embeddings-command.options.ts +12 -50
  68. package/src/embeddings-command.ts +18 -18
  69. package/src/index.ts +12 -3
  70. package/src/schema.ts +149 -0
  71. package/src/search-command.ts +22 -5
  72. package/src/utils/embedding-dimensions.ts +39 -0
  73. package/src/utils/zod-to-azure-fields.test.ts +136 -0
  74. package/src/utils/zod-to-azure-fields.ts +177 -0
  75. package/src/version.ts +1 -1
@@ -1 +1,2 @@
1
- export {};
1
+ /** Callback invoked after each document is enriched with metadata. */
2
+ export type MetadataProgressCallback = (source: string) => void;
@@ -0,0 +1,22 @@
1
+ import type { Observable } from 'rxjs';
2
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
3
+ import type { IndexSchemaConfig } from '../schema.js';
4
+ /**
5
+ * Creates an RxJS operator that resolves promoted schema fields for each
6
+ * document and separates them from the generic `attributes` bag.
7
+ *
8
+ * For each document in the batch:
9
+ * 1. Runs the optional `prepareAttributes` callback to enrich attributes
10
+ * with type-safe access to schema-declared fields
11
+ * 2. Calls the schema resolver to compute promoted field values
12
+ * 3. Validates the resolved values against the Zod shape
13
+ * 4. Stores promoted fields on `metadata.schemaFields`
14
+ * 5. Removes promoted keys from `metadata.attributes` to avoid duplication
15
+ *
16
+ * When no schema is configured, the stream passes through unchanged.
17
+ *
18
+ * @param document$ - Stream of document batches from the metadata enrichment step.
19
+ * @param schema - The index schema config, if defined. When `undefined`, documents pass through unchanged.
20
+ * @returns Stream of document batches with promoted fields resolved and stored.
21
+ */
22
+ export declare function applySchema(document$: Observable<VectorStoreDocument[]>, schema: IndexSchemaConfig | undefined): Observable<VectorStoreDocument[]>;
@@ -0,0 +1 @@
1
+ export {};
@@ -1,5 +1,6 @@
1
1
  import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
2
2
  import type { FusionAIConfig } from '@equinor/fusion-framework-cli-plugin-ai-base';
3
+ import type { IndexSchemaConfig } from './schema.js';
3
4
  /**
4
5
  * Index-specific configuration for Fusion AI document indexing operations.
5
6
  *
@@ -49,7 +50,20 @@ export interface IndexConfig {
49
50
  chunkSize?: number;
50
51
  /** Number of overlapping tokens between consecutive chunks. */
51
52
  chunkOverlap?: number;
53
+ /** Explicit vector dimensions for custom embedding models not in the known model map. */
54
+ dimensions?: number;
52
55
  };
56
+ /**
57
+ * Custom index schema that promotes frequently-filtered metadata to
58
+ * top-level Azure AI Search fields.
59
+ *
60
+ * When defined, the schema resolver runs after metadata enrichment and
61
+ * places resolved values as top-level document fields in Azure Search,
62
+ * enabling direct OData filters without the `any()` operator.
63
+ *
64
+ * @see {@link IndexSchemaConfig} for details and examples.
65
+ */
66
+ schema?: IndexSchemaConfig;
53
67
  }
54
68
  /**
55
69
  * Fusion AI configuration extended with {@link IndexConfig | index-specific settings}.
@@ -0,0 +1,6 @@
1
+ import { type Command } from 'commander';
2
+ /**
3
+ * The `ai index create` command with inherited AI base options for
4
+ * authentication and service discovery.
5
+ */
6
+ export declare const createIndexCommand: Command;
@@ -1,32 +1,18 @@
1
1
  import { z } from 'zod';
2
2
  /**
3
- * Zod schema for validating options of the `ai index remove` command.
3
+ * Zod schema for the `ai index remove` command.
4
4
  *
5
- * Extends the base AI options schema ({@link AiOptionsSchema}) to require
6
- * Azure Search credentials and the embedding deployment (needed to initialise
7
- * the vector store service for document removal).
8
- *
9
- * @example
10
- * ```ts
11
- * const validated = await DeleteOptionsSchema.parseAsync(rawOptions);
12
- * // validated.dryRun, validated.filter, validated.azureSearchEndpoint, etc.
13
- * ```
5
+ * Extends the base AI options schema making `indexName` required.
14
6
  */
15
7
  export declare const DeleteOptionsSchema: z.ZodObject<{
16
- openaiApiKey: z.ZodString;
17
- openaiApiVersion: z.ZodString;
18
- openaiInstance: z.ZodString;
19
- openaiChatDeployment: z.ZodOptional<z.ZodString>;
20
- openaiEmbeddingDeployment: z.ZodString;
21
- azureSearchEndpoint: z.ZodString;
22
- azureSearchApiKey: z.ZodString;
23
- azureSearchIndexName: z.ZodString;
8
+ env: z.ZodOptional<z.ZodString>;
9
+ token: z.ZodOptional<z.ZodString>;
10
+ tenantId: z.ZodOptional<z.ZodString>;
11
+ clientId: z.ZodOptional<z.ZodString>;
12
+ chatModel: z.ZodOptional<z.ZodString>;
13
+ embedModel: z.ZodOptional<z.ZodString>;
14
+ indexName: z.ZodString;
24
15
  dryRun: z.ZodBoolean;
25
16
  filter: z.ZodOptional<z.ZodString>;
26
17
  }, z.core.$strip>;
27
- /**
28
- * Validated options for the `ai index remove` command.
29
- *
30
- * Inferred from {@link DeleteOptionsSchema}.
31
- */
32
18
  export type DeleteOptions = z.infer<typeof DeleteOptionsSchema>;
@@ -0,0 +1,6 @@
1
+ import { type Command } from 'commander';
2
+ /**
3
+ * The `ai index delete` command with inherited AI base options for
4
+ * authentication and service discovery.
5
+ */
6
+ export declare const deleteIndexCommand: Command;
@@ -0,0 +1,12 @@
1
+ /**
2
+ * CLI command: `ai index embed <text>`
3
+ *
4
+ * Embeds a single text string and prints the resulting vector.
5
+ * Useful for verifying the embeddings endpoint and model are reachable.
6
+ *
7
+ * @example
8
+ * ```sh
9
+ * ffc ai index embed "hello world"
10
+ * ```
11
+ */
12
+ export declare const embedCommand: import("commander").Command;
@@ -1,40 +1,21 @@
1
1
  import { z } from 'zod';
2
2
  /**
3
- * Zod schema for validating command options for the `ai index add` command.
3
+ * Zod schema for the `ai index add` command.
4
4
  *
5
- * Extends the base AI options schema ({@link AiOptionsSchema}) with
6
- * add-specific options such as `--dry-run`, `--diff`, `--config`,
7
- * `--base-ref`, and `--clean`.
8
- *
9
- * Azure Search and embedding options that are optional in the base schema
10
- * become **required** because the add command always writes to a
11
- * vector store.
12
- *
13
- * @example
14
- * ```ts
15
- * const validated = await CommandOptionsSchema.parseAsync(rawOptions);
16
- * // validated.dryRun, validated.azureSearchEndpoint, etc.
17
- * ```
5
+ * Extends the base AI options schema making `embedModel` and `indexName` required.
18
6
  */
19
7
  export declare const CommandOptionsSchema: z.ZodObject<{
20
- openaiApiKey: z.ZodString;
21
- openaiApiVersion: z.ZodString;
22
- openaiInstance: z.ZodString;
23
- openaiChatDeployment: z.ZodOptional<z.ZodString>;
24
- openaiEmbeddingDeployment: z.ZodString;
25
- azureSearchEndpoint: z.ZodString;
26
- azureSearchApiKey: z.ZodString;
27
- azureSearchIndexName: z.ZodString;
8
+ env: z.ZodOptional<z.ZodString>;
9
+ token: z.ZodOptional<z.ZodString>;
10
+ tenantId: z.ZodOptional<z.ZodString>;
11
+ clientId: z.ZodOptional<z.ZodString>;
12
+ chatModel: z.ZodOptional<z.ZodString>;
13
+ embedModel: z.ZodString;
14
+ indexName: z.ZodString;
28
15
  dryRun: z.ZodBoolean;
29
16
  config: z.ZodString;
30
17
  diff: z.ZodBoolean;
31
18
  baseRef: z.ZodOptional<z.ZodString>;
32
19
  clean: z.ZodBoolean;
33
20
  }, z.core.$strip>;
34
- /**
35
- * Validated options for the `ai index add` command.
36
- *
37
- * Inferred from {@link CommandOptionsSchema} and used as the single
38
- * source of truth for option types throughout the add/embeddings pipeline.
39
- */
40
21
  export type CommandOptions = z.infer<typeof CommandOptionsSchema>;
@@ -1,5 +1,6 @@
1
1
  import type { Command } from 'commander';
2
2
  export { FusionAIConfigWithIndex, IndexConfig } from './config.js';
3
+ export { defineIndexSchema, IndexSchemaConfig } from './schema.js';
3
4
  /**
4
5
  * Registers the `ai index` command with the Fusion Framework CLI.
5
6
  *
@@ -0,0 +1,137 @@
1
+ import type { z } from 'zod';
2
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
3
+ /**
4
+ * Attribute map type used by {@link IndexSchemaConfig.prepareAttributes}.
5
+ *
6
+ * Combines the schema-declared field types (all optional, since
7
+ * attributes are built up incrementally) with a `Record<string, unknown>`
8
+ * base so non-promoted attributes are still accessible.
9
+ *
10
+ * @template T - Zod object schema from which attribute types are derived.
11
+ */
12
+ export type SchemaAttributes<T extends z.ZodObject> = Partial<z.input<T>> & Record<string, unknown>;
13
+ /**
14
+ * Configuration for a custom Azure AI Search index schema defined via a Zod
15
+ * object shape.
16
+ *
17
+ * Declares which metadata fields should be promoted to top-level Azure AI
18
+ * Search fields (instead of being stored in the generic `attributes` array)
19
+ * and how their values are resolved from each document.
20
+ *
21
+ * Promoted fields become filterable/facetable at the Azure Search level,
22
+ * eliminating the need for `any()` OData operators.
23
+ *
24
+ * @template T - Zod object schema type that defines the promoted field names and types.
25
+ *
26
+ * @example
27
+ * ```ts
28
+ * import { z } from 'zod';
29
+ * import { defineIndexSchema } from '@equinor/fusion-framework-cli-plugin-ai-index';
30
+ *
31
+ * const schema = defineIndexSchema({
32
+ * shape: z.object({
33
+ * pkg_name: z.string().optional(),
34
+ * type: z.string(),
35
+ * tags: z.array(z.string()).default([]),
36
+ * source_dir: z.string(),
37
+ * }),
38
+ * prepareAttributes: (attrs, doc) => {
39
+ * // attrs.tags is typed as string[] | undefined ✅
40
+ * attrs.tags ??= [];
41
+ * if (doc.metadata.source.includes('packages/')) {
42
+ * attrs.tags.push('package');
43
+ * }
44
+ * return attrs;
45
+ * },
46
+ * resolve: (doc) => ({
47
+ * pkg_name: doc.metadata.attributes?.pkg_name as string | undefined,
48
+ * type: (doc.metadata.attributes?.type as string) ?? 'unknown',
49
+ * tags: (doc.metadata.attributes?.tags as string[]) ?? [],
50
+ * source_dir: doc.metadata.source.split('/')[0],
51
+ * }),
52
+ * });
53
+ * ```
54
+ */
55
+ export interface IndexSchemaConfig<T extends z.ZodObject = z.ZodObject> {
56
+ /**
57
+ * Zod object schema defining the promoted field names and their types.
58
+ *
59
+ * Each key becomes a top-level Azure AI Search field. The Zod type
60
+ * determines the Azure EDM field type:
61
+ * - `z.string()` → `Edm.String` (filterable, facetable)
62
+ * - `z.array(z.string())` → `Collection(Edm.String)` (filterable, facetable)
63
+ * - `z.number()` → `Edm.Double` (filterable, sortable)
64
+ * - `z.boolean()` → `Edm.Boolean` (filterable)
65
+ */
66
+ shape: T;
67
+ /**
68
+ * Type-safe attribute processor that enriches document attributes before
69
+ * the schema resolver runs.
70
+ *
71
+ * Runs in addition to the untyped `metadata.attributeProcessor` callback
72
+ * when a schema is defined. The `attributes` parameter is typed from the
73
+ * Zod shape so that schema-declared fields (e.g. `tags`, `pkg_name`)
74
+ * have proper types while non-schema attributes remain accessible via
75
+ * the `Record<string, unknown>` base.
76
+ *
77
+ * Runs after git and package metadata enrichment and after
78
+ * `metadata.attributeProcessor`, before
79
+ * {@link IndexSchemaConfig.resolve | resolve}.
80
+ *
81
+ * @param attributes - The accumulated attributes for the document, typed
82
+ * from the schema shape. All schema fields are optional since they may
83
+ * not be populated yet.
84
+ * @param document - The vector-store document being processed.
85
+ * @returns The enriched attributes map.
86
+ */
87
+ prepareAttributes?: (attributes: SchemaAttributes<T>, document: VectorStoreDocument) => SchemaAttributes<T>;
88
+ /**
89
+ * Per-document resolver that extracts or computes promoted field values.
90
+ *
91
+ * Runs after {@link IndexSchemaConfig.prepareAttributes | prepareAttributes}
92
+ * and metadata enrichment (git, package), so all enriched attributes are
93
+ * available on the document.
94
+ *
95
+ * @param document - The fully enriched vector-store document.
96
+ * @returns An object matching the Zod shape with resolved field values.
97
+ */
98
+ resolve: (document: VectorStoreDocument) => z.output<T>;
99
+ }
100
+ /**
101
+ * Type-safe factory for creating an {@link IndexSchemaConfig}.
102
+ *
103
+ * Infers `T` from the Zod shape and constrains both the
104
+ * `prepareAttributes` parameter types and the `resolve` return type,
105
+ * providing compile-time safety that attribute processing and resolution
106
+ * match the declared schema.
107
+ *
108
+ * @template T - Zod object schema type, inferred from `config.shape`.
109
+ * @param config - Schema configuration with a Zod shape, optional typed
110
+ * attribute processor, and a resolver function.
111
+ * @returns The same config object, narrowed to the inferred generic type.
112
+ *
113
+ * @example
114
+ * ```ts
115
+ * import { z } from 'zod';
116
+ * import { defineIndexSchema } from '@equinor/fusion-framework-cli-plugin-ai-index';
117
+ *
118
+ * const schema = defineIndexSchema({
119
+ * shape: z.object({
120
+ * tags: z.array(z.string()).default([]),
121
+ * type: z.string(),
122
+ * }),
123
+ * prepareAttributes: (attrs, doc) => {
124
+ * attrs.tags ??= []; // string[] | undefined — type-safe ✅
125
+ * if (doc.metadata.source.includes('cookbooks/')) {
126
+ * attrs.tags.push('cookbook');
127
+ * }
128
+ * return attrs;
129
+ * },
130
+ * resolve: (doc) => ({
131
+ * tags: (doc.metadata.attributes?.tags as string[]) ?? [],
132
+ * type: (doc.metadata.attributes?.type as string) ?? 'raw',
133
+ * }),
134
+ * });
135
+ * ```
136
+ */
137
+ export declare function defineIndexSchema<T extends z.ZodObject>(config: IndexSchemaConfig<T>): IndexSchemaConfig<T>;
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Resolve the embedding vector dimensions for a given model name.
3
+ *
4
+ * Checks the known model→dimensions map first. Falls back to an explicit
5
+ * `dimensions` override from the config. Throws if neither is available.
6
+ *
7
+ * @param model - The embedding model name (e.g. `'text-embedding-3-large'`).
8
+ * @param configDimensions - Optional explicit dimensions from config, used
9
+ * when the model is not in the known map.
10
+ * @returns The number of dimensions for the embedding vector.
11
+ * @throws {Error} When the model is unknown and no explicit dimensions are configured.
12
+ */
13
+ export declare function resolveEmbeddingDimensions(model: string, configDimensions?: number): number;
@@ -0,0 +1,61 @@
1
+ import { type z } from 'zod';
2
+ /**
3
+ * Azure AI Search EDM (Entity Data Model) type identifiers used in
4
+ * index field definitions.
5
+ */
6
+ type AzureEdmType = 'Edm.String' | 'Edm.Int32' | 'Edm.Int64' | 'Edm.Double' | 'Edm.Boolean' | 'Collection(Edm.String)';
7
+ /**
8
+ * Azure AI Search field definition matching the REST API schema for
9
+ * index creation.
10
+ *
11
+ * @see https://learn.microsoft.com/en-us/rest/api/searchservice/indexes/create
12
+ */
13
+ export interface AzureSearchField {
14
+ /** Field name as it appears in the index schema. */
15
+ name: string;
16
+ /** Azure EDM type for the field. */
17
+ type: AzureEdmType;
18
+ /** Whether the field can be used in `$filter` expressions. */
19
+ filterable: boolean;
20
+ /** Whether the field can be used in `$orderby` expressions. */
21
+ sortable: boolean;
22
+ /** Whether the field supports faceted navigation. */
23
+ facetable: boolean;
24
+ /** Whether the field is included in full-text search. */
25
+ searchable: boolean;
26
+ }
27
+ /**
28
+ * Convert a Zod object schema into an array of Azure AI Search field
29
+ * definitions.
30
+ *
31
+ * Walks the Zod shape, maps each field to its Azure EDM type, and assigns
32
+ * default capabilities (filterable, facetable, sortable). Used by the
33
+ * `ffc ai index create` command to generate the index schema.
34
+ *
35
+ * Uses public `instanceof` checks and `unwrap()` methods to avoid
36
+ * reliance on Zod's private `_zod.def` internals, ensuring compatibility
37
+ * across Zod versions.
38
+ *
39
+ * @param schema - A Zod object schema whose keys define the promoted fields.
40
+ * @returns An array of Azure AI Search field definitions.
41
+ * @throws {Error} When a field type cannot be mapped to an Azure EDM type.
42
+ *
43
+ * @example
44
+ * ```ts
45
+ * import { z } from 'zod';
46
+ * import { zodToAzureFields } from './zod-to-azure-fields.js';
47
+ *
48
+ * const fields = zodToAzureFields(
49
+ * z.object({
50
+ * pkg_name: z.string().optional(),
51
+ * tags: z.array(z.string()).default([]),
52
+ * }),
53
+ * );
54
+ * // [
55
+ * // { name: 'pkg_name', type: 'Edm.String', filterable: true, facetable: true, ... },
56
+ * // { name: 'tags', type: 'Collection(Edm.String)', filterable: true, facetable: true, ... },
57
+ * // ]
58
+ * ```
59
+ */
60
+ export declare function zodToAzureFields(schema: z.ZodObject): AzureSearchField[];
61
+ export {};
@@ -0,0 +1 @@
1
+ export {};
@@ -1 +1 @@
1
- export declare const version = "2.0.1";
1
+ export declare const version = "2.1.0";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@equinor/fusion-framework-cli-plugin-ai-index",
3
- "version": "2.0.1",
3
+ "version": "2.1.0",
4
4
  "description": "AI indexing plugin for Fusion Framework CLI providing document embedding and chunking utilities",
5
5
  "main": "dist/esm/index.js",
6
6
  "type": "module",
@@ -53,18 +53,18 @@
53
53
  "tree-sitter-typescript": "^0.23.2",
54
54
  "ts-morph": "^28.0.0",
55
55
  "zod": "^4.3.6",
56
- "@equinor/fusion-framework-cli-plugin-ai-base": "2.0.1",
57
56
  "@equinor/fusion-framework-module": "6.0.0",
58
- "@equinor/fusion-framework-module-ai": "3.0.1",
59
- "@equinor/fusion-imports": "2.0.0"
57
+ "@equinor/fusion-imports": "2.0.0",
58
+ "@equinor/fusion-framework-cli-plugin-ai-base": "3.0.0",
59
+ "@equinor/fusion-framework-module-ai": "4.0.0"
60
60
  },
61
61
  "peerDependencies": {
62
- "@equinor/fusion-framework-cli": "^14.2.5"
62
+ "@equinor/fusion-framework-cli": "^14.2.7"
63
63
  },
64
64
  "devDependencies": {
65
65
  "typescript": "^5.9.3",
66
66
  "vitest": "^4.1.0",
67
- "@equinor/fusion-framework-cli": "^14.2.5"
67
+ "@equinor/fusion-framework-cli": "^14.2.7"
68
68
  },
69
69
  "scripts": {
70
70
  "build": "tsc -b",
@@ -1,5 +1,5 @@
1
1
  import path from 'node:path';
2
- import { from, mergeMap, map, toArray } from 'rxjs';
2
+ import { from, mergeMap, map, tap, toArray } from 'rxjs';
3
3
  import type { Observable } from 'rxjs';
4
4
  import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
5
5
  import { extractGitMetadata } from '../utils/git/index.js';
@@ -7,6 +7,9 @@ import { resolvePackage } from '../utils/package-resolver.js';
7
7
  import type { DocumentEntry } from './types.js';
8
8
  import type { FusionAIConfigWithIndex } from '../config.js';
9
9
 
10
+ /** Callback invoked after each document is enriched with metadata. */
11
+ export type MetadataProgressCallback = (source: string) => void;
12
+
10
13
  /**
11
14
  * Creates a stream that applies metadata to documents.
12
15
  * @internal
@@ -14,14 +17,25 @@ import type { FusionAIConfigWithIndex } from '../config.js';
14
17
  export function applyMetadata(
15
18
  document$: Observable<DocumentEntry>,
16
19
  indexConfig: FusionAIConfigWithIndex['index'],
20
+ onProgress?: MetadataProgressCallback,
17
21
  ): Observable<VectorStoreDocument[]> {
18
22
  // Resolve packages if enabled
19
23
  const shouldResolvePackage = indexConfig?.metadata?.resolvePackage ?? false;
20
24
 
25
+ /** Cap concurrent git subprocess calls to avoid overwhelming the OS process table. */
26
+ const GIT_CONCURRENCY = 20;
27
+
28
+ /**
29
+ * Cap the number of file entries processed in parallel.
30
+ * Each entry fans out to GIT_CONCURRENCY inner git calls, so
31
+ * total concurrent git processes ≤ ENTRY_CONCURRENCY × GIT_CONCURRENCY.
32
+ */
33
+ const ENTRY_CONCURRENCY = 20;
34
+
21
35
  return document$.pipe(
22
36
  mergeMap((entry) => {
23
37
  return from(entry.documents).pipe(
24
- // Extract git metadata concurrently for all documents
38
+ // Extract git metadata concurrently (capped to limit parallel git processes)
25
39
  mergeMap(async (document): Promise<VectorStoreDocument> => {
26
40
  const rootPath = document.metadata.rootPath ?? process.cwd();
27
41
  const sourcePath = path.join(rootPath, document.metadata.source);
@@ -54,7 +68,9 @@ export function applyMetadata(
54
68
  },
55
69
  },
56
70
  };
57
- }),
71
+ }, GIT_CONCURRENCY),
72
+ // Notify caller after each document is enriched
73
+ tap((document) => onProgress?.(document.metadata.source)),
58
74
  // Apply custom attribute processor from config
59
75
  map((document: VectorStoreDocument) => {
60
76
  const attributeProcessor =
@@ -72,6 +88,6 @@ export function applyMetadata(
72
88
  // Group back by file for batch deletion in next step
73
89
  toArray(),
74
90
  );
75
- }),
91
+ }, ENTRY_CONCURRENCY),
76
92
  );
77
93
  }
@@ -0,0 +1,170 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { z } from 'zod';
3
+ import { of, lastValueFrom } from 'rxjs';
4
+ import type { VectorStoreDocument } from '@equinor/fusion-framework-module-ai/lib';
5
+
6
+ import { defineIndexSchema } from '../schema.js';
7
+ import { applySchema } from './apply-schema.js';
8
+
9
+ /** Helper to create a minimal VectorStoreDocument for testing. */
10
+ function makeDocument(
11
+ overrides: Partial<VectorStoreDocument> & { metadata: VectorStoreDocument['metadata'] },
12
+ ): VectorStoreDocument {
13
+ return {
14
+ id: 'test-id',
15
+ pageContent: 'test content',
16
+ ...overrides,
17
+ };
18
+ }
19
+
20
+ describe('defineIndexSchema', () => {
21
+ it('returns the same config object (type-narrowing only)', () => {
22
+ const shape = z.object({ type: z.string() });
23
+ const resolve = () => ({ type: 'tsdoc' });
24
+
25
+ const schema = defineIndexSchema({ shape, resolve });
26
+
27
+ expect(schema.shape).toBe(shape);
28
+ expect(schema.resolve).toBe(resolve);
29
+ });
30
+ });
31
+
32
+ describe('applySchema', () => {
33
+ const schema = defineIndexSchema({
34
+ shape: z.object({
35
+ pkg_name: z.string().optional(),
36
+ type: z.string(),
37
+ tags: z.array(z.string()).default([]),
38
+ source_dir: z.string(),
39
+ }),
40
+ resolve: (doc) => ({
41
+ pkg_name: doc.metadata.attributes?.pkg_name as string | undefined,
42
+ type: (doc.metadata.attributes?.type as string) ?? 'unknown',
43
+ tags: (doc.metadata.attributes?.tags as string[]) ?? [],
44
+ source_dir: doc.metadata.source.split('/')[0],
45
+ }),
46
+ });
47
+
48
+ it('passes through unchanged when schema is undefined', async () => {
49
+ const doc = makeDocument({
50
+ metadata: { source: 'packages/foo/src/index.ts', attributes: { type: 'tsdoc' } },
51
+ });
52
+ const docs$ = of([doc]);
53
+
54
+ const result = await lastValueFrom(applySchema(docs$, undefined));
55
+
56
+ expect(result).toEqual([doc]);
57
+ });
58
+
59
+ it('resolves promoted fields and stores them on metadata.schemaFields', async () => {
60
+ const doc = makeDocument({
61
+ metadata: {
62
+ source: 'packages/foo/src/index.ts',
63
+ attributes: {
64
+ type: 'tsdoc',
65
+ pkg_name: '@equinor/fusion-framework',
66
+ tags: ['package', 'react'],
67
+ other_attr: 'keep-me',
68
+ },
69
+ },
70
+ });
71
+ const docs$ = of([doc]);
72
+
73
+ const result = await lastValueFrom(applySchema(docs$, schema));
74
+
75
+ expect(result[0].metadata.schemaFields).toEqual({
76
+ pkg_name: '@equinor/fusion-framework',
77
+ type: 'tsdoc',
78
+ tags: ['package', 'react'],
79
+ source_dir: 'packages',
80
+ });
81
+ });
82
+
83
+ it('removes promoted keys from attributes to avoid duplication', async () => {
84
+ const doc = makeDocument({
85
+ metadata: {
86
+ source: 'packages/foo/src/index.ts',
87
+ attributes: {
88
+ type: 'tsdoc',
89
+ pkg_name: '@equinor/fusion-framework',
90
+ tags: ['package'],
91
+ git_commit_hash: 'abc123',
92
+ },
93
+ },
94
+ });
95
+ const docs$ = of([doc]);
96
+
97
+ const result = await lastValueFrom(applySchema(docs$, schema));
98
+
99
+ // Promoted keys removed, non-promoted keys preserved
100
+ expect(result[0].metadata.attributes).toEqual({ git_commit_hash: 'abc123' });
101
+ });
102
+
103
+ it('handles documents with no attributes gracefully', async () => {
104
+ const doc = makeDocument({
105
+ metadata: { source: 'cookbooks/app-react/src/App.tsx' },
106
+ });
107
+ const docs$ = of([doc]);
108
+
109
+ const result = await lastValueFrom(applySchema(docs$, schema));
110
+
111
+ expect(result[0].metadata.schemaFields).toEqual({
112
+ pkg_name: undefined,
113
+ type: 'unknown',
114
+ tags: [],
115
+ source_dir: 'cookbooks',
116
+ });
117
+ expect(result[0].metadata.attributes).toEqual({});
118
+ });
119
+
120
+ it('throws when resolved values fail Zod validation', async () => {
121
+ const badSchema = defineIndexSchema({
122
+ shape: z.object({ type: z.string().min(1) }),
123
+ resolve: () => ({ type: '' }), // Empty string fails min(1)
124
+ });
125
+
126
+ const doc = makeDocument({
127
+ metadata: { source: 'test.ts', attributes: {} },
128
+ });
129
+ const docs$ = of([doc]);
130
+
131
+ await expect(lastValueFrom(applySchema(docs$, badSchema))).rejects.toThrow();
132
+ });
133
+
134
+ it('runs prepareAttributes before resolve to enrich attributes', async () => {
135
+ const schemaWithPrepare = defineIndexSchema({
136
+ shape: z.object({
137
+ tags: z.array(z.string()).default([]),
138
+ type: z.string(),
139
+ }),
140
+ prepareAttributes: (attrs, doc) => {
141
+ // Type-safe: attrs.tags is string[] | undefined
142
+ attrs.tags ??= [];
143
+ if (doc.metadata.source.includes('packages/')) {
144
+ attrs.tags.push('package');
145
+ }
146
+ return attrs;
147
+ },
148
+ resolve: (doc) => ({
149
+ tags: (doc.metadata.attributes?.tags as string[]) ?? [],
150
+ type: (doc.metadata.attributes?.type as string) ?? 'unknown',
151
+ }),
152
+ });
153
+
154
+ const doc = makeDocument({
155
+ metadata: {
156
+ source: 'packages/framework/src/index.ts',
157
+ attributes: { type: 'tsdoc' },
158
+ },
159
+ });
160
+ const docs$ = of([doc]);
161
+
162
+ const result = await lastValueFrom(applySchema(docs$, schemaWithPrepare));
163
+
164
+ // prepareAttributes added 'package' tag before resolve consumed it
165
+ expect(result[0].metadata.schemaFields).toEqual({
166
+ tags: ['package'],
167
+ type: 'tsdoc',
168
+ });
169
+ });
170
+ });