@kaelio/ktx 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/python/{kaelio_ktx-0.1.0-py3-none-any.whl → kaelio_ktx-0.2.0-py3-none-any.whl} +0 -0
- package/assets/python/manifest.json +4 -4
- package/dist/admin-reindex.d.ts +15 -0
- package/dist/admin-reindex.js +168 -0
- package/dist/admin-reindex.test.js +116 -0
- package/dist/{dev.d.ts → admin.d.ts} +1 -1
- package/dist/{dev.js → admin.js} +14 -12
- package/dist/admin.test.d.ts +1 -0
- package/dist/{dev.test.js → admin.test.js} +36 -31
- package/dist/cli-program.js +7 -7
- package/dist/cli-program.test.js +1 -1
- package/dist/cli-runtime.d.ts +2 -0
- package/dist/commands/connection-commands.js +11 -10
- package/dist/commands/connection-selection.d.ts +11 -0
- package/dist/commands/connection-selection.js +9 -0
- package/dist/commands/ingest-commands.js +32 -26
- package/dist/commands/knowledge-commands.js +17 -28
- package/dist/commands/mcp-commands.js +17 -11
- package/dist/commands/setup-commands.js +14 -26
- package/dist/commands/sl-commands.js +27 -32
- package/dist/doctor.test.js +7 -8
- package/dist/example-smoke.test.js +3 -3
- package/dist/index.test.js +102 -70
- package/dist/ingest-depth.js +0 -1
- package/dist/ingest.test-utils.js +2 -2
- package/dist/ingest.test.js +4 -4
- package/dist/io/print-list.test.js +4 -4
- package/dist/knowledge.js +1 -1
- package/dist/managed-local-embeddings.d.ts +2 -0
- package/dist/managed-local-embeddings.js +2 -0
- package/dist/managed-local-embeddings.test.js +2 -0
- package/dist/managed-mcp-daemon.js +3 -2
- package/dist/managed-mcp-daemon.test.js +25 -0
- package/dist/managed-python-command.js +2 -2
- package/dist/managed-python-command.test.js +4 -3
- package/dist/managed-python-daemon.js +3 -2
- package/dist/managed-python-daemon.test.js +20 -0
- package/dist/managed-python-runtime.d.ts +5 -1
- package/dist/managed-python-runtime.js +50 -6
- package/dist/managed-python-runtime.test.js +53 -23
- package/dist/memory-flow-tui.test.js +2 -2
- package/dist/next-steps.d.ts +6 -6
- package/dist/next-steps.js +4 -4
- package/dist/next-steps.test.js +5 -5
- package/dist/print-command-tree.test.js +1 -1
- package/dist/proxy-env.d.ts +1 -0
- package/dist/proxy-env.js +23 -0
- package/dist/proxy-env.test.d.ts +1 -0
- package/dist/proxy-env.test.js +17 -0
- package/dist/public-ingest.js +3 -5
- package/dist/public-ingest.test.js +7 -3
- package/dist/runtime.test.js +2 -1
- package/dist/scan.test.js +2 -2
- package/dist/setup-agents.js +6 -4
- package/dist/setup-agents.test.js +35 -1
- package/dist/setup-embeddings.d.ts +1 -0
- package/dist/setup-embeddings.js +29 -7
- package/dist/setup-embeddings.test.js +49 -7
- package/dist/setup-models.d.ts +0 -1
- package/dist/setup-models.js +2 -3
- package/dist/setup-models.test.js +8 -10
- package/dist/setup-project.d.ts +9 -1
- package/dist/setup-project.js +52 -25
- package/dist/setup-project.test.js +8 -8
- package/dist/setup-runtime.test.js +4 -2
- package/dist/setup.d.ts +1 -2
- package/dist/setup.js +21 -5
- package/dist/setup.test.js +160 -43
- package/dist/sl.js +1 -1
- package/dist/sl.test.js +2 -1
- package/dist/standalone-smoke.test.js +8 -5
- package/dist/status-project.js +1 -10
- package/node_modules/@ktx/context/dist/index-sync/index.d.ts +2 -0
- package/node_modules/@ktx/context/dist/index-sync/index.js +1 -0
- package/node_modules/@ktx/context/dist/index-sync/reindex.d.ts +20 -0
- package/node_modules/@ktx/context/dist/index-sync/reindex.js +141 -0
- package/node_modules/@ktx/context/dist/index-sync/reindex.test.d.ts +1 -0
- package/node_modules/@ktx/context/dist/index-sync/reindex.test.js +139 -0
- package/node_modules/@ktx/context/dist/index-sync/types.d.ts +29 -0
- package/node_modules/@ktx/context/dist/index-sync/types.js +1 -0
- package/node_modules/@ktx/context/dist/index.d.ts +1 -0
- package/node_modules/@ktx/context/dist/index.js +1 -0
- package/node_modules/@ktx/context/dist/ingest/adapters/historic-sql/local-ingest-acceptance.test.js +1 -1
- package/node_modules/@ktx/context/dist/ingest/local-bundle-ingest.test.js +8 -8
- package/node_modules/@ktx/context/dist/ingest/local-bundle-runtime.js +4 -1
- package/node_modules/@ktx/context/dist/ingest/local-bundle-runtime.test.js +3 -3
- package/node_modules/@ktx/context/dist/ingest/local-embedding-provider.integration.test.js +9 -10
- package/node_modules/@ktx/context/dist/ingest/memory-flow/schema.d.ts +2 -2
- package/node_modules/@ktx/context/dist/ingest/report-snapshot.d.ts +2 -2
- package/node_modules/@ktx/context/dist/llm/local-config.js +2 -15
- package/node_modules/@ktx/context/dist/llm/local-config.test.js +3 -7
- package/node_modules/@ktx/context/dist/memory/local-memory.js +9 -3
- package/node_modules/@ktx/context/dist/project/config.d.ts +0 -5
- package/node_modules/@ktx/context/dist/project/config.js +5 -5
- package/node_modules/@ktx/context/dist/project/config.test.js +4 -7
- package/node_modules/@ktx/context/dist/scan/enrichment-state.test.js +4 -4
- package/node_modules/@ktx/context/dist/scan/index.d.ts +1 -1
- package/node_modules/@ktx/context/dist/scan/local-enrichment.d.ts +2 -6
- package/node_modules/@ktx/context/dist/scan/local-enrichment.js +31 -47
- package/node_modules/@ktx/context/dist/scan/local-enrichment.test.js +35 -18
- package/node_modules/@ktx/context/dist/scan/local-scan.test.js +2 -3
- package/node_modules/@ktx/context/dist/sl/ports.d.ts +3 -3
- package/node_modules/@ktx/context/dist/sl/sl-search.service.d.ts +3 -2
- package/node_modules/@ktx/context/dist/sl/sl-search.service.js +47 -45
- package/node_modules/@ktx/context/dist/sl/sl-search.service.test.js +61 -0
- package/node_modules/@ktx/context/dist/sl/sqlite-sl-sources-index.d.ts +4 -3
- package/node_modules/@ktx/context/dist/sl/sqlite-sl-sources-index.js +15 -5
- package/node_modules/@ktx/context/dist/sl/sqlite-sl-sources-index.test.js +24 -0
- package/node_modules/@ktx/context/dist/wiki/knowledge-wiki.service.d.ts +3 -2
- package/node_modules/@ktx/context/dist/wiki/knowledge-wiki.service.js +62 -51
- package/node_modules/@ktx/context/dist/wiki/knowledge-wiki.service.test.js +59 -3
- package/node_modules/@ktx/context/dist/wiki/ports.d.ts +3 -3
- package/node_modules/@ktx/context/dist/wiki/sqlite-knowledge-index.d.ts +33 -0
- package/node_modules/@ktx/context/dist/wiki/sqlite-knowledge-index.js +155 -2
- package/node_modules/@ktx/context/dist/wiki/sqlite-knowledge-index.test.js +26 -0
- package/node_modules/@ktx/context/package.json +5 -0
- package/node_modules/@ktx/llm/dist/embedding-provider.d.ts +0 -7
- package/node_modules/@ktx/llm/dist/embedding-provider.js +12 -138
- package/node_modules/@ktx/llm/dist/embedding-provider.test.js +10 -25
- package/node_modules/@ktx/llm/dist/types.d.ts +1 -1
- package/package.json +1 -1
- /package/dist/{dev.test.d.ts → admin-reindex.test.d.ts} +0 -0
|
@@ -69,7 +69,6 @@ declare const llmSchema: z.ZodObject<{
|
|
|
69
69
|
}, z.core.$strict>;
|
|
70
70
|
declare const embeddingSchema: z.ZodObject<{
|
|
71
71
|
backend: z.ZodDefault<z.ZodEnum<{
|
|
72
|
-
deterministic: "deterministic";
|
|
73
72
|
none: "none";
|
|
74
73
|
openai: "openai";
|
|
75
74
|
"sentence-transformers": "sentence-transformers";
|
|
@@ -102,7 +101,6 @@ declare const scanEnrichmentSchema: z.ZodObject<{
|
|
|
102
101
|
}>>;
|
|
103
102
|
embeddings: z.ZodOptional<z.ZodObject<{
|
|
104
103
|
backend: z.ZodDefault<z.ZodEnum<{
|
|
105
|
-
deterministic: "deterministic";
|
|
106
104
|
none: "none";
|
|
107
105
|
openai: "openai";
|
|
108
106
|
"sentence-transformers": "sentence-transformers";
|
|
@@ -141,7 +139,6 @@ declare const scanSchema: z.ZodObject<{
|
|
|
141
139
|
}>>;
|
|
142
140
|
embeddings: z.ZodOptional<z.ZodObject<{
|
|
143
141
|
backend: z.ZodDefault<z.ZodEnum<{
|
|
144
|
-
deterministic: "deterministic";
|
|
145
142
|
none: "none";
|
|
146
143
|
openai: "openai";
|
|
147
144
|
"sentence-transformers": "sentence-transformers";
|
|
@@ -462,7 +459,6 @@ declare const ktxProjectConfigSchema: z.ZodObject<{
|
|
|
462
459
|
adapters: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
463
460
|
embeddings: z.ZodPrefault<z.ZodObject<{
|
|
464
461
|
backend: z.ZodDefault<z.ZodEnum<{
|
|
465
|
-
deterministic: "deterministic";
|
|
466
462
|
none: "none";
|
|
467
463
|
openai: "openai";
|
|
468
464
|
"sentence-transformers": "sentence-transformers";
|
|
@@ -507,7 +503,6 @@ declare const ktxProjectConfigSchema: z.ZodObject<{
|
|
|
507
503
|
}>>;
|
|
508
504
|
embeddings: z.ZodOptional<z.ZodObject<{
|
|
509
505
|
backend: z.ZodDefault<z.ZodEnum<{
|
|
510
|
-
deterministic: "deterministic";
|
|
511
506
|
none: "none";
|
|
512
507
|
openai: "openai";
|
|
513
508
|
"sentence-transformers": "sentence-transformers";
|
|
@@ -3,7 +3,7 @@ import YAML from 'yaml';
|
|
|
3
3
|
import * as z from 'zod';
|
|
4
4
|
import { connectionConfigSchema } from './driver-schemas.js';
|
|
5
5
|
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway', 'claude-code'];
|
|
6
|
-
const KTX_EMBEDDING_BACKENDS = ['none', '
|
|
6
|
+
const KTX_EMBEDDING_BACKENDS = ['none', 'openai', 'sentence-transformers'];
|
|
7
7
|
const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'];
|
|
8
8
|
const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'];
|
|
9
9
|
const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'];
|
|
@@ -69,9 +69,9 @@ const embeddingSchema = z
|
|
|
69
69
|
.strictObject({
|
|
70
70
|
backend: z
|
|
71
71
|
.enum(KTX_EMBEDDING_BACKENDS)
|
|
72
|
-
.default('
|
|
73
|
-
.describe('Embedding backend. "
|
|
74
|
-
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").
|
|
72
|
+
.default('none')
|
|
73
|
+
.describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
|
|
74
|
+
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'),
|
|
75
75
|
dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
|
|
76
76
|
openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
|
|
77
77
|
sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
|
|
@@ -95,7 +95,7 @@ const ingestSchema = z
|
|
|
95
95
|
.default([])
|
|
96
96
|
.describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
|
|
97
97
|
embeddings: embeddingSchema
|
|
98
|
-
.prefault({ backend: '
|
|
98
|
+
.prefault({ backend: 'none' })
|
|
99
99
|
.describe('Embedding configuration used when ingest adapters need to embed documents.'),
|
|
100
100
|
workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
|
|
101
101
|
})
|
|
@@ -32,8 +32,7 @@ connections:
|
|
|
32
32
|
ingest: {
|
|
33
33
|
adapters: [],
|
|
34
34
|
embeddings: {
|
|
35
|
-
backend: '
|
|
36
|
-
model: 'deterministic',
|
|
35
|
+
backend: 'none',
|
|
37
36
|
dimensions: 8,
|
|
38
37
|
},
|
|
39
38
|
workUnits: {
|
|
@@ -75,11 +74,10 @@ connections:
|
|
|
75
74
|
const parsed = parseKtxProjectConfig(serialized);
|
|
76
75
|
expect(serialized).not.toContain('project:');
|
|
77
76
|
expect(serialized).not.toContain('live-database');
|
|
78
|
-
expect(serialized).toContain(' embeddings:\n backend:
|
|
77
|
+
expect(serialized).toContain(' embeddings:\n backend: none\n dimensions: 8');
|
|
79
78
|
expect(parsed.ingest.adapters).toEqual([]);
|
|
80
79
|
expect(parsed.ingest.embeddings).toEqual({
|
|
81
|
-
backend: '
|
|
82
|
-
model: 'deterministic',
|
|
80
|
+
backend: 'none',
|
|
83
81
|
dimensions: 8,
|
|
84
82
|
});
|
|
85
83
|
});
|
|
@@ -350,8 +348,7 @@ scan:
|
|
|
350
348
|
const config = parseKtxProjectConfig('{}\n');
|
|
351
349
|
expect(config).toEqual(buildDefaultKtxProjectConfig());
|
|
352
350
|
expect(config.ingest.embeddings).toEqual({
|
|
353
|
-
backend: '
|
|
354
|
-
model: 'deterministic',
|
|
351
|
+
backend: 'none',
|
|
355
352
|
dimensions: 8,
|
|
356
353
|
});
|
|
357
354
|
});
|
|
@@ -48,13 +48,13 @@ describe('scan enrichment state', () => {
|
|
|
48
48
|
snapshot,
|
|
49
49
|
mode: 'enriched',
|
|
50
50
|
detectRelationships: true,
|
|
51
|
-
providerIdentity: { provider: '
|
|
51
|
+
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
|
52
52
|
});
|
|
53
53
|
const second = computeKtxScanEnrichmentInputHash({
|
|
54
54
|
snapshot: { ...snapshot, metadata: {} },
|
|
55
55
|
mode: 'enriched',
|
|
56
56
|
detectRelationships: true,
|
|
57
|
-
providerIdentity: { llmModel: 'a',
|
|
57
|
+
providerIdentity: { llmModel: 'a', provider: 'local-heuristic' },
|
|
58
58
|
});
|
|
59
59
|
const firstTable = snapshot.tables[0];
|
|
60
60
|
if (!firstTable) {
|
|
@@ -64,7 +64,7 @@ describe('scan enrichment state', () => {
|
|
|
64
64
|
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
|
|
65
65
|
mode: 'enriched',
|
|
66
66
|
detectRelationships: true,
|
|
67
|
-
providerIdentity: { provider: '
|
|
67
|
+
providerIdentity: { provider: 'local-heuristic', llmModel: 'a' },
|
|
68
68
|
});
|
|
69
69
|
expect(first).toMatch(/^[a-f0-9]{64}$/);
|
|
70
70
|
expect(second).toBe(first);
|
|
@@ -75,7 +75,7 @@ describe('scan enrichment state', () => {
|
|
|
75
75
|
snapshot,
|
|
76
76
|
mode: 'enriched',
|
|
77
77
|
detectRelationships: true,
|
|
78
|
-
providerIdentity: { provider: '
|
|
78
|
+
providerIdentity: { provider: 'local-heuristic' },
|
|
79
79
|
});
|
|
80
80
|
await store.saveCompletedStage({
|
|
81
81
|
runId: 'scan-run-1',
|
|
@@ -13,7 +13,7 @@ export { createKtxEntityDetailsService } from './entity-details.js';
|
|
|
13
13
|
export type { DisplayTargetResolution, RawSchemaHit, TableDetail, WarehouseCatalogServiceDeps, } from './warehouse-catalog.js';
|
|
14
14
|
export { WarehouseCatalogService } from './warehouse-catalog.js';
|
|
15
15
|
export type { KtxColumnSampleUpdate, KtxDescriptionSource, KtxDescriptionUpdate, KtxEmbeddingUpdate, KtxEnrichedColumn, KtxEnrichedRelationship, KtxEnrichedSchema, KtxEnrichedTable, KtxRelationshipEndpoint, KtxRelationshipSource, KtxRelationshipType, KtxRelationshipUpdate, KtxScanMetadataStore, KtxSkippedRelationship, KtxStructuralSyncPlan, } from './enrichment-types.js';
|
|
16
|
-
export type {
|
|
16
|
+
export type { KtxLocalScanEnrichmentInput, KtxLocalScanEnrichmentProviders, KtxLocalScanEnrichmentResult, } from './local-enrichment.js';
|
|
17
17
|
export { createDeterministicLocalScanEnrichmentProviders, runLocalScanEnrichment, snapshotToKtxEnrichedSchema, } from './local-enrichment.js';
|
|
18
18
|
export type { WriteLocalScanEnrichmentArtifactsInput, WriteLocalScanEnrichmentArtifactsResult, WriteLocalScanManifestShardsInput, WriteLocalScanManifestShardsResult, } from './local-enrichment-artifacts.js';
|
|
19
19
|
export { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards, } from './local-enrichment-artifacts.js';
|
|
@@ -6,13 +6,9 @@ import type { KtxCompositeRelationshipCandidate } from './relationship-composite
|
|
|
6
6
|
import type { KtxResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
|
|
7
7
|
import type { KtxRelationshipProfileArtifact } from './relationship-profiling.js';
|
|
8
8
|
import type { KtxEmbeddingPort, KtxScanConnector, KtxScanContext, KtxScanEnrichmentStateSummary, KtxScanEnrichmentSummary, KtxScanMode, KtxScanRelationshipSummary, KtxScanWarning, KtxSchemaSnapshot, KtxTableRef } from './types.js';
|
|
9
|
-
export interface DeterministicLocalScanEnrichmentProviderOptions {
|
|
10
|
-
embeddingDimensions?: number;
|
|
11
|
-
maxBatchSize?: number;
|
|
12
|
-
}
|
|
13
9
|
export interface KtxLocalScanEnrichmentProviders {
|
|
14
10
|
llmRuntime: KtxLlmRuntimePort;
|
|
15
|
-
embedding
|
|
11
|
+
embedding?: KtxEmbeddingPort | null;
|
|
16
12
|
}
|
|
17
13
|
export interface KtxLocalScanEnrichmentInput {
|
|
18
14
|
connectionId: string;
|
|
@@ -44,6 +40,6 @@ export interface KtxLocalScanEnrichmentResult {
|
|
|
44
40
|
resolvedRelationships: KtxResolvedRelationshipDiscoveryCandidate[] | null;
|
|
45
41
|
compositeRelationships: KtxCompositeRelationshipCandidate[] | null;
|
|
46
42
|
}
|
|
47
|
-
export declare function createDeterministicLocalScanEnrichmentProviders(
|
|
43
|
+
export declare function createDeterministicLocalScanEnrichmentProviders(): KtxLocalScanEnrichmentProviders;
|
|
48
44
|
export declare function snapshotToKtxEnrichedSchema(snapshot: KtxSchemaSnapshot, embeddingsByColumnId?: ReadonlyMap<string, number[]>): KtxEnrichedSchema;
|
|
49
45
|
export declare function runLocalScanEnrichment(input: KtxLocalScanEnrichmentInput): Promise<KtxLocalScanEnrichmentResult>;
|
|
@@ -76,28 +76,9 @@ function providerlessEnrichedWarning(relationshipDetection) {
|
|
|
76
76
|
},
|
|
77
77
|
};
|
|
78
78
|
}
|
|
79
|
-
function
|
|
80
|
-
const values = Array.from({ length: dimensions }, (_, index) => {
|
|
81
|
-
let hash = index + 17;
|
|
82
|
-
for (const char of text) {
|
|
83
|
-
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
|
|
84
|
-
}
|
|
85
|
-
return Number(((hash % 200) / 100 - 1).toFixed(4));
|
|
86
|
-
});
|
|
87
|
-
return values;
|
|
88
|
-
}
|
|
89
|
-
export function createDeterministicLocalScanEnrichmentProviders(options = {}) {
|
|
90
|
-
const dimensions = options.embeddingDimensions ?? 8;
|
|
91
|
-
const maxBatchSize = options.maxBatchSize ?? 64;
|
|
79
|
+
export function createDeterministicLocalScanEnrichmentProviders() {
|
|
92
80
|
return {
|
|
93
81
|
llmRuntime: deterministicLlmRuntime(),
|
|
94
|
-
embedding: {
|
|
95
|
-
dimensions,
|
|
96
|
-
maxBatchSize,
|
|
97
|
-
async embedBatch(texts) {
|
|
98
|
-
return texts.map((text) => hashEmbedding(text, dimensions));
|
|
99
|
-
},
|
|
100
|
-
},
|
|
101
82
|
};
|
|
102
83
|
}
|
|
103
84
|
function deterministicLlmRuntime() {
|
|
@@ -262,7 +243,7 @@ async function buildEmbeddings(input) {
|
|
|
262
243
|
}
|
|
263
244
|
}
|
|
264
245
|
const embeddings = [];
|
|
265
|
-
const maxBatchSize = embeddingBatchSize(input.
|
|
246
|
+
const maxBatchSize = embeddingBatchSize(input.embedding.maxBatchSize);
|
|
266
247
|
const embeddingTexts = texts.map((item) => item.text);
|
|
267
248
|
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
|
|
268
249
|
if (batchCount === 0) {
|
|
@@ -274,7 +255,7 @@ async function buildEmbeddings(input) {
|
|
|
274
255
|
transient: true,
|
|
275
256
|
});
|
|
276
257
|
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
|
|
277
|
-
const batchEmbeddings = await input.
|
|
258
|
+
const batchEmbeddings = await input.embedding.embedBatch(batch);
|
|
278
259
|
if (batchEmbeddings.length !== batch.length) {
|
|
279
260
|
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
|
|
280
261
|
}
|
|
@@ -395,34 +376,37 @@ export async function runLocalScanEnrichment(input) {
|
|
|
395
376
|
warnings,
|
|
396
377
|
}),
|
|
397
378
|
});
|
|
398
|
-
const embeddingProgress = progress?.startPhase(0.2);
|
|
399
|
-
embeddingUpdates = await runEnrichmentStage({
|
|
400
|
-
stateStore: input.stateStore,
|
|
401
|
-
runId: input.context.runId,
|
|
402
|
-
connectionId: input.connectionId,
|
|
403
|
-
syncId,
|
|
404
|
-
mode: input.mode,
|
|
405
|
-
stage: 'embeddings',
|
|
406
|
-
inputHash,
|
|
407
|
-
now,
|
|
408
|
-
resumedStages: state.resumedStages,
|
|
409
|
-
completedStages: state.completedStages,
|
|
410
|
-
failedStages: state.failedStages,
|
|
411
|
-
compute: async () => {
|
|
412
|
-
const embeddings = await buildEmbeddings({
|
|
413
|
-
snapshot,
|
|
414
|
-
providers,
|
|
415
|
-
descriptions,
|
|
416
|
-
progress: embeddingProgress,
|
|
417
|
-
});
|
|
418
|
-
return embeddings.updates;
|
|
419
|
-
},
|
|
420
|
-
});
|
|
421
|
-
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
|
422
379
|
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
|
|
423
380
|
summary.tableDescriptions = 'completed';
|
|
424
381
|
summary.columnDescriptions = 'completed';
|
|
425
|
-
|
|
382
|
+
const embeddingProgress = progress?.startPhase(0.2);
|
|
383
|
+
const embedding = providers.embedding;
|
|
384
|
+
if (embedding) {
|
|
385
|
+
embeddingUpdates = await runEnrichmentStage({
|
|
386
|
+
stateStore: input.stateStore,
|
|
387
|
+
runId: input.context.runId,
|
|
388
|
+
connectionId: input.connectionId,
|
|
389
|
+
syncId,
|
|
390
|
+
mode: input.mode,
|
|
391
|
+
stage: 'embeddings',
|
|
392
|
+
inputHash,
|
|
393
|
+
now,
|
|
394
|
+
resumedStages: state.resumedStages,
|
|
395
|
+
completedStages: state.completedStages,
|
|
396
|
+
failedStages: state.failedStages,
|
|
397
|
+
compute: async () => {
|
|
398
|
+
const embeddings = await buildEmbeddings({
|
|
399
|
+
snapshot,
|
|
400
|
+
embedding,
|
|
401
|
+
descriptions,
|
|
402
|
+
progress: embeddingProgress,
|
|
403
|
+
});
|
|
404
|
+
return embeddings.updates;
|
|
405
|
+
},
|
|
406
|
+
});
|
|
407
|
+
schema = snapshotToKtxEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
|
408
|
+
summary.embeddings = 'completed';
|
|
409
|
+
}
|
|
426
410
|
}
|
|
427
411
|
let relationshipUpdate = null;
|
|
428
412
|
let relationshipProfile = null;
|
|
@@ -4,6 +4,15 @@ import { buildDefaultKtxProjectConfig } from '../project/config.js';
|
|
|
4
4
|
import { createDeterministicLocalScanEnrichmentProviders, runLocalScanEnrichment, snapshotToKtxEnrichedSchema, } from './local-enrichment.js';
|
|
5
5
|
import { createLocalScanEnrichmentProvidersFromConfig } from './local-scan.js';
|
|
6
6
|
import { createKtxConnectorCapabilities, } from './types.js';
|
|
7
|
+
function fakeScanEmbedding(options) {
|
|
8
|
+
return {
|
|
9
|
+
dimensions: options.dimensions,
|
|
10
|
+
maxBatchSize: options.maxBatchSize ?? 64,
|
|
11
|
+
async embedBatch(texts) {
|
|
12
|
+
return texts.map((_, textIndex) => Array.from({ length: options.dimensions }, (__, dimensionIndex) => textIndex + dimensionIndex));
|
|
13
|
+
},
|
|
14
|
+
};
|
|
15
|
+
}
|
|
7
16
|
const snapshot = {
|
|
8
17
|
connectionId: 'warehouse',
|
|
9
18
|
driver: 'postgres',
|
|
@@ -317,7 +326,7 @@ describe('local scan enrichment', () => {
|
|
|
317
326
|
}
|
|
318
327
|
});
|
|
319
328
|
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
|
320
|
-
const providers = createDeterministicLocalScanEnrichmentProviders(
|
|
329
|
+
const providers = createDeterministicLocalScanEnrichmentProviders();
|
|
321
330
|
const generateObject = vi.fn();
|
|
322
331
|
const result = await runLocalScanEnrichment({
|
|
323
332
|
connectionId: 'warehouse',
|
|
@@ -381,7 +390,7 @@ describe('local scan enrichment', () => {
|
|
|
381
390
|
detectRelationships: false,
|
|
382
391
|
connector: failingConnector,
|
|
383
392
|
context: { runId: 'scan-run-warnings', logger },
|
|
384
|
-
providers: createDeterministicLocalScanEnrichmentProviders(
|
|
393
|
+
providers: createDeterministicLocalScanEnrichmentProviders(),
|
|
385
394
|
});
|
|
386
395
|
const codes = result.warnings.map((warning) => warning.code);
|
|
387
396
|
expect(codes).toContain('sampling_failed');
|
|
@@ -394,24 +403,23 @@ describe('local scan enrichment', () => {
|
|
|
394
403
|
// Sampling was retried 3× for each of the 2 tables = 6 calls
|
|
395
404
|
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
|
|
396
405
|
});
|
|
397
|
-
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
|
|
406
|
+
it('runs configured deterministic enrichment with descriptions and no embeddings', async () => {
|
|
398
407
|
const result = await runLocalScanEnrichment({
|
|
399
408
|
connectionId: 'warehouse',
|
|
400
409
|
mode: 'enriched',
|
|
401
410
|
detectRelationships: true,
|
|
402
411
|
connector: connector(),
|
|
403
412
|
context: { runId: 'scan-run-2' },
|
|
404
|
-
providers: createDeterministicLocalScanEnrichmentProviders(
|
|
413
|
+
providers: createDeterministicLocalScanEnrichmentProviders(),
|
|
405
414
|
});
|
|
406
415
|
expect(result.summary).toMatchObject({
|
|
407
416
|
dataDictionary: 'completed',
|
|
408
417
|
tableDescriptions: 'completed',
|
|
409
418
|
columnDescriptions: 'completed',
|
|
410
|
-
embeddings: '
|
|
419
|
+
embeddings: 'skipped',
|
|
411
420
|
deterministicRelationships: 'completed',
|
|
412
421
|
});
|
|
413
|
-
expect(result.embeddingUpdates).
|
|
414
|
-
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
|
|
422
|
+
expect(result.embeddingUpdates).toEqual([]);
|
|
415
423
|
expect(result.snapshot).toEqual(snapshot);
|
|
416
424
|
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
|
417
425
|
});
|
|
@@ -470,7 +478,7 @@ describe('local scan enrichment', () => {
|
|
|
470
478
|
mode: 'enriched',
|
|
471
479
|
connector: scanConnector,
|
|
472
480
|
context: { runId: 'scan-run-concurrent-descriptions' },
|
|
473
|
-
providers: createDeterministicLocalScanEnrichmentProviders(
|
|
481
|
+
providers: createDeterministicLocalScanEnrichmentProviders(),
|
|
474
482
|
relationshipSettings: settings,
|
|
475
483
|
});
|
|
476
484
|
expect(maxActiveColumnSamples).toBe(6);
|
|
@@ -491,7 +499,10 @@ describe('local scan enrichment', () => {
|
|
|
491
499
|
detectRelationships: true,
|
|
492
500
|
connector: connector(),
|
|
493
501
|
context: { runId: 'scan-run-progress', progress },
|
|
494
|
-
providers:
|
|
502
|
+
providers: {
|
|
503
|
+
...createDeterministicLocalScanEnrichmentProviders(),
|
|
504
|
+
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
505
|
+
},
|
|
495
506
|
});
|
|
496
507
|
expect(events).toEqual(expect.arrayContaining([
|
|
497
508
|
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
|
|
@@ -555,7 +566,7 @@ describe('local scan enrichment', () => {
|
|
|
555
566
|
...connector(),
|
|
556
567
|
introspect: vi.fn(async () => manyColumnSnapshot),
|
|
557
568
|
};
|
|
558
|
-
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders(
|
|
569
|
+
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders();
|
|
559
570
|
const embedBatch = vi.fn(async (texts) => {
|
|
560
571
|
if (texts.length > 2) {
|
|
561
572
|
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
|
@@ -583,7 +594,10 @@ describe('local scan enrichment', () => {
|
|
|
583
594
|
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
|
584
595
|
const stateStore = memoryEnrichmentStateStore();
|
|
585
596
|
const scanConnector = connector();
|
|
586
|
-
const providers =
|
|
597
|
+
const providers = {
|
|
598
|
+
...createDeterministicLocalScanEnrichmentProviders(),
|
|
599
|
+
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
600
|
+
};
|
|
587
601
|
const first = await runLocalScanEnrichment({
|
|
588
602
|
connectionId: 'warehouse',
|
|
589
603
|
mode: 'enriched',
|
|
@@ -593,7 +607,7 @@ describe('local scan enrichment', () => {
|
|
|
593
607
|
providers,
|
|
594
608
|
stateStore,
|
|
595
609
|
syncId: 'sync-resume-1',
|
|
596
|
-
providerIdentity: { provider: '
|
|
610
|
+
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
|
597
611
|
});
|
|
598
612
|
const generateText = vi.spyOn(providers.llmRuntime, 'generateText');
|
|
599
613
|
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
|
@@ -606,7 +620,7 @@ describe('local scan enrichment', () => {
|
|
|
606
620
|
providers,
|
|
607
621
|
stateStore,
|
|
608
622
|
syncId: 'sync-resume-1',
|
|
609
|
-
providerIdentity: { provider: '
|
|
623
|
+
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
|
610
624
|
});
|
|
611
625
|
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
|
612
626
|
expect(first.state.resumedStages).toEqual([]);
|
|
@@ -620,7 +634,10 @@ describe('local scan enrichment', () => {
|
|
|
620
634
|
});
|
|
621
635
|
it('does not reuse completed stages when the snapshot changes', async () => {
|
|
622
636
|
const stateStore = memoryEnrichmentStateStore();
|
|
623
|
-
const providers =
|
|
637
|
+
const providers = {
|
|
638
|
+
...createDeterministicLocalScanEnrichmentProviders(),
|
|
639
|
+
embedding: fakeScanEmbedding({ dimensions: 6 }),
|
|
640
|
+
};
|
|
624
641
|
const scanConnector = connector();
|
|
625
642
|
await runLocalScanEnrichment({
|
|
626
643
|
connectionId: 'warehouse',
|
|
@@ -631,7 +648,7 @@ describe('local scan enrichment', () => {
|
|
|
631
648
|
providers,
|
|
632
649
|
stateStore,
|
|
633
650
|
syncId: 'sync-resume-hash',
|
|
634
|
-
providerIdentity: { provider: '
|
|
651
|
+
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
|
635
652
|
});
|
|
636
653
|
const firstTable = snapshot.tables[0];
|
|
637
654
|
if (!firstTable) {
|
|
@@ -654,7 +671,7 @@ describe('local scan enrichment', () => {
|
|
|
654
671
|
providers,
|
|
655
672
|
stateStore,
|
|
656
673
|
syncId: 'sync-resume-hash',
|
|
657
|
-
providerIdentity: { provider: '
|
|
674
|
+
providerIdentity: { provider: 'fake', embeddingDimensions: 6 },
|
|
658
675
|
});
|
|
659
676
|
expect(result.state.resumedStages).toEqual([]);
|
|
660
677
|
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
|
@@ -749,8 +766,8 @@ describe('local scan enrichment', () => {
|
|
|
749
766
|
createKtxEmbeddingProvider: createKtxEmbeddingProvider,
|
|
750
767
|
env: { OPENAI_API_KEY: 'openai-key' }, // pragma: allowlist secret
|
|
751
768
|
});
|
|
752
|
-
expect(providers?.embedding
|
|
753
|
-
expect(providers?.embedding
|
|
769
|
+
expect(providers?.embedding?.dimensions).toBe(1536);
|
|
770
|
+
expect(providers?.embedding?.maxBatchSize).toBe(8);
|
|
754
771
|
expect(createKtxLlmProvider).toHaveBeenCalledWith(expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }));
|
|
755
772
|
expect(createKtxEmbeddingProvider).toHaveBeenCalledWith(expect.objectContaining({ backend: 'openai', model: 'provider/embedding-model' }));
|
|
756
773
|
});
|
|
@@ -916,7 +916,7 @@ describe('local scan', () => {
|
|
|
916
916
|
expect(persistedReport).toContain('postgres://reader:<redacted>@example.test/db');
|
|
917
917
|
expect(persistedReport).not.toContain('postgres://reader:secret@example.test/db'); // pragma: allowlist secret
|
|
918
918
|
});
|
|
919
|
-
it('runs enriched scans when deterministic standalone enrichment is configured', async () => {
|
|
919
|
+
it('runs enriched scans when deterministic standalone enrichment is configured without embeddings', async () => {
|
|
920
920
|
await writeFile(join(project.projectDir, 'ktx.yaml'), [
|
|
921
921
|
'connections:',
|
|
922
922
|
' warehouse:',
|
|
@@ -995,10 +995,9 @@ describe('local scan', () => {
|
|
|
995
995
|
expect(result.report.mode).toBe('enriched');
|
|
996
996
|
expect(result.report.enrichment.tableDescriptions).toBe('completed');
|
|
997
997
|
expect(result.report.enrichment.columnDescriptions).toBe('completed');
|
|
998
|
-
expect(result.report.enrichment.embeddings).toBe('
|
|
998
|
+
expect(result.report.enrichment.embeddings).toBe('skipped');
|
|
999
999
|
expect(result.report.artifactPaths.enrichmentArtifacts).toEqual([
|
|
1000
1000
|
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/descriptions.json',
|
|
1001
|
-
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/embeddings.json',
|
|
1002
1001
|
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationships.json',
|
|
1003
1002
|
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationship-profile.json',
|
|
1004
1003
|
'raw-sources/warehouse/live-database/2026-04-29-091500-scan-enriched-1/enrichment/relationship-diagnostics.json',
|
|
@@ -50,9 +50,9 @@ export interface SlSourcesIndexPort {
|
|
|
50
50
|
searchText: string;
|
|
51
51
|
hasEmbedding: boolean;
|
|
52
52
|
}>>;
|
|
53
|
-
deleteStale(connectionId: string, keepNames: string[]): Promise<
|
|
54
|
-
deleteByConnection(connectionId: string): Promise<
|
|
55
|
-
deleteByConnectionAndName(connectionId: string, sourceName: string): Promise<
|
|
53
|
+
deleteStale(connectionId: string, keepNames: string[]): Promise<number>;
|
|
54
|
+
deleteByConnection(connectionId: string): Promise<number>;
|
|
55
|
+
deleteByConnectionAndName(connectionId: string, sourceName: string): Promise<number>;
|
|
56
56
|
search(connectionId: string, queryEmbedding: number[] | null, queryText: string, limit: number, minRrfScore?: number): Promise<Array<{
|
|
57
57
|
sourceName: string;
|
|
58
58
|
rrfScore: number;
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { KtxEmbeddingPort, KtxLogger } from '../core/index.js';
|
|
2
|
+
import type { ReindexWorkResult } from '../index-sync/types.js';
|
|
2
3
|
import type { SlSourcesIndexPort } from './ports.js';
|
|
3
4
|
import type { SemanticLayerSource } from './types.js';
|
|
4
5
|
export declare function buildSemanticLayerSourceSearchText(source: SemanticLayerSource, priority?: string[]): string;
|
|
@@ -6,8 +7,8 @@ export declare class SlSearchService {
|
|
|
6
7
|
private readonly embeddingService;
|
|
7
8
|
private readonly slSourcesRepository;
|
|
8
9
|
private readonly logger;
|
|
9
|
-
constructor(embeddingService: KtxEmbeddingPort, slSourcesRepository: SlSourcesIndexPort, logger?: KtxLogger);
|
|
10
|
-
indexSources(connectionId: string, sources: SemanticLayerSource[]): Promise<
|
|
10
|
+
constructor(embeddingService: KtxEmbeddingPort | null, slSourcesRepository: SlSourcesIndexPort, logger?: KtxLogger);
|
|
11
|
+
indexSources(connectionId: string, sources: SemanticLayerSource[]): Promise<ReindexWorkResult>;
|
|
11
12
|
search(connectionId: string, query: string, limit?: number, minRrfScore?: number): Promise<Array<{
|
|
12
13
|
sourceName: string;
|
|
13
14
|
score: number;
|
|
@@ -84,64 +84,66 @@ export class SlSearchService {
|
|
|
84
84
|
this.logger = logger;
|
|
85
85
|
}
|
|
86
86
|
async indexSources(connectionId, sources) {
|
|
87
|
+
const existing = await this.slSourcesRepository.getExistingSearchTexts(connectionId);
|
|
87
88
|
if (sources.length === 0) {
|
|
88
|
-
await this.slSourcesRepository.deleteByConnection(connectionId);
|
|
89
|
-
return;
|
|
89
|
+
const deleted = await this.slSourcesRepository.deleteByConnection(connectionId);
|
|
90
|
+
return { scanned: 0, updated: 0, deleted, embeddingsRecomputed: 0, embeddingsFailed: 0 };
|
|
90
91
|
}
|
|
91
|
-
// Detect which sources actually changed by comparing search_text
|
|
92
|
-
const existing = await this.slSourcesRepository.getExistingSearchTexts(connectionId);
|
|
93
92
|
const searchTexts = sources.map((s) => this.buildSearchText(s));
|
|
93
|
+
const embeddingService = this.embeddingService;
|
|
94
94
|
const changedIndices = [];
|
|
95
|
-
for (let i = 0; i < sources.length; i
|
|
96
|
-
const
|
|
97
|
-
if (!
|
|
95
|
+
for (let i = 0; i < sources.length; i += 1) {
|
|
96
|
+
const previous = existing.get(sources[i].name);
|
|
97
|
+
if (!previous ||
|
|
98
|
+
previous.searchText !== searchTexts[i] ||
|
|
99
|
+
(embeddingService !== null && !previous.hasEmbedding)) {
|
|
98
100
|
changedIndices.push(i);
|
|
99
101
|
}
|
|
100
102
|
}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
103
|
+
let changedEmbeddings = changedIndices.map(() => null);
|
|
104
|
+
let embeddingsRecomputed = 0;
|
|
105
|
+
let embeddingsFailed = 0;
|
|
106
|
+
if (embeddingService && changedIndices.length > 0) {
|
|
107
|
+
try {
|
|
108
|
+
const changedTexts = changedIndices.map((index) => searchTexts[index]);
|
|
109
|
+
const allEmbeddings = [];
|
|
110
|
+
for (let i = 0; i < changedTexts.length; i += embeddingService.maxBatchSize) {
|
|
111
|
+
const batch = changedTexts.slice(i, i + embeddingService.maxBatchSize);
|
|
112
|
+
allEmbeddings.push(...(await embeddingService.computeEmbeddingsBulk(batch)));
|
|
113
|
+
}
|
|
114
|
+
changedEmbeddings = allEmbeddings;
|
|
115
|
+
embeddingsRecomputed = allEmbeddings.length;
|
|
116
|
+
}
|
|
117
|
+
catch (error) {
|
|
118
|
+
this.logger.warn(`Failed to compute SL source embeddings: ${error instanceof Error ? error.message : String(error)}`);
|
|
119
|
+
embeddingsFailed = changedIndices.length;
|
|
118
120
|
}
|
|
119
|
-
changedEmbeddings = allEmbeddings;
|
|
120
|
-
}
|
|
121
|
-
catch (error) {
|
|
122
|
-
this.logger.warn(`Failed to compute SL source embeddings: ${error instanceof Error ? error.message : String(error)}`);
|
|
123
|
-
changedEmbeddings = changedIndices.map(() => null);
|
|
124
121
|
}
|
|
125
|
-
const rows = changedIndices.map((
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
};
|
|
131
|
-
});
|
|
122
|
+
const rows = changedIndices.map((sourceIndex, embeddingIndex) => ({
|
|
123
|
+
sourceName: sources[sourceIndex].name,
|
|
124
|
+
searchText: searchTexts[sourceIndex],
|
|
125
|
+
embedding: changedEmbeddings[embeddingIndex] ?? null,
|
|
126
|
+
}));
|
|
132
127
|
await this.slSourcesRepository.upsertSources(connectionId, rows);
|
|
133
|
-
|
|
134
|
-
const
|
|
135
|
-
|
|
136
|
-
|
|
128
|
+
const keepNames = sources.map((source) => source.name);
|
|
129
|
+
const deleted = await this.slSourcesRepository.deleteStale(connectionId, keepNames);
|
|
130
|
+
return {
|
|
131
|
+
scanned: sources.length,
|
|
132
|
+
updated: changedIndices.length,
|
|
133
|
+
deleted,
|
|
134
|
+
embeddingsRecomputed,
|
|
135
|
+
embeddingsFailed,
|
|
136
|
+
};
|
|
137
137
|
}
|
|
138
138
|
async search(connectionId, query, limit = 15, minRrfScore = 0) {
|
|
139
139
|
let queryEmbedding = null;
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
140
|
+
if (this.embeddingService) {
|
|
141
|
+
try {
|
|
142
|
+
queryEmbedding = await this.embeddingService.computeEmbedding(query);
|
|
143
|
+
}
|
|
144
|
+
catch (error) {
|
|
145
|
+
this.logger.warn(`Failed to compute query embedding, falling back to FTS + trigram: ${error instanceof Error ? error.message : String(error)}`);
|
|
146
|
+
}
|
|
145
147
|
}
|
|
146
148
|
const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore);
|
|
147
149
|
return results.map((result) => ({
|