@memberjunction/db-auto-doc 5.37.0 → 5.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/core/AnalysisOrchestrator.d.ts.map +1 -1
- package/dist/core/AnalysisOrchestrator.js +32 -2
- package/dist/core/AnalysisOrchestrator.js.map +1 -1
- package/dist/discovery/BridgeViewSQLGenerator.d.ts +67 -0
- package/dist/discovery/BridgeViewSQLGenerator.d.ts.map +1 -0
- package/dist/discovery/BridgeViewSQLGenerator.js +99 -0
- package/dist/discovery/BridgeViewSQLGenerator.js.map +1 -0
- package/dist/discovery/ColumnClusterer.d.ts +63 -0
- package/dist/discovery/ColumnClusterer.d.ts.map +1 -0
- package/dist/discovery/ColumnClusterer.js +205 -0
- package/dist/discovery/ColumnClusterer.js.map +1 -0
- package/dist/discovery/ColumnNormalizer.d.ts +106 -0
- package/dist/discovery/ColumnNormalizer.d.ts.map +1 -0
- package/dist/discovery/ColumnNormalizer.js +376 -0
- package/dist/discovery/ColumnNormalizer.js.map +1 -0
- package/dist/discovery/Composer.d.ts +59 -0
- package/dist/discovery/Composer.d.ts.map +1 -0
- package/dist/discovery/Composer.js +95 -0
- package/dist/discovery/Composer.js.map +1 -0
- package/dist/discovery/EmbeddingProvider.d.ts +27 -0
- package/dist/discovery/EmbeddingProvider.d.ts.map +1 -0
- package/dist/discovery/EmbeddingProvider.js +87 -0
- package/dist/discovery/EmbeddingProvider.js.map +1 -0
- package/dist/discovery/FKGraphWalker.d.ts +108 -0
- package/dist/discovery/FKGraphWalker.d.ts.map +1 -0
- package/dist/discovery/FKGraphWalker.js +169 -0
- package/dist/discovery/FKGraphWalker.js.map +1 -0
- package/dist/discovery/OrganicKeyDetector.d.ts +51 -0
- package/dist/discovery/OrganicKeyDetector.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyDetector.js +78 -0
- package/dist/discovery/OrganicKeyDetector.js.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts +78 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.js +166 -0
- package/dist/discovery/OrganicKeyTranslator.js.map +1 -0
- package/dist/discovery/SemanticPhase.d.ts +70 -0
- package/dist/discovery/SemanticPhase.d.ts.map +1 -0
- package/dist/discovery/SemanticPhase.js +423 -0
- package/dist/discovery/SemanticPhase.js.map +1 -0
- package/dist/discovery/StructuralPhase.d.ts +24 -0
- package/dist/discovery/StructuralPhase.d.ts.map +1 -0
- package/dist/discovery/StructuralPhase.js +23 -0
- package/dist/discovery/StructuralPhase.js.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts +65 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.js +244 -0
- package/dist/discovery/TransitiveBridgeDetector.js.map +1 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts +12 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts.map +1 -1
- package/dist/generators/AdditionalSchemaInfoGenerator.js +31 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.js.map +1 -1
- package/dist/types/config.d.ts +71 -0
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/config.js.map +1 -1
- package/dist/types/organic-keys.d.ts +141 -0
- package/dist/types/organic-keys.d.ts.map +1 -0
- package/dist/types/organic-keys.js +27 -0
- package/dist/types/organic-keys.js.map +1 -0
- package/dist/types/state.d.ts +7 -0
- package/dist/types/state.d.ts.map +1 -1
- package/dist/utils/json.d.ts +40 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +141 -0
- package/dist/utils/json.js.map +1 -0
- package/package.json +5 -5
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ColumnClusterer — average-linkage agglomerative clustering on cosine distance.
|
|
3
|
+
*
|
|
4
|
+
* Input: N columns each with a unit-normalized embedding vector.
|
|
5
|
+
* Output: clusters of ≥2 members spanning ≥2 distinct tables, sorted biggest-first.
|
|
6
|
+
*
|
|
7
|
+
* Algorithm:
|
|
8
|
+
* 1. Compute the full N×N upper-triangular pairwise cosine-distance matrix.
|
|
9
|
+
* 2. Repeatedly find the closest two clusters; if their distance ≤ mergeThreshold,
|
|
10
|
+
* merge them and update the distance to every other cluster as the AVERAGE
|
|
11
|
+
* pairwise distance between their members. Stop when the closest pair exceeds
|
|
12
|
+
* the threshold.
|
|
13
|
+
* 3. Filter clusters by minClusterSize + minDistinctTables.
|
|
14
|
+
*
|
|
15
|
+
* Why average-linkage:
|
|
16
|
+
* Complete-linkage requires EVERY pair across two clusters to be close before
|
|
17
|
+
* merging — too tight in practice. Single-linkage chains unrelated points
|
|
18
|
+
* through bridges. Average-linkage merges when clusters are similar on average,
|
|
19
|
+
* which matches the actual "cluster = one concept" question.
|
|
20
|
+
*/
|
|
21
|
+
export class ColumnClusterer {
|
|
22
|
+
constructor(opts) {
|
|
23
|
+
this.opts = opts;
|
|
24
|
+
this.lastResolvedThreshold = 0;
|
|
25
|
+
}
|
|
26
|
+
cluster(columns) {
|
|
27
|
+
const n = columns.length;
|
|
28
|
+
if (n < this.opts.minClusterSize)
|
|
29
|
+
return [];
|
|
30
|
+
// Step 1 — pairwise cosine-distance matrix.
|
|
31
|
+
const distance = computePairwiseDistance(columns);
|
|
32
|
+
const resolvedThreshold = this.opts.mergeThreshold !== undefined
|
|
33
|
+
? this.opts.mergeThreshold
|
|
34
|
+
: computeThresholdFromDistribution(distance, this.opts.mergeThresholdPercentile ?? 5);
|
|
35
|
+
this.lastResolvedThreshold = resolvedThreshold;
|
|
36
|
+
// Step 2 — agglomerative merge with average-linkage.
|
|
37
|
+
const clusters = new Map();
|
|
38
|
+
for (let i = 0; i < n; i++)
|
|
39
|
+
clusters.set(i, [i]);
|
|
40
|
+
const avgDist = new Map();
|
|
41
|
+
for (let i = 0; i < n; i++) {
|
|
42
|
+
const row = new Map();
|
|
43
|
+
for (let j = 0; j < n; j++) {
|
|
44
|
+
if (i !== j)
|
|
45
|
+
row.set(j, distanceAt(distance, n, i, j));
|
|
46
|
+
}
|
|
47
|
+
avgDist.set(i, row);
|
|
48
|
+
}
|
|
49
|
+
while (clusters.size > 1) {
|
|
50
|
+
let bestA = -1;
|
|
51
|
+
let bestB = -1;
|
|
52
|
+
let bestD = Number.POSITIVE_INFINITY;
|
|
53
|
+
for (const [a, aRow] of avgDist) {
|
|
54
|
+
for (const [b, d] of aRow) {
|
|
55
|
+
if (a < b && d < bestD) {
|
|
56
|
+
bestD = d;
|
|
57
|
+
bestA = a;
|
|
58
|
+
bestB = b;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
if (bestD > resolvedThreshold || bestA < 0)
|
|
63
|
+
break;
|
|
64
|
+
const membersA = clusters.get(bestA);
|
|
65
|
+
const membersB = clusters.get(bestB);
|
|
66
|
+
const sizeA = membersA.length;
|
|
67
|
+
const sizeB = membersB.length;
|
|
68
|
+
const rowA = avgDist.get(bestA);
|
|
69
|
+
const rowB = avgDist.get(bestB);
|
|
70
|
+
for (const c of clusters.keys()) {
|
|
71
|
+
if (c === bestA || c === bestB)
|
|
72
|
+
continue;
|
|
73
|
+
const dAC = rowA.get(c);
|
|
74
|
+
const dBC = rowB.get(c);
|
|
75
|
+
const newD = (sizeA * dAC + sizeB * dBC) / (sizeA + sizeB);
|
|
76
|
+
rowA.set(c, newD);
|
|
77
|
+
avgDist.get(c).set(bestA, newD);
|
|
78
|
+
avgDist.get(c).delete(bestB);
|
|
79
|
+
}
|
|
80
|
+
rowA.delete(bestB);
|
|
81
|
+
avgDist.delete(bestB);
|
|
82
|
+
clusters.set(bestA, membersA.concat(membersB));
|
|
83
|
+
clusters.delete(bestB);
|
|
84
|
+
}
|
|
85
|
+
// Step 3 — filter + emit.
|
|
86
|
+
const out = [];
|
|
87
|
+
for (const memberIdxs of clusters.values()) {
|
|
88
|
+
if (memberIdxs.length < this.opts.minClusterSize)
|
|
89
|
+
continue;
|
|
90
|
+
const tableSet = new Set(memberIdxs.map((i) => `${columns[i].schema}.${columns[i].table}`));
|
|
91
|
+
if (tableSet.size < this.opts.minDistinctTables)
|
|
92
|
+
continue;
|
|
93
|
+
const members = memberIdxs.map((i) => ({
|
|
94
|
+
schema: columns[i].schema,
|
|
95
|
+
table: columns[i].table,
|
|
96
|
+
column: columns[i].column,
|
|
97
|
+
participatesInFK: !!columns[i].participatesInFK,
|
|
98
|
+
fkTarget: columns[i].fkTarget ?? null,
|
|
99
|
+
isPrimaryKey: !!columns[i].isPrimaryKey,
|
|
100
|
+
}));
|
|
101
|
+
out.push({
|
|
102
|
+
memberIndexes: memberIdxs,
|
|
103
|
+
members,
|
|
104
|
+
maxIntraDistance: maxIntraDistance(memberIdxs, distance, n),
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
out.sort((a, b) => b.members.length - a.members.length);
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
112
|
+
function computePairwiseDistance(columns) {
|
|
113
|
+
const n = columns.length;
|
|
114
|
+
const len = (n * (n - 1)) / 2;
|
|
115
|
+
const out = new Float32Array(len);
|
|
116
|
+
for (let i = 0; i < n; i++) {
|
|
117
|
+
const ei = columns[i].embedding;
|
|
118
|
+
for (let j = i + 1; j < n; j++) {
|
|
119
|
+
const ej = columns[j].embedding;
|
|
120
|
+
let s = 0;
|
|
121
|
+
const dim = Math.min(ei.length, ej.length);
|
|
122
|
+
for (let k = 0; k < dim; k++)
|
|
123
|
+
s += ei[k] * ej[k];
|
|
124
|
+
const d = 1 - s;
|
|
125
|
+
out[flatIndex(i, j, n)] = d > 0 ? d : 0;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return out;
|
|
129
|
+
}
|
|
130
|
+
function flatIndex(i, j, n) {
|
|
131
|
+
return i * n - (i * (i + 1)) / 2 + (j - i - 1);
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Auto-calibrate the merge threshold from the actual distance distribution.
|
|
135
|
+
*
|
|
136
|
+
* Real embedding-distance distributions for "same vs different concept" data
|
|
137
|
+
* are usually BIMODAL: a dense cluster of small distances (same concept) and
|
|
138
|
+
* a dense cluster of large distances (different concept), with a clear gap
|
|
139
|
+
* between them. A flat percentile (p5) breaks in the degenerate cases where
|
|
140
|
+
* many descriptions land exactly identically — p5 becomes 0 and the merge
|
|
141
|
+
* loop refuses to merge anything that isn't byte-equal.
|
|
142
|
+
*
|
|
143
|
+
* Robust approach: find the largest gap in the BOTTOM half of the sorted
|
|
144
|
+
* distance distribution (where the same-concept / different-concept split
|
|
145
|
+
* lives), and place the threshold inside that gap so all same-concept pairs
|
|
146
|
+
* merge and no different-concept pair does.
|
|
147
|
+
*
|
|
148
|
+
* Fallback when no clear gap exists: use the pN percentile (legacy behavior).
|
|
149
|
+
* Floor the result so we still merge near-identical descriptions even when
|
|
150
|
+
* the geometry is degenerate.
|
|
151
|
+
*/
|
|
152
|
+
function computeThresholdFromDistribution(distances, percentile) {
|
|
153
|
+
if (distances.length === 0)
|
|
154
|
+
return 0;
|
|
155
|
+
const sorted = Array.from(distances).sort((a, b) => a - b);
|
|
156
|
+
const N = sorted.length;
|
|
157
|
+
// Search the entire distribution for the largest gap whose START is in the
|
|
158
|
+
// "same-concept territory" (≤0.40 cosine distance). Anything above 0.40 is
|
|
159
|
+
// already inter-concept, and the gaps up there are irrelevant noise.
|
|
160
|
+
//
|
|
161
|
+
// For Gemini clustering embeddings, same-concept pairs concentrate ≤0.15
|
|
162
|
+
// and different-concept pairs ≥0.30, so the boundary gap lives somewhere in
|
|
163
|
+
// [0.05, 0.35]. Searching that window finds it whether the intra-concept
|
|
164
|
+
// pairs are 10% or 60% of all pairs.
|
|
165
|
+
const GAP_START_MAX = 0.40;
|
|
166
|
+
const MIN_MEANINGFUL_GAP = 0.03;
|
|
167
|
+
const FLOOR = 0.05;
|
|
168
|
+
let bestGap = 0;
|
|
169
|
+
let bestGapStart = 0;
|
|
170
|
+
for (let i = 0; i < N - 1; i++) {
|
|
171
|
+
if (sorted[i] > GAP_START_MAX)
|
|
172
|
+
break;
|
|
173
|
+
const gap = sorted[i + 1] - sorted[i];
|
|
174
|
+
if (gap > bestGap) {
|
|
175
|
+
bestGap = gap;
|
|
176
|
+
bestGapStart = sorted[i];
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
if (bestGap >= MIN_MEANINGFUL_GAP) {
|
|
180
|
+
return Math.max(FLOOR, bestGapStart + bestGap * 0.5);
|
|
181
|
+
}
|
|
182
|
+
// No clear gap — use the pN percentile, floored to merge near-identical pairs.
|
|
183
|
+
const p = Math.max(0, Math.min(100, percentile));
|
|
184
|
+
const idx = Math.floor((N * p) / 100);
|
|
185
|
+
return Math.max(FLOOR, sorted[Math.min(idx, N - 1)]);
|
|
186
|
+
}
|
|
187
|
+
function distanceAt(matrix, n, i, j) {
|
|
188
|
+
if (i === j)
|
|
189
|
+
return 0;
|
|
190
|
+
if (i > j)
|
|
191
|
+
[i, j] = [j, i];
|
|
192
|
+
return matrix[flatIndex(i, j, n)];
|
|
193
|
+
}
|
|
194
|
+
function maxIntraDistance(memberIdxs, matrix, n) {
|
|
195
|
+
let max = 0;
|
|
196
|
+
for (let a = 0; a < memberIdxs.length; a++) {
|
|
197
|
+
for (let b = a + 1; b < memberIdxs.length; b++) {
|
|
198
|
+
const d = distanceAt(matrix, n, memberIdxs[a], memberIdxs[b]);
|
|
199
|
+
if (d > max)
|
|
200
|
+
max = d;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return max;
|
|
204
|
+
}
|
|
205
|
+
//# sourceMappingURL=ColumnClusterer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ColumnClusterer.js","sourceRoot":"","sources":["../../src/discovery/ColumnClusterer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAsCH,MAAM,OAAO,eAAe;IAGxB,YAA6B,IAAsB;QAAtB,SAAI,GAAJ,IAAI,CAAkB;QAF5C,0BAAqB,GAAG,CAAC,CAAC;IAEqB,CAAC;IAEhD,OAAO,CAAC,OAA+B;QAC1C,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACzB,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc;YAAE,OAAO,EAAE,CAAC;QAE5C,4CAA4C;QAC5C,MAAM,QAAQ,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;QAClD,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc,KAAK,SAAS;YAC5D,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc;YAC1B,CAAC,CAAC,gCAAgC,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,wBAAwB,IAAI,CAAC,CAAC,CAAC;QAC1F,IAAI,CAAC,qBAAqB,GAAG,iBAAiB,CAAC;QAE/C,qDAAqD;QACrD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAoB,CAAC;QAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAEjD,MAAM,OAAO,GAAG,IAAI,GAAG,EAA+B,CAAC;QACvD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YACzB,MAAM,GAAG,GAAG,IAAI,GAAG,EAAkB,CAAC;YACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;gBACzB,IAAI,CAAC,KAAK,CAAC;oBAAE,GAAG,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,QAAQ,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YAC3D,CAAC;YACD,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACxB,CAAC;QAED,OAAO,QAAQ,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YACvB,IAAI,KAAK,GAAG,CAAC,CAAC,CAAC;YACf,IAAI,KAAK,GAAG,CAAC,CAAC,CAAC;YACf,IAAI,KAAK,GAAG,MAAM,CAAC,iBAAiB,CAAC;YACrC,KAAK,MAAM,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,OAAO,EAAE,CAAC;gBAC9B,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,IAAI,EAAE,CAAC;oBACxB,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC;wBACrB,KAAK,GAAG,CAAC,CAAC;wBACV,KAAK,GAAG,CAAC,CAAC;wBACV,KAAK,GAAG,CAAC,CAAC;oBACd,CAAC;gBACL,CAAC;YACL,CAAC;YACD,IAAI,KAAK,GAAG,iBAAiB,IAAI,KAAK,GAAG,CAAC;gBAAE,MAAM;YAElD,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC;YACtC,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC;YACtC,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC;YAC9B,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC;YAC9B,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC;YACjC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAE,CAAC;YACjC,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;gBAC9B,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,KAAK;oBAAE,SAAS;gBACzC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC;gBACzB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC;gBACzB,MAAM,IAAI,GAAG,CAAC,KAAK,GAAG,GAAG,GAAG,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC;gBAC3D,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;gBAClB,OAAO,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;gBACjC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAClC,CAAC;YACD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACnB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAEtB,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC/C,QAAQ,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3B,CAAC;QAED,0BAA0B;QAC1B,MAAM,GAAG,GAAiB,EAAE,CAAC;QAC7B,KAAK,MAAM,UAAU,IAAI,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC;YACzC,IAAI,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,IAAI,CAAC,cAAc;gBAAE,SAAS;YAC3D,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YAC5F,IAAI,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,iBAAiB;gBAAE,SAAS;YAC1D,MAAM,OAAO,GAA8B,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC9D,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM;gBACzB,KAAK,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK;gBACvB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM;gBACzB,gBAAgB,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,gBAAgB;gBAC/C,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,IAAI,IAAI;gBACrC,YAAY,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,YAAY;aAC1C,CAAC,CAAC,CAAC;YACJ,GAAG,CAAC,IAAI,CAAC;gBACL,aAAa,EAAE,UAAU;gBACzB,OAAO;gBACP,gBAAgB,EAAE,gBAAgB,CAAC,UAAU,EAAE,QAAQ,EAAE,CAAC,CAAC;aAC9D,CAAC,CAAC;QACP,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QACxD,OAAO,GAAG,CAAC;IACf,CAAC;CACJ;AAED,+EAA+E;AAE/E,SAAS,uBAAuB,CAAC,OAA+B;IAC5D,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,MAAM,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IAC9B,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,GAAG,CAAC,CAAC;IAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACzB,MAAM,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7B,MAAM,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAChC,IAAI,CAAC,GAAG,CAAC,CAAC;YACV,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,CAAC;YAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;gBAAE,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;YACjD,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAChB,GAAG,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5C,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACf,CAAC;AAED,SAAS,SAAS,CAAC,CAAS,EAAE,CAAS,EAAE,CAAS;IAC9C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;AACnD,CAAC;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,SAAS,gCAAgC,CAAC,SAAuB,EAAE,UAAkB;IACjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACrC,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3D,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IAExB,2EAA2E;IAC3E,2EAA2E;IAC3E,qEAAqE;IACrE,EAAE;IACF,yEAAyE;IACzE,4EAA4E;IAC5E,yEAAyE;IACzE,qCAAqC;IACrC,MAAM,aAAa,GAAG,IAAI,CAAC;IAC3B,MAAM,kBAAkB,GAAG,IAAI,CAAC;IAChC,MAAM,KAAK,GAAG,IAAI,CAAC;IAEnB,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,IAAI,MAAM,CAAC,CAAC,CAAC,GAAG,aAAa;YAAE,MAAM;QACrC,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QACtC,IAAI,GAAG,GAAG,OAAO,EAAE,CAAC;YAChB,OAAO,GAAG,GAAG,CAAC;YACd,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;IACL,CAAC;IAED,IAAI,OAAO,IAAI,kBAAkB,EAAE,CAAC;QAChC,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,YAAY,GAAG,OAAO,GAAG,GAAG,CAAC,CAAC;IACzD,CAAC;IAED,+EAA+E;IAC/E,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC,CAAC;IACjD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC;IACtC,OAAO,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,SAAS,UAAU,CAAC,MAAoB,EAAE,CAAS,EAAE,CAAS,EAAE,CAAS;IACrE,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACtB,IAAI,CAAC,GAAG,CAAC;QAAE,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3B,OAAO,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,gBAAgB,CAAC,UAAoB,EAAE,MAAoB,EAAE,CAAS;IAC3E,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9D,IAAI,CAAC,GAAG,GAAG;gBAAE,GAAG,GAAG,CAAC,CAAC;QACzB,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACf,CAAC"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TableNormalizer — one LLM call per TABLE (not per column), upstream of embedding.
|
|
3
|
+
*
|
|
4
|
+
* For each in-scope table, a single LLM call sees:
|
|
5
|
+
* - The table's name + description + sibling columns
|
|
6
|
+
* - Every column's identity + description + sample values + FK/PK status
|
|
7
|
+
*
|
|
8
|
+
* The call returns one normalized entry per column with:
|
|
9
|
+
* - conceptName : canonical snake_case (`email_address`, `customer_id`, ...)
|
|
10
|
+
* - normalizationStrategy : how values should be compared
|
|
11
|
+
* - normalizedDescription : business-concept-focused, system-agnostic sentence
|
|
12
|
+
* - isUsefulOrganicKey : false for audit/system/free-form (filtered out)
|
|
13
|
+
* - confidence + reasoning
|
|
14
|
+
*
|
|
15
|
+
* Why per-table instead of per-column:
|
|
16
|
+
* - Fewer calls (5K cols across 500 tables → 500 calls instead of 5K). At
|
|
17
|
+
* Gemini Flash pricing, that's ~$0.04 instead of ~$0.20 for APTIFY-scale.
|
|
18
|
+
* - System prompt amortizes across all columns in the table.
|
|
19
|
+
* - The LLM sees siblings as context — knowing the table has FirstName + LastName
|
|
20
|
+
* next to an Email column reveals it's a person email, not a server hostname.
|
|
21
|
+
* - More token-efficient: one JSON array out instead of N independent objects.
|
|
22
|
+
*
|
|
23
|
+
* The single most important constraint: same-concept columns from DIFFERENT
|
|
24
|
+
* TABLES (across systems) must produce the same conceptName and a similar
|
|
25
|
+
* normalizedDescription so the embedding step naturally clusters them. The
|
|
26
|
+
* prompt enforces this via a canonical concept-name list.
|
|
27
|
+
*/
|
|
28
|
+
import { AIConfig } from '../types/config.js';
|
|
29
|
+
import { OrganicKeyNormalizationStrategy } from '../types/organic-keys.js';
|
|
30
|
+
/** One column's input to the normalizer. */
|
|
31
|
+
export interface NormalizerInputColumn {
|
|
32
|
+
schema: string;
|
|
33
|
+
table: string;
|
|
34
|
+
column: string;
|
|
35
|
+
dataType: string;
|
|
36
|
+
/** Original LLM-generated description from DBAutoDoc's prior analysis pass. */
|
|
37
|
+
originalDescription: string;
|
|
38
|
+
/** Sample values from the column's actual data. */
|
|
39
|
+
sampleValues: string[];
|
|
40
|
+
/** Whether the column participates in any FK (declared or detected). */
|
|
41
|
+
participatesInFK: boolean;
|
|
42
|
+
fkTarget?: {
|
|
43
|
+
schema: string;
|
|
44
|
+
table: string;
|
|
45
|
+
column: string;
|
|
46
|
+
} | null;
|
|
47
|
+
isPrimaryKey: boolean;
|
|
48
|
+
}
|
|
49
|
+
/** All columns of one table, batched for a single LLM call. */
|
|
50
|
+
export interface TableNormalizationInput {
|
|
51
|
+
schema: string;
|
|
52
|
+
schemaDescription?: string;
|
|
53
|
+
table: string;
|
|
54
|
+
tableDescription?: string;
|
|
55
|
+
columns: NormalizerInputColumn[];
|
|
56
|
+
}
|
|
57
|
+
/** One column's normalized output. */
|
|
58
|
+
export interface NormalizedColumn extends NormalizerInputColumn {
|
|
59
|
+
conceptName: string;
|
|
60
|
+
normalizationStrategy: OrganicKeyNormalizationStrategy;
|
|
61
|
+
customNormalizationExpression?: string;
|
|
62
|
+
normalizedDescription: string;
|
|
63
|
+
isUsefulOrganicKey: boolean;
|
|
64
|
+
confidence: number;
|
|
65
|
+
reasoning: string;
|
|
66
|
+
}
|
|
67
|
+
/** Aggregate result of normalizing many tables. */
|
|
68
|
+
export interface NormalizationBatchResult {
|
|
69
|
+
/** Useful organic-key columns surviving normalization (filtered to isUsefulOrganicKey=true). */
|
|
70
|
+
normalized: NormalizedColumn[];
|
|
71
|
+
/** Columns the normalizer marked isUsefulOrganicKey=false (audit, free-form, etc.). */
|
|
72
|
+
rejected: number;
|
|
73
|
+
/** Tables where the LLM call failed entirely. */
|
|
74
|
+
errors: number;
|
|
75
|
+
tokens: {
|
|
76
|
+
total: number;
|
|
77
|
+
input: number;
|
|
78
|
+
output: number;
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
export interface NormalizerOptions {
|
|
82
|
+
/** Concurrency for per-table LLM calls. Default 8. */
|
|
83
|
+
concurrency?: number;
|
|
84
|
+
/** Max retries per table on transient failures. Default 2. */
|
|
85
|
+
maxRetries?: number;
|
|
86
|
+
/** Progress callback (done count, total count). */
|
|
87
|
+
onProgress?: (done: number, total: number) => void;
|
|
88
|
+
}
|
|
89
|
+
export declare class TableNormalizer {
|
|
90
|
+
private readonly aiConfig;
|
|
91
|
+
private readonly llm;
|
|
92
|
+
constructor(aiConfig: AIConfig);
|
|
93
|
+
/** Normalize one table — one LLM call returning per-column entries. */
|
|
94
|
+
normalizeTable(input: TableNormalizationInput, maxRetries?: number): Promise<{
|
|
95
|
+
normalized: NormalizedColumn[];
|
|
96
|
+
tokens: {
|
|
97
|
+
total: number;
|
|
98
|
+
input: number;
|
|
99
|
+
output: number;
|
|
100
|
+
};
|
|
101
|
+
errorMessage?: string;
|
|
102
|
+
}>;
|
|
103
|
+
/** Batch normalize many tables with bounded concurrency. */
|
|
104
|
+
normalizeAll(tables: TableNormalizationInput[], opts?: NormalizerOptions): Promise<NormalizationBatchResult>;
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=ColumnNormalizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ColumnNormalizer.d.ts","sourceRoot":"","sources":["../../src/discovery/ColumnNormalizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAIH,OAAO,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAC9C,OAAO,EAAE,+BAA+B,EAAE,MAAM,0BAA0B,CAAC;AAG3E,4CAA4C;AAC5C,MAAM,WAAW,qBAAqB;IAClC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,+EAA+E;IAC/E,mBAAmB,EAAE,MAAM,CAAC;IAC5B,mDAAmD;IACnD,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,wEAAwE;IACxE,gBAAgB,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IACpE,YAAY,EAAE,OAAO,CAAC;CACzB;AAED,+DAA+D;AAC/D,MAAM,WAAW,uBAAuB;IACpC,MAAM,EAAE,MAAM,CAAC;IACf,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,OAAO,EAAE,qBAAqB,EAAE,CAAC;CACpC;AAED,sCAAsC;AACtC,MAAM,WAAW,gBAAiB,SAAQ,qBAAqB;IAC3D,WAAW,EAAE,MAAM,CAAC;IACpB,qBAAqB,EAAE,+BAA+B,CAAC;IACvD,6BAA6B,CAAC,EAAE,MAAM,CAAC;IACvC,qBAAqB,EAAE,MAAM,CAAC;IAC9B,kBAAkB,EAAE,OAAO,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;CACrB;AAED,mDAAmD;AACnD,MAAM,WAAW,wBAAwB;IACrC,gGAAgG;IAChG,UAAU,EAAE,gBAAgB,EAAE,CAAC;IAC/B,uFAAuF;IACvF,QAAQ,EAAE,MAAM,CAAC;IACjB,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;CAC5D;AAED,MAAM,WAAW,iBAAiB;IAC9B,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,8DAA8D;IAC9D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,mDAAmD;IACnD,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACtD;AAED,qBAAa,eAAe;IAGZ,OAAO,CAAC,QAAQ,CAAC,QAAQ;IAFrC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAU;gBAED,QAAQ,EAAE,QAAQ;IAI/C,uEAAuE;IAC1D,cAAc,CACvB,KAAK,EAAE,uBAAuB,EAC9B,UAAU,SAAI,GACf,OAAO,CAAC;QACP,UAAU,EAAE,gBAAgB,EAAE,CAAC;QAC/B,MAAM,EAAE;YAAE,KAAK,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAC;YAAC,MAAM,EAAE,MAAM,CAAA;SAAE,CAAC;QACzD,YAAY,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC;IA6EF,4DAA4D;IAC/C,YAAY,CACrB,MAAM,EAAE,uBAAuB,EAAE,EACjC,IAAI,GAAE,iBAAsB,GAC7B,OAAO,CAAC,wBAAwB,CAAC;CAqCvC"}
|