@memberjunction/db-auto-doc 5.36.0 → 5.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/core/AnalysisOrchestrator.d.ts.map +1 -1
- package/dist/core/AnalysisOrchestrator.js +32 -2
- package/dist/core/AnalysisOrchestrator.js.map +1 -1
- package/dist/discovery/BridgeViewSQLGenerator.d.ts +67 -0
- package/dist/discovery/BridgeViewSQLGenerator.d.ts.map +1 -0
- package/dist/discovery/BridgeViewSQLGenerator.js +99 -0
- package/dist/discovery/BridgeViewSQLGenerator.js.map +1 -0
- package/dist/discovery/ColumnClusterer.d.ts +63 -0
- package/dist/discovery/ColumnClusterer.d.ts.map +1 -0
- package/dist/discovery/ColumnClusterer.js +205 -0
- package/dist/discovery/ColumnClusterer.js.map +1 -0
- package/dist/discovery/ColumnNormalizer.d.ts +106 -0
- package/dist/discovery/ColumnNormalizer.d.ts.map +1 -0
- package/dist/discovery/ColumnNormalizer.js +376 -0
- package/dist/discovery/ColumnNormalizer.js.map +1 -0
- package/dist/discovery/Composer.d.ts +59 -0
- package/dist/discovery/Composer.d.ts.map +1 -0
- package/dist/discovery/Composer.js +95 -0
- package/dist/discovery/Composer.js.map +1 -0
- package/dist/discovery/EmbeddingProvider.d.ts +27 -0
- package/dist/discovery/EmbeddingProvider.d.ts.map +1 -0
- package/dist/discovery/EmbeddingProvider.js +87 -0
- package/dist/discovery/EmbeddingProvider.js.map +1 -0
- package/dist/discovery/FKGraphWalker.d.ts +108 -0
- package/dist/discovery/FKGraphWalker.d.ts.map +1 -0
- package/dist/discovery/FKGraphWalker.js +169 -0
- package/dist/discovery/FKGraphWalker.js.map +1 -0
- package/dist/discovery/OrganicKeyDetector.d.ts +51 -0
- package/dist/discovery/OrganicKeyDetector.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyDetector.js +78 -0
- package/dist/discovery/OrganicKeyDetector.js.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts +78 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.js +166 -0
- package/dist/discovery/OrganicKeyTranslator.js.map +1 -0
- package/dist/discovery/SemanticPhase.d.ts +70 -0
- package/dist/discovery/SemanticPhase.d.ts.map +1 -0
- package/dist/discovery/SemanticPhase.js +423 -0
- package/dist/discovery/SemanticPhase.js.map +1 -0
- package/dist/discovery/StructuralPhase.d.ts +24 -0
- package/dist/discovery/StructuralPhase.d.ts.map +1 -0
- package/dist/discovery/StructuralPhase.js +23 -0
- package/dist/discovery/StructuralPhase.js.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts +65 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.js +244 -0
- package/dist/discovery/TransitiveBridgeDetector.js.map +1 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts +12 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts.map +1 -1
- package/dist/generators/AdditionalSchemaInfoGenerator.js +31 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.js.map +1 -1
- package/dist/types/config.d.ts +71 -0
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/config.js.map +1 -1
- package/dist/types/organic-keys.d.ts +141 -0
- package/dist/types/organic-keys.d.ts.map +1 -0
- package/dist/types/organic-keys.js +27 -0
- package/dist/types/organic-keys.js.map +1 -0
- package/dist/types/state.d.ts +7 -0
- package/dist/types/state.d.ts.map +1 -1
- package/dist/utils/json.d.ts +40 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +141 -0
- package/dist/utils/json.js.map +1 -0
- package/package.json +5 -5
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OrganicKeyTranslator — PURE EMISSION.
|
|
3
|
+
*
|
|
4
|
+
* Takes a list of clusters that have ALREADY been gated by the Composer, plus
|
|
5
|
+
* the transitive spokes for those that survived, and fans them out into PR
|
|
6
|
+
* #2193's per-schema/per-table JSON.
|
|
7
|
+
*
|
|
8
|
+
* No filtering, no thresholds, no scoring logic in here. The Composer is the
|
|
9
|
+
* single emission gate; this file is its render-to-JSON helper.
|
|
10
|
+
*
|
|
11
|
+
* Same-concept consolidation: clusters that the LLM normalizer assigned the
|
|
12
|
+
* same canonical concept name (e.g. four separate geometric clusters all
|
|
13
|
+
* named `product_id`) are MERGED here into a single conceptual cluster before
|
|
14
|
+
* fan-out — deterministic equivalent of an LLM concept-merge pass.
|
|
15
|
+
*/
|
|
16
|
+
import { memberColumns, isCompoundMember, } from '../types/organic-keys.js';
|
|
17
|
+
// ─── Entry point — pure fan-out ─────────────────────────────────────────────
|
|
18
|
+
/**
|
|
19
|
+
* Render gated clusters into PR #2193 JSON. Same-concept consolidation +
|
|
20
|
+
* per-table fan-out + transitive spoke attachment. No filters, no thresholds.
|
|
21
|
+
*
|
|
22
|
+
* The caller (Composer) is responsible for filtering. Anything passed in
|
|
23
|
+
* gets emitted.
|
|
24
|
+
*/
|
|
25
|
+
export function translateClusters(clusters, transitiveSpokes = []) {
|
|
26
|
+
// Step 1 — group by normalized canonical concept name.
|
|
27
|
+
const byConcept = new Map();
|
|
28
|
+
for (const cluster of clusters) {
|
|
29
|
+
if (cluster.members.length < 2)
|
|
30
|
+
continue;
|
|
31
|
+
const key = normalizeConceptName(cluster.concept);
|
|
32
|
+
const bucket = byConcept.get(key);
|
|
33
|
+
if (bucket)
|
|
34
|
+
bucket.push(cluster);
|
|
35
|
+
else
|
|
36
|
+
byConcept.set(key, [cluster]);
|
|
37
|
+
}
|
|
38
|
+
const out = {};
|
|
39
|
+
for (const group of byConcept.values()) {
|
|
40
|
+
// Step 2 — union members across all clusters in the group, deduped by
|
|
41
|
+
// full column key (schema.table.col1,col2,...).
|
|
42
|
+
const memberMap = new Map();
|
|
43
|
+
for (const c of group) {
|
|
44
|
+
for (const m of c.members) {
|
|
45
|
+
const k = `${m.schema}.${m.table}.${memberColumns(m).join(',')}`;
|
|
46
|
+
if (!memberMap.has(k))
|
|
47
|
+
memberMap.set(k, m);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
const allMembers = Array.from(memberMap.values());
|
|
51
|
+
if (allMembers.length < 2)
|
|
52
|
+
continue;
|
|
53
|
+
const distinctTables = new Set(allMembers.map((m) => `${m.schema}.${m.table}`));
|
|
54
|
+
if (distinctTables.size < 2)
|
|
55
|
+
continue;
|
|
56
|
+
// Pick the anchor (highest confidence, ties by member count).
|
|
57
|
+
const anchor = group
|
|
58
|
+
.slice()
|
|
59
|
+
.sort((a, b) => (b.confidence - a.confidence) || (b.members.length - a.members.length))[0];
|
|
60
|
+
const name = prettyConceptName(anchor.concept);
|
|
61
|
+
// Step 3 — fan out: one entry per unique owner, spokes = all OTHER unique members.
|
|
62
|
+
for (const owner of allMembers) {
|
|
63
|
+
const tableEntry = upsertTable(out, owner.schema, owner.table);
|
|
64
|
+
const seenSpokes = new Set();
|
|
65
|
+
const spokes = [];
|
|
66
|
+
// Direct spokes — every other cluster member.
|
|
67
|
+
for (const target of allMembers) {
|
|
68
|
+
if (target === owner)
|
|
69
|
+
continue;
|
|
70
|
+
const targetColumns = memberColumns(target);
|
|
71
|
+
const k = `${target.schema}.${target.table}.${targetColumns.join(',')}`;
|
|
72
|
+
if (seenSpokes.has(k))
|
|
73
|
+
continue;
|
|
74
|
+
seenSpokes.add(k);
|
|
75
|
+
spokes.push({
|
|
76
|
+
SchemaName: target.schema,
|
|
77
|
+
TableName: target.table,
|
|
78
|
+
RelatedFieldNames: targetColumns,
|
|
79
|
+
DisplayName: `${target.schema}.${target.table}`,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
// Transitive spokes — bridge views attached to matching hubs.
|
|
83
|
+
const ownerColumns = memberColumns(owner);
|
|
84
|
+
const matchingTransitive = transitiveSpokes.filter((t) => t.hubSchema === owner.schema &&
|
|
85
|
+
t.hubTable === owner.table &&
|
|
86
|
+
t.hubKeyFields.length === ownerColumns.length &&
|
|
87
|
+
t.hubKeyFields.every((f, idx) => f === ownerColumns[idx]));
|
|
88
|
+
for (const t of matchingTransitive) {
|
|
89
|
+
spokes.push({
|
|
90
|
+
SchemaName: t.spokeSchema,
|
|
91
|
+
TableName: t.spokeTable,
|
|
92
|
+
TransitiveView: t.transitiveView,
|
|
93
|
+
TransitiveMatchFieldNames: t.transitiveMatchFieldNames,
|
|
94
|
+
TransitiveOutputFieldName: t.transitiveOutputFieldName,
|
|
95
|
+
RelatedEntityJoinFieldName: t.relatedEntityJoinFieldName,
|
|
96
|
+
DisplayName: `${t.spokeSchema}.${t.spokeTable} (via ${t.hubConcept ?? 'bridge'})`,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
const compoundSuffix = isCompoundMember(owner) ? ` (compound: ${ownerColumns.join('+')})` : '';
|
|
100
|
+
// Per-column normalization: each emitted EntityOrganicKey row carries the
|
|
101
|
+
// transformation for ITS owner column. The runtime looks up each side's
|
|
102
|
+
// own expression at match time (see EntityInfo.BuildOrganicKeyViewParams),
|
|
103
|
+
// so different columns in the same cluster can carry different functions.
|
|
104
|
+
// Falls back to cluster-level normalization if a member didn't get its own.
|
|
105
|
+
const ownerNormalization = owner.normalizationStrategy ?? anchor.normalization;
|
|
106
|
+
const ownerCustomExpression = owner.customNormalizationExpression ?? anchor.customNormalizationExpression;
|
|
107
|
+
tableEntry.OrganicKeys.push({
|
|
108
|
+
Name: name + compoundSuffix,
|
|
109
|
+
Description: anchor.reasoning,
|
|
110
|
+
MatchFieldNames: ownerColumns,
|
|
111
|
+
NormalizationStrategy: ownerNormalization,
|
|
112
|
+
CustomNormalizationExpression: ownerCustomExpression,
|
|
113
|
+
AutoCreateRelatedViewOnForm: true,
|
|
114
|
+
RelatedEntities: spokes,
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return out;
|
|
119
|
+
}
|
|
120
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
121
|
+
function normalizeConceptName(name) {
|
|
122
|
+
return (name ?? '')
|
|
123
|
+
.toLowerCase()
|
|
124
|
+
.replace(/[\s\-]+/g, '_')
|
|
125
|
+
.replace(/_+/g, '_')
|
|
126
|
+
.replace(/^_|_$/g, '')
|
|
127
|
+
.trim();
|
|
128
|
+
}
|
|
129
|
+
function prettyConceptName(concept) {
|
|
130
|
+
const pretty = concept
|
|
131
|
+
.split('_')
|
|
132
|
+
.filter((p) => p.length > 0)
|
|
133
|
+
.map((p) => p[0].toUpperCase() + p.slice(1))
|
|
134
|
+
.join(' ');
|
|
135
|
+
return `${pretty} Match`;
|
|
136
|
+
}
|
|
137
|
+
function upsertTable(out, schema, table) {
|
|
138
|
+
let bySchema = out[schema];
|
|
139
|
+
if (!bySchema) {
|
|
140
|
+
bySchema = [];
|
|
141
|
+
out[schema] = bySchema;
|
|
142
|
+
}
|
|
143
|
+
let entry = bySchema.find((t) => t.TableName === table);
|
|
144
|
+
if (!entry) {
|
|
145
|
+
entry = { TableName: table, OrganicKeys: [] };
|
|
146
|
+
bySchema.push(entry);
|
|
147
|
+
}
|
|
148
|
+
return entry;
|
|
149
|
+
}
|
|
150
|
+
/** Tally the emit payload: number of schemas, tables, organic keys, and spokes it contains. */
|
|
151
|
+
export function countOutputEntries(out) {
|
|
152
|
+
let tables = 0;
|
|
153
|
+
let keys = 0;
|
|
154
|
+
let spokes = 0;
|
|
155
|
+
const schemas = Object.keys(out).length;
|
|
156
|
+
for (const tableList of Object.values(out)) {
|
|
157
|
+
tables += tableList.length;
|
|
158
|
+
for (const t of tableList) {
|
|
159
|
+
keys += t.OrganicKeys.length;
|
|
160
|
+
for (const k of t.OrganicKeys)
|
|
161
|
+
spokes += k.RelatedEntities.length;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return { schemas, tables, keys, spokes };
|
|
165
|
+
}
|
|
166
|
+
//# sourceMappingURL=OrganicKeyTranslator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"OrganicKeyTranslator.js","sourceRoot":"","sources":["../../src/discovery/OrganicKeyTranslator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAIH,aAAa,EACb,gBAAgB,GACnB,MAAM,0BAA0B,CAAC;AA6DlC,+EAA+E;AAE/E;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAC7B,QAA6B,EAC7B,mBAA2C,EAAE;IAE7C,uDAAuD;IACvD,MAAM,SAAS,GAAG,IAAI,GAAG,EAA+B,CAAC;IACzD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC7B,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QACzC,MAAM,GAAG,GAAG,oBAAoB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAClD,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAClC,IAAI,MAAM;YAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;;YAC5B,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,GAAG,GAA8B,EAAE,CAAC;IAE1C,KAAK,MAAM,KAAK,IAAI,SAAS,CAAC,MAAM,EAAE,EAAE,CAAC;QACrC,sEAAsE;QACtE,gDAAgD;QAChD,MAAM,SAAS,GAAG,IAAI,GAAG,EAAmC,CAAC;QAC7D,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACpB,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,CAAC;gBACxB,MAAM,CAAC,GAAG,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,IAAI,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjE,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC;oBAAE,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAC/C,CAAC;QACL,CAAC;QACD,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC;QAClD,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QACpC,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAChF,IAAI,cAAc,CAAC,IAAI,GAAG,CAAC;YAAE,SAAS;QAEtC,8DAA8D;QAC9D,MAAM,MAAM,GAAG,KAAK;aACf,KAAK,EAAE;aACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/F,MAAM,IAAI,GAAG,iBAAiB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QAE/C,mFAAmF;QACnF,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;YAC7B,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC;YAE/D,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;YACrC,MAAM,MAAM,GAAoC,EAAE,CAAC;YAEnD,8CAA8C;YAC9C,KAAK,MAAM,MAAM,IAAI,UAAU,EAAE,CAAC;gBAC9B,IAAI,MAAM,KAAK,KAAK;oBAAE,SAAS;gBAC/B,MAAM,aAAa,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC5C,MAAM,CAAC,GAAG,GAAG,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,KAAK,IAAI,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACxE,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;oBAAE,SAAS;gBAChC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;gBAClB,MAAM,CAAC,IAAI,CAAC;oBACR,UAAU,EAAE,MAAM,CAAC,MAAM;oBACzB,SAAS,EAAE,MAAM,CAAC,KAAK;oBACvB,iBAAiB,EAAE,aAAa;oBAChC,WAAW,EAAE,GAAG,MAAM,CAAC,MAAM,IAAI,MAAM,CAAC,KAAK,EAAE;iBAClD,CAAC,CAAC;YACP,CAAC;YAED,8DAA8D;YAC9D,MAAM,YAAY,GAAG,aAAa,CAAC,KAAK,CAAC,CAAC;YAC1C,MAAM,kBAAkB,GAAG,gBAAgB,CAAC,MAAM,CAC9C,CAAC,CAAC,EAAE,EAAE,CACF,CAAC,CAAC,SAAS,KAAK,KAAK,CAAC,MAAM;gBAC5B,CAAC,CAAC,QAAQ,KAAK,KAAK,CAAC,KAAK;gBAC1B,CAAC,CAAC,YAAY,CAAC,MAAM,KAAK,YAAY,CAAC,MAAM;gBAC7C,CAAC,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,YAAY,CAAC,GAAG,CAAC,CAAC,CAChE,CAAC;YACF,KAAK,MAAM,CAAC,IAAI,kBAAkB,EAAE,CAAC;gBACjC,MAAM,CAAC,IAAI,CAAC;oBACR,UAAU,EAAE,CAAC,CAAC,WAAW;oBACzB,SAAS,EAAE,CAAC,CAAC,UAAU;oBACvB,cAAc,EAAE,CAAC,CAAC,cAAc;oBAChC,yBAAyB,EAAE,CAAC,CAAC,yBAAyB;oBACtD,yBAAyB,EAAE,CAAC,CAAC,yBAAyB;oBACtD,0BAA0B,EAAE,CAAC,CAAC,0BAA0B;oBACxD,WAAW,EAAE,GAAG,CAAC,CAAC,WAAW,IAAI,CAAC,CAAC,UAAU,SAAS,CAAC,CAAC,UAAU,IAAI,QAAQ,GAAG;iBACpF,CAAC,CAAC;YACP,CAAC;YAED,MAAM,cAAc,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,eAAe,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/F,0EAA0E;YAC1E,wEAAwE;YACxE,2EAA2E;YAC3E,0EAA0E;YAC1E,4EAA4E;YAC5E,MAAM,kBAAkB,GAAG,KAAK,CAAC,qBAAqB,IAAI,MAAM,CAAC,aAAa,CAAC;YAC/E,MAAM,qBAAqB,GACvB,KAAK,CAAC,6BAA6B,IAAI,MAAM,CAAC,6BAA6B,CAAC;YAChF,UAAU,CAAC,WAAW,CAAC,IAAI,CAAC;gBACxB,IAAI,EAAE,IAAI,GAAG,cAAc;gBAC3B,WAAW,EAAE,MAAM,CAAC,SAAS;gBAC7B,eAAe,EAAE,YAAY;gBAC7B,qBAAqB,EAAE,kBAAkB;gBACzC,6BAA6B,EAAE,qBAAqB;gBACpD,2BAA2B,EAAE,IAAI;gBACjC,eAAe,EAAE,MAAM;aAC1B,CAAC,CAAC;QACP,CAAC;IACL,CAAC;IAED,OAAO,GAAG,CAAC;AACf,CAAC;AAED,+EAA+E;AAE/E,SAAS,oBAAoB,CAAC,IAAY;IACtC,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC;SACd,WAAW,EAAE;SACb,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC;SACnB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,IAAI,EAAE,CAAC;AAChB,CAAC;AAED,SAAS,iBAAiB,CAAC,OAAe;IACtC,MAAM,MAAM,GAAG,OAAO;SACjB,KAAK,CAAC,GAAG,CAAC;SACV,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;SAC3C,IAAI,CAAC,GAAG,CAAC,CAAC;IACf,OAAO,GAAG,MAAM,QAAQ,CAAC;AAC7B,CAAC;AAED,SAAS,WAAW,CAChB,GAA8B,EAC9B,MAAc,EACd,KAAa;IAEb,IAAI,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC;IAC3B,IAAI,CAAC,QAAQ,EAAE,CAAC;QACZ,QAAQ,GAAG,EAAE,CAAC;QACd,GAAG,CAAC,MAAM,CAAC,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD,IAAI,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,KAAK,KAAK,CAAC,CAAC;IACxD,IAAI,CAAC,KAAK,EAAE,CAAC;QACT,KAAK,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,WAAW,EAAE,EAAE,EAAE,CAAC;QAC9C,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACzB,CAAC;IACD,OAAO,KAAK,CAAC;AACjB,CAAC;AAED,+FAA+F;AAC/F,MAAM,UAAU,kBAAkB,CAAC,GAA8B;IAM7D,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,OAAO,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;IACxC,KAAK,MAAM,SAAS,IAAI,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;QACzC,MAAM,IAAI,SAAS,CAAC,MAAM,CAAC;QAC3B,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;YACxB,IAAI,IAAI,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC;YAC7B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,WAAW;gBAAE,MAAM,IAAI,CAAC,CAAC,eAAe,CAAC,MAAM,CAAC;QACtE,CAAC;IACL,CAAC;IACD,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;AAC7C,CAAC"}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase A — SEMANTIC.
|
|
3
|
+
*
|
|
4
|
+
* The defensible PR #2193 pipeline:
|
|
5
|
+
*
|
|
6
|
+
* 1. PREFILTER (deterministic)
|
|
7
|
+
* Drop columns that cannot be organic keys regardless of semantics:
|
|
8
|
+
* binary/blob types, audit-named columns, ultra-low-cardinality.
|
|
9
|
+
*
|
|
10
|
+
* 2. NORMALIZE TO BUSINESS SPACE (LLM, one call per table)
|
|
11
|
+
* For each surviving column, produce a structured business-focused
|
|
12
|
+
* description that encodes PR #2193's "if two rows share this value,
|
|
13
|
+
* do they refer to the same real-world entity?" test, plus a canonical
|
|
14
|
+
* snake_case conceptName, normalization strategy, and isOrganicKey gate.
|
|
15
|
+
* Columns the normalizer rejects (audit, categorical, surrogate, free-
|
|
16
|
+
* text, etc.) are dropped here.
|
|
17
|
+
*
|
|
18
|
+
* 3. EMBED the normalized descriptions
|
|
19
|
+
* Same-concept descriptions geometrically converge regardless of
|
|
20
|
+
* source-system, table, or column-name conventions. Sample values are
|
|
21
|
+
* appended to distinguish e.g. emails from phones even when both
|
|
22
|
+
* describe "identifying a person".
|
|
23
|
+
*
|
|
24
|
+
* 4. CLUSTER (agglomerative average-linkage, gap-detected threshold)
|
|
25
|
+
* Each cluster is a candidate organic-key concept. The threshold is
|
|
26
|
+
* auto-calibrated by finding the largest gap in the bottom of the
|
|
27
|
+
* pairwise-distance distribution — robust across schemas/providers.
|
|
28
|
+
*
|
|
29
|
+
* 5. SPLIT-BY-CONCEPT
|
|
30
|
+
* When the LLM has already assigned DIFFERENT conceptNames to columns
|
|
31
|
+
* the embeddings put in the same cluster (e.g. product_id /
|
|
32
|
+
* product_model_id / product_category_id all describe "an identifier
|
|
33
|
+
* for a product-related entity" and embed close together), honor the
|
|
34
|
+
* LLM's distinction by splitting the cluster along conceptName.
|
|
35
|
+
* Catches Gemini's tight description-space compression without losing
|
|
36
|
+
* cross-table merging when the LLM does converge on one name.
|
|
37
|
+
*
|
|
38
|
+
* 6. LABEL each cluster from its members' conceptName votes.
|
|
39
|
+
*
|
|
40
|
+
* No vocabulary discovery, no LLM cluster judge, no per-cluster refinement —
|
|
41
|
+
* the normalize step IS the PR #2193 judgment, the cluster + split steps ARE
|
|
42
|
+
* the cross-table identity proof.
|
|
43
|
+
*/
|
|
44
|
+
import { AIConfig, OrganicKeyDetectionConfig } from '../types/config.js';
|
|
45
|
+
import { DatabaseDocumentation } from '../types/state.js';
|
|
46
|
+
import { OrganicKeyCluster } from '../types/organic-keys.js';
|
|
47
|
+
/** Result of the semantic (organic-key clustering) phase. */
|
|
48
|
+
export interface SemanticPhaseResult {
|
|
49
|
+
clusters: OrganicKeyCluster[];
|
|
50
|
+
tokens: {
|
|
51
|
+
input: number;
|
|
52
|
+
output: number;
|
|
53
|
+
total: number;
|
|
54
|
+
};
|
|
55
|
+
summary: {
|
|
56
|
+
columnsInScope: number;
|
|
57
|
+
columnsNormalized: number;
|
|
58
|
+
columnsRejectedByNormalizer: number;
|
|
59
|
+
/** Clusters formed by the embedding clusterer, before the concept-name split. */
|
|
60
|
+
clustersBeforeSplit: number;
|
|
61
|
+
/** Final clusters after splitting by concept name and dropping sub-threshold sub-clusters. */
|
|
62
|
+
clustersFound: number;
|
|
63
|
+
/** Sub-clusters discarded during the split for falling below minClusterSize / minDistinctTables. */
|
|
64
|
+
clustersDropped: number;
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
/** Receives human-readable progress messages as the semantic phase advances. */
|
|
68
|
+
export type ProgressCallback = (message: string) => void;
|
|
69
|
+
export declare function runSemanticPhase(state: DatabaseDocumentation, config: OrganicKeyDetectionConfig, aiConfig: AIConfig, progress?: ProgressCallback): Promise<SemanticPhaseResult>;
|
|
70
|
+
//# sourceMappingURL=SemanticPhase.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SemanticPhase.d.ts","sourceRoot":"","sources":["../../src/discovery/SemanticPhase.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0CG;AAEH,OAAO,EAAE,QAAQ,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AACzE,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAC1D,OAAO,EAEH,iBAAiB,EAGpB,MAAM,0BAA0B,CAAC;AAmDlC,6DAA6D;AAC7D,MAAM,WAAW,mBAAmB;IAChC,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,MAAM,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC;IACzD,OAAO,EAAE;QACL,cAAc,EAAE,MAAM,CAAC;QACvB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,2BAA2B,EAAE,MAAM,CAAC;QACpC,iFAAiF;QACjF,mBAAmB,EAAE,MAAM,CAAC;QAC5B,8FAA8F;QAC9F,aAAa,EAAE,MAAM,CAAC;QACtB,oGAAoG;QACpG,eAAe,EAAE,MAAM,CAAC;KAC3B,CAAC;CACL;AAED,gFAAgF;AAChF,MAAM,MAAM,gBAAgB,GAAG,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;AAEzD,wBAAsB,gBAAgB,CAClC,KAAK,EAAE,qBAAqB,EAC5B,MAAM,EAAE,yBAAyB,EACjC,QAAQ,EAAE,QAAQ,EAClB,QAAQ,GAAE,gBAA2B,GACtC,OAAO,CAAC,mBAAmB,CAAC,CA2F9B"}
|
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase A — SEMANTIC.
|
|
3
|
+
*
|
|
4
|
+
* The defensible PR #2193 pipeline:
|
|
5
|
+
*
|
|
6
|
+
* 1. PREFILTER (deterministic)
|
|
7
|
+
* Drop columns that cannot be organic keys regardless of semantics:
|
|
8
|
+
* binary/blob types, audit-named columns, ultra-low-cardinality.
|
|
9
|
+
*
|
|
10
|
+
* 2. NORMALIZE TO BUSINESS SPACE (LLM, one call per table)
|
|
11
|
+
* For each surviving column, produce a structured business-focused
|
|
12
|
+
* description that encodes PR #2193's "if two rows share this value,
|
|
13
|
+
* do they refer to the same real-world entity?" test, plus a canonical
|
|
14
|
+
* snake_case conceptName, normalization strategy, and isOrganicKey gate.
|
|
15
|
+
* Columns the normalizer rejects (audit, categorical, surrogate, free-
|
|
16
|
+
* text, etc.) are dropped here.
|
|
17
|
+
*
|
|
18
|
+
* 3. EMBED the normalized descriptions
|
|
19
|
+
* Same-concept descriptions geometrically converge regardless of
|
|
20
|
+
* source-system, table, or column-name conventions. Sample values are
|
|
21
|
+
* appended to distinguish e.g. emails from phones even when both
|
|
22
|
+
* describe "identifying a person".
|
|
23
|
+
*
|
|
24
|
+
* 4. CLUSTER (agglomerative average-linkage, gap-detected threshold)
|
|
25
|
+
* Each cluster is a candidate organic-key concept. The threshold is
|
|
26
|
+
* auto-calibrated by finding the largest gap in the bottom of the
|
|
27
|
+
* pairwise-distance distribution — robust across schemas/providers.
|
|
28
|
+
*
|
|
29
|
+
* 5. SPLIT-BY-CONCEPT
|
|
30
|
+
* When the LLM has already assigned DIFFERENT conceptNames to columns
|
|
31
|
+
* the embeddings put in the same cluster (e.g. product_id /
|
|
32
|
+
* product_model_id / product_category_id all describe "an identifier
|
|
33
|
+
* for a product-related entity" and embed close together), honor the
|
|
34
|
+
* LLM's distinction by splitting the cluster along conceptName.
|
|
35
|
+
* Catches Gemini's tight description-space compression without losing
|
|
36
|
+
* cross-table merging when the LLM does converge on one name.
|
|
37
|
+
*
|
|
38
|
+
* 6. LABEL each cluster from its members' conceptName votes.
|
|
39
|
+
*
|
|
40
|
+
* No vocabulary discovery, no LLM cluster judge, no per-cluster refinement —
|
|
41
|
+
* the normalize step IS the PR #2193 judgment, the cluster + split steps ARE
|
|
42
|
+
* the cross-table identity proof.
|
|
43
|
+
*/
|
|
44
|
+
import { DEFAULT_DETECTOR_CONFIG, } from '../types/organic-keys.js';
|
|
45
|
+
import { TableNormalizer, } from './ColumnNormalizer.js';
|
|
46
|
+
import { ColumnClusterer } from './ColumnClusterer.js';
|
|
47
|
+
import { createEmbeddingProvider } from './EmbeddingProvider.js';
|
|
48
|
+
const AUDIT_COLUMN_PATTERN = /^(modified|created|updated|inserted|changed)(date|at|time|by|on)?$|^rowguid$|^timestamp$|^row_?version$|^__mj_.*$/i;
|
|
49
|
+
const NON_VALUEMATCHABLE_TYPES = /(binary|blob|image|varbinary|xml|geography|geometry|hierarchyid|sql_variant)/i;
|
|
50
|
+
// ─── Prefilter cardinality cutoffs ───────────────────────────────────────────
|
|
51
|
+
/**
|
|
52
|
+
* A column with fewer than this many distinct values AND a uniqueness ratio below
|
|
53
|
+
* {@link MIN_UNIQUENESS_RATIO} is treated as a low-cardinality categorical (boolean/enum),
|
|
54
|
+
* not an identity-bearing column, and is dropped before clustering.
|
|
55
|
+
*/
|
|
56
|
+
const MIN_DISTINCT_FOR_KEY = 10;
|
|
57
|
+
const MIN_UNIQUENESS_RATIO = 0.001;
|
|
58
|
+
// ─── Embedding-input sample caps ─────────────────────────────────────────────
|
|
59
|
+
/** At most this many sample values are appended to a column's embedding text. */
|
|
60
|
+
const EMBED_SAMPLE_COUNT = 4;
|
|
61
|
+
/** Each appended sample value is truncated to this many characters. */
|
|
62
|
+
const EMBED_SAMPLE_MAX_CHARS = 60;
|
|
63
|
+
// ─── Cluster-confidence tightness penalty ────────────────────────────────────
|
|
64
|
+
/**
|
|
65
|
+
* Cluster confidence = mean member confidence − (maxIntraDistance × {@link TIGHTNESS_PENALTY_SLOPE}),
|
|
66
|
+
* with the penalty capped at {@link TIGHTNESS_PENALTY_MAX} so a loose-but-real cluster isn't
|
|
67
|
+
* over-penalized into irrelevance.
|
|
68
|
+
*/
|
|
69
|
+
const TIGHTNESS_PENALTY_SLOPE = 0.5;
|
|
70
|
+
const TIGHTNESS_PENALTY_MAX = 0.2;
|
|
71
|
+
/**
|
|
72
|
+
* Clustering sensitivity → percentile of the pairwise cosine-distance distribution used as the
|
|
73
|
+
* auto-calibrated merge threshold. A lower percentile yields tighter (stricter) clusters.
|
|
74
|
+
*/
|
|
75
|
+
const SENSITIVITY_PERCENTILE = {
|
|
76
|
+
strict: 1,
|
|
77
|
+
balanced: 5,
|
|
78
|
+
permissive: 15,
|
|
79
|
+
};
|
|
80
|
+
export async function runSemanticPhase(state, config, aiConfig, progress = () => { }) {
|
|
81
|
+
// ─── 1. Prefilter ────────────────────────────────────────────────────────
|
|
82
|
+
const candidates = prefilter(state, config);
|
|
83
|
+
progress(`semantic: ${candidates.length} columns in scope`);
|
|
84
|
+
if (candidates.length < (config.minClusterSize ?? DEFAULT_DETECTOR_CONFIG.minClusterSize)) {
|
|
85
|
+
return emptyResult(candidates.length);
|
|
86
|
+
}
|
|
87
|
+
// ─── 2. Normalize to business space (LLM per-table) ──────────────────────
|
|
88
|
+
progress('semantic: normalizing column descriptions to business space');
|
|
89
|
+
const tableInputs = groupColumnsByTable(state, candidates);
|
|
90
|
+
const normResult = await new TableNormalizer(aiConfig).normalizeAll(tableInputs, {
|
|
91
|
+
concurrency: config.refinementConcurrency ?? DEFAULT_DETECTOR_CONFIG.refinementConcurrency,
|
|
92
|
+
maxRetries: config.maxRefinementRetries ?? 2,
|
|
93
|
+
onProgress: () => { },
|
|
94
|
+
});
|
|
95
|
+
progress(`semantic: ${normResult.normalized.length} columns kept (${normResult.rejected} rejected by PR-#2193 axes)`);
|
|
96
|
+
if (normResult.normalized.length < (config.minClusterSize ?? DEFAULT_DETECTOR_CONFIG.minClusterSize)) {
|
|
97
|
+
return {
|
|
98
|
+
clusters: [],
|
|
99
|
+
tokens: normResult.tokens,
|
|
100
|
+
summary: {
|
|
101
|
+
columnsInScope: candidates.length,
|
|
102
|
+
columnsNormalized: normResult.normalized.length,
|
|
103
|
+
columnsRejectedByNormalizer: normResult.rejected,
|
|
104
|
+
clustersBeforeSplit: 0,
|
|
105
|
+
clustersFound: 0,
|
|
106
|
+
clustersDropped: 0,
|
|
107
|
+
},
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
// ─── 3. Embed the normalized descriptions ────────────────────────────────
|
|
111
|
+
const embedProvider = resolveEmbeddingProvider(config, aiConfig);
|
|
112
|
+
progress(`semantic: embedding ${normResult.normalized.length} descriptions via ${embedProvider.name}`);
|
|
113
|
+
const texts = normResult.normalized.map((n) => buildEmbeddingText(n));
|
|
114
|
+
const embeddings = await embedProvider.embed(texts);
|
|
115
|
+
// ─── 4. Cluster columns by embedding distance ────────────────────────────
|
|
116
|
+
const sensitivity = config.clusteringSensitivity ?? 'balanced';
|
|
117
|
+
const percentile = sensitivityToPercentile(sensitivity);
|
|
118
|
+
const clusterer = new ColumnClusterer({
|
|
119
|
+
mergeThreshold: config.mergeThreshold,
|
|
120
|
+
mergeThresholdPercentile: percentile,
|
|
121
|
+
minClusterSize: config.minClusterSize ?? DEFAULT_DETECTOR_CONFIG.minClusterSize,
|
|
122
|
+
minDistinctTables: config.minDistinctTables ?? DEFAULT_DETECTOR_CONFIG.minDistinctTables,
|
|
123
|
+
});
|
|
124
|
+
const clustererInputs = normResult.normalized.map((n, i) => ({
|
|
125
|
+
schema: n.schema,
|
|
126
|
+
table: n.table,
|
|
127
|
+
column: n.column,
|
|
128
|
+
embedding: embeddings[i],
|
|
129
|
+
participatesInFK: n.participatesInFK,
|
|
130
|
+
fkTarget: n.fkTarget,
|
|
131
|
+
isPrimaryKey: n.isPrimaryKey,
|
|
132
|
+
}));
|
|
133
|
+
const rawClusters = clusterer.cluster(clustererInputs);
|
|
134
|
+
progress(`semantic: ${rawClusters.length} clusters formed (threshold=${clusterer.lastResolvedThreshold.toFixed(3)})`);
|
|
135
|
+
// ─── 5. Split clusters that contain multiple distinct conceptNames ──────
|
|
136
|
+
const minClusterSize = config.minClusterSize ?? DEFAULT_DETECTOR_CONFIG.minClusterSize;
|
|
137
|
+
const minDistinctTables = config.minDistinctTables ?? DEFAULT_DETECTOR_CONFIG.minDistinctTables;
|
|
138
|
+
const { clusters: split, dropped } = splitClustersByConceptName(rawClusters, normResult.normalized, minClusterSize, minDistinctTables);
|
|
139
|
+
if (split.length !== rawClusters.length) {
|
|
140
|
+
progress(`semantic: ${split.length} clusters after concept-name split`);
|
|
141
|
+
}
|
|
142
|
+
// ─── 6. Label clusters from member votes ─────────────────────────────────
|
|
143
|
+
const clusters = split.map((rc, idx) => {
|
|
144
|
+
const memberNormalized = rc.memberIndexes.map((i) => normResult.normalized[i]);
|
|
145
|
+
return labelCluster(memberNormalized, rc.members, rc.maxIntraDistance, idx);
|
|
146
|
+
});
|
|
147
|
+
return {
|
|
148
|
+
clusters,
|
|
149
|
+
tokens: normResult.tokens,
|
|
150
|
+
summary: {
|
|
151
|
+
columnsInScope: candidates.length,
|
|
152
|
+
columnsNormalized: normResult.normalized.length,
|
|
153
|
+
columnsRejectedByNormalizer: normResult.rejected,
|
|
154
|
+
clustersBeforeSplit: rawClusters.length,
|
|
155
|
+
clustersFound: clusters.length,
|
|
156
|
+
clustersDropped: dropped,
|
|
157
|
+
},
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Split each raw embedding cluster into sub-clusters by distinct conceptName.
|
|
162
|
+
*
|
|
163
|
+
* Embedding clustering over-merges when distinct concepts produce similar descriptions
|
|
164
|
+
* (e.g. product_id vs product_model_id vs product_category_id all describe "identifier
|
|
165
|
+
* for a product-related entity"). The LLM normalize step already distinguished them by
|
|
166
|
+
* assigning different concept names — we honor that here by splitting any cluster whose
|
|
167
|
+
* members carry multiple distinct conceptName values.
|
|
168
|
+
*
|
|
169
|
+
* Sub-clusters that fall below `minClusterSize` / `minDistinctTables` are dropped (a concept
|
|
170
|
+
* with one column in one table isn't a cross-table organic key). Returns the surviving
|
|
171
|
+
* sub-clusters (sorted largest-first) and the count of dropped sub-clusters.
|
|
172
|
+
*/
|
|
173
|
+
function splitClustersByConceptName(rawClusters, normalized, minClusterSize, minDistinctTables) {
|
|
174
|
+
const split = [];
|
|
175
|
+
let dropped = 0;
|
|
176
|
+
for (const rc of rawClusters) {
|
|
177
|
+
const byConcept = groupMemberIndexesByConcept(rc.memberIndexes, normalized);
|
|
178
|
+
if (byConcept.size === 1) {
|
|
179
|
+
split.push(rc);
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
for (const [, idxs] of byConcept) {
|
|
183
|
+
const members = idxs.map((i) => toClusterMember(normalized[i]));
|
|
184
|
+
const tableSet = new Set(members.map((m) => `${m.schema}.${m.table}`));
|
|
185
|
+
if (members.length < minClusterSize || tableSet.size < minDistinctTables) {
|
|
186
|
+
dropped++;
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
split.push({ memberIndexes: idxs, members, maxIntraDistance: rc.maxIntraDistance });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
split.sort((a, b) => b.members.length - a.members.length);
|
|
193
|
+
return { clusters: split, dropped };
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Bucket member indexes by normalized conceptName. The grouping key is lowercased with all
|
|
197
|
+
* non-alphanumeric chars stripped, collapsing LLM stochastic variants (`sales_person_id` vs
|
|
198
|
+
* `salesPersonId`) into one bucket — they're the same identifier written differently.
|
|
199
|
+
*/
|
|
200
|
+
function groupMemberIndexesByConcept(memberIndexes, normalized) {
|
|
201
|
+
const byConcept = new Map();
|
|
202
|
+
for (const ci of memberIndexes) {
|
|
203
|
+
const cn = (normalized[ci].conceptName || '__unnamed__').toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
204
|
+
const bucket = byConcept.get(cn);
|
|
205
|
+
if (bucket)
|
|
206
|
+
bucket.push(ci);
|
|
207
|
+
else
|
|
208
|
+
byConcept.set(cn, [ci]);
|
|
209
|
+
}
|
|
210
|
+
return byConcept;
|
|
211
|
+
}
|
|
212
|
+
/** Project a normalized column into the cluster-member descriptor (sans per-column normalization). */
|
|
213
|
+
function toClusterMember(n) {
|
|
214
|
+
return {
|
|
215
|
+
schema: n.schema,
|
|
216
|
+
table: n.table,
|
|
217
|
+
column: n.column,
|
|
218
|
+
participatesInFK: n.participatesInFK,
|
|
219
|
+
fkTarget: n.fkTarget,
|
|
220
|
+
isPrimaryKey: n.isPrimaryKey,
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
224
|
+
function prefilter(state, config) {
|
|
225
|
+
const sampleValueCount = config.sampleValueCount ?? DEFAULT_DETECTOR_CONFIG.sampleValueCount;
|
|
226
|
+
const out = [];
|
|
227
|
+
for (const schema of state.schemas) {
|
|
228
|
+
for (const table of schema.tables) {
|
|
229
|
+
for (const col of table.columns) {
|
|
230
|
+
if (NON_VALUEMATCHABLE_TYPES.test(col.dataType))
|
|
231
|
+
continue;
|
|
232
|
+
if (AUDIT_COLUMN_PATTERN.test(col.name))
|
|
233
|
+
continue;
|
|
234
|
+
if (col.statistics) {
|
|
235
|
+
const distinct = col.statistics.distinctCount ?? 0;
|
|
236
|
+
const ratio = col.statistics.uniquenessRatio ?? 0;
|
|
237
|
+
// Drop ultra-low-cardinality / near-boolean columns.
|
|
238
|
+
if (distinct < MIN_DISTINCT_FOR_KEY && ratio < MIN_UNIQUENESS_RATIO)
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
out.push({
|
|
242
|
+
schema: schema.name,
|
|
243
|
+
table: table.name,
|
|
244
|
+
column: col.name,
|
|
245
|
+
dataType: col.dataType,
|
|
246
|
+
originalDescription: col.userDescription ?? col.description ?? '',
|
|
247
|
+
sampleValues: extractSampleValues(col, sampleValueCount),
|
|
248
|
+
participatesInFK: !!col.isForeignKey,
|
|
249
|
+
fkTarget: col.foreignKeyReferences
|
|
250
|
+
? {
|
|
251
|
+
schema: col.foreignKeyReferences.schema,
|
|
252
|
+
table: col.foreignKeyReferences.table,
|
|
253
|
+
column: col.foreignKeyReferences.referencedColumn,
|
|
254
|
+
}
|
|
255
|
+
: null,
|
|
256
|
+
isPrimaryKey: !!col.isPrimaryKey,
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
return out;
|
|
262
|
+
}
|
|
263
|
+
function groupColumnsByTable(state, columns) {
|
|
264
|
+
const schemaDesc = new Map();
|
|
265
|
+
const tableDesc = new Map();
|
|
266
|
+
for (const s of state.schemas) {
|
|
267
|
+
schemaDesc.set(s.name, s.description);
|
|
268
|
+
for (const t of s.tables)
|
|
269
|
+
tableDesc.set(`${s.name}.${t.name}`, t.description);
|
|
270
|
+
}
|
|
271
|
+
const buckets = new Map();
|
|
272
|
+
for (const c of columns) {
|
|
273
|
+
const k = `${c.schema}.${c.table}`;
|
|
274
|
+
const b = buckets.get(k);
|
|
275
|
+
if (b)
|
|
276
|
+
b.push(c);
|
|
277
|
+
else
|
|
278
|
+
buckets.set(k, [c]);
|
|
279
|
+
}
|
|
280
|
+
return Array.from(buckets.entries()).map(([k, cols]) => {
|
|
281
|
+
const [schema, table] = k.split('.');
|
|
282
|
+
return {
|
|
283
|
+
schema,
|
|
284
|
+
table,
|
|
285
|
+
schemaDescription: schemaDesc.get(schema),
|
|
286
|
+
tableDescription: tableDesc.get(k),
|
|
287
|
+
columns: cols,
|
|
288
|
+
};
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* Build the embedding input text from a normalized column.
|
|
293
|
+
*
|
|
294
|
+
* Gemini clustering embeddings compress "database column description" text into
|
|
295
|
+
* a tight neighborhood ([0, 0.15] cosine distance). To pull distinct concepts
|
|
296
|
+
* apart in that compressed geometry, we lead with the concept name repeated
|
|
297
|
+
* three times — embedding models give more weight to repeated tokens, so a
|
|
298
|
+
* column labeled "product_category_id" lands distinguishably away from one
|
|
299
|
+
* labeled "product_id" even though their template descriptions look similar.
|
|
300
|
+
*
|
|
301
|
+
* Sample values are appended last (when present) to add concrete value-format
|
|
302
|
+
* signal that distinguishes e.g. an email from a phone even when both describe
|
|
303
|
+
* "identifying a person".
|
|
304
|
+
*/
|
|
305
|
+
function buildEmbeddingText(n) {
|
|
306
|
+
const concept = (n.conceptName || 'unknown').replace(/_/g, ' ');
|
|
307
|
+
const parts = [];
|
|
308
|
+
parts.push(`${concept}. ${concept}. ${concept}.`);
|
|
309
|
+
parts.push(n.normalizedDescription);
|
|
310
|
+
if (n.sampleValues && n.sampleValues.length > 0) {
|
|
311
|
+
const samples = n.sampleValues.slice(0, EMBED_SAMPLE_COUNT).map((v) => String(v).slice(0, EMBED_SAMPLE_MAX_CHARS));
|
|
312
|
+
parts.push(`Sample values: ${samples.join(' | ')}.`);
|
|
313
|
+
}
|
|
314
|
+
return parts.join(' ');
|
|
315
|
+
}
|
|
316
|
+
function labelCluster(members, outMembers, maxIntraDistance, index) {
|
|
317
|
+
// Cluster concept = majority concept name (ties → highest confidence).
|
|
318
|
+
const conceptVotes = new Map();
|
|
319
|
+
for (const m of members) {
|
|
320
|
+
const key = m.conceptName || 'unknown';
|
|
321
|
+
const cur = conceptVotes.get(key) ?? { count: 0, sumConf: 0 };
|
|
322
|
+
cur.count += 1;
|
|
323
|
+
cur.sumConf += m.confidence;
|
|
324
|
+
conceptVotes.set(key, cur);
|
|
325
|
+
}
|
|
326
|
+
const concept = Array.from(conceptVotes.entries())
|
|
327
|
+
.sort((a, b) => b[1].count - a[1].count || b[1].sumConf - a[1].sumConf || a[0].localeCompare(b[0]))[0][0];
|
|
328
|
+
// Per-column normalization — each member keeps its own strategy + expression so
|
|
329
|
+
// the emitted EntityOrganicKey hub row for THAT column carries the right transformation.
|
|
330
|
+
// The runtime applies each side's own expression at match time (see
|
|
331
|
+
// BuildOrganicKeyViewParams in MJCore). The cluster-level `normalization` field is
|
|
332
|
+
// computed by majority vote as a summary / fallback for legacy consumers.
|
|
333
|
+
const stratCounts = new Map();
|
|
334
|
+
for (const m of members) {
|
|
335
|
+
stratCounts.set(m.normalizationStrategy, (stratCounts.get(m.normalizationStrategy) ?? 0) + 1);
|
|
336
|
+
}
|
|
337
|
+
const normalization = Array.from(stratCounts.entries())
|
|
338
|
+
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))[0][0];
|
|
339
|
+
// Highest-confidence member's customNormalizationExpression becomes the cluster-level
|
|
340
|
+
// fallback (only used if a member doesn't carry its own — shouldn't normally happen).
|
|
341
|
+
const customFallback = members
|
|
342
|
+
.slice()
|
|
343
|
+
.sort((a, b) => b.confidence - a.confidence)
|
|
344
|
+
.find((m) => m.customNormalizationExpression)?.customNormalizationExpression;
|
|
345
|
+
// Decorate outMembers with per-column normalization. Members are aligned by index
|
|
346
|
+
// with `members` (the NormalizedColumn array) via the caller's memberIndexes mapping.
|
|
347
|
+
const decoratedMembers = outMembers.map((om, i) => {
|
|
348
|
+
const nm = members[i];
|
|
349
|
+
return {
|
|
350
|
+
...om,
|
|
351
|
+
normalizationStrategy: nm?.normalizationStrategy,
|
|
352
|
+
customNormalizationExpression: nm?.customNormalizationExpression,
|
|
353
|
+
};
|
|
354
|
+
});
|
|
355
|
+
// Cluster confidence = mean of member confidences, lightly penalized by tightness.
|
|
356
|
+
const meanConf = members.reduce((s, n) => s + n.confidence, 0) / members.length;
|
|
357
|
+
const tightnessPenalty = Math.min(TIGHTNESS_PENALTY_MAX, maxIntraDistance * TIGHTNESS_PENALTY_SLOPE);
|
|
358
|
+
const confidence = Math.max(0, Math.min(1, meanConf - tightnessPenalty));
|
|
359
|
+
// Reasoning from the highest-confidence member.
|
|
360
|
+
const repr = members.slice().sort((a, b) => b.confidence - a.confidence)[0];
|
|
361
|
+
return {
|
|
362
|
+
id: `cluster_${index}`,
|
|
363
|
+
concept,
|
|
364
|
+
normalization,
|
|
365
|
+
customNormalizationExpression: customFallback,
|
|
366
|
+
members: decoratedMembers,
|
|
367
|
+
confidence,
|
|
368
|
+
reasoning: repr.reasoning,
|
|
369
|
+
maxIntraDistance,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
function extractSampleValues(col, max) {
|
|
373
|
+
const raw = col.statistics?.sampleValues ?? col.possibleValues ?? [];
|
|
374
|
+
if (!Array.isArray(raw))
|
|
375
|
+
return [];
|
|
376
|
+
const out = [];
|
|
377
|
+
for (const v of raw) {
|
|
378
|
+
if (v == null)
|
|
379
|
+
continue;
|
|
380
|
+
const s = String(v).trim();
|
|
381
|
+
if (s.length === 0)
|
|
382
|
+
continue;
|
|
383
|
+
out.push(s);
|
|
384
|
+
if (out.length >= max)
|
|
385
|
+
break;
|
|
386
|
+
}
|
|
387
|
+
return out;
|
|
388
|
+
}
|
|
389
|
+
function sensitivityToPercentile(s) {
|
|
390
|
+
return SENSITIVITY_PERCENTILE[s] ?? SENSITIVITY_PERCENTILE.balanced;
|
|
391
|
+
}
|
|
392
|
+
function resolveEmbeddingProvider(config, aiConfig) {
|
|
393
|
+
const cfg = config.embedding ?? {};
|
|
394
|
+
const provider = (cfg.provider ?? 'openai');
|
|
395
|
+
const apiKey = aiConfig.apiKey;
|
|
396
|
+
const impl = createEmbeddingProvider({
|
|
397
|
+
provider,
|
|
398
|
+
apiKey,
|
|
399
|
+
model: cfg.model,
|
|
400
|
+
dimensions: cfg.dimensions,
|
|
401
|
+
batchSize: cfg.batchSize,
|
|
402
|
+
endpoint: cfg.endpoint,
|
|
403
|
+
});
|
|
404
|
+
return {
|
|
405
|
+
name: `${provider}:${cfg.model ?? 'default'}`,
|
|
406
|
+
embed: (texts) => impl.embed(texts),
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
function emptyResult(columnsInScope) {
|
|
410
|
+
return {
|
|
411
|
+
clusters: [],
|
|
412
|
+
tokens: { input: 0, output: 0, total: 0 },
|
|
413
|
+
summary: {
|
|
414
|
+
columnsInScope,
|
|
415
|
+
columnsNormalized: 0,
|
|
416
|
+
columnsRejectedByNormalizer: 0,
|
|
417
|
+
clustersBeforeSplit: 0,
|
|
418
|
+
clustersFound: 0,
|
|
419
|
+
clustersDropped: 0,
|
|
420
|
+
},
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
//# sourceMappingURL=SemanticPhase.js.map
|