@memberjunction/db-auto-doc 5.37.0 → 5.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/core/AnalysisOrchestrator.d.ts.map +1 -1
- package/dist/core/AnalysisOrchestrator.js +32 -2
- package/dist/core/AnalysisOrchestrator.js.map +1 -1
- package/dist/discovery/BridgeViewSQLGenerator.d.ts +67 -0
- package/dist/discovery/BridgeViewSQLGenerator.d.ts.map +1 -0
- package/dist/discovery/BridgeViewSQLGenerator.js +99 -0
- package/dist/discovery/BridgeViewSQLGenerator.js.map +1 -0
- package/dist/discovery/ColumnClusterer.d.ts +63 -0
- package/dist/discovery/ColumnClusterer.d.ts.map +1 -0
- package/dist/discovery/ColumnClusterer.js +205 -0
- package/dist/discovery/ColumnClusterer.js.map +1 -0
- package/dist/discovery/ColumnNormalizer.d.ts +106 -0
- package/dist/discovery/ColumnNormalizer.d.ts.map +1 -0
- package/dist/discovery/ColumnNormalizer.js +376 -0
- package/dist/discovery/ColumnNormalizer.js.map +1 -0
- package/dist/discovery/Composer.d.ts +59 -0
- package/dist/discovery/Composer.d.ts.map +1 -0
- package/dist/discovery/Composer.js +95 -0
- package/dist/discovery/Composer.js.map +1 -0
- package/dist/discovery/EmbeddingProvider.d.ts +27 -0
- package/dist/discovery/EmbeddingProvider.d.ts.map +1 -0
- package/dist/discovery/EmbeddingProvider.js +87 -0
- package/dist/discovery/EmbeddingProvider.js.map +1 -0
- package/dist/discovery/FKGraphWalker.d.ts +108 -0
- package/dist/discovery/FKGraphWalker.d.ts.map +1 -0
- package/dist/discovery/FKGraphWalker.js +169 -0
- package/dist/discovery/FKGraphWalker.js.map +1 -0
- package/dist/discovery/OrganicKeyDetector.d.ts +51 -0
- package/dist/discovery/OrganicKeyDetector.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyDetector.js +78 -0
- package/dist/discovery/OrganicKeyDetector.js.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts +78 -0
- package/dist/discovery/OrganicKeyTranslator.d.ts.map +1 -0
- package/dist/discovery/OrganicKeyTranslator.js +166 -0
- package/dist/discovery/OrganicKeyTranslator.js.map +1 -0
- package/dist/discovery/SemanticPhase.d.ts +70 -0
- package/dist/discovery/SemanticPhase.d.ts.map +1 -0
- package/dist/discovery/SemanticPhase.js +423 -0
- package/dist/discovery/SemanticPhase.js.map +1 -0
- package/dist/discovery/StructuralPhase.d.ts +24 -0
- package/dist/discovery/StructuralPhase.d.ts.map +1 -0
- package/dist/discovery/StructuralPhase.js +23 -0
- package/dist/discovery/StructuralPhase.js.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts +65 -0
- package/dist/discovery/TransitiveBridgeDetector.d.ts.map +1 -0
- package/dist/discovery/TransitiveBridgeDetector.js +244 -0
- package/dist/discovery/TransitiveBridgeDetector.js.map +1 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts +12 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.d.ts.map +1 -1
- package/dist/generators/AdditionalSchemaInfoGenerator.js +31 -0
- package/dist/generators/AdditionalSchemaInfoGenerator.js.map +1 -1
- package/dist/types/config.d.ts +71 -0
- package/dist/types/config.d.ts.map +1 -1
- package/dist/types/config.js.map +1 -1
- package/dist/types/organic-keys.d.ts +141 -0
- package/dist/types/organic-keys.d.ts.map +1 -0
- package/dist/types/organic-keys.js +27 -0
- package/dist/types/organic-keys.js.map +1 -0
- package/dist/types/state.d.ts +7 -0
- package/dist/types/state.d.ts.map +1 -1
- package/dist/utils/json.d.ts +40 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +141 -0
- package/dist/utils/json.js.map +1 -0
- package/package.json +5 -5
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TableNormalizer — one LLM call per TABLE (not per column), upstream of embedding.
|
|
3
|
+
*
|
|
4
|
+
* For each in-scope table, a single LLM call sees:
|
|
5
|
+
* - The table's name + description + sibling columns
|
|
6
|
+
* - Every column's identity + description + sample values + FK/PK status
|
|
7
|
+
*
|
|
8
|
+
* The call returns one normalized entry per column with:
|
|
9
|
+
* - conceptName : canonical snake_case (`email_address`, `customer_id`, ...)
|
|
10
|
+
* - normalizationStrategy : how values should be compared
|
|
11
|
+
* - normalizedDescription : business-concept-focused, system-agnostic sentence
|
|
12
|
+
* - isUsefulOrganicKey : false for audit/system/free-form (filtered out)
|
|
13
|
+
* - confidence + reasoning
|
|
14
|
+
*
|
|
15
|
+
* Why per-table instead of per-column:
|
|
16
|
+
* - Fewer calls (5K cols across 500 tables → 500 calls instead of 5K). At
|
|
17
|
+
* Gemini Flash pricing, that's ~$0.04 instead of ~$0.20 for APTIFY-scale.
|
|
18
|
+
* - System prompt amortizes across all columns in the table.
|
|
19
|
+
* - The LLM sees siblings as context — knowing the table has FirstName + LastName
|
|
20
|
+
* next to an Email column reveals it's a person email, not a server hostname.
|
|
21
|
+
* - More token-efficient: one JSON array out instead of N independent objects.
|
|
22
|
+
*
|
|
23
|
+
* The single most important constraint: same-concept columns from DIFFERENT
|
|
24
|
+
* TABLES (across systems) must produce the same conceptName and a similar
|
|
25
|
+
* normalizedDescription so the embedding step naturally clusters them. The
|
|
26
|
+
* prompt enforces this via a canonical concept-name list.
|
|
27
|
+
*/
|
|
28
|
+
import { createLLMInstance } from '../utils/llm-factory.js';
|
|
29
|
+
import { cleanAndParseJSON } from '../utils/json.js';
|
|
30
|
+
export class TableNormalizer {
|
|
31
|
+
constructor(aiConfig) {
|
|
32
|
+
this.aiConfig = aiConfig;
|
|
33
|
+
this.llm = createLLMInstance(aiConfig.provider, aiConfig.apiKey);
|
|
34
|
+
}
|
|
35
|
+
/** Normalize one table — one LLM call returning per-column entries. */
|
|
36
|
+
async normalizeTable(input, maxRetries = 2) {
|
|
37
|
+
if (input.columns.length === 0) {
|
|
38
|
+
return { normalized: [], tokens: { total: 0, input: 0, output: 0 } };
|
|
39
|
+
}
|
|
40
|
+
const userPrompt = buildUserPrompt(input);
|
|
41
|
+
const params = {
|
|
42
|
+
model: this.aiConfig.model,
|
|
43
|
+
messages: [
|
|
44
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
45
|
+
{ role: 'user', content: userPrompt },
|
|
46
|
+
],
|
|
47
|
+
temperature: this.aiConfig.temperature ?? 0,
|
|
48
|
+
maxOutputTokens: this.aiConfig.maxTokens,
|
|
49
|
+
responseFormat: 'JSON',
|
|
50
|
+
};
|
|
51
|
+
let lastError = '';
|
|
52
|
+
let cumTokens = { total: 0, input: 0, output: 0 };
|
|
53
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
54
|
+
let result;
|
|
55
|
+
try {
|
|
56
|
+
result = await this.llm.ChatCompletion(params);
|
|
57
|
+
}
|
|
58
|
+
catch (err) {
|
|
59
|
+
lastError = `LLM call threw: ${err.message}`;
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
if (!result.success) {
|
|
63
|
+
lastError = `LLM call failed: ${result.errorMessage ?? 'unknown'}`;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
const content = result.data?.choices?.[0]?.message?.content ?? '';
|
|
67
|
+
const usage = result.data?.usage;
|
|
68
|
+
cumTokens = {
|
|
69
|
+
total: cumTokens.total + (usage?.totalTokens ?? 0),
|
|
70
|
+
input: cumTokens.input + (usage?.promptTokens ?? 0),
|
|
71
|
+
output: cumTokens.output + (usage?.completionTokens ?? 0),
|
|
72
|
+
};
|
|
73
|
+
let parsed = null;
|
|
74
|
+
try {
|
|
75
|
+
parsed = cleanAndParseJSON(content);
|
|
76
|
+
}
|
|
77
|
+
catch (err) {
|
|
78
|
+
lastError = `JSON parse threw: ${err.message}. Content prefix: ${content.slice(0, 200)}`;
|
|
79
|
+
if (attempt < maxRetries)
|
|
80
|
+
continue;
|
|
81
|
+
return { normalized: [], tokens: cumTokens, errorMessage: lastError };
|
|
82
|
+
}
|
|
83
|
+
if (!parsed || !Array.isArray(parsed.columns)) {
|
|
84
|
+
lastError = `JSON parse returned bad shape. Content prefix: ${content.slice(0, 200)}`;
|
|
85
|
+
if (attempt < maxRetries)
|
|
86
|
+
continue;
|
|
87
|
+
return { normalized: [], tokens: cumTokens, errorMessage: lastError };
|
|
88
|
+
}
|
|
89
|
+
// Match the LLM's response entries back to input columns by name.
|
|
90
|
+
const byName = new Map(input.columns.map((c) => [c.column.toLowerCase(), c]));
|
|
91
|
+
const normalized = [];
|
|
92
|
+
for (const entry of parsed.columns) {
|
|
93
|
+
if (!entry || typeof entry.column !== 'string')
|
|
94
|
+
continue;
|
|
95
|
+
const inputCol = byName.get(entry.column.toLowerCase());
|
|
96
|
+
if (!inputCol)
|
|
97
|
+
continue; // LLM hallucinated a column name; skip
|
|
98
|
+
normalized.push({
|
|
99
|
+
...inputCol,
|
|
100
|
+
conceptName: entry.conceptName ?? '',
|
|
101
|
+
normalizationStrategy: entry.normalizationStrategy ?? 'LowerCaseTrim',
|
|
102
|
+
customNormalizationExpression: sanitizePlaceholder(entry.customNormalizationExpression),
|
|
103
|
+
normalizedDescription: entry.normalizedDescription ?? '',
|
|
104
|
+
isUsefulOrganicKey: !!entry.isUsefulOrganicKey,
|
|
105
|
+
confidence: clamp01(entry.confidence),
|
|
106
|
+
reasoning: entry.reasoning ?? '',
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
return { normalized, tokens: cumTokens };
|
|
110
|
+
}
|
|
111
|
+
return { normalized: [], tokens: cumTokens, errorMessage: lastError || 'unknown failure after retries' };
|
|
112
|
+
}
|
|
113
|
+
/** Batch normalize many tables with bounded concurrency. */
|
|
114
|
+
async normalizeAll(tables, opts = {}) {
|
|
115
|
+
const concurrency = Math.max(1, opts.concurrency ?? 8);
|
|
116
|
+
const maxRetries = Math.max(0, opts.maxRetries ?? 2);
|
|
117
|
+
const allNormalized = [];
|
|
118
|
+
let rejected = 0;
|
|
119
|
+
let errors = 0;
|
|
120
|
+
let total = 0;
|
|
121
|
+
let input = 0;
|
|
122
|
+
let output = 0;
|
|
123
|
+
let completed = 0;
|
|
124
|
+
let cursor = 0;
|
|
125
|
+
const runners = Array.from({ length: concurrency }, async () => {
|
|
126
|
+
while (true) {
|
|
127
|
+
const idx = cursor++;
|
|
128
|
+
if (idx >= tables.length)
|
|
129
|
+
return;
|
|
130
|
+
const r = await this.normalizeTable(tables[idx], maxRetries);
|
|
131
|
+
total += r.tokens.total;
|
|
132
|
+
input += r.tokens.input;
|
|
133
|
+
output += r.tokens.output;
|
|
134
|
+
if (r.errorMessage) {
|
|
135
|
+
errors++;
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
for (const n of r.normalized) {
|
|
139
|
+
if (n.isUsefulOrganicKey)
|
|
140
|
+
allNormalized.push(n);
|
|
141
|
+
else
|
|
142
|
+
rejected++;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
completed++;
|
|
146
|
+
opts.onProgress?.(completed, tables.length);
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
await Promise.all(runners);
|
|
150
|
+
return { normalized: allNormalized, rejected, errors, tokens: { total, input, output } };
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// ─── Prompt ─────────────────────────────────────────────────────────────────
|
|
154
|
+
const SYSTEM_PROMPT = `You are translating database columns into a NORMALIZED BUSINESS-CONCEPT REPRESENTATION
|
|
155
|
+
for organic-key detection per MemberJunction PR #2193.
|
|
156
|
+
|
|
157
|
+
═══ PR #2193 — WHAT AN ORGANIC KEY IS ═══
|
|
158
|
+
|
|
159
|
+
An organic key is a column whose value can be used to MATCH two rows that
|
|
160
|
+
refer to the SAME real-world entity, WITHOUT going through a declared foreign
|
|
161
|
+
key. PR #2193 lets the framework "join by value" wherever the schema lacks an
|
|
162
|
+
explicit FK link (cross-system data, late-bound integrations, denormalized
|
|
163
|
+
warehouses, partial schemas).
|
|
164
|
+
|
|
165
|
+
KEEP a column as an organic-key candidate when this test passes:
|
|
166
|
+
|
|
167
|
+
"If I take two rows that have the same value in this column, do they
|
|
168
|
+
refer to the SAME real-world entity (the same customer, the same person,
|
|
169
|
+
the same order, the same product, the same location, the same legal
|
|
170
|
+
entity, the same communication endpoint)?"
|
|
171
|
+
|
|
172
|
+
This test is SATISFIED by:
|
|
173
|
+
- Customer / member / employee / person / company / product / order IDs —
|
|
174
|
+
natural OR surrogate. Within a database an EmployeeID of 42 in one table
|
|
175
|
+
DOES refer to the same employee as EmployeeID 42 in another table. That
|
|
176
|
+
is the WHOLE POINT of PR #2193 — these are the matches it makes navigable.
|
|
177
|
+
- Email addresses, phone numbers, fax numbers, URLs.
|
|
178
|
+
- Tax IDs, social security numbers, account numbers, license numbers.
|
|
179
|
+
- ISBNs, SKUs, product codes, part numbers.
|
|
180
|
+
- Postal codes, street addresses, geocodes (they identify a delivery point
|
|
181
|
+
or location entity).
|
|
182
|
+
- Full names, first names, last names, organization names — identifiers of
|
|
183
|
+
persons or organizations even when fuzzy.
|
|
184
|
+
|
|
185
|
+
FK columns are EXPLICITLY ALLOWED. PR #2193 organic keys often overlap with
|
|
186
|
+
FK columns by design — the same EmployeeID that's a declared FK in one table
|
|
187
|
+
is the organic match key for tables where the FK isn't declared. Do not
|
|
188
|
+
disqualify a column just because it participates in a FK.
|
|
189
|
+
|
|
190
|
+
REJECT (isUsefulOrganicKey=false) ONLY in these cases:
|
|
191
|
+
|
|
192
|
+
- Categorical / enum-like values with a small fixed vocabulary (status =
|
|
193
|
+
'Active'/'Pending'/'Closed'; type = 'A'/'B'/'C'; region code = 'NA'/'EMEA';
|
|
194
|
+
country code = 'US'/'UK'/'FR'). Two rows sharing status='Active' do NOT
|
|
195
|
+
refer to the same real-world entity — they're just both active.
|
|
196
|
+
|
|
197
|
+
- Booleans / flags (IsActive, HasDiscount).
|
|
198
|
+
|
|
199
|
+
- Measurements, quantities, prices, percentages, aggregates (price, qty,
|
|
200
|
+
discount, count, score, weight).
|
|
201
|
+
|
|
202
|
+
- Audit metadata (created_at, modified_by, version, row_version, rowguid,
|
|
203
|
+
last_login_at).
|
|
204
|
+
|
|
205
|
+
- Free-form descriptive text (notes, comments, description paragraphs,
|
|
206
|
+
long-text fields).
|
|
207
|
+
|
|
208
|
+
- System paths / blob references (photo_path, file_url, attachment_uri)
|
|
209
|
+
when they're pointers to assets rather than the asset's identity.
|
|
210
|
+
|
|
211
|
+
DO NOT REJECT a column just because:
|
|
212
|
+
- It is auto-increment (auto-increment IDs are still valid organic keys
|
|
213
|
+
across tables in the same system).
|
|
214
|
+
- It is a foreign key (FKs are valid organic keys, see above).
|
|
215
|
+
- It "identifies a location, not the parent row's entity" (a postal code
|
|
216
|
+
DOES identify a delivery location entity, which is a valid organic-key
|
|
217
|
+
use; clustering will decide whether it's useful).
|
|
218
|
+
- Names "could collide" (low uniqueness lowers confidence but does NOT
|
|
219
|
+
disqualify — names are valid organic-key candidates per PR #2193).
|
|
220
|
+
|
|
221
|
+
═══ YOUR TASK ═══
|
|
222
|
+
|
|
223
|
+
For each column you receive, produce a JSON object with these fields. Look at
|
|
224
|
+
the sibling columns and the table's purpose for context — they often clarify
|
|
225
|
+
whether a value identifies or categorizes.
|
|
226
|
+
|
|
227
|
+
1. normalizedDescription — A STRUCTURED BUSINESS-FOCUSED SENTENCE that any
|
|
228
|
+
reader (or embedding model) can use to recognize this kind of value. Use
|
|
229
|
+
this exact structural template:
|
|
230
|
+
|
|
231
|
+
"<value-kind> identifying <entity-kind>; <normalization rule>."
|
|
232
|
+
|
|
233
|
+
Example shapes:
|
|
234
|
+
"RFC-5322 email address identifying a natural person; case-insensitive
|
|
235
|
+
whitespace-trimmed equality."
|
|
236
|
+
"E.164 phone number identifying a person or organization; digits-only
|
|
237
|
+
equality after stripping formatting."
|
|
238
|
+
"Customer identifier (auto-increment or business code) identifying a
|
|
239
|
+
customer entity across tables; exact equality after trimming."
|
|
240
|
+
"Employee identifier identifying an employee across HR tables; exact
|
|
241
|
+
equality."
|
|
242
|
+
"Postal / ZIP code identifying a delivery area; exact equality after
|
|
243
|
+
trimming."
|
|
244
|
+
"Family name (last name) identifying a natural person; case-insensitive
|
|
245
|
+
whitespace-trimmed equality."
|
|
246
|
+
|
|
247
|
+
For REJECTED columns, the template flips:
|
|
248
|
+
"ISO-3166 country code (categorical, 250 buckets); not an entity
|
|
249
|
+
identifier; not applicable."
|
|
250
|
+
"Order line quantity (measurement); not an entity identifier; not
|
|
251
|
+
applicable."
|
|
252
|
+
"Modification timestamp (audit metadata); not applicable."
|
|
253
|
+
|
|
254
|
+
Same-concept columns from different tables MUST produce highly similar
|
|
255
|
+
normalizedDescription strings so they cluster geometrically. Generic prose
|
|
256
|
+
("a code", "an identifier") is NOT acceptable — name the value kind
|
|
257
|
+
(email address, phone number, customer id, postal code, family name, etc.)
|
|
258
|
+
and the entity kind explicitly.
|
|
259
|
+
|
|
260
|
+
2. conceptName — A canonical snake_case label for this value kind
|
|
261
|
+
(e.g. "email_address", "phone_number", "postal_code", "customer_id",
|
|
262
|
+
"employee_id", "person_family_name"). Use the same name for the same
|
|
263
|
+
concept across tables. Used as a cluster label hint downstream.
|
|
264
|
+
|
|
265
|
+
3. normalizationStrategy — How equality should be tested at match time:
|
|
266
|
+
"LowerCaseTrim" (default for case-insensitive text)
|
|
267
|
+
"Trim" (whitespace only)
|
|
268
|
+
"ExactMatch" (codes, IDs that are case-sensitive)
|
|
269
|
+
"Custom" (provide customNormalizationExpression — a SQL expression
|
|
270
|
+
that MUST use the literal placeholder {{FieldName}} where the
|
|
271
|
+
column reference goes, e.g.
|
|
272
|
+
REPLACE(REPLACE({{FieldName}}, '-', ''), ' ', ''). Do NOT use
|
|
273
|
+
'value', 'x', or a column name — only {{FieldName}}.)
|
|
274
|
+
|
|
275
|
+
4. isUsefulOrganicKey — Apply the test at the top of the prompt:
|
|
276
|
+
"If I take two rows with the same value in this column, do they refer to
|
|
277
|
+
the SAME real-world entity?" True for IDs, emails, phones, names,
|
|
278
|
+
addresses, codes that are entity-level. False ONLY for categorical
|
|
279
|
+
enums, booleans, measurements, audit metadata, free-form text, and
|
|
280
|
+
asset paths.
|
|
281
|
+
|
|
282
|
+
5. confidence — 0.0 to 1.0. Reflect uncertainty honestly; sample values that
|
|
283
|
+
contradict the column name should lower confidence even if you commit to
|
|
284
|
+
a judgment.
|
|
285
|
+
|
|
286
|
+
6. reasoning — One short sentence stating why this kind of value satisfies
|
|
287
|
+
or fails the test in field 4.
|
|
288
|
+
|
|
289
|
+
Sample values are the strongest signal. If the column is named "Status" but
|
|
290
|
+
samples are all distinct uuid-like tokens, trust the data over the name.
|
|
291
|
+
|
|
292
|
+
Output STRICT JSON only, no markdown fences:
|
|
293
|
+
{
|
|
294
|
+
"columns": [
|
|
295
|
+
{
|
|
296
|
+
"column": "<exact column name from input>",
|
|
297
|
+
"conceptName": "snake_case_name",
|
|
298
|
+
"normalizationStrategy": "LowerCaseTrim" | "Trim" | "ExactMatch" | "Custom",
|
|
299
|
+
"customNormalizationExpression": "...",
|
|
300
|
+
"normalizedDescription": "<structured sentence per template above>",
|
|
301
|
+
"isUsefulOrganicKey": true,
|
|
302
|
+
"confidence": 0.95,
|
|
303
|
+
"reasoning": "One short sentence."
|
|
304
|
+
}
|
|
305
|
+
]
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
Include EVERY column from the input, in the same order. Match the "column" field
|
|
309
|
+
exactly to the input column name.`;
|
|
310
|
+
/**
|
|
311
|
+
* Normalize whatever column placeholder the LLM used in a Custom expression to the
|
|
312
|
+
* canonical {{FieldName}} token that CodeGen + the PR #2193 runtime substitute. The
|
|
313
|
+
* prompt asks for {{FieldName}}, but models frequently emit `value`, `x`, `col`, or
|
|
314
|
+
* `column` instead — this guards against the resulting silent runtime breakage where
|
|
315
|
+
* the literal placeholder would survive into the executed SQL.
|
|
316
|
+
*/
|
|
317
|
+
function sanitizePlaceholder(expr) {
|
|
318
|
+
if (!expr)
|
|
319
|
+
return expr;
|
|
320
|
+
let out = expr;
|
|
321
|
+
// Already correct — leave alone.
|
|
322
|
+
if (/\{\{\s*FieldName\s*\}\}/.test(out)) {
|
|
323
|
+
return out.replace(/\{\{\s*FieldName\s*\}\}/g, '{{FieldName}}');
|
|
324
|
+
}
|
|
325
|
+
// Replace common standalone placeholder identifiers (word-boundary, not inside quotes).
|
|
326
|
+
// Order matters: longer tokens first.
|
|
327
|
+
for (const token of ['column', 'value', 'col', 'x']) {
|
|
328
|
+
const re = new RegExp(`\\b${token}\\b`, 'g');
|
|
329
|
+
if (re.test(out)) {
|
|
330
|
+
out = out.replace(re, '{{FieldName}}');
|
|
331
|
+
break; // only the first matching convention is the placeholder
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
return out;
|
|
335
|
+
}
|
|
336
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
337
|
+
function buildUserPrompt(input) {
|
|
338
|
+
const lines = [];
|
|
339
|
+
lines.push(`Table: ${input.schema}.${input.table}`);
|
|
340
|
+
if (input.schemaDescription)
|
|
341
|
+
lines.push(`Schema purpose: ${truncate(input.schemaDescription, 240)}`);
|
|
342
|
+
if (input.tableDescription)
|
|
343
|
+
lines.push(`Table purpose: ${truncate(input.tableDescription, 240)}`);
|
|
344
|
+
lines.push('');
|
|
345
|
+
lines.push(`Columns (${input.columns.length}):`);
|
|
346
|
+
for (const c of input.columns) {
|
|
347
|
+
lines.push(` - ${c.column} [${c.dataType}]${c.isPrimaryKey ? ' PK' : ''}${c.participatesInFK ? ` FK${c.fkTarget ? `→${c.fkTarget.schema}.${c.fkTarget.table}.${c.fkTarget.column}` : ''}` : ''}`);
|
|
348
|
+
if (c.originalDescription)
|
|
349
|
+
lines.push(` description: ${truncate(c.originalDescription, 240)}`);
|
|
350
|
+
if (c.sampleValues && c.sampleValues.length > 0) {
|
|
351
|
+
const samples = c.sampleValues
|
|
352
|
+
.slice(0, 5)
|
|
353
|
+
.map((v) => JSON.stringify(truncate(String(v), 80)))
|
|
354
|
+
.join(', ');
|
|
355
|
+
lines.push(` samples: [${samples}]`);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
lines.push('');
|
|
359
|
+
lines.push('Output the normalized JSON per the system prompt — one entry per column, in order.');
|
|
360
|
+
return lines.join('\n');
|
|
361
|
+
}
|
|
362
|
+
function clamp01(x) {
|
|
363
|
+
if (!Number.isFinite(x))
|
|
364
|
+
return 0;
|
|
365
|
+
if (x < 0)
|
|
366
|
+
return 0;
|
|
367
|
+
if (x > 1)
|
|
368
|
+
return 1;
|
|
369
|
+
return x;
|
|
370
|
+
}
|
|
371
|
+
function truncate(s, n) {
|
|
372
|
+
if (!s)
|
|
373
|
+
return '';
|
|
374
|
+
return s.length > n ? s.slice(0, n - 1) + '…' : s;
|
|
375
|
+
}
|
|
376
|
+
//# sourceMappingURL=ColumnNormalizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ColumnNormalizer.js","sourceRoot":"","sources":["../../src/discovery/ColumnNormalizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAGH,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAG5D,OAAO,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AA0DrD,MAAM,OAAO,eAAe;IAGxB,YAA6B,QAAkB;QAAlB,aAAQ,GAAR,QAAQ,CAAU;QAC3C,IAAI,CAAC,GAAG,GAAG,iBAAiB,CAAC,QAAQ,CAAC,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;IACrE,CAAC;IAED,uEAAuE;IAChE,KAAK,CAAC,cAAc,CACvB,KAA8B,EAC9B,UAAU,GAAG,CAAC;QAMd,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACzE,CAAC;QAED,MAAM,UAAU,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;QAC1C,MAAM,MAAM,GAAe;YACvB,KAAK,EAAE,IAAI,CAAC,QAAQ,CAAC,KAAK;YAC1B,QAAQ,EAAE;gBACN,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,aAAa,EAAE;gBAC1C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE;aACxC;YACD,WAAW,EAAE,IAAI,CAAC,QAAQ,CAAC,WAAW,IAAI,CAAC;YAC3C,eAAe,EAAE,IAAI,CAAC,QAAQ,CAAC,SAAS;YACxC,cAAc,EAAE,MAAM;SACzB,CAAC;QAEF,IAAI,SAAS,GAAG,EAAE,CAAC;QACnB,IAAI,SAAS,GAAG,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;QAClD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;YACrD,IAAI,MAA8B,CAAC;YACnC,IAAI,CAAC;gBACD,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC;YACnD,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACX,SAAS,GAAG,mBAAoB,GAAa,CAAC,OAAO,EAAE,CAAC;gBACxD,SAAS;YACb,CAAC;YACD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;gBAClB,SAAS,GAAG,oBAAoB,MAAM,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC;gBACnE,SAAS;YACb,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC;YAClE,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC;YACjC,SAAS,GAAG;gBACR,KAAK,EAAE,SAAS,CAAC,KAAK,GAAG,CAAC,KAAK,EAAE,WAAW,IAAI,CAAC,CAAC;gBAClD,KAAK,EAAE,SAAS,CAAC,KAAK,GAAG,CAAC,KAAK,EAAE,YAAY,IAAI,CAAC,CAAC;gBACnD,MAAM,EAAE,SAAS,CAAC,MAAM,GAAG,CAAC,KAAK,EAAE,gBAAgB,IAAI,CAAC,CAAC;aAC5D,CAAC;YAEF,IAAI,MAAM,GAA4B,IAAI,CAAC;YAC3C,IAAI,CAAC;gBACD,MAAM,GAAG,iBAAiB,CAAmB,OAAO,CAAC,CAAC;YAC1D,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACX,SAAS,GAAG,qBAAsB,GAAa,CAAC,OAAO,qBAAqB,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;gBACpG,IAAI,OAAO,GAAG,UAAU;oBAAE,SAAS;gBACnC,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC;YAC1E,CAAC;YACD,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC5C,SAAS,GAAG,kDAAkD,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;gBACtF,IAAI,OAAO,GAAG,UAAU;oBAAE,SAAS;gBACnC,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC;YAC1E,CAAC;YAED,kEAAkE;YAClE,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9E,MAAM,UAAU,GAAuB,EAAE,CAAC;YAC1C,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACjC,IAAI,CAAC,KAAK,IAAI,OAAO,KAAK,CAAC,MAAM,KAAK,QAAQ;oBAAE,SAAS;gBACzD,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC;gBACxD,IAAI,CAAC,QAAQ;oBAAE,SAAS,CAAC,uCAAuC;gBAChE,UAAU,CAAC,IAAI,CAAC;oBACZ,GAAG,QAAQ;oBACX,WAAW,EAAE,KAAK,CAAC,WAAW,IAAI,EAAE;oBACpC,qBAAqB,EAAE,KAAK,CAAC,qBAAqB,IAAI,eAAe;oBACrE,6BAA6B,EAAE,mBAAmB,CAAC,KAAK,CAAC,6BAA6B,CAAC;oBACvF,qBAAqB,EAAE,KAAK,CAAC,qBAAqB,IAAI,EAAE;oBACxD,kBAAkB,EAAE,CAAC,CAAC,KAAK,CAAC,kBAAkB;oBAC9C,UAAU,EAAE,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC;oBACrC,SAAS,EAAE,KAAK,CAAC,SAAS,IAAI,EAAE;iBACnC,CAAC,CAAC;YACP,CAAC;YACD,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;QAC7C,CAAC;QACD,OAAO,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,SAAS,EAAE,YAAY,EAAE,SAAS,IAAI,+BAA+B,EAAE,CAAC;IAC7G,CAAC;IAED,4DAA4D;IACrD,KAAK,CAAC,YAAY,CACrB,MAAiC,EACjC,OAA0B,EAAE;QAE5B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,IAAI,CAAC,CAAC,CAAC;QACvD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;QAErD,MAAM,aAAa,GAAuB,EAAE,CAAC;QAC7C,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,IAAI,MAAM,GAAG,CAAC,CAAC;QACf,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI,EAAE;YAC3D,OAAO,IAAI,EAAE,CAAC;gBACV,MAAM,GAAG,GAAG,MAAM,EAAE,CAAC;gBACrB,IAAI,GAAG,IAAI,MAAM,CAAC,MAAM;oBAAE,OAAO;gBACjC,MAAM,CAAC,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,UAAU,CAAC,CAAC;gBAC7D,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;gBACxB,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;gBACxB,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;gBAE1B,IAAI,CAAC,CAAC,YAAY,EAAE,CAAC;oBACjB,MAAM,EAAE,CAAC;gBACb,CAAC;qBAAM,CAAC;oBACJ,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;wBAC3B,IAAI,CAAC,CAAC,kBAAkB;4BAAE,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;4BAC3C,QAAQ,EAAE,CAAC;oBACpB,CAAC;gBACL,CAAC;gBACD,SAAS,EAAE,CAAC;gBACZ,IAAI,CAAC,UAAU,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YAChD,CAAC;QACL,CAAC,CAAC,CAAC;QACH,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAC3B,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAC7F,CAAC;CACJ;AAED,+EAA+E;AAE/E,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;kCA2JY,CAAC;AAEnC;;;;;;GAMG;AACH,SAAS,mBAAmB,CAAC,IAAwB;IACjD,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IACvB,IAAI,GAAG,GAAG,IAAI,CAAC;IACf,iCAAiC;IACjC,IAAI,yBAAyB,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QACtC,OAAO,GAAG,CAAC,OAAO,CAAC,0BAA0B,EAAE,eAAe,CAAC,CAAC;IACpE,CAAC;IACD,wFAAwF;IACxF,sCAAsC;IACtC,KAAK,MAAM,KAAK,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,MAAM,KAAK,KAAK,EAAE,GAAG,CAAC,CAAC;QAC7C,IAAI,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACf,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,EAAE,EAAE,eAAe,CAAC,CAAC;YACvC,MAAM,CAAC,wDAAwD;QACnE,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACf,CAAC;AAqBD,+EAA+E;AAE/E,SAAS,eAAe,CAAC,KAA8B;IACnD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC;IACpD,IAAI,KAAK,CAAC,iBAAiB;QAAE,KAAK,CAAC,IAAI,CAAC,mBAAmB,QAAQ,CAAC,KAAK,CAAC,iBAAiB,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IACrG,IAAI,KAAK,CAAC,gBAAgB;QAAE,KAAK,CAAC,IAAI,CAAC,mBAAmB,QAAQ,CAAC,KAAK,CAAC,gBAAgB,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;IACnG,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,YAAY,KAAK,CAAC,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC;IACjD,KAAK,MAAM,CAAC,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAC5B,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACtM,IAAI,CAAC,CAAC,mBAAmB;YAAE,KAAK,CAAC,IAAI,CAAC,sBAAsB,QAAQ,CAAC,CAAC,CAAC,mBAAmB,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QACpG,IAAI,CAAC,CAAC,YAAY,IAAI,CAAC,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9C,MAAM,OAAO,GAAG,CAAC,CAAC,YAAY;iBACzB,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;iBACX,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;iBACnD,IAAI,CAAC,IAAI,CAAC,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,mBAAmB,OAAO,GAAG,CAAC,CAAC;QAC9C,CAAC;IACL,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,oFAAoF,CAAC,CAAC;IACjG,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,OAAO,CAAC,CAAS;IACtB,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,CAAC,CAAC;IAClC,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IACpB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IACpB,OAAO,CAAC,CAAC;AACb,CAAC;AAED,SAAS,QAAQ,CAAC,CAAS,EAAE,CAAS;IAClC,IAAI,CAAC,CAAC;QAAE,OAAO,EAAE,CAAC;IAClB,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;AACtD,CAAC"}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Composer — flag-and-emit (no destructive filtering).
|
|
3
|
+
*
|
|
4
|
+
* Computes `isFKRedundant` for each cluster (PR #2193 organic keys are meant
|
|
5
|
+
* to be used "in place of a foreign-key reference" — when every non-PK member
|
|
6
|
+
* is a declared FK pointing to the PK member, the cluster adds no new
|
|
7
|
+
* navigation). Sets the flag on the cluster but does NOT drop — well-modeled
|
|
8
|
+
* OLTP schemas would lose 30-50% of valid organic-key candidates if we
|
|
9
|
+
* dropped, and the discovery value (cross-system extension, naming
|
|
10
|
+
* consistency checks) survives the redundancy.
|
|
11
|
+
*
|
|
12
|
+
* The dashboard surfaces a "hide FK-redundant" filter so users get the lookup-
|
|
13
|
+
* table-PK noise out of view without losing the underlying candidates.
|
|
14
|
+
*/
|
|
15
|
+
import { OrganicKeyCluster, OrganicKeyClusterMember } from '../types/organic-keys.js';
|
|
16
|
+
import { DetectedOrganicKeysOutput } from './OrganicKeyTranslator.js';
|
|
17
|
+
import { TransitiveBridgeFinding } from './TransitiveBridgeDetector.js';
|
|
18
|
+
/** Output of the compose step: the PR #2193 JSON, the FK-redundancy-annotated clusters, and emit counts. */
|
|
19
|
+
export interface ComposerResult {
|
|
20
|
+
output: DetectedOrganicKeysOutput;
|
|
21
|
+
/** Clusters with isFKRedundant filled in — callers that persist the cluster list
|
|
22
|
+
* (e.g. detector → state.json → dashboard) should use THIS, not the pre-compose
|
|
23
|
+
* input, otherwise the flag is silently lost. */
|
|
24
|
+
annotatedClusters: OrganicKeyCluster[];
|
|
25
|
+
emitted: number;
|
|
26
|
+
flaggedFKRedundant: number;
|
|
27
|
+
summary: {
|
|
28
|
+
outputSchemas: number;
|
|
29
|
+
outputTables: number;
|
|
30
|
+
outputKeys: number;
|
|
31
|
+
outputSpokes: number;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Compose detected clusters + transitive bridges into the PR #2193 emit JSON.
|
|
36
|
+
*
|
|
37
|
+
* Each cluster is annotated with `isFKRedundant` (true when it's already navigable via a
|
|
38
|
+
* declared foreign key — kept but flagged, not dropped). Matching transitive bridges are
|
|
39
|
+
* attached as spokes. Returns the JSON plus the annotated clusters and emit counts.
|
|
40
|
+
*/
|
|
41
|
+
export declare function compose(clusters: OrganicKeyCluster[], bridges: TransitiveBridgeFinding[]): ComposerResult;
|
|
42
|
+
/**
|
|
43
|
+
* A cluster is FK-redundant when ALL non-PK members are declared FKs pointing
|
|
44
|
+
* at the same target column (typically the PK member of the cluster). PR #2193
|
|
45
|
+
* organic keys are "used in place of a foreign-key reference" — if the FK is
|
|
46
|
+
* already declared, the cluster doesn't add navigability.
|
|
47
|
+
*
|
|
48
|
+
* Requires at least one PK and at least one FK in the cluster to apply.
|
|
49
|
+
* Returns false for clusters that are entirely PKs, entirely non-FKs, or that
|
|
50
|
+
* have mixed FK targets (the latter is a genuine value-based correlation that
|
|
51
|
+
* no single FK covers).
|
|
52
|
+
*/
|
|
53
|
+
declare function isFKRedundant(cluster: OrganicKeyCluster): boolean;
|
|
54
|
+
/** Re-export for tests / observability. */
|
|
55
|
+
export declare const __test__: {
|
|
56
|
+
isFKRedundant: typeof isFKRedundant;
|
|
57
|
+
};
|
|
58
|
+
export type { OrganicKeyClusterMember };
|
|
59
|
+
//# sourceMappingURL=Composer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Composer.d.ts","sourceRoot":"","sources":["../../src/discovery/Composer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,iBAAiB,EAAE,uBAAuB,EAAE,MAAM,0BAA0B,CAAC;AACtF,OAAO,EACH,yBAAyB,EAI5B,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AAExE,4GAA4G;AAC5G,MAAM,WAAW,cAAc;IAC3B,MAAM,EAAE,yBAAyB,CAAC;IAClC;;sDAEkD;IAClD,iBAAiB,EAAE,iBAAiB,EAAE,CAAC;IACvC,OAAO,EAAE,MAAM,CAAC;IAChB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,OAAO,EAAE;QAAE,aAAa,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC;CACtG;AAED;;;;;;GAMG;AACH,wBAAgB,OAAO,CACnB,QAAQ,EAAE,iBAAiB,EAAE,EAC7B,OAAO,EAAE,uBAAuB,EAAE,GACnC,cAAc,CA0ChB;AAED;;;;;;;;;;GAUG;AACH,iBAAS,aAAa,CAAC,OAAO,EAAE,iBAAiB,GAAG,OAAO,CAiB1D;AAED,2CAA2C;AAC3C,eAAO,MAAM,QAAQ;;CAAoB,CAAC;AAC1C,YAAY,EAAE,uBAAuB,EAAE,CAAC"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Composer — flag-and-emit (no destructive filtering).
|
|
3
|
+
*
|
|
4
|
+
* Computes `isFKRedundant` for each cluster (PR #2193 organic keys are meant
|
|
5
|
+
* to be used "in place of a foreign-key reference" — when every non-PK member
|
|
6
|
+
* is a declared FK pointing to the PK member, the cluster adds no new
|
|
7
|
+
* navigation). Sets the flag on the cluster but does NOT drop — well-modeled
|
|
8
|
+
* OLTP schemas would lose 30-50% of valid organic-key candidates if we
|
|
9
|
+
* dropped, and the discovery value (cross-system extension, naming
|
|
10
|
+
* consistency checks) survives the redundancy.
|
|
11
|
+
*
|
|
12
|
+
* The dashboard surfaces a "hide FK-redundant" filter so users get the lookup-
|
|
13
|
+
* table-PK noise out of view without losing the underlying candidates.
|
|
14
|
+
*/
|
|
15
|
+
import { translateClusters, countOutputEntries, } from './OrganicKeyTranslator.js';
|
|
16
|
+
/**
|
|
17
|
+
* Compose detected clusters + transitive bridges into the PR #2193 emit JSON.
|
|
18
|
+
*
|
|
19
|
+
* Each cluster is annotated with `isFKRedundant` (true when it's already navigable via a
|
|
20
|
+
* declared foreign key — kept but flagged, not dropped). Matching transitive bridges are
|
|
21
|
+
* attached as spokes. Returns the JSON plus the annotated clusters and emit counts.
|
|
22
|
+
*/
|
|
23
|
+
export function compose(clusters, bridges) {
|
|
24
|
+
let flaggedCount = 0;
|
|
25
|
+
const annotated = clusters.map((c) => {
|
|
26
|
+
const redundant = isFKRedundant(c);
|
|
27
|
+
if (redundant)
|
|
28
|
+
flaggedCount += 1;
|
|
29
|
+
return { ...c, isFKRedundant: redundant };
|
|
30
|
+
});
|
|
31
|
+
const hubKeys = new Set();
|
|
32
|
+
for (const c of annotated) {
|
|
33
|
+
for (const m of c.members)
|
|
34
|
+
hubKeys.add(`${m.schema}.${m.table}.${m.column}`);
|
|
35
|
+
}
|
|
36
|
+
const spokes = bridges
|
|
37
|
+
.filter((b) => hubKeys.has(`${b.hubSchema}.${b.hubTable}.${b.hubKeyFields[0]}`))
|
|
38
|
+
.map((b) => ({
|
|
39
|
+
hubSchema: b.hubSchema,
|
|
40
|
+
hubTable: b.hubTable,
|
|
41
|
+
hubKeyFields: b.hubKeyFields,
|
|
42
|
+
spokeSchema: b.spokeSchema,
|
|
43
|
+
spokeTable: b.spokeTable,
|
|
44
|
+
transitiveView: { Name: b.view.viewName, SchemaName: b.view.schemaName, SQL: b.view.sql },
|
|
45
|
+
transitiveMatchFieldNames: [b.view.hubKeyField],
|
|
46
|
+
transitiveOutputFieldName: b.view.spokeOutputField,
|
|
47
|
+
relatedEntityJoinFieldName: b.view.spokeJoinField,
|
|
48
|
+
hubConcept: b.hubConcept,
|
|
49
|
+
}));
|
|
50
|
+
const output = translateClusters(annotated, spokes);
|
|
51
|
+
const counts = countOutputEntries(output);
|
|
52
|
+
return {
|
|
53
|
+
output,
|
|
54
|
+
annotatedClusters: annotated,
|
|
55
|
+
emitted: annotated.length,
|
|
56
|
+
flaggedFKRedundant: flaggedCount,
|
|
57
|
+
summary: {
|
|
58
|
+
outputSchemas: counts.schemas,
|
|
59
|
+
outputTables: counts.tables,
|
|
60
|
+
outputKeys: counts.keys,
|
|
61
|
+
outputSpokes: counts.spokes,
|
|
62
|
+
},
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* A cluster is FK-redundant when ALL non-PK members are declared FKs pointing
|
|
67
|
+
* at the same target column (typically the PK member of the cluster). PR #2193
|
|
68
|
+
* organic keys are "used in place of a foreign-key reference" — if the FK is
|
|
69
|
+
* already declared, the cluster doesn't add navigability.
|
|
70
|
+
*
|
|
71
|
+
* Requires at least one PK and at least one FK in the cluster to apply.
|
|
72
|
+
* Returns false for clusters that are entirely PKs, entirely non-FKs, or that
|
|
73
|
+
* have mixed FK targets (the latter is a genuine value-based correlation that
|
|
74
|
+
* no single FK covers).
|
|
75
|
+
*/
|
|
76
|
+
function isFKRedundant(cluster) {
|
|
77
|
+
const pkMembers = cluster.members.filter((m) => m.isPrimaryKey);
|
|
78
|
+
const nonPK = cluster.members.filter((m) => !m.isPrimaryKey);
|
|
79
|
+
if (pkMembers.length === 0 || nonPK.length === 0)
|
|
80
|
+
return false;
|
|
81
|
+
// Build the set of plausible "target" identifiers from the PK members.
|
|
82
|
+
const pkTargets = new Set(pkMembers.map((m) => `${m.schema}.${m.table}.${m.column}`.toLowerCase()));
|
|
83
|
+
// Every non-PK member must be a FK pointing into one of the PK targets.
|
|
84
|
+
for (const m of nonPK) {
|
|
85
|
+
if (!m.participatesInFK || !m.fkTarget)
|
|
86
|
+
return false;
|
|
87
|
+
const key = `${m.fkTarget.schema}.${m.fkTarget.table}.${m.fkTarget.column}`.toLowerCase();
|
|
88
|
+
if (!pkTargets.has(key))
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
/** Re-export for tests / observability. */
|
|
94
|
+
export const __test__ = { isFKRedundant };
|
|
95
|
+
//# sourceMappingURL=Composer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Composer.js","sourceRoot":"","sources":["../../src/discovery/Composer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAGH,OAAO,EAGH,iBAAiB,EACjB,kBAAkB,GACrB,MAAM,2BAA2B,CAAC;AAenC;;;;;;GAMG;AACH,MAAM,UAAU,OAAO,CACnB,QAA6B,EAC7B,OAAkC;IAElC,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,MAAM,SAAS,GAAwB,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QACtD,MAAM,SAAS,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QACnC,IAAI,SAAS;YAAE,YAAY,IAAI,CAAC,CAAC;QACjC,OAAO,EAAE,GAAG,CAAC,EAAE,aAAa,EAAE,SAAS,EAAE,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAClC,KAAK,MAAM,CAAC,IAAI,SAAS,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,OAAO;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,MAAM,MAAM,GAA2B,OAAO;SACzC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,SAAS,IAAI,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;SAC/E,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACT,SAAS,EAAE,CAAC,CAAC,SAAS;QACtB,QAAQ,EAAE,CAAC,CAAC,QAAQ;QACpB,YAAY,EAAE,CAAC,CAAC,YAAY;QAC5B,WAAW,EAAE,CAAC,CAAC,WAAW;QAC1B,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,cAAc,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,EAAE,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,UAAU,EAAE,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE;QACzF,yBAAyB,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC;QAC/C,yBAAyB,EAAE,CAAC,CAAC,IAAI,CAAC,gBAAgB;QAClD,0BAA0B,EAAE,CAAC,CAAC,IAAI,CAAC,cAAc;QACjD,UAAU,EAAE,CAAC,CAAC,UAAU;KAC3B,CAAC,CAAC,CAAC;IAER,MAAM,MAAM,GAAG,iBAAiB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IACpD,MAAM,MAAM,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAC;IAE1C,OAAO;QACH,MAAM;QACN,iBAAiB,EAAE,SAAS;QAC5B,OAAO,EAAE,SAAS,CAAC,MAAM;QACzB,kBAAkB,EAAE,YAAY;QAChC,OAAO,EAAE;YACL,aAAa,EAAE,MAAM,CAAC,OAAO;YAC7B,YAAY,EAAE,MAAM,CAAC,MAAM;YAC3B,UAAU,EAAE,MAAM,CAAC,IAAI;YACvB,YAAY,EAAE,MAAM,CAAC,MAAM;SAC9B;KACJ,CAAC;AACN,CAAC;AAED;;;;;;;;;;GAUG;AACH,SAAS,aAAa,CAAC,OAA0B;IAC7C,MAAM,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC;IAChE,MAAM,KAAK,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC;IAC7D,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAE/D,uEAAuE;IACvE,MAAM,SAAS,GAAG,IAAI,GAAG,CACrB,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,CAAC,CAC3E,CAAC;IAEF,wEAAwE;IACxE,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACpB,IAAI,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,QAAQ;YAAE,OAAO,KAAK,CAAC;QACrD,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,QAAQ,CAAC,MAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,CAAC;QAC1F,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;IAC1C,CAAC;IACD,OAAO,IAAI,CAAC;AAChB,CAAC;AAED,2CAA2C;AAC3C,MAAM,CAAC,MAAM,QAAQ,GAAG,EAAE,aAAa,EAAE,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EmbeddingProvider — thin wrapper over MemberJunction's `BaseEmbeddings` infrastructure.
|
|
3
|
+
*
|
|
4
|
+
* Embeddings are produced through the same MJ ClassFactory + driver pattern that
|
|
5
|
+
* `llm-factory` uses for LLMs (so DBAutoDoc stays coupled to MJ's AI stack rather
|
|
6
|
+
* than talking to provider REST endpoints directly). The concrete driver class is
|
|
7
|
+
* resolved from the provider name and instantiated with the supplied API key.
|
|
8
|
+
*
|
|
9
|
+
* Vectors are unit-normalized so the clustering step can use cosine distance
|
|
10
|
+
* directly regardless of whether the underlying model returns normalized output.
|
|
11
|
+
*/
|
|
12
|
+
/** Provider names that map to a registered `BaseEmbeddings` driver class. */
|
|
13
|
+
export type EmbeddingProviderName = 'openai' | 'mistral' | 'azure' | 'bedrock' | 'ollama' | 'local';
|
|
14
|
+
export interface EmbeddingProviderConfig {
|
|
15
|
+
provider: EmbeddingProviderName;
|
|
16
|
+
apiKey: string;
|
|
17
|
+
model?: string;
|
|
18
|
+
dimensions?: number;
|
|
19
|
+
batchSize?: number;
|
|
20
|
+
endpoint?: string;
|
|
21
|
+
}
|
|
22
|
+
export interface EmbeddingProvider {
|
|
23
|
+
readonly provider: EmbeddingProviderName;
|
|
24
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
25
|
+
}
|
|
26
|
+
export declare function createEmbeddingProvider(config: EmbeddingProviderConfig): EmbeddingProvider;
|
|
27
|
+
//# sourceMappingURL=EmbeddingProvider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"EmbeddingProvider.d.ts","sourceRoot":"","sources":["../../src/discovery/EmbeddingProvider.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAKH,6EAA6E;AAC7E,MAAM,MAAM,qBAAqB,GAAG,QAAQ,GAAG,SAAS,GAAG,OAAO,GAAG,SAAS,GAAG,QAAQ,GAAG,OAAO,CAAC;AAEpG,MAAM,WAAW,uBAAuB;IACpC,QAAQ,EAAE,qBAAqB,CAAC;IAChC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,iBAAiB;IAC9B,QAAQ,CAAC,QAAQ,EAAE,qBAAqB,CAAC;IACzC,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;CACnD;AAyBD,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,uBAAuB,GAAG,iBAAiB,CAO1F"}
|