@berthojoris/mcp-mysql-server 1.15.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/DOCUMENTATIONS.md +292 -5
- package/README.md +14 -13
- package/dist/config/featureConfig.d.ts +2 -1
- package/dist/config/featureConfig.js +20 -0
- package/dist/index.d.ts +225 -0
- package/dist/index.js +60 -0
- package/dist/mcp-server.js +273 -0
- package/dist/tools/documentationGeneratorTools.d.ts +145 -0
- package/dist/tools/documentationGeneratorTools.js +820 -0
- package/dist/tools/intelligentQueryTools.d.ts +94 -0
- package/dist/tools/intelligentQueryTools.js +713 -0
- package/dist/tools/smartDiscoveryTools.d.ts +163 -0
- package/dist/tools/smartDiscoveryTools.js +750 -0
- package/package.json +1 -1
|
@@ -0,0 +1,750 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.SmartDiscoveryTools = void 0;
|
|
7
|
+
const connection_1 = __importDefault(require("../db/connection"));
|
|
8
|
+
const config_1 = require("../config/config");
|
|
9
|
+
/**
|
|
10
|
+
* Smart Data Discovery Agent
|
|
11
|
+
* Finds relevant tables/columns using semantic search and pattern matching
|
|
12
|
+
* Discovers hidden relationships automatically
|
|
13
|
+
*/
|
|
14
|
+
class SmartDiscoveryTools {
|
|
15
|
+
constructor(security) {
|
|
16
|
+
this.db = connection_1.default.getInstance();
|
|
17
|
+
this.security = security;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Validate database access - ensures only the connected database can be accessed
|
|
21
|
+
*/
|
|
22
|
+
validateDatabaseAccess(requestedDatabase) {
|
|
23
|
+
const connectedDatabase = config_1.dbConfig.database;
|
|
24
|
+
if (!connectedDatabase) {
|
|
25
|
+
return {
|
|
26
|
+
valid: false,
|
|
27
|
+
database: "",
|
|
28
|
+
error: "No database specified in connection string. Cannot access any database.",
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
if (!requestedDatabase) {
|
|
32
|
+
return {
|
|
33
|
+
valid: true,
|
|
34
|
+
database: connectedDatabase,
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
if (requestedDatabase !== connectedDatabase) {
|
|
38
|
+
return {
|
|
39
|
+
valid: false,
|
|
40
|
+
database: "",
|
|
41
|
+
error: `Access denied. You can only access the connected database '${connectedDatabase}'. Requested database '${requestedDatabase}' is not allowed.`,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
return {
|
|
45
|
+
valid: true,
|
|
46
|
+
database: connectedDatabase,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Smart search across database objects (tables, columns, data patterns)
|
|
51
|
+
*/
|
|
52
|
+
async smartSearch(params) {
|
|
53
|
+
const startTime = Date.now();
|
|
54
|
+
try {
|
|
55
|
+
const dbValidation = this.validateDatabaseAccess(params?.database);
|
|
56
|
+
if (!dbValidation.valid) {
|
|
57
|
+
return { status: "error", error: dbValidation.error };
|
|
58
|
+
}
|
|
59
|
+
const { search_term, search_type = "all", similarity_threshold = 0.3, include_sample_data = false, max_results = 20, } = params;
|
|
60
|
+
const database = dbValidation.database;
|
|
61
|
+
if (!search_term?.trim()) {
|
|
62
|
+
return { status: "error", error: "search_term is required" };
|
|
63
|
+
}
|
|
64
|
+
const searchTermLower = search_term.toLowerCase().trim();
|
|
65
|
+
const searchTokens = this.tokenize(searchTermLower);
|
|
66
|
+
// Get all tables and columns
|
|
67
|
+
const tablesResult = await this.db.query(`SELECT TABLE_NAME, TABLE_ROWS
|
|
68
|
+
FROM INFORMATION_SCHEMA.TABLES
|
|
69
|
+
WHERE TABLE_SCHEMA = ? AND TABLE_TYPE = 'BASE TABLE'`, [database]);
|
|
70
|
+
const columnsResult = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_KEY
|
|
71
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
72
|
+
WHERE TABLE_SCHEMA = ?
|
|
73
|
+
ORDER BY TABLE_NAME, ORDINAL_POSITION`, [database]);
|
|
74
|
+
// Get foreign key relationships
|
|
75
|
+
const fkResult = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
|
|
76
|
+
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
|
|
77
|
+
WHERE TABLE_SCHEMA = ? AND REFERENCED_TABLE_NAME IS NOT NULL`, [database]);
|
|
78
|
+
const results = {
|
|
79
|
+
tables: [],
|
|
80
|
+
columns: [],
|
|
81
|
+
data_patterns: [],
|
|
82
|
+
relationships: [],
|
|
83
|
+
};
|
|
84
|
+
// Search tables
|
|
85
|
+
if (search_type === "all" || search_type === "table") {
|
|
86
|
+
for (const table of tablesResult) {
|
|
87
|
+
const tableName = table.TABLE_NAME.toLowerCase();
|
|
88
|
+
const score = this.calculateRelevanceScore(searchTokens, tableName, searchTermLower);
|
|
89
|
+
if (score >= similarity_threshold) {
|
|
90
|
+
const tableColumns = columnsResult.filter((c) => c.TABLE_NAME === table.TABLE_NAME);
|
|
91
|
+
const matchingCols = tableColumns
|
|
92
|
+
.filter((c) => this.calculateRelevanceScore(searchTokens, c.COLUMN_NAME.toLowerCase(), searchTermLower) >= similarity_threshold)
|
|
93
|
+
.map((c) => c.COLUMN_NAME);
|
|
94
|
+
results.tables.push({
|
|
95
|
+
name: table.TABLE_NAME,
|
|
96
|
+
relevance_score: Math.round(score * 100) / 100,
|
|
97
|
+
match_reason: this.getMatchReason(searchTokens, tableName),
|
|
98
|
+
column_count: tableColumns.length,
|
|
99
|
+
row_estimate: parseInt(table.TABLE_ROWS || "0", 10) || 0,
|
|
100
|
+
matching_columns: matchingCols.length > 0 ? matchingCols : undefined,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Search columns
|
|
106
|
+
if (search_type === "all" || search_type === "column") {
|
|
107
|
+
for (const column of columnsResult) {
|
|
108
|
+
const colName = column.COLUMN_NAME.toLowerCase();
|
|
109
|
+
const score = this.calculateRelevanceScore(searchTokens, colName, searchTermLower);
|
|
110
|
+
if (score >= similarity_threshold) {
|
|
111
|
+
const colResult = {
|
|
112
|
+
table_name: column.TABLE_NAME,
|
|
113
|
+
column_name: column.COLUMN_NAME,
|
|
114
|
+
data_type: column.DATA_TYPE,
|
|
115
|
+
relevance_score: Math.round(score * 100) / 100,
|
|
116
|
+
match_reason: this.getMatchReason(searchTokens, colName),
|
|
117
|
+
};
|
|
118
|
+
// Include sample data if requested
|
|
119
|
+
if (include_sample_data) {
|
|
120
|
+
try {
|
|
121
|
+
const samples = await this.db.query(`SELECT DISTINCT \`${column.COLUMN_NAME}\`
|
|
122
|
+
FROM \`${database}\`.\`${column.TABLE_NAME}\`
|
|
123
|
+
WHERE \`${column.COLUMN_NAME}\` IS NOT NULL
|
|
124
|
+
LIMIT 5`);
|
|
125
|
+
colResult.sample_values = samples.map((s) => s[column.COLUMN_NAME]);
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
// Ignore errors when fetching samples
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
results.columns.push(colResult);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
// Search for data patterns (look for the search term in actual data)
|
|
136
|
+
if (search_type === "all" || search_type === "data_pattern") {
|
|
137
|
+
// Only search text and varchar columns for data patterns
|
|
138
|
+
const textColumns = columnsResult.filter((c) => ["varchar", "text", "char", "longtext", "mediumtext", "tinytext", "enum", "set"].includes(c.DATA_TYPE.toLowerCase()));
|
|
139
|
+
// Limit to prevent too many queries
|
|
140
|
+
const columnsToSearch = textColumns.slice(0, 20);
|
|
141
|
+
for (const column of columnsToSearch) {
|
|
142
|
+
try {
|
|
143
|
+
const patternQuery = `
|
|
144
|
+
SELECT DISTINCT \`${column.COLUMN_NAME}\`
|
|
145
|
+
FROM \`${database}\`.\`${column.TABLE_NAME}\`
|
|
146
|
+
WHERE LOWER(\`${column.COLUMN_NAME}\`) LIKE ?
|
|
147
|
+
LIMIT 5
|
|
148
|
+
`;
|
|
149
|
+
const matches = await this.db.query(patternQuery, [
|
|
150
|
+
`%${searchTermLower}%`,
|
|
151
|
+
]);
|
|
152
|
+
if (matches.length > 0) {
|
|
153
|
+
results.data_patterns.push({
|
|
154
|
+
table_name: column.TABLE_NAME,
|
|
155
|
+
column_name: column.COLUMN_NAME,
|
|
156
|
+
pattern_type: "CONTAINS",
|
|
157
|
+
description: `Found ${matches.length}+ values containing "${search_term}"`,
|
|
158
|
+
sample_matches: matches.map((m) => m[column.COLUMN_NAME]),
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// Ignore errors when searching patterns
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
// Search relationships
|
|
168
|
+
if (search_type === "all" || search_type === "relationship") {
|
|
169
|
+
// Explicit foreign keys
|
|
170
|
+
for (const fk of fkResult) {
|
|
171
|
+
const score = Math.max(this.calculateRelevanceScore(searchTokens, fk.TABLE_NAME.toLowerCase(), searchTermLower), this.calculateRelevanceScore(searchTokens, fk.REFERENCED_TABLE_NAME.toLowerCase(), searchTermLower), this.calculateRelevanceScore(searchTokens, fk.COLUMN_NAME.toLowerCase(), searchTermLower));
|
|
172
|
+
if (score >= similarity_threshold) {
|
|
173
|
+
results.relationships.push({
|
|
174
|
+
from_table: fk.TABLE_NAME,
|
|
175
|
+
from_column: fk.COLUMN_NAME,
|
|
176
|
+
to_table: fk.REFERENCED_TABLE_NAME,
|
|
177
|
+
to_column: fk.REFERENCED_COLUMN_NAME,
|
|
178
|
+
relationship_type: "FOREIGN_KEY",
|
|
179
|
+
confidence: 1.0,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
// Discover implicit relationships (naming conventions)
|
|
184
|
+
const implicitRels = this.discoverImplicitRelationships(tablesResult, columnsResult, searchTokens, similarity_threshold);
|
|
185
|
+
results.relationships.push(...implicitRels);
|
|
186
|
+
}
|
|
187
|
+
// Sort and limit results
|
|
188
|
+
results.tables.sort((a, b) => b.relevance_score - a.relevance_score);
|
|
189
|
+
results.columns.sort((a, b) => b.relevance_score - a.relevance_score);
|
|
190
|
+
// Apply max_results limit
|
|
191
|
+
results.tables = results.tables.slice(0, max_results);
|
|
192
|
+
results.columns = results.columns.slice(0, max_results);
|
|
193
|
+
results.data_patterns = results.data_patterns.slice(0, max_results);
|
|
194
|
+
results.relationships = results.relationships.slice(0, max_results);
|
|
195
|
+
const totalMatches = results.tables.length +
|
|
196
|
+
results.columns.length +
|
|
197
|
+
results.data_patterns.length +
|
|
198
|
+
results.relationships.length;
|
|
199
|
+
return {
|
|
200
|
+
status: "success",
|
|
201
|
+
data: {
|
|
202
|
+
search_term,
|
|
203
|
+
search_type,
|
|
204
|
+
results,
|
|
205
|
+
total_matches: totalMatches,
|
|
206
|
+
search_time_ms: Date.now() - startTime,
|
|
207
|
+
},
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
catch (error) {
|
|
211
|
+
return {
|
|
212
|
+
status: "error",
|
|
213
|
+
error: error.message,
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Find similar columns across tables (potential join candidates)
|
|
219
|
+
*/
|
|
220
|
+
async findSimilarColumns(params) {
|
|
221
|
+
try {
|
|
222
|
+
const dbValidation = this.validateDatabaseAccess(params?.database);
|
|
223
|
+
if (!dbValidation.valid) {
|
|
224
|
+
return { status: "error", error: dbValidation.error };
|
|
225
|
+
}
|
|
226
|
+
const { column_name, table_name, include_data_comparison = false, max_results = 20, } = params;
|
|
227
|
+
const database = dbValidation.database;
|
|
228
|
+
// Get all columns
|
|
229
|
+
const allColumns = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_KEY
|
|
230
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
231
|
+
WHERE TABLE_SCHEMA = ?
|
|
232
|
+
ORDER BY TABLE_NAME, ORDINAL_POSITION`, [database]);
|
|
233
|
+
const similarColumns = [];
|
|
234
|
+
const potentialJoins = [];
|
|
235
|
+
let referenceColumn;
|
|
236
|
+
if (column_name && table_name) {
|
|
237
|
+
// Find reference column
|
|
238
|
+
const refCol = allColumns.find((c) => c.TABLE_NAME.toLowerCase() === table_name.toLowerCase() &&
|
|
239
|
+
c.COLUMN_NAME.toLowerCase() === column_name.toLowerCase());
|
|
240
|
+
if (!refCol) {
|
|
241
|
+
return {
|
|
242
|
+
status: "error",
|
|
243
|
+
error: `Column '${column_name}' not found in table '${table_name}'`,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
referenceColumn = {
|
|
247
|
+
table: refCol.TABLE_NAME,
|
|
248
|
+
column: refCol.COLUMN_NAME,
|
|
249
|
+
data_type: refCol.DATA_TYPE,
|
|
250
|
+
};
|
|
251
|
+
// Find similar columns
|
|
252
|
+
const refNameTokens = this.tokenize(refCol.COLUMN_NAME.toLowerCase());
|
|
253
|
+
for (const col of allColumns) {
|
|
254
|
+
if (col.TABLE_NAME === refCol.TABLE_NAME &&
|
|
255
|
+
col.COLUMN_NAME === refCol.COLUMN_NAME) {
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
const colNameLower = col.COLUMN_NAME.toLowerCase();
|
|
259
|
+
const nameSimilarity = this.calculateNameSimilarity(refCol.COLUMN_NAME.toLowerCase(), colNameLower);
|
|
260
|
+
const typeSimilarity = col.DATA_TYPE === refCol.DATA_TYPE ? 0.3 : 0;
|
|
261
|
+
const totalScore = nameSimilarity * 0.7 + typeSimilarity;
|
|
262
|
+
if (totalScore >= 0.3) {
|
|
263
|
+
const simCol = {
|
|
264
|
+
table_name: col.TABLE_NAME,
|
|
265
|
+
column_name: col.COLUMN_NAME,
|
|
266
|
+
data_type: col.DATA_TYPE,
|
|
267
|
+
similarity_score: Math.round(totalScore * 100) / 100,
|
|
268
|
+
similarity_type: this.getSimilarityType(refCol.COLUMN_NAME, col.COLUMN_NAME),
|
|
269
|
+
};
|
|
270
|
+
// Compare data if requested
|
|
271
|
+
if (include_data_comparison &&
|
|
272
|
+
col.DATA_TYPE === refCol.DATA_TYPE) {
|
|
273
|
+
try {
|
|
274
|
+
const overlapResult = await this.calculateDataOverlap(database, refCol.TABLE_NAME, refCol.COLUMN_NAME, col.TABLE_NAME, col.COLUMN_NAME);
|
|
275
|
+
simCol.data_overlap_percentage = overlapResult;
|
|
276
|
+
}
|
|
277
|
+
catch {
|
|
278
|
+
// Ignore data comparison errors
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
similarColumns.push(simCol);
|
|
282
|
+
// Add as potential join if high confidence
|
|
283
|
+
if (totalScore >= 0.6 || (simCol.data_overlap_percentage || 0) > 50) {
|
|
284
|
+
potentialJoins.push({
|
|
285
|
+
table1: refCol.TABLE_NAME,
|
|
286
|
+
column1: refCol.COLUMN_NAME,
|
|
287
|
+
table2: col.TABLE_NAME,
|
|
288
|
+
column2: col.COLUMN_NAME,
|
|
289
|
+
confidence: Math.round(Math.max(totalScore, (simCol.data_overlap_percentage || 0) / 100) * 100) / 100,
|
|
290
|
+
reason: (simCol.data_overlap_percentage || 0) > 50
|
|
291
|
+
? "High data overlap"
|
|
292
|
+
: "Similar column names",
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
else {
|
|
299
|
+
// Find all potential join candidates based on naming patterns
|
|
300
|
+
const columnGroups = new Map();
|
|
301
|
+
for (const col of allColumns) {
|
|
302
|
+
// Group by normalized column name
|
|
303
|
+
const normalized = this.normalizeColumnName(col.COLUMN_NAME);
|
|
304
|
+
if (!columnGroups.has(normalized)) {
|
|
305
|
+
columnGroups.set(normalized, []);
|
|
306
|
+
}
|
|
307
|
+
columnGroups.get(normalized).push(col);
|
|
308
|
+
}
|
|
309
|
+
// Find groups with multiple columns (potential joins)
|
|
310
|
+
for (const [normalizedName, columns] of columnGroups) {
|
|
311
|
+
if (columns.length > 1) {
|
|
312
|
+
// Generate pairs
|
|
313
|
+
for (let i = 0; i < columns.length; i++) {
|
|
314
|
+
for (let j = i + 1; j < columns.length; j++) {
|
|
315
|
+
const col1 = columns[i];
|
|
316
|
+
const col2 = columns[j];
|
|
317
|
+
if (col1.TABLE_NAME !== col2.TABLE_NAME &&
|
|
318
|
+
col1.DATA_TYPE === col2.DATA_TYPE) {
|
|
319
|
+
const confidence = col1.COLUMN_NAME === col2.COLUMN_NAME ? 0.9 : 0.7;
|
|
320
|
+
potentialJoins.push({
|
|
321
|
+
table1: col1.TABLE_NAME,
|
|
322
|
+
column1: col1.COLUMN_NAME,
|
|
323
|
+
table2: col2.TABLE_NAME,
|
|
324
|
+
column2: col2.COLUMN_NAME,
|
|
325
|
+
confidence,
|
|
326
|
+
reason: col1.COLUMN_NAME === col2.COLUMN_NAME
|
|
327
|
+
? "Identical column names"
|
|
328
|
+
: `Similar names: ${col1.COLUMN_NAME} ~ ${col2.COLUMN_NAME}`,
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
// Also look for id/foreign key patterns
|
|
336
|
+
for (const col of allColumns) {
|
|
337
|
+
if (col.COLUMN_NAME.toLowerCase().endsWith("_id")) {
|
|
338
|
+
const potentialTable = col.COLUMN_NAME.slice(0, -3);
|
|
339
|
+
const matchingTable = allColumns.find((c) => c.TABLE_NAME.toLowerCase() === potentialTable.toLowerCase() &&
|
|
340
|
+
c.COLUMN_KEY === "PRI");
|
|
341
|
+
if (matchingTable && matchingTable.TABLE_NAME !== col.TABLE_NAME) {
|
|
342
|
+
potentialJoins.push({
|
|
343
|
+
table1: col.TABLE_NAME,
|
|
344
|
+
column1: col.COLUMN_NAME,
|
|
345
|
+
table2: matchingTable.TABLE_NAME,
|
|
346
|
+
column2: matchingTable.COLUMN_NAME,
|
|
347
|
+
confidence: 0.85,
|
|
348
|
+
reason: "Foreign key naming convention (_id suffix)",
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
// Sort by similarity/confidence
|
|
355
|
+
similarColumns.sort((a, b) => b.similarity_score - a.similarity_score);
|
|
356
|
+
potentialJoins.sort((a, b) => b.confidence - a.confidence);
|
|
357
|
+
return {
|
|
358
|
+
status: "success",
|
|
359
|
+
data: {
|
|
360
|
+
reference_column: referenceColumn,
|
|
361
|
+
similar_columns: similarColumns.slice(0, max_results),
|
|
362
|
+
potential_joins: potentialJoins.slice(0, max_results),
|
|
363
|
+
},
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
catch (error) {
|
|
367
|
+
return {
|
|
368
|
+
status: "error",
|
|
369
|
+
error: error.message,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
/**
|
|
374
|
+
* Discover data relationships and patterns
|
|
375
|
+
*/
|
|
376
|
+
async discoverDataPatterns(params) {
|
|
377
|
+
try {
|
|
378
|
+
const dbValidation = this.validateDatabaseAccess(params?.database);
|
|
379
|
+
if (!dbValidation.valid) {
|
|
380
|
+
return { status: "error", error: dbValidation.error };
|
|
381
|
+
}
|
|
382
|
+
const { table_name, pattern_types = ["unique", "null", "duplicate", "format", "range"], max_columns = 20, } = params;
|
|
383
|
+
const database = dbValidation.database;
|
|
384
|
+
// Validate table name
|
|
385
|
+
if (!this.security.validateIdentifier(table_name).valid) {
|
|
386
|
+
return { status: "error", error: "Invalid table name" };
|
|
387
|
+
}
|
|
388
|
+
// Get columns
|
|
389
|
+
const columns = await this.db.query(`SELECT COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COLUMN_KEY
|
|
390
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
391
|
+
WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ?
|
|
392
|
+
ORDER BY ORDINAL_POSITION
|
|
393
|
+
LIMIT ?`, [database, table_name, max_columns]);
|
|
394
|
+
if (columns.length === 0) {
|
|
395
|
+
return {
|
|
396
|
+
status: "error",
|
|
397
|
+
error: `Table '${table_name}' not found or has no columns`,
|
|
398
|
+
};
|
|
399
|
+
}
|
|
400
|
+
// Get row count
|
|
401
|
+
const countResult = await this.db.query(`SELECT COUNT(*) as cnt FROM \`${database}\`.\`${table_name}\``);
|
|
402
|
+
const totalRows = countResult[0]?.cnt || 0;
|
|
403
|
+
const patterns = [];
|
|
404
|
+
let qualityScore = 100;
|
|
405
|
+
for (const col of columns) {
|
|
406
|
+
const colName = col.COLUMN_NAME;
|
|
407
|
+
// Check for NULL patterns
|
|
408
|
+
if (pattern_types.includes("null")) {
|
|
409
|
+
const nullResult = await this.db.query(`SELECT COUNT(*) as null_count
|
|
410
|
+
FROM \`${database}\`.\`${table_name}\`
|
|
411
|
+
WHERE \`${colName}\` IS NULL`);
|
|
412
|
+
const nullCount = nullResult[0]?.null_count || 0;
|
|
413
|
+
const nullPercentage = totalRows > 0 ? (nullCount / totalRows) * 100 : 0;
|
|
414
|
+
if (nullPercentage > 50) {
|
|
415
|
+
patterns.push({
|
|
416
|
+
column_name: colName,
|
|
417
|
+
pattern_type: "HIGH_NULL_RATE",
|
|
418
|
+
description: `${nullPercentage.toFixed(1)}% of values are NULL`,
|
|
419
|
+
metrics: { null_count: nullCount, null_percentage: nullPercentage },
|
|
420
|
+
recommendations: [
|
|
421
|
+
"Review if this column is necessary",
|
|
422
|
+
"Consider setting a default value",
|
|
423
|
+
],
|
|
424
|
+
});
|
|
425
|
+
qualityScore -= 5;
|
|
426
|
+
}
|
|
427
|
+
else if (nullPercentage > 0 && col.IS_NULLABLE === "NO") {
|
|
428
|
+
patterns.push({
|
|
429
|
+
column_name: colName,
|
|
430
|
+
pattern_type: "NULLABLE_MISMATCH",
|
|
431
|
+
description: "Column is marked NOT NULL but has NULL values",
|
|
432
|
+
metrics: { null_count: nullCount },
|
|
433
|
+
recommendations: ["Check data integrity constraints"],
|
|
434
|
+
});
|
|
435
|
+
qualityScore -= 10;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
// Check for uniqueness patterns
|
|
439
|
+
if (pattern_types.includes("unique")) {
|
|
440
|
+
const uniqueResult = await this.db.query(`SELECT COUNT(DISTINCT \`${colName}\`) as distinct_count
|
|
441
|
+
FROM \`${database}\`.\`${table_name}\``);
|
|
442
|
+
const distinctCount = uniqueResult[0]?.distinct_count || 0;
|
|
443
|
+
const uniqueRatio = totalRows > 0 ? distinctCount / totalRows : 0;
|
|
444
|
+
if (uniqueRatio === 1 && col.COLUMN_KEY !== "PRI" && col.COLUMN_KEY !== "UNI") {
|
|
445
|
+
patterns.push({
|
|
446
|
+
column_name: colName,
|
|
447
|
+
pattern_type: "POTENTIALLY_UNIQUE",
|
|
448
|
+
description: "All values are unique but column is not marked as UNIQUE",
|
|
449
|
+
metrics: { distinct_count: distinctCount, total_rows: totalRows },
|
|
450
|
+
recommendations: [
|
|
451
|
+
"Consider adding a UNIQUE constraint",
|
|
452
|
+
"Could be a natural key candidate",
|
|
453
|
+
],
|
|
454
|
+
});
|
|
455
|
+
}
|
|
456
|
+
else if (uniqueRatio < 0.1 && totalRows > 100) {
|
|
457
|
+
patterns.push({
|
|
458
|
+
column_name: colName,
|
|
459
|
+
pattern_type: "LOW_CARDINALITY",
|
|
460
|
+
description: `Only ${distinctCount} distinct values across ${totalRows} rows`,
|
|
461
|
+
metrics: { distinct_count: distinctCount, cardinality_ratio: uniqueRatio },
|
|
462
|
+
recommendations: [
|
|
463
|
+
"Consider using ENUM if values are fixed",
|
|
464
|
+
"May be a good candidate for indexing",
|
|
465
|
+
],
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
// Check for duplicate patterns
|
|
470
|
+
if (pattern_types.includes("duplicate") && totalRows > 10) {
|
|
471
|
+
const duplicateResult = await this.db.query(`SELECT \`${colName}\`, COUNT(*) as cnt
|
|
472
|
+
FROM \`${database}\`.\`${table_name}\`
|
|
473
|
+
WHERE \`${colName}\` IS NOT NULL
|
|
474
|
+
GROUP BY \`${colName}\`
|
|
475
|
+
HAVING cnt > 1
|
|
476
|
+
ORDER BY cnt DESC
|
|
477
|
+
LIMIT 5`);
|
|
478
|
+
if (duplicateResult.length > 0) {
|
|
479
|
+
const topDuplicates = duplicateResult.map((r) => ({
|
|
480
|
+
value: r[colName],
|
|
481
|
+
count: r.cnt,
|
|
482
|
+
}));
|
|
483
|
+
if (col.COLUMN_KEY === "UNI" || col.COLUMN_KEY === "PRI") {
|
|
484
|
+
patterns.push({
|
|
485
|
+
column_name: colName,
|
|
486
|
+
pattern_type: "DUPLICATE_IN_UNIQUE",
|
|
487
|
+
description: "Duplicates found in supposedly unique column",
|
|
488
|
+
metrics: { top_duplicates: topDuplicates },
|
|
489
|
+
recommendations: ["Critical: Fix data integrity issue"],
|
|
490
|
+
});
|
|
491
|
+
qualityScore -= 20;
|
|
492
|
+
}
|
|
493
|
+
else {
|
|
494
|
+
patterns.push({
|
|
495
|
+
column_name: colName,
|
|
496
|
+
pattern_type: "DUPLICATES_FOUND",
|
|
497
|
+
description: `Found ${duplicateResult.length} values with duplicates`,
|
|
498
|
+
metrics: { duplicate_groups: duplicateResult.length, top_duplicates: topDuplicates },
|
|
499
|
+
});
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
// Check for format patterns (for string columns)
|
|
504
|
+
if (pattern_types.includes("format") &&
|
|
505
|
+
["varchar", "char", "text"].includes(col.DATA_TYPE.toLowerCase())) {
|
|
506
|
+
// Check for email pattern
|
|
507
|
+
const emailResult = await this.db.query(`SELECT COUNT(*) as cnt
|
|
508
|
+
FROM \`${database}\`.\`${table_name}\`
|
|
509
|
+
WHERE \`${colName}\` REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'`);
|
|
510
|
+
if (emailResult[0]?.cnt > totalRows * 0.5) {
|
|
511
|
+
patterns.push({
|
|
512
|
+
column_name: colName,
|
|
513
|
+
pattern_type: "EMAIL_FORMAT",
|
|
514
|
+
description: "Contains email-formatted data",
|
|
515
|
+
metrics: { email_count: emailResult[0].cnt },
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
// Check for phone pattern
|
|
519
|
+
const phoneResult = await this.db.query(`SELECT COUNT(*) as cnt
|
|
520
|
+
FROM \`${database}\`.\`${table_name}\`
|
|
521
|
+
WHERE \`${colName}\` REGEXP '^[+]?[0-9]{10,15}$'`);
|
|
522
|
+
if (phoneResult[0]?.cnt > totalRows * 0.3) {
|
|
523
|
+
patterns.push({
|
|
524
|
+
column_name: colName,
|
|
525
|
+
pattern_type: "PHONE_FORMAT",
|
|
526
|
+
description: "Contains phone number-formatted data",
|
|
527
|
+
metrics: { phone_count: phoneResult[0].cnt },
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
// Check for range patterns (for numeric columns)
|
|
532
|
+
if (pattern_types.includes("range") &&
|
|
533
|
+
["int", "bigint", "decimal", "float", "double"].includes(col.DATA_TYPE.toLowerCase())) {
|
|
534
|
+
const rangeResult = await this.db.query(`SELECT MIN(\`${colName}\`) as min_val, MAX(\`${colName}\`) as max_val, AVG(\`${colName}\`) as avg_val
|
|
535
|
+
FROM \`${database}\`.\`${table_name}\``);
|
|
536
|
+
if (rangeResult[0]) {
|
|
537
|
+
const { min_val, max_val, avg_val } = rangeResult[0];
|
|
538
|
+
const range = max_val - min_val;
|
|
539
|
+
patterns.push({
|
|
540
|
+
column_name: colName,
|
|
541
|
+
pattern_type: "NUMERIC_RANGE",
|
|
542
|
+
description: `Values range from ${min_val} to ${max_val}`,
|
|
543
|
+
metrics: {
|
|
544
|
+
min: min_val,
|
|
545
|
+
max: max_val,
|
|
546
|
+
avg: avg_val,
|
|
547
|
+
range: range,
|
|
548
|
+
},
|
|
549
|
+
});
|
|
550
|
+
// Check for potential outliers
|
|
551
|
+
if (range > 0 && (max_val > avg_val * 10 || min_val < avg_val / 10)) {
|
|
552
|
+
patterns.push({
|
|
553
|
+
column_name: colName,
|
|
554
|
+
pattern_type: "POTENTIAL_OUTLIERS",
|
|
555
|
+
description: "Large variance detected, may contain outliers",
|
|
556
|
+
recommendations: [
|
|
557
|
+
"Review extreme values for data quality",
|
|
558
|
+
"Consider outlier detection",
|
|
559
|
+
],
|
|
560
|
+
});
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
return {
|
|
566
|
+
status: "success",
|
|
567
|
+
data: {
|
|
568
|
+
table_name,
|
|
569
|
+
patterns,
|
|
570
|
+
summary: {
|
|
571
|
+
columns_analyzed: columns.length,
|
|
572
|
+
patterns_found: patterns.length,
|
|
573
|
+
data_quality_score: Math.max(0, qualityScore),
|
|
574
|
+
},
|
|
575
|
+
},
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
catch (error) {
|
|
579
|
+
return {
|
|
580
|
+
status: "error",
|
|
581
|
+
error: error.message,
|
|
582
|
+
};
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
// ==================== Helper Methods ====================
|
|
586
|
+
/**
|
|
587
|
+
* Tokenize a string into searchable tokens
|
|
588
|
+
*/
|
|
589
|
+
tokenize(text) {
|
|
590
|
+
return text
|
|
591
|
+
.toLowerCase()
|
|
592
|
+
.replace(/[-_]/g, " ")
|
|
593
|
+
.split(/\s+/)
|
|
594
|
+
.filter((t) => t.length > 1);
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Calculate relevance score between search tokens and target
|
|
598
|
+
*/
|
|
599
|
+
calculateRelevanceScore(tokens, target, originalTerm) {
|
|
600
|
+
const targetLower = target.toLowerCase();
|
|
601
|
+
// Exact match
|
|
602
|
+
if (targetLower === originalTerm)
|
|
603
|
+
return 1.0;
|
|
604
|
+
// Contains full term
|
|
605
|
+
if (targetLower.includes(originalTerm))
|
|
606
|
+
return 0.9;
|
|
607
|
+
if (originalTerm.includes(targetLower))
|
|
608
|
+
return 0.85;
|
|
609
|
+
// Token-based scoring
|
|
610
|
+
let matchedTokens = 0;
|
|
611
|
+
for (const token of tokens) {
|
|
612
|
+
if (targetLower.includes(token))
|
|
613
|
+
matchedTokens++;
|
|
614
|
+
}
|
|
615
|
+
if (tokens.length > 0) {
|
|
616
|
+
const tokenScore = matchedTokens / tokens.length;
|
|
617
|
+
if (tokenScore > 0)
|
|
618
|
+
return 0.3 + tokenScore * 0.5;
|
|
619
|
+
}
|
|
620
|
+
// Similarity-based scoring
|
|
621
|
+
const similarity = this.calculateNameSimilarity(originalTerm, targetLower);
|
|
622
|
+
if (similarity > 0.5)
|
|
623
|
+
return similarity * 0.6;
|
|
624
|
+
return 0;
|
|
625
|
+
}
|
|
626
|
+
/**
|
|
627
|
+
* Get a human-readable match reason
|
|
628
|
+
*/
|
|
629
|
+
getMatchReason(tokens, target) {
|
|
630
|
+
const targetLower = target.toLowerCase();
|
|
631
|
+
for (const token of tokens) {
|
|
632
|
+
if (targetLower === token)
|
|
633
|
+
return `Exact match: "${token}"`;
|
|
634
|
+
if (targetLower.includes(token))
|
|
635
|
+
return `Contains: "${token}"`;
|
|
636
|
+
}
|
|
637
|
+
return "Similar name pattern";
|
|
638
|
+
}
|
|
639
|
+
/**
|
|
640
|
+
* Calculate name similarity using Levenshtein-like approach
|
|
641
|
+
*/
|
|
642
|
+
calculateNameSimilarity(a, b) {
|
|
643
|
+
if (a === b)
|
|
644
|
+
return 1;
|
|
645
|
+
if (a.length === 0 || b.length === 0)
|
|
646
|
+
return 0;
|
|
647
|
+
// Simple character overlap ratio
|
|
648
|
+
const setA = new Set(a.split(""));
|
|
649
|
+
const setB = new Set(b.split(""));
|
|
650
|
+
const intersection = [...setA].filter((x) => setB.has(x)).length;
|
|
651
|
+
const union = new Set([...setA, ...setB]).size;
|
|
652
|
+
const charSimilarity = union > 0 ? intersection / union : 0;
|
|
653
|
+
// Prefix matching bonus
|
|
654
|
+
let prefixLength = 0;
|
|
655
|
+
const minLen = Math.min(a.length, b.length);
|
|
656
|
+
for (let i = 0; i < minLen; i++) {
|
|
657
|
+
if (a[i] === b[i])
|
|
658
|
+
prefixLength++;
|
|
659
|
+
else
|
|
660
|
+
break;
|
|
661
|
+
}
|
|
662
|
+
const prefixBonus = prefixLength / Math.max(a.length, b.length);
|
|
663
|
+
return charSimilarity * 0.6 + prefixBonus * 0.4;
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Normalize column name for comparison
|
|
667
|
+
*/
|
|
668
|
+
normalizeColumnName(name) {
|
|
669
|
+
return name
|
|
670
|
+
.toLowerCase()
|
|
671
|
+
.replace(/^(fk_|pk_|idx_)/, "")
|
|
672
|
+
.replace(/_id$/, "")
|
|
673
|
+
.replace(/[_-]/g, "");
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Get similarity type description
|
|
677
|
+
*/
|
|
678
|
+
getSimilarityType(name1, name2) {
|
|
679
|
+
if (name1.toLowerCase() === name2.toLowerCase())
|
|
680
|
+
return "EXACT_MATCH";
|
|
681
|
+
const n1 = this.normalizeColumnName(name1);
|
|
682
|
+
const n2 = this.normalizeColumnName(name2);
|
|
683
|
+
if (n1 === n2)
|
|
684
|
+
return "NORMALIZED_MATCH";
|
|
685
|
+
if (n1.includes(n2) || n2.includes(n1))
|
|
686
|
+
return "SUBSTRING_MATCH";
|
|
687
|
+
return "SIMILAR_PATTERN";
|
|
688
|
+
}
|
|
689
|
+
/**
|
|
690
|
+
* Calculate data overlap between two columns
|
|
691
|
+
*/
|
|
692
|
+
async calculateDataOverlap(database, table1, col1, table2, col2) {
|
|
693
|
+
const overlapQuery = `
|
|
694
|
+
SELECT COUNT(DISTINCT t1.\`${col1}\`) as overlap_count
|
|
695
|
+
FROM \`${database}\`.\`${table1}\` t1
|
|
696
|
+
INNER JOIN \`${database}\`.\`${table2}\` t2 ON t1.\`${col1}\` = t2.\`${col2}\`
|
|
697
|
+
WHERE t1.\`${col1}\` IS NOT NULL
|
|
698
|
+
`;
|
|
699
|
+
const totalQuery = `
|
|
700
|
+
SELECT COUNT(DISTINCT \`${col1}\`) as total
|
|
701
|
+
FROM \`${database}\`.\`${table1}\`
|
|
702
|
+
WHERE \`${col1}\` IS NOT NULL
|
|
703
|
+
`;
|
|
704
|
+
const [overlapResult, totalResult] = await Promise.all([
|
|
705
|
+
this.db.query(overlapQuery),
|
|
706
|
+
this.db.query(totalQuery),
|
|
707
|
+
]);
|
|
708
|
+
const overlap = overlapResult[0]?.overlap_count || 0;
|
|
709
|
+
const total = totalResult[0]?.total || 1;
|
|
710
|
+
return Math.round((overlap / total) * 100);
|
|
711
|
+
}
|
|
712
|
+
/**
|
|
713
|
+
* Discover implicit relationships based on naming conventions
|
|
714
|
+
*/
|
|
715
|
+
discoverImplicitRelationships(tables, columns, searchTokens, threshold) {
|
|
716
|
+
const relationships = [];
|
|
717
|
+
const tableNames = new Set(tables.map((t) => t.TABLE_NAME.toLowerCase()));
|
|
718
|
+
const primaryKeys = new Map();
|
|
719
|
+
// Find primary keys
|
|
720
|
+
for (const col of columns) {
|
|
721
|
+
if (col.COLUMN_KEY === "PRI") {
|
|
722
|
+
primaryKeys.set(col.TABLE_NAME.toLowerCase(), col.COLUMN_NAME);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
// Look for _id columns that match table names
|
|
726
|
+
for (const col of columns) {
|
|
727
|
+
const colLower = col.COLUMN_NAME.toLowerCase();
|
|
728
|
+
if (colLower.endsWith("_id")) {
|
|
729
|
+
const potentialTable = colLower.slice(0, -3);
|
|
730
|
+
// Check if the tokens match
|
|
731
|
+
const score = this.calculateRelevanceScore(searchTokens, potentialTable, searchTokens.join(" "));
|
|
732
|
+
if (score >= threshold || searchTokens.length === 0) {
|
|
733
|
+
if (tableNames.has(potentialTable)) {
|
|
734
|
+
const pk = primaryKeys.get(potentialTable) || "id";
|
|
735
|
+
relationships.push({
|
|
736
|
+
from_table: col.TABLE_NAME,
|
|
737
|
+
from_column: col.COLUMN_NAME,
|
|
738
|
+
to_table: potentialTable,
|
|
739
|
+
to_column: pk,
|
|
740
|
+
relationship_type: "IMPLICIT_FK",
|
|
741
|
+
confidence: 0.8,
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
return relationships;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
exports.SmartDiscoveryTools = SmartDiscoveryTools;
|