@berthojoris/mcp-mysql-server 1.14.1 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,750 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.SmartDiscoveryTools = void 0;
7
+ const connection_1 = __importDefault(require("../db/connection"));
8
+ const config_1 = require("../config/config");
9
+ /**
10
+ * Smart Data Discovery Agent
11
+ * Finds relevant tables/columns using semantic search and pattern matching
12
+ * Discovers hidden relationships automatically
13
+ */
14
+ class SmartDiscoveryTools {
15
+ constructor(security) {
16
+ this.db = connection_1.default.getInstance();
17
+ this.security = security;
18
+ }
19
+ /**
20
+ * Validate database access - ensures only the connected database can be accessed
21
+ */
22
+ validateDatabaseAccess(requestedDatabase) {
23
+ const connectedDatabase = config_1.dbConfig.database;
24
+ if (!connectedDatabase) {
25
+ return {
26
+ valid: false,
27
+ database: "",
28
+ error: "No database specified in connection string. Cannot access any database.",
29
+ };
30
+ }
31
+ if (!requestedDatabase) {
32
+ return {
33
+ valid: true,
34
+ database: connectedDatabase,
35
+ };
36
+ }
37
+ if (requestedDatabase !== connectedDatabase) {
38
+ return {
39
+ valid: false,
40
+ database: "",
41
+ error: `Access denied. You can only access the connected database '${connectedDatabase}'. Requested database '${requestedDatabase}' is not allowed.`,
42
+ };
43
+ }
44
+ return {
45
+ valid: true,
46
+ database: connectedDatabase,
47
+ };
48
+ }
49
+ /**
50
+ * Smart search across database objects (tables, columns, data patterns)
51
+ */
52
+ async smartSearch(params) {
53
+ const startTime = Date.now();
54
+ try {
55
+ const dbValidation = this.validateDatabaseAccess(params?.database);
56
+ if (!dbValidation.valid) {
57
+ return { status: "error", error: dbValidation.error };
58
+ }
59
+ const { search_term, search_type = "all", similarity_threshold = 0.3, include_sample_data = false, max_results = 20, } = params;
60
+ const database = dbValidation.database;
61
+ if (!search_term?.trim()) {
62
+ return { status: "error", error: "search_term is required" };
63
+ }
64
+ const searchTermLower = search_term.toLowerCase().trim();
65
+ const searchTokens = this.tokenize(searchTermLower);
66
+ // Get all tables and columns
67
+ const tablesResult = await this.db.query(`SELECT TABLE_NAME, TABLE_ROWS
68
+ FROM INFORMATION_SCHEMA.TABLES
69
+ WHERE TABLE_SCHEMA = ? AND TABLE_TYPE = 'BASE TABLE'`, [database]);
70
+ const columnsResult = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_KEY
71
+ FROM INFORMATION_SCHEMA.COLUMNS
72
+ WHERE TABLE_SCHEMA = ?
73
+ ORDER BY TABLE_NAME, ORDINAL_POSITION`, [database]);
74
+ // Get foreign key relationships
75
+ const fkResult = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
76
+ FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
77
+ WHERE TABLE_SCHEMA = ? AND REFERENCED_TABLE_NAME IS NOT NULL`, [database]);
78
+ const results = {
79
+ tables: [],
80
+ columns: [],
81
+ data_patterns: [],
82
+ relationships: [],
83
+ };
84
+ // Search tables
85
+ if (search_type === "all" || search_type === "table") {
86
+ for (const table of tablesResult) {
87
+ const tableName = table.TABLE_NAME.toLowerCase();
88
+ const score = this.calculateRelevanceScore(searchTokens, tableName, searchTermLower);
89
+ if (score >= similarity_threshold) {
90
+ const tableColumns = columnsResult.filter((c) => c.TABLE_NAME === table.TABLE_NAME);
91
+ const matchingCols = tableColumns
92
+ .filter((c) => this.calculateRelevanceScore(searchTokens, c.COLUMN_NAME.toLowerCase(), searchTermLower) >= similarity_threshold)
93
+ .map((c) => c.COLUMN_NAME);
94
+ results.tables.push({
95
+ name: table.TABLE_NAME,
96
+ relevance_score: Math.round(score * 100) / 100,
97
+ match_reason: this.getMatchReason(searchTokens, tableName),
98
+ column_count: tableColumns.length,
99
+ row_estimate: parseInt(table.TABLE_ROWS || "0", 10) || 0,
100
+ matching_columns: matchingCols.length > 0 ? matchingCols : undefined,
101
+ });
102
+ }
103
+ }
104
+ }
105
+ // Search columns
106
+ if (search_type === "all" || search_type === "column") {
107
+ for (const column of columnsResult) {
108
+ const colName = column.COLUMN_NAME.toLowerCase();
109
+ const score = this.calculateRelevanceScore(searchTokens, colName, searchTermLower);
110
+ if (score >= similarity_threshold) {
111
+ const colResult = {
112
+ table_name: column.TABLE_NAME,
113
+ column_name: column.COLUMN_NAME,
114
+ data_type: column.DATA_TYPE,
115
+ relevance_score: Math.round(score * 100) / 100,
116
+ match_reason: this.getMatchReason(searchTokens, colName),
117
+ };
118
+ // Include sample data if requested
119
+ if (include_sample_data) {
120
+ try {
121
+ const samples = await this.db.query(`SELECT DISTINCT \`${column.COLUMN_NAME}\`
122
+ FROM \`${database}\`.\`${column.TABLE_NAME}\`
123
+ WHERE \`${column.COLUMN_NAME}\` IS NOT NULL
124
+ LIMIT 5`);
125
+ colResult.sample_values = samples.map((s) => s[column.COLUMN_NAME]);
126
+ }
127
+ catch {
128
+ // Ignore errors when fetching samples
129
+ }
130
+ }
131
+ results.columns.push(colResult);
132
+ }
133
+ }
134
+ }
135
+ // Search for data patterns (look for the search term in actual data)
136
+ if (search_type === "all" || search_type === "data_pattern") {
137
+ // Only search text and varchar columns for data patterns
138
+ const textColumns = columnsResult.filter((c) => ["varchar", "text", "char", "longtext", "mediumtext", "tinytext", "enum", "set"].includes(c.DATA_TYPE.toLowerCase()));
139
+ // Limit to prevent too many queries
140
+ const columnsToSearch = textColumns.slice(0, 20);
141
+ for (const column of columnsToSearch) {
142
+ try {
143
+ const patternQuery = `
144
+ SELECT DISTINCT \`${column.COLUMN_NAME}\`
145
+ FROM \`${database}\`.\`${column.TABLE_NAME}\`
146
+ WHERE LOWER(\`${column.COLUMN_NAME}\`) LIKE ?
147
+ LIMIT 5
148
+ `;
149
+ const matches = await this.db.query(patternQuery, [
150
+ `%${searchTermLower}%`,
151
+ ]);
152
+ if (matches.length > 0) {
153
+ results.data_patterns.push({
154
+ table_name: column.TABLE_NAME,
155
+ column_name: column.COLUMN_NAME,
156
+ pattern_type: "CONTAINS",
157
+ description: `Found ${matches.length}+ values containing "${search_term}"`,
158
+ sample_matches: matches.map((m) => m[column.COLUMN_NAME]),
159
+ });
160
+ }
161
+ }
162
+ catch {
163
+ // Ignore errors when searching patterns
164
+ }
165
+ }
166
+ }
167
+ // Search relationships
168
+ if (search_type === "all" || search_type === "relationship") {
169
+ // Explicit foreign keys
170
+ for (const fk of fkResult) {
171
+ const score = Math.max(this.calculateRelevanceScore(searchTokens, fk.TABLE_NAME.toLowerCase(), searchTermLower), this.calculateRelevanceScore(searchTokens, fk.REFERENCED_TABLE_NAME.toLowerCase(), searchTermLower), this.calculateRelevanceScore(searchTokens, fk.COLUMN_NAME.toLowerCase(), searchTermLower));
172
+ if (score >= similarity_threshold) {
173
+ results.relationships.push({
174
+ from_table: fk.TABLE_NAME,
175
+ from_column: fk.COLUMN_NAME,
176
+ to_table: fk.REFERENCED_TABLE_NAME,
177
+ to_column: fk.REFERENCED_COLUMN_NAME,
178
+ relationship_type: "FOREIGN_KEY",
179
+ confidence: 1.0,
180
+ });
181
+ }
182
+ }
183
+ // Discover implicit relationships (naming conventions)
184
+ const implicitRels = this.discoverImplicitRelationships(tablesResult, columnsResult, searchTokens, similarity_threshold);
185
+ results.relationships.push(...implicitRels);
186
+ }
187
+ // Sort and limit results
188
+ results.tables.sort((a, b) => b.relevance_score - a.relevance_score);
189
+ results.columns.sort((a, b) => b.relevance_score - a.relevance_score);
190
+ // Apply max_results limit
191
+ results.tables = results.tables.slice(0, max_results);
192
+ results.columns = results.columns.slice(0, max_results);
193
+ results.data_patterns = results.data_patterns.slice(0, max_results);
194
+ results.relationships = results.relationships.slice(0, max_results);
195
+ const totalMatches = results.tables.length +
196
+ results.columns.length +
197
+ results.data_patterns.length +
198
+ results.relationships.length;
199
+ return {
200
+ status: "success",
201
+ data: {
202
+ search_term,
203
+ search_type,
204
+ results,
205
+ total_matches: totalMatches,
206
+ search_time_ms: Date.now() - startTime,
207
+ },
208
+ };
209
+ }
210
+ catch (error) {
211
+ return {
212
+ status: "error",
213
+ error: error.message,
214
+ };
215
+ }
216
+ }
217
+ /**
218
+ * Find similar columns across tables (potential join candidates)
219
+ */
220
+ async findSimilarColumns(params) {
221
+ try {
222
+ const dbValidation = this.validateDatabaseAccess(params?.database);
223
+ if (!dbValidation.valid) {
224
+ return { status: "error", error: dbValidation.error };
225
+ }
226
+ const { column_name, table_name, include_data_comparison = false, max_results = 20, } = params;
227
+ const database = dbValidation.database;
228
+ // Get all columns
229
+ const allColumns = await this.db.query(`SELECT TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_KEY
230
+ FROM INFORMATION_SCHEMA.COLUMNS
231
+ WHERE TABLE_SCHEMA = ?
232
+ ORDER BY TABLE_NAME, ORDINAL_POSITION`, [database]);
233
+ const similarColumns = [];
234
+ const potentialJoins = [];
235
+ let referenceColumn;
236
+ if (column_name && table_name) {
237
+ // Find reference column
238
+ const refCol = allColumns.find((c) => c.TABLE_NAME.toLowerCase() === table_name.toLowerCase() &&
239
+ c.COLUMN_NAME.toLowerCase() === column_name.toLowerCase());
240
+ if (!refCol) {
241
+ return {
242
+ status: "error",
243
+ error: `Column '${column_name}' not found in table '${table_name}'`,
244
+ };
245
+ }
246
+ referenceColumn = {
247
+ table: refCol.TABLE_NAME,
248
+ column: refCol.COLUMN_NAME,
249
+ data_type: refCol.DATA_TYPE,
250
+ };
251
+ // Find similar columns
252
+ const refNameTokens = this.tokenize(refCol.COLUMN_NAME.toLowerCase());
253
+ for (const col of allColumns) {
254
+ if (col.TABLE_NAME === refCol.TABLE_NAME &&
255
+ col.COLUMN_NAME === refCol.COLUMN_NAME) {
256
+ continue;
257
+ }
258
+ const colNameLower = col.COLUMN_NAME.toLowerCase();
259
+ const nameSimilarity = this.calculateNameSimilarity(refCol.COLUMN_NAME.toLowerCase(), colNameLower);
260
+ const typeSimilarity = col.DATA_TYPE === refCol.DATA_TYPE ? 0.3 : 0;
261
+ const totalScore = nameSimilarity * 0.7 + typeSimilarity;
262
+ if (totalScore >= 0.3) {
263
+ const simCol = {
264
+ table_name: col.TABLE_NAME,
265
+ column_name: col.COLUMN_NAME,
266
+ data_type: col.DATA_TYPE,
267
+ similarity_score: Math.round(totalScore * 100) / 100,
268
+ similarity_type: this.getSimilarityType(refCol.COLUMN_NAME, col.COLUMN_NAME),
269
+ };
270
+ // Compare data if requested
271
+ if (include_data_comparison &&
272
+ col.DATA_TYPE === refCol.DATA_TYPE) {
273
+ try {
274
+ const overlapResult = await this.calculateDataOverlap(database, refCol.TABLE_NAME, refCol.COLUMN_NAME, col.TABLE_NAME, col.COLUMN_NAME);
275
+ simCol.data_overlap_percentage = overlapResult;
276
+ }
277
+ catch {
278
+ // Ignore data comparison errors
279
+ }
280
+ }
281
+ similarColumns.push(simCol);
282
+ // Add as potential join if high confidence
283
+ if (totalScore >= 0.6 || (simCol.data_overlap_percentage || 0) > 50) {
284
+ potentialJoins.push({
285
+ table1: refCol.TABLE_NAME,
286
+ column1: refCol.COLUMN_NAME,
287
+ table2: col.TABLE_NAME,
288
+ column2: col.COLUMN_NAME,
289
+ confidence: Math.round(Math.max(totalScore, (simCol.data_overlap_percentage || 0) / 100) * 100) / 100,
290
+ reason: (simCol.data_overlap_percentage || 0) > 50
291
+ ? "High data overlap"
292
+ : "Similar column names",
293
+ });
294
+ }
295
+ }
296
+ }
297
+ }
298
+ else {
299
+ // Find all potential join candidates based on naming patterns
300
+ const columnGroups = new Map();
301
+ for (const col of allColumns) {
302
+ // Group by normalized column name
303
+ const normalized = this.normalizeColumnName(col.COLUMN_NAME);
304
+ if (!columnGroups.has(normalized)) {
305
+ columnGroups.set(normalized, []);
306
+ }
307
+ columnGroups.get(normalized).push(col);
308
+ }
309
+ // Find groups with multiple columns (potential joins)
310
+ for (const [normalizedName, columns] of columnGroups) {
311
+ if (columns.length > 1) {
312
+ // Generate pairs
313
+ for (let i = 0; i < columns.length; i++) {
314
+ for (let j = i + 1; j < columns.length; j++) {
315
+ const col1 = columns[i];
316
+ const col2 = columns[j];
317
+ if (col1.TABLE_NAME !== col2.TABLE_NAME &&
318
+ col1.DATA_TYPE === col2.DATA_TYPE) {
319
+ const confidence = col1.COLUMN_NAME === col2.COLUMN_NAME ? 0.9 : 0.7;
320
+ potentialJoins.push({
321
+ table1: col1.TABLE_NAME,
322
+ column1: col1.COLUMN_NAME,
323
+ table2: col2.TABLE_NAME,
324
+ column2: col2.COLUMN_NAME,
325
+ confidence,
326
+ reason: col1.COLUMN_NAME === col2.COLUMN_NAME
327
+ ? "Identical column names"
328
+ : `Similar names: ${col1.COLUMN_NAME} ~ ${col2.COLUMN_NAME}`,
329
+ });
330
+ }
331
+ }
332
+ }
333
+ }
334
+ }
335
+ // Also look for id/foreign key patterns
336
+ for (const col of allColumns) {
337
+ if (col.COLUMN_NAME.toLowerCase().endsWith("_id")) {
338
+ const potentialTable = col.COLUMN_NAME.slice(0, -3);
339
+ const matchingTable = allColumns.find((c) => c.TABLE_NAME.toLowerCase() === potentialTable.toLowerCase() &&
340
+ c.COLUMN_KEY === "PRI");
341
+ if (matchingTable && matchingTable.TABLE_NAME !== col.TABLE_NAME) {
342
+ potentialJoins.push({
343
+ table1: col.TABLE_NAME,
344
+ column1: col.COLUMN_NAME,
345
+ table2: matchingTable.TABLE_NAME,
346
+ column2: matchingTable.COLUMN_NAME,
347
+ confidence: 0.85,
348
+ reason: "Foreign key naming convention (_id suffix)",
349
+ });
350
+ }
351
+ }
352
+ }
353
+ }
354
+ // Sort by similarity/confidence
355
+ similarColumns.sort((a, b) => b.similarity_score - a.similarity_score);
356
+ potentialJoins.sort((a, b) => b.confidence - a.confidence);
357
+ return {
358
+ status: "success",
359
+ data: {
360
+ reference_column: referenceColumn,
361
+ similar_columns: similarColumns.slice(0, max_results),
362
+ potential_joins: potentialJoins.slice(0, max_results),
363
+ },
364
+ };
365
+ }
366
+ catch (error) {
367
+ return {
368
+ status: "error",
369
+ error: error.message,
370
+ };
371
+ }
372
+ }
373
+ /**
374
+ * Discover data relationships and patterns
375
+ */
376
+ async discoverDataPatterns(params) {
377
+ try {
378
+ const dbValidation = this.validateDatabaseAccess(params?.database);
379
+ if (!dbValidation.valid) {
380
+ return { status: "error", error: dbValidation.error };
381
+ }
382
+ const { table_name, pattern_types = ["unique", "null", "duplicate", "format", "range"], max_columns = 20, } = params;
383
+ const database = dbValidation.database;
384
+ // Validate table name
385
+ if (!this.security.validateIdentifier(table_name).valid) {
386
+ return { status: "error", error: "Invalid table name" };
387
+ }
388
+ // Get columns
389
+ const columns = await this.db.query(`SELECT COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COLUMN_KEY
390
+ FROM INFORMATION_SCHEMA.COLUMNS
391
+ WHERE TABLE_SCHEMA = ? AND TABLE_NAME = ?
392
+ ORDER BY ORDINAL_POSITION
393
+ LIMIT ?`, [database, table_name, max_columns]);
394
+ if (columns.length === 0) {
395
+ return {
396
+ status: "error",
397
+ error: `Table '${table_name}' not found or has no columns`,
398
+ };
399
+ }
400
+ // Get row count
401
+ const countResult = await this.db.query(`SELECT COUNT(*) as cnt FROM \`${database}\`.\`${table_name}\``);
402
+ const totalRows = countResult[0]?.cnt || 0;
403
+ const patterns = [];
404
+ let qualityScore = 100;
405
+ for (const col of columns) {
406
+ const colName = col.COLUMN_NAME;
407
+ // Check for NULL patterns
408
+ if (pattern_types.includes("null")) {
409
+ const nullResult = await this.db.query(`SELECT COUNT(*) as null_count
410
+ FROM \`${database}\`.\`${table_name}\`
411
+ WHERE \`${colName}\` IS NULL`);
412
+ const nullCount = nullResult[0]?.null_count || 0;
413
+ const nullPercentage = totalRows > 0 ? (nullCount / totalRows) * 100 : 0;
414
+ if (nullPercentage > 50) {
415
+ patterns.push({
416
+ column_name: colName,
417
+ pattern_type: "HIGH_NULL_RATE",
418
+ description: `${nullPercentage.toFixed(1)}% of values are NULL`,
419
+ metrics: { null_count: nullCount, null_percentage: nullPercentage },
420
+ recommendations: [
421
+ "Review if this column is necessary",
422
+ "Consider setting a default value",
423
+ ],
424
+ });
425
+ qualityScore -= 5;
426
+ }
427
+ else if (nullPercentage > 0 && col.IS_NULLABLE === "NO") {
428
+ patterns.push({
429
+ column_name: colName,
430
+ pattern_type: "NULLABLE_MISMATCH",
431
+ description: "Column is marked NOT NULL but has NULL values",
432
+ metrics: { null_count: nullCount },
433
+ recommendations: ["Check data integrity constraints"],
434
+ });
435
+ qualityScore -= 10;
436
+ }
437
+ }
438
+ // Check for uniqueness patterns
439
+ if (pattern_types.includes("unique")) {
440
+ const uniqueResult = await this.db.query(`SELECT COUNT(DISTINCT \`${colName}\`) as distinct_count
441
+ FROM \`${database}\`.\`${table_name}\``);
442
+ const distinctCount = uniqueResult[0]?.distinct_count || 0;
443
+ const uniqueRatio = totalRows > 0 ? distinctCount / totalRows : 0;
444
+ if (uniqueRatio === 1 && col.COLUMN_KEY !== "PRI" && col.COLUMN_KEY !== "UNI") {
445
+ patterns.push({
446
+ column_name: colName,
447
+ pattern_type: "POTENTIALLY_UNIQUE",
448
+ description: "All values are unique but column is not marked as UNIQUE",
449
+ metrics: { distinct_count: distinctCount, total_rows: totalRows },
450
+ recommendations: [
451
+ "Consider adding a UNIQUE constraint",
452
+ "Could be a natural key candidate",
453
+ ],
454
+ });
455
+ }
456
+ else if (uniqueRatio < 0.1 && totalRows > 100) {
457
+ patterns.push({
458
+ column_name: colName,
459
+ pattern_type: "LOW_CARDINALITY",
460
+ description: `Only ${distinctCount} distinct values across ${totalRows} rows`,
461
+ metrics: { distinct_count: distinctCount, cardinality_ratio: uniqueRatio },
462
+ recommendations: [
463
+ "Consider using ENUM if values are fixed",
464
+ "May be a good candidate for indexing",
465
+ ],
466
+ });
467
+ }
468
+ }
469
+ // Check for duplicate patterns
470
+ if (pattern_types.includes("duplicate") && totalRows > 10) {
471
+ const duplicateResult = await this.db.query(`SELECT \`${colName}\`, COUNT(*) as cnt
472
+ FROM \`${database}\`.\`${table_name}\`
473
+ WHERE \`${colName}\` IS NOT NULL
474
+ GROUP BY \`${colName}\`
475
+ HAVING cnt > 1
476
+ ORDER BY cnt DESC
477
+ LIMIT 5`);
478
+ if (duplicateResult.length > 0) {
479
+ const topDuplicates = duplicateResult.map((r) => ({
480
+ value: r[colName],
481
+ count: r.cnt,
482
+ }));
483
+ if (col.COLUMN_KEY === "UNI" || col.COLUMN_KEY === "PRI") {
484
+ patterns.push({
485
+ column_name: colName,
486
+ pattern_type: "DUPLICATE_IN_UNIQUE",
487
+ description: "Duplicates found in supposedly unique column",
488
+ metrics: { top_duplicates: topDuplicates },
489
+ recommendations: ["Critical: Fix data integrity issue"],
490
+ });
491
+ qualityScore -= 20;
492
+ }
493
+ else {
494
+ patterns.push({
495
+ column_name: colName,
496
+ pattern_type: "DUPLICATES_FOUND",
497
+ description: `Found ${duplicateResult.length} values with duplicates`,
498
+ metrics: { duplicate_groups: duplicateResult.length, top_duplicates: topDuplicates },
499
+ });
500
+ }
501
+ }
502
+ }
503
+ // Check for format patterns (for string columns)
504
+ if (pattern_types.includes("format") &&
505
+ ["varchar", "char", "text"].includes(col.DATA_TYPE.toLowerCase())) {
506
+ // Check for email pattern
507
+ const emailResult = await this.db.query(`SELECT COUNT(*) as cnt
508
+ FROM \`${database}\`.\`${table_name}\`
509
+ WHERE \`${colName}\` REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'`);
510
+ if (emailResult[0]?.cnt > totalRows * 0.5) {
511
+ patterns.push({
512
+ column_name: colName,
513
+ pattern_type: "EMAIL_FORMAT",
514
+ description: "Contains email-formatted data",
515
+ metrics: { email_count: emailResult[0].cnt },
516
+ });
517
+ }
518
+ // Check for phone pattern
519
+ const phoneResult = await this.db.query(`SELECT COUNT(*) as cnt
520
+ FROM \`${database}\`.\`${table_name}\`
521
+ WHERE \`${colName}\` REGEXP '^[+]?[0-9]{10,15}$'`);
522
+ if (phoneResult[0]?.cnt > totalRows * 0.3) {
523
+ patterns.push({
524
+ column_name: colName,
525
+ pattern_type: "PHONE_FORMAT",
526
+ description: "Contains phone number-formatted data",
527
+ metrics: { phone_count: phoneResult[0].cnt },
528
+ });
529
+ }
530
+ }
531
+ // Check for range patterns (for numeric columns)
532
+ if (pattern_types.includes("range") &&
533
+ ["int", "bigint", "decimal", "float", "double"].includes(col.DATA_TYPE.toLowerCase())) {
534
+ const rangeResult = await this.db.query(`SELECT MIN(\`${colName}\`) as min_val, MAX(\`${colName}\`) as max_val, AVG(\`${colName}\`) as avg_val
535
+ FROM \`${database}\`.\`${table_name}\``);
536
+ if (rangeResult[0]) {
537
+ const { min_val, max_val, avg_val } = rangeResult[0];
538
+ const range = max_val - min_val;
539
+ patterns.push({
540
+ column_name: colName,
541
+ pattern_type: "NUMERIC_RANGE",
542
+ description: `Values range from ${min_val} to ${max_val}`,
543
+ metrics: {
544
+ min: min_val,
545
+ max: max_val,
546
+ avg: avg_val,
547
+ range: range,
548
+ },
549
+ });
550
+ // Check for potential outliers
551
+ if (range > 0 && (max_val > avg_val * 10 || min_val < avg_val / 10)) {
552
+ patterns.push({
553
+ column_name: colName,
554
+ pattern_type: "POTENTIAL_OUTLIERS",
555
+ description: "Large variance detected, may contain outliers",
556
+ recommendations: [
557
+ "Review extreme values for data quality",
558
+ "Consider outlier detection",
559
+ ],
560
+ });
561
+ }
562
+ }
563
+ }
564
+ }
565
+ return {
566
+ status: "success",
567
+ data: {
568
+ table_name,
569
+ patterns,
570
+ summary: {
571
+ columns_analyzed: columns.length,
572
+ patterns_found: patterns.length,
573
+ data_quality_score: Math.max(0, qualityScore),
574
+ },
575
+ },
576
+ };
577
+ }
578
+ catch (error) {
579
+ return {
580
+ status: "error",
581
+ error: error.message,
582
+ };
583
+ }
584
+ }
585
+ // ==================== Helper Methods ====================
586
+ /**
587
+ * Tokenize a string into searchable tokens
588
+ */
589
+ tokenize(text) {
590
+ return text
591
+ .toLowerCase()
592
+ .replace(/[-_]/g, " ")
593
+ .split(/\s+/)
594
+ .filter((t) => t.length > 1);
595
+ }
596
+ /**
597
+ * Calculate relevance score between search tokens and target
598
+ */
599
+ calculateRelevanceScore(tokens, target, originalTerm) {
600
+ const targetLower = target.toLowerCase();
601
+ // Exact match
602
+ if (targetLower === originalTerm)
603
+ return 1.0;
604
+ // Contains full term
605
+ if (targetLower.includes(originalTerm))
606
+ return 0.9;
607
+ if (originalTerm.includes(targetLower))
608
+ return 0.85;
609
+ // Token-based scoring
610
+ let matchedTokens = 0;
611
+ for (const token of tokens) {
612
+ if (targetLower.includes(token))
613
+ matchedTokens++;
614
+ }
615
+ if (tokens.length > 0) {
616
+ const tokenScore = matchedTokens / tokens.length;
617
+ if (tokenScore > 0)
618
+ return 0.3 + tokenScore * 0.5;
619
+ }
620
+ // Similarity-based scoring
621
+ const similarity = this.calculateNameSimilarity(originalTerm, targetLower);
622
+ if (similarity > 0.5)
623
+ return similarity * 0.6;
624
+ return 0;
625
+ }
626
+ /**
627
+ * Get a human-readable match reason
628
+ */
629
+ getMatchReason(tokens, target) {
630
+ const targetLower = target.toLowerCase();
631
+ for (const token of tokens) {
632
+ if (targetLower === token)
633
+ return `Exact match: "${token}"`;
634
+ if (targetLower.includes(token))
635
+ return `Contains: "${token}"`;
636
+ }
637
+ return "Similar name pattern";
638
+ }
639
+ /**
640
+ * Calculate name similarity using Levenshtein-like approach
641
+ */
642
+ calculateNameSimilarity(a, b) {
643
+ if (a === b)
644
+ return 1;
645
+ if (a.length === 0 || b.length === 0)
646
+ return 0;
647
+ // Simple character overlap ratio
648
+ const setA = new Set(a.split(""));
649
+ const setB = new Set(b.split(""));
650
+ const intersection = [...setA].filter((x) => setB.has(x)).length;
651
+ const union = new Set([...setA, ...setB]).size;
652
+ const charSimilarity = union > 0 ? intersection / union : 0;
653
+ // Prefix matching bonus
654
+ let prefixLength = 0;
655
+ const minLen = Math.min(a.length, b.length);
656
+ for (let i = 0; i < minLen; i++) {
657
+ if (a[i] === b[i])
658
+ prefixLength++;
659
+ else
660
+ break;
661
+ }
662
+ const prefixBonus = prefixLength / Math.max(a.length, b.length);
663
+ return charSimilarity * 0.6 + prefixBonus * 0.4;
664
+ }
665
+ /**
666
+ * Normalize column name for comparison
667
+ */
668
+ normalizeColumnName(name) {
669
+ return name
670
+ .toLowerCase()
671
+ .replace(/^(fk_|pk_|idx_)/, "")
672
+ .replace(/_id$/, "")
673
+ .replace(/[_-]/g, "");
674
+ }
675
+ /**
676
+ * Get similarity type description
677
+ */
678
+ getSimilarityType(name1, name2) {
679
+ if (name1.toLowerCase() === name2.toLowerCase())
680
+ return "EXACT_MATCH";
681
+ const n1 = this.normalizeColumnName(name1);
682
+ const n2 = this.normalizeColumnName(name2);
683
+ if (n1 === n2)
684
+ return "NORMALIZED_MATCH";
685
+ if (n1.includes(n2) || n2.includes(n1))
686
+ return "SUBSTRING_MATCH";
687
+ return "SIMILAR_PATTERN";
688
+ }
689
+ /**
690
+ * Calculate data overlap between two columns
691
+ */
692
+ async calculateDataOverlap(database, table1, col1, table2, col2) {
693
+ const overlapQuery = `
694
+ SELECT COUNT(DISTINCT t1.\`${col1}\`) as overlap_count
695
+ FROM \`${database}\`.\`${table1}\` t1
696
+ INNER JOIN \`${database}\`.\`${table2}\` t2 ON t1.\`${col1}\` = t2.\`${col2}\`
697
+ WHERE t1.\`${col1}\` IS NOT NULL
698
+ `;
699
+ const totalQuery = `
700
+ SELECT COUNT(DISTINCT \`${col1}\`) as total
701
+ FROM \`${database}\`.\`${table1}\`
702
+ WHERE \`${col1}\` IS NOT NULL
703
+ `;
704
+ const [overlapResult, totalResult] = await Promise.all([
705
+ this.db.query(overlapQuery),
706
+ this.db.query(totalQuery),
707
+ ]);
708
+ const overlap = overlapResult[0]?.overlap_count || 0;
709
+ const total = totalResult[0]?.total || 1;
710
+ return Math.round((overlap / total) * 100);
711
+ }
712
+ /**
713
+ * Discover implicit relationships based on naming conventions
714
+ */
715
+ discoverImplicitRelationships(tables, columns, searchTokens, threshold) {
716
+ const relationships = [];
717
+ const tableNames = new Set(tables.map((t) => t.TABLE_NAME.toLowerCase()));
718
+ const primaryKeys = new Map();
719
+ // Find primary keys
720
+ for (const col of columns) {
721
+ if (col.COLUMN_KEY === "PRI") {
722
+ primaryKeys.set(col.TABLE_NAME.toLowerCase(), col.COLUMN_NAME);
723
+ }
724
+ }
725
+ // Look for _id columns that match table names
726
+ for (const col of columns) {
727
+ const colLower = col.COLUMN_NAME.toLowerCase();
728
+ if (colLower.endsWith("_id")) {
729
+ const potentialTable = colLower.slice(0, -3);
730
+ // Check if the tokens match
731
+ const score = this.calculateRelevanceScore(searchTokens, potentialTable, searchTokens.join(" "));
732
+ if (score >= threshold || searchTokens.length === 0) {
733
+ if (tableNames.has(potentialTable)) {
734
+ const pk = primaryKeys.get(potentialTable) || "id";
735
+ relationships.push({
736
+ from_table: col.TABLE_NAME,
737
+ from_column: col.COLUMN_NAME,
738
+ to_table: potentialTable,
739
+ to_column: pk,
740
+ relationship_type: "IMPLICIT_FK",
741
+ confidence: 0.8,
742
+ });
743
+ }
744
+ }
745
+ }
746
+ }
747
+ return relationships;
748
+ }
749
+ }
750
+ exports.SmartDiscoveryTools = SmartDiscoveryTools;