postgres-scout-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/README.md +234 -0
- package/bin/cli.js +67 -0
- package/dist/config/environment.js +52 -0
- package/dist/index.js +59 -0
- package/dist/server/setup.js +122 -0
- package/dist/tools/data-quality.js +442 -0
- package/dist/tools/database.js +148 -0
- package/dist/tools/export.js +223 -0
- package/dist/tools/index.js +52 -0
- package/dist/tools/live-monitoring.js +369 -0
- package/dist/tools/maintenance.js +617 -0
- package/dist/tools/monitoring.js +286 -0
- package/dist/tools/mutations.js +410 -0
- package/dist/tools/optimization.js +1094 -0
- package/dist/tools/query.js +138 -0
- package/dist/tools/relationships.js +261 -0
- package/dist/tools/schema.js +253 -0
- package/dist/tools/temporal.js +313 -0
- package/dist/types.js +2 -0
- package/dist/utils/database.js +123 -0
- package/dist/utils/logger.js +73 -0
- package/dist/utils/query-builder.js +180 -0
- package/dist/utils/rate-limiter.js +39 -0
- package/dist/utils/result-formatter.js +42 -0
- package/dist/utils/sanitize.js +525 -0
- package/dist/utils/zod-to-json-schema.js +85 -0
- package/package.json +58 -0
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
2
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
3
|
+
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
|
4
|
+
import { RateLimiter } from '../utils/rate-limiter.js';
|
|
5
|
+
import { tools, executeTool } from '../tools/index.js';
|
|
6
|
+
import { sanitizeErrorMessage } from '../utils/sanitize.js';
|
|
7
|
+
import { zodToJsonSchema } from '../utils/zod-to-json-schema.js';
|
|
8
|
+
export function createMCPServer(connection, logger, config) {
|
|
9
|
+
const server = new Server({
|
|
10
|
+
name: 'postgres-scout-mcp',
|
|
11
|
+
version: '0.1.0',
|
|
12
|
+
}, {
|
|
13
|
+
capabilities: {
|
|
14
|
+
tools: {},
|
|
15
|
+
},
|
|
16
|
+
});
|
|
17
|
+
const rateLimiter = new RateLimiter(config.rateLimitMaxRequests, config.rateLimitWindowMs, config.enableRateLimit);
|
|
18
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
19
|
+
logger.debug('mcp', 'Listing available tools');
|
|
20
|
+
const toolList = Object.entries(tools).map(([name, tool]) => ({
|
|
21
|
+
name,
|
|
22
|
+
description: getToolDescription(name, config.mode),
|
|
23
|
+
inputSchema: zodToJsonSchema(tool.schema)
|
|
24
|
+
}));
|
|
25
|
+
return { tools: toolList };
|
|
26
|
+
});
|
|
27
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
28
|
+
const { name, arguments: args } = request.params;
|
|
29
|
+
logger.info('mcp', `Tool called: ${name}`, { args });
|
|
30
|
+
try {
|
|
31
|
+
rateLimiter.checkLimit();
|
|
32
|
+
const result = await executeTool(name, connection, logger, args || {});
|
|
33
|
+
return {
|
|
34
|
+
content: [
|
|
35
|
+
{
|
|
36
|
+
type: 'text',
|
|
37
|
+
text: JSON.stringify(result, null, 2)
|
|
38
|
+
}
|
|
39
|
+
]
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
const rawMessage = error instanceof Error ? error.message : String(error);
|
|
44
|
+
logger.error('mcp', `Error executing tool ${name}`, { error: rawMessage });
|
|
45
|
+
return {
|
|
46
|
+
content: [
|
|
47
|
+
{
|
|
48
|
+
type: 'text',
|
|
49
|
+
text: JSON.stringify({
|
|
50
|
+
error: sanitizeErrorMessage(rawMessage),
|
|
51
|
+
tool: name
|
|
52
|
+
}, null, 2)
|
|
53
|
+
}
|
|
54
|
+
],
|
|
55
|
+
isError: true
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
});
|
|
59
|
+
return server;
|
|
60
|
+
}
|
|
61
|
+
export async function startServer(server, logger) {
|
|
62
|
+
const transport = new StdioServerTransport();
|
|
63
|
+
logger.info('server', 'Starting MCP server');
|
|
64
|
+
await server.connect(transport);
|
|
65
|
+
logger.info('server', 'MCP server running on stdio');
|
|
66
|
+
}
|
|
67
|
+
function getToolDescription(name, mode) {
|
|
68
|
+
const descriptions = {
|
|
69
|
+
// Database operations
|
|
70
|
+
listDatabases: 'List all databases the user has access to',
|
|
71
|
+
getDatabaseStats: 'Get comprehensive database statistics including size, cache hit ratio, and connection info',
|
|
72
|
+
// Schema operations
|
|
73
|
+
listSchemas: 'List all schemas in the current database',
|
|
74
|
+
listTables: 'List all tables in a schema with size and statistics',
|
|
75
|
+
describeTable: 'Get comprehensive table information including columns, constraints, and indexes',
|
|
76
|
+
// Query operations
|
|
77
|
+
executeQuery: `Execute SELECT queries${mode === 'read-write' ? ' or write operations' : ' (read-only)'}`,
|
|
78
|
+
explainQuery: `Analyze query performance using EXPLAIN${mode === 'read-only' ? ' (ANALYZE disabled)' : ' ANALYZE'}`,
|
|
79
|
+
// Data quality tools
|
|
80
|
+
findDuplicates: 'Find duplicate rows based on column combinations',
|
|
81
|
+
findMissingValues: 'Find NULL values or missing data in columns',
|
|
82
|
+
findOrphans: 'Find orphaned records with invalid foreign key references',
|
|
83
|
+
checkConstraintViolations: 'Check for rows that would violate a constraint before adding it',
|
|
84
|
+
analyzeTypeConsistency: 'Analyze if text columns contain consistent data types',
|
|
85
|
+
// Temporal tools
|
|
86
|
+
findRecent: 'Find rows within a time window',
|
|
87
|
+
analyzeTimeSeries: 'Advanced time-series analysis with window functions and anomaly detection',
|
|
88
|
+
detectSeasonality: 'Detect seasonal patterns in time-series data',
|
|
89
|
+
// Monitoring tools
|
|
90
|
+
getCurrentActivity: 'Get current active queries and connections',
|
|
91
|
+
analyzeLocks: 'Analyze current locks and blocking queries',
|
|
92
|
+
getIndexUsage: 'Analyze index usage and identify unused indexes',
|
|
93
|
+
// Relationship tools
|
|
94
|
+
exploreRelationships: 'Follow foreign key relationships to explore related records',
|
|
95
|
+
analyzeForeignKeys: 'Analyze foreign key health and performance',
|
|
96
|
+
// Export tools
|
|
97
|
+
exportTable: 'Export table data to various formats (CSV, JSON, SQL)',
|
|
98
|
+
generateInsertStatements: 'Generate INSERT statements for data migration',
|
|
99
|
+
// Maintenance & health tools
|
|
100
|
+
analyzeTableBloat: 'Detect table and index bloat for VACUUM planning',
|
|
101
|
+
suggestVacuum: 'Analyze and recommend VACUUM operations based on dead tuples and bloat',
|
|
102
|
+
getHealthScore: 'Calculate overall database health score with component breakdown',
|
|
103
|
+
getSlowQueries: 'Analyze slow queries from pg_stat_statements extension',
|
|
104
|
+
// Optimization tools
|
|
105
|
+
suggestIndexes: 'Analyze query patterns and table scans to recommend missing indexes',
|
|
106
|
+
suggestPartitioning: 'Analyze large tables and recommend partitioning strategies',
|
|
107
|
+
detectAnomalies: 'Detect anomalies in query performance, connections, and data patterns',
|
|
108
|
+
optimizeQuery: `Analyze a specific query and provide optimization recommendations${mode === 'read-only' ? ' (estimated plan only — ANALYZE unavailable)' : ''}`,
|
|
109
|
+
// Mutation tools (safe write operations)
|
|
110
|
+
previewUpdate: 'Preview which rows would be affected by an UPDATE without modifying data',
|
|
111
|
+
previewDelete: 'Preview which rows would be deleted without actually deleting them',
|
|
112
|
+
safeUpdate: 'Execute UPDATE with safety guards: dry-run mode, maxRows limit, empty WHERE protection',
|
|
113
|
+
safeDelete: 'Execute DELETE with safety guards: dry-run mode, maxRows limit, empty WHERE protection',
|
|
114
|
+
safeInsert: 'Execute INSERT with safety guards: dry-run mode, maxRows limit, parameterized values, row batching, ON CONFLICT support',
|
|
115
|
+
// Live monitoring tools
|
|
116
|
+
getLiveMetrics: 'Collect real-time database metrics over a time period with configurable intervals',
|
|
117
|
+
getHottestTables: 'Identify tables with highest activity during a sample period',
|
|
118
|
+
getTableMetrics: 'Get comprehensive metrics for a specific table including I/O, scans, and maintenance stats'
|
|
119
|
+
};
|
|
120
|
+
return descriptions[name] || `Execute ${name} operation`;
|
|
121
|
+
}
|
|
122
|
+
//# sourceMappingURL=setup.js.map
|
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { executeInternalQuery } from '../utils/database.js';
|
|
3
|
+
import { escapeIdentifier, sanitizeIdentifier, validateCondition } from '../utils/sanitize.js';
|
|
4
|
+
const FindDuplicatesSchema = z.object({
|
|
5
|
+
table: z.string(),
|
|
6
|
+
columns: z.array(z.string()),
|
|
7
|
+
schema: z.string().optional().default('public'),
|
|
8
|
+
limit: z.number().optional().default(100),
|
|
9
|
+
minCount: z.number().optional().default(2),
|
|
10
|
+
includeRows: z.boolean().optional().default(true)
|
|
11
|
+
});
|
|
12
|
+
const FindMissingValuesSchema = z.object({
|
|
13
|
+
table: z.string(),
|
|
14
|
+
columns: z.array(z.string()),
|
|
15
|
+
schema: z.string().optional().default('public'),
|
|
16
|
+
includeRows: z.boolean().optional().default(true),
|
|
17
|
+
limit: z.number().optional().default(100)
|
|
18
|
+
});
|
|
19
|
+
const FindOrphansSchema = z.object({
|
|
20
|
+
table: z.string(),
|
|
21
|
+
foreignKey: z.string(),
|
|
22
|
+
referenceTable: z.string(),
|
|
23
|
+
referenceColumn: z.string(),
|
|
24
|
+
schema: z.string().optional().default('public'),
|
|
25
|
+
referenceSchema: z.string().optional().default('public'),
|
|
26
|
+
limit: z.number().optional().default(100)
|
|
27
|
+
});
|
|
28
|
+
const CheckConstraintViolationsSchema = z.object({
|
|
29
|
+
table: z.string(),
|
|
30
|
+
condition: z.string().describe('SQL boolean expression to check, e.g., "email IS NOT NULL"'),
|
|
31
|
+
constraintName: z.string().optional().describe('Name for the constraint'),
|
|
32
|
+
schema: z.string().optional().default('public')
|
|
33
|
+
});
|
|
34
|
+
const AnalyzeTypeConsistencySchema = z.object({
|
|
35
|
+
table: z.string(),
|
|
36
|
+
column: z.string(),
|
|
37
|
+
schema: z.string().optional().default('public'),
|
|
38
|
+
suggestConversion: z.boolean().optional().default(true),
|
|
39
|
+
sampleSize: z.number().optional().default(10000)
|
|
40
|
+
});
|
|
41
|
+
export async function findDuplicates(connection, logger, args) {
|
|
42
|
+
const { table, columns, schema, limit, minCount, includeRows } = args;
|
|
43
|
+
logger.info('findDuplicates', 'Finding duplicate rows', { table, columns });
|
|
44
|
+
const sanitizedSchema = sanitizeIdentifier(schema);
|
|
45
|
+
const sanitizedTable = sanitizeIdentifier(table);
|
|
46
|
+
const sanitizedColumns = columns.map(sanitizeIdentifier);
|
|
47
|
+
const columnList = sanitizedColumns.map(escapeIdentifier).join(', ');
|
|
48
|
+
const groupByList = sanitizedColumns.map((col, idx) => `${idx + 1}`).join(', ');
|
|
49
|
+
const countQuery = `
|
|
50
|
+
SELECT COUNT(*) as total_rows
|
|
51
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
52
|
+
`;
|
|
53
|
+
const duplicatesQuery = `
|
|
54
|
+
SELECT
|
|
55
|
+
${columnList},
|
|
56
|
+
COUNT(*) as count
|
|
57
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
58
|
+
GROUP BY ${columnList}
|
|
59
|
+
HAVING COUNT(*) >= $1
|
|
60
|
+
ORDER BY COUNT(*) DESC
|
|
61
|
+
LIMIT $2
|
|
62
|
+
`;
|
|
63
|
+
const [totalResult, duplicatesResult] = await Promise.all([
|
|
64
|
+
executeInternalQuery(connection, logger, { query: countQuery }),
|
|
65
|
+
executeInternalQuery(connection, logger, {
|
|
66
|
+
query: duplicatesQuery,
|
|
67
|
+
params: [minCount, limit]
|
|
68
|
+
})
|
|
69
|
+
]);
|
|
70
|
+
const totalRows = parseInt(totalResult.rows[0]?.total_rows || '0', 10);
|
|
71
|
+
const duplicateGroups = duplicatesResult.rows;
|
|
72
|
+
let duplicateGroupsWithRows = duplicateGroups;
|
|
73
|
+
if (includeRows && duplicateGroups.length > 0) {
|
|
74
|
+
// Process in batches of 5 to avoid exhausting the connection pool
|
|
75
|
+
const BATCH_SIZE = 5;
|
|
76
|
+
const results = [];
|
|
77
|
+
for (let i = 0; i < duplicateGroups.length; i += BATCH_SIZE) {
|
|
78
|
+
const batch = duplicateGroups.slice(i, i + BATCH_SIZE);
|
|
79
|
+
const batchResults = await Promise.all(batch.map(async (group) => {
|
|
80
|
+
const whereConditions = sanitizedColumns.map((col, idx) => {
|
|
81
|
+
return `${escapeIdentifier(col)} = $${idx + 1}`;
|
|
82
|
+
}).join(' AND ');
|
|
83
|
+
const rowsQuery = `
|
|
84
|
+
SELECT *
|
|
85
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
86
|
+
WHERE ${whereConditions}
|
|
87
|
+
LIMIT 10
|
|
88
|
+
`;
|
|
89
|
+
const params = sanitizedColumns.map(col => group[col]);
|
|
90
|
+
const rowsResult = await executeInternalQuery(connection, logger, {
|
|
91
|
+
query: rowsQuery,
|
|
92
|
+
params
|
|
93
|
+
});
|
|
94
|
+
return {
|
|
95
|
+
...group,
|
|
96
|
+
rows: rowsResult.rows
|
|
97
|
+
};
|
|
98
|
+
}));
|
|
99
|
+
results.push(...batchResults);
|
|
100
|
+
}
|
|
101
|
+
duplicateGroupsWithRows = results;
|
|
102
|
+
}
|
|
103
|
+
const totalDuplicateRows = duplicateGroups.reduce((sum, group) => sum + parseInt(group.count, 10), 0);
|
|
104
|
+
const recommendations = [];
|
|
105
|
+
if (duplicateGroups.length > 0) {
|
|
106
|
+
recommendations.push(`Found ${totalDuplicateRows} duplicate rows across ${duplicateGroups.length} groups`);
|
|
107
|
+
recommendations.push(`Consider adding UNIQUE constraint: ALTER TABLE ${schema}.${table} ADD CONSTRAINT ${table}_${columns.join('_')}_unique UNIQUE (${columns.join(', ')})`);
|
|
108
|
+
recommendations.push('Review and delete duplicates, keeping the most recent or earliest record');
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
recommendations.push('✓ No duplicates found');
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
table,
|
|
115
|
+
schema,
|
|
116
|
+
columns,
|
|
117
|
+
totalDuplicateGroups: duplicateGroups.length,
|
|
118
|
+
affectedRows: totalDuplicateRows,
|
|
119
|
+
statistics: {
|
|
120
|
+
totalRows,
|
|
121
|
+
uniqueRows: totalRows - totalDuplicateRows,
|
|
122
|
+
duplicateRows: totalDuplicateRows,
|
|
123
|
+
duplicatePercentage: totalRows > 0 ? ((totalDuplicateRows / totalRows) * 100).toFixed(2) : '0'
|
|
124
|
+
},
|
|
125
|
+
duplicateGroups: duplicateGroupsWithRows.slice(0, limit),
|
|
126
|
+
recommendations
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
export async function findMissingValues(connection, logger, args) {
|
|
130
|
+
const { table, columns, schema, includeRows, limit } = args;
|
|
131
|
+
logger.info('findMissingValues', 'Finding NULL values', { table, columns });
|
|
132
|
+
const sanitizedSchema = sanitizeIdentifier(schema);
|
|
133
|
+
const sanitizedTable = sanitizeIdentifier(table);
|
|
134
|
+
const sanitizedColumns = columns.map(sanitizeIdentifier);
|
|
135
|
+
const countQuery = `
|
|
136
|
+
SELECT COUNT(*) as total_rows
|
|
137
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
138
|
+
`;
|
|
139
|
+
const totalResult = await executeInternalQuery(connection, logger, { query: countQuery });
|
|
140
|
+
const totalRows = parseInt(totalResult.rows[0]?.total_rows || '0', 10);
|
|
141
|
+
const analysis = {};
|
|
142
|
+
for (const column of sanitizedColumns) {
|
|
143
|
+
const nullCountQuery = `
|
|
144
|
+
SELECT
|
|
145
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(column)} IS NULL) as null_count
|
|
146
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
147
|
+
`;
|
|
148
|
+
const nullResult = await executeInternalQuery(connection, logger, { query: nullCountQuery });
|
|
149
|
+
const nullCount = parseInt(nullResult.rows[0]?.null_count || '0', 10);
|
|
150
|
+
const nullPercentage = totalRows > 0 ? ((nullCount / totalRows) * 100).toFixed(2) : '0';
|
|
151
|
+
let recommendation = '';
|
|
152
|
+
let sampleRows = [];
|
|
153
|
+
if (nullCount === 0) {
|
|
154
|
+
recommendation = '✓ No NULL values';
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
const percentage = parseFloat(nullPercentage);
|
|
158
|
+
if (percentage < 1) {
|
|
159
|
+
recommendation = `${nullCount} rows with NULL ${column} - minor issue`;
|
|
160
|
+
}
|
|
161
|
+
else if (percentage < 5) {
|
|
162
|
+
recommendation = `⚠ ${percentage}% of rows missing ${column} - investigate`;
|
|
163
|
+
}
|
|
164
|
+
else {
|
|
165
|
+
recommendation = `⚠ ${percentage}% of rows missing ${column} - set default or make required`;
|
|
166
|
+
}
|
|
167
|
+
if (includeRows && nullCount > 0) {
|
|
168
|
+
const sampleQuery = `
|
|
169
|
+
SELECT *
|
|
170
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
171
|
+
WHERE ${escapeIdentifier(column)} IS NULL
|
|
172
|
+
LIMIT $1
|
|
173
|
+
`;
|
|
174
|
+
const sampleResult = await executeInternalQuery(connection, logger, {
|
|
175
|
+
query: sampleQuery,
|
|
176
|
+
params: [limit]
|
|
177
|
+
});
|
|
178
|
+
sampleRows = sampleResult.rows;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
analysis[column] = {
|
|
182
|
+
nullCount,
|
|
183
|
+
nullPercentage: parseFloat(nullPercentage),
|
|
184
|
+
recommendation,
|
|
185
|
+
...(sampleRows.length > 0 && { sampleRows })
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
const recommendations = [];
|
|
189
|
+
for (const [column, data] of Object.entries(analysis)) {
|
|
190
|
+
if (data.nullCount === 0) {
|
|
191
|
+
recommendations.push(`Consider adding NOT NULL constraint to ${column}`);
|
|
192
|
+
}
|
|
193
|
+
else if (data.nullPercentage > 5) {
|
|
194
|
+
recommendations.push(`High NULL rate in ${column} (${data.nullPercentage}%) - investigate data quality`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return {
|
|
198
|
+
table,
|
|
199
|
+
schema,
|
|
200
|
+
totalRows,
|
|
201
|
+
analysis,
|
|
202
|
+
recommendations
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
export async function findOrphans(connection, logger, args) {
|
|
206
|
+
const { table, foreignKey, referenceTable, referenceColumn, schema, referenceSchema, limit } = args;
|
|
207
|
+
logger.info('findOrphans', 'Finding orphaned records', { table, foreignKey, referenceTable });
|
|
208
|
+
const sanitizedSchema = sanitizeIdentifier(schema);
|
|
209
|
+
const sanitizedTable = sanitizeIdentifier(table);
|
|
210
|
+
const sanitizedForeignKey = sanitizeIdentifier(foreignKey);
|
|
211
|
+
const sanitizedRefSchema = sanitizeIdentifier(referenceSchema);
|
|
212
|
+
const sanitizedRefTable = sanitizeIdentifier(referenceTable);
|
|
213
|
+
const sanitizedRefColumn = sanitizeIdentifier(referenceColumn);
|
|
214
|
+
const orphansQuery = `
|
|
215
|
+
SELECT t.*
|
|
216
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)} t
|
|
217
|
+
LEFT JOIN ${escapeIdentifier(sanitizedRefSchema)}.${escapeIdentifier(sanitizedRefTable)} r
|
|
218
|
+
ON t.${escapeIdentifier(sanitizedForeignKey)} = r.${escapeIdentifier(sanitizedRefColumn)}
|
|
219
|
+
WHERE t.${escapeIdentifier(sanitizedForeignKey)} IS NOT NULL
|
|
220
|
+
AND r.${escapeIdentifier(sanitizedRefColumn)} IS NULL
|
|
221
|
+
LIMIT $1
|
|
222
|
+
`;
|
|
223
|
+
const countQuery = `
|
|
224
|
+
SELECT COUNT(*) as orphan_count
|
|
225
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)} t
|
|
226
|
+
LEFT JOIN ${escapeIdentifier(sanitizedRefSchema)}.${escapeIdentifier(sanitizedRefTable)} r
|
|
227
|
+
ON t.${escapeIdentifier(sanitizedForeignKey)} = r.${escapeIdentifier(sanitizedRefColumn)}
|
|
228
|
+
WHERE t.${escapeIdentifier(sanitizedForeignKey)} IS NOT NULL
|
|
229
|
+
AND r.${escapeIdentifier(sanitizedRefColumn)} IS NULL
|
|
230
|
+
`;
|
|
231
|
+
const totalQuery = `
|
|
232
|
+
SELECT COUNT(*) as total_count
|
|
233
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
234
|
+
`;
|
|
235
|
+
const [orphansResult, countResult, totalResult] = await Promise.all([
|
|
236
|
+
executeInternalQuery(connection, logger, { query: orphansQuery, params: [limit] }),
|
|
237
|
+
executeInternalQuery(connection, logger, { query: countQuery }),
|
|
238
|
+
executeInternalQuery(connection, logger, { query: totalQuery })
|
|
239
|
+
]);
|
|
240
|
+
const orphanCount = parseInt(countResult.rows[0]?.orphan_count || '0', 10);
|
|
241
|
+
const totalCount = parseInt(totalResult.rows[0]?.total_count || '0', 10);
|
|
242
|
+
const orphanPercentage = totalCount > 0 ? ((orphanCount / totalCount) * 100).toFixed(2) : '0';
|
|
243
|
+
const recommendations = [];
|
|
244
|
+
if (orphanCount > 0) {
|
|
245
|
+
recommendations.push(`Found ${orphanCount} orphaned records (${orphanPercentage}% of total)`);
|
|
246
|
+
recommendations.push(`Delete orphaned records: DELETE FROM ${schema}.${table} WHERE ${foreignKey} NOT IN (SELECT ${referenceColumn} FROM ${referenceSchema}.${referenceTable})`);
|
|
247
|
+
recommendations.push(`Or set to NULL: UPDATE ${schema}.${table} SET ${foreignKey} = NULL WHERE ${foreignKey} NOT IN (SELECT ${referenceColumn} FROM ${referenceSchema}.${referenceTable})`);
|
|
248
|
+
recommendations.push(`After cleanup, add FK constraint: ALTER TABLE ${schema}.${table} ADD CONSTRAINT ${table}_${foreignKey}_fkey FOREIGN KEY (${foreignKey}) REFERENCES ${referenceSchema}.${referenceTable}(${referenceColumn})`);
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
recommendations.push('✓ No orphaned records found');
|
|
252
|
+
recommendations.push(`Safe to add FK constraint: ALTER TABLE ${schema}.${table} ADD CONSTRAINT ${table}_${foreignKey}_fkey FOREIGN KEY (${foreignKey}) REFERENCES ${referenceSchema}.${referenceTable}(${referenceColumn})`);
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
table,
|
|
256
|
+
schema,
|
|
257
|
+
foreignKey,
|
|
258
|
+
referenceTable,
|
|
259
|
+
referenceSchema,
|
|
260
|
+
referenceColumn,
|
|
261
|
+
orphanCount,
|
|
262
|
+
totalCount,
|
|
263
|
+
orphanPercentage: parseFloat(orphanPercentage),
|
|
264
|
+
orphanedRows: orphansResult.rows,
|
|
265
|
+
recommendations
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
export async function checkConstraintViolations(connection, logger, args) {
|
|
269
|
+
const { table, condition, constraintName, schema } = args;
|
|
270
|
+
logger.info('checkConstraintViolations', 'Checking constraint violations', { table, condition });
|
|
271
|
+
validateCondition(condition);
|
|
272
|
+
const sanitizedSchema = sanitizeIdentifier(schema);
|
|
273
|
+
const sanitizedTable = sanitizeIdentifier(table);
|
|
274
|
+
const name = constraintName || `${table}_check`;
|
|
275
|
+
const violationsQuery = `
|
|
276
|
+
SELECT *
|
|
277
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
278
|
+
WHERE NOT (${condition})
|
|
279
|
+
LIMIT 100
|
|
280
|
+
`;
|
|
281
|
+
const countQuery = `
|
|
282
|
+
SELECT COUNT(*) as violation_count
|
|
283
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
284
|
+
WHERE NOT (${condition})
|
|
285
|
+
`;
|
|
286
|
+
const [violationsResult, countResult] = await Promise.all([
|
|
287
|
+
executeInternalQuery(connection, logger, { query: violationsQuery }),
|
|
288
|
+
executeInternalQuery(connection, logger, { query: countQuery })
|
|
289
|
+
]);
|
|
290
|
+
const violationCount = parseInt(countResult.rows[0]?.violation_count || '0', 10);
|
|
291
|
+
const recommendations = [];
|
|
292
|
+
if (violationCount > 0) {
|
|
293
|
+
recommendations.push(`⚠ ${violationCount} rows would violate CHECK constraint`);
|
|
294
|
+
recommendations.push('Fix violations before adding constraint');
|
|
295
|
+
recommendations.push(`Example: UPDATE ${schema}.${table} SET ... WHERE NOT (${condition})`);
|
|
296
|
+
}
|
|
297
|
+
else {
|
|
298
|
+
recommendations.push('✓ No violations found - safe to add constraint');
|
|
299
|
+
recommendations.push(`ALTER TABLE ${schema}.${table} ADD CONSTRAINT ${name} CHECK (${condition})`);
|
|
300
|
+
}
|
|
301
|
+
return {
|
|
302
|
+
table,
|
|
303
|
+
schema,
|
|
304
|
+
constraint: name,
|
|
305
|
+
condition,
|
|
306
|
+
violationCount,
|
|
307
|
+
violations: violationsResult.rows.slice(0, 20),
|
|
308
|
+
recommendations
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
export async function analyzeTypeConsistency(connection, logger, args) {
|
|
312
|
+
const { table, column, schema, suggestConversion, sampleSize } = args;
|
|
313
|
+
logger.info('analyzeTypeConsistency', 'Analyzing type consistency', { table, column });
|
|
314
|
+
const sanitizedSchema = sanitizeIdentifier(schema);
|
|
315
|
+
const sanitizedTable = sanitizeIdentifier(table);
|
|
316
|
+
const sanitizedColumn = sanitizeIdentifier(column);
|
|
317
|
+
const typeQuery = `
|
|
318
|
+
SELECT
|
|
319
|
+
data_type as current_type
|
|
320
|
+
FROM information_schema.columns
|
|
321
|
+
WHERE table_schema = $1
|
|
322
|
+
AND table_name = $2
|
|
323
|
+
AND column_name = $3
|
|
324
|
+
`;
|
|
325
|
+
const typeResult = await executeInternalQuery(connection, logger, {
|
|
326
|
+
query: typeQuery,
|
|
327
|
+
params: [sanitizedSchema, sanitizedTable, sanitizedColumn]
|
|
328
|
+
});
|
|
329
|
+
const currentType = typeResult.rows[0]?.current_type || 'unknown';
|
|
330
|
+
const analysisQuery = `
|
|
331
|
+
SELECT
|
|
332
|
+
COUNT(*) as total_rows,
|
|
333
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(sanitizedColumn)} IS NULL) as null_count,
|
|
334
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(sanitizedColumn)} ~ '^[0-9]+$') as integer_count,
|
|
335
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(sanitizedColumn)} ~ '^[0-9]+\\.[0-9]+$') as decimal_count,
|
|
336
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(sanitizedColumn)} ~ '^[0-9]{4}-[0-9]{2}-[0-9]{2}') as date_count,
|
|
337
|
+
COUNT(*) FILTER (WHERE ${escapeIdentifier(sanitizedColumn)} ~ '^(true|false|t|f|yes|no|y|n|1|0)$') as boolean_count
|
|
338
|
+
FROM (
|
|
339
|
+
SELECT ${escapeIdentifier(sanitizedColumn)}
|
|
340
|
+
FROM ${escapeIdentifier(sanitizedSchema)}.${escapeIdentifier(sanitizedTable)}
|
|
341
|
+
LIMIT $1
|
|
342
|
+
) sample
|
|
343
|
+
`;
|
|
344
|
+
const analysisResult = await executeInternalQuery(connection, logger, {
|
|
345
|
+
query: analysisQuery,
|
|
346
|
+
params: [sampleSize]
|
|
347
|
+
});
|
|
348
|
+
const stats = analysisResult.rows[0];
|
|
349
|
+
const totalRows = parseInt(stats.total_rows, 10);
|
|
350
|
+
const nullCount = parseInt(stats.null_count, 10);
|
|
351
|
+
const numericCount = parseInt(stats.integer_count, 10) + parseInt(stats.decimal_count, 10);
|
|
352
|
+
const dateCount = parseInt(stats.date_count, 10);
|
|
353
|
+
const booleanCount = parseInt(stats.boolean_count, 10);
|
|
354
|
+
const invalidCount = totalRows - nullCount - numericCount - dateCount - booleanCount;
|
|
355
|
+
const patterns = {
|
|
356
|
+
numeric: {
|
|
357
|
+
count: numericCount,
|
|
358
|
+
percentage: ((numericCount / totalRows) * 100).toFixed(1)
|
|
359
|
+
},
|
|
360
|
+
date: {
|
|
361
|
+
count: dateCount,
|
|
362
|
+
percentage: ((dateCount / totalRows) * 100).toFixed(1)
|
|
363
|
+
},
|
|
364
|
+
boolean: {
|
|
365
|
+
count: booleanCount,
|
|
366
|
+
percentage: ((booleanCount / totalRows) * 100).toFixed(1)
|
|
367
|
+
},
|
|
368
|
+
null: {
|
|
369
|
+
count: nullCount,
|
|
370
|
+
percentage: ((nullCount / totalRows) * 100).toFixed(1)
|
|
371
|
+
},
|
|
372
|
+
invalid: {
|
|
373
|
+
count: invalidCount,
|
|
374
|
+
percentage: ((invalidCount / totalRows) * 100).toFixed(1)
|
|
375
|
+
}
|
|
376
|
+
};
|
|
377
|
+
const recommendations = [];
|
|
378
|
+
let suggestedMigration = null;
|
|
379
|
+
if (numericCount / totalRows > 0.95) {
|
|
380
|
+
recommendations.push(`✓ ${patterns.numeric.percentage}% of values are numeric`);
|
|
381
|
+
if (invalidCount > 0) {
|
|
382
|
+
recommendations.push(`${invalidCount} rows contain non-numeric values - clean up first`);
|
|
383
|
+
}
|
|
384
|
+
if (suggestConversion) {
|
|
385
|
+
const targetType = parseInt(stats.decimal_count, 10) > 0 ? 'numeric(10,2)' : 'integer';
|
|
386
|
+
recommendations.push(`Consider converting to ${targetType}: ALTER TABLE ${schema}.${table} ALTER COLUMN ${column} TYPE ${targetType} USING ${column}::${targetType}`);
|
|
387
|
+
suggestedMigration = {
|
|
388
|
+
targetType,
|
|
389
|
+
needsCleanup: invalidCount > 0,
|
|
390
|
+
cleanupQuery: `UPDATE ${schema}.${table} SET ${column} = NULL WHERE ${column} !~ '^[0-9.]+$'`,
|
|
391
|
+
conversionQuery: `ALTER TABLE ${schema}.${table} ALTER COLUMN ${column} TYPE ${targetType} USING ${column}::${targetType}`
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
else if (dateCount / totalRows > 0.95) {
|
|
396
|
+
recommendations.push(`✓ ${patterns.date.percentage}% of values are date-like`);
|
|
397
|
+
recommendations.push(`Consider converting to DATE or TIMESTAMP`);
|
|
398
|
+
}
|
|
399
|
+
else if (booleanCount / totalRows > 0.95) {
|
|
400
|
+
recommendations.push(`✓ ${patterns.boolean.percentage}% of values are boolean-like`);
|
|
401
|
+
recommendations.push(`Consider converting to BOOLEAN`);
|
|
402
|
+
}
|
|
403
|
+
else {
|
|
404
|
+
recommendations.push('⚠ Data has mixed types - not suitable for type conversion');
|
|
405
|
+
}
|
|
406
|
+
return {
|
|
407
|
+
table,
|
|
408
|
+
schema,
|
|
409
|
+
column,
|
|
410
|
+
currentType,
|
|
411
|
+
analysis: {
|
|
412
|
+
totalRows,
|
|
413
|
+
sampleSize,
|
|
414
|
+
patterns
|
|
415
|
+
},
|
|
416
|
+
recommendations,
|
|
417
|
+
...(suggestedMigration && { suggestedMigration })
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
export const dataQualityTools = {
|
|
421
|
+
findDuplicates: {
|
|
422
|
+
schema: FindDuplicatesSchema,
|
|
423
|
+
handler: findDuplicates
|
|
424
|
+
},
|
|
425
|
+
findMissingValues: {
|
|
426
|
+
schema: FindMissingValuesSchema,
|
|
427
|
+
handler: findMissingValues
|
|
428
|
+
},
|
|
429
|
+
findOrphans: {
|
|
430
|
+
schema: FindOrphansSchema,
|
|
431
|
+
handler: findOrphans
|
|
432
|
+
},
|
|
433
|
+
checkConstraintViolations: {
|
|
434
|
+
schema: CheckConstraintViolationsSchema,
|
|
435
|
+
handler: checkConstraintViolations
|
|
436
|
+
},
|
|
437
|
+
analyzeTypeConsistency: {
|
|
438
|
+
schema: AnalyzeTypeConsistencySchema,
|
|
439
|
+
handler: analyzeTypeConsistency
|
|
440
|
+
}
|
|
441
|
+
};
|
|
442
|
+
//# sourceMappingURL=data-quality.js.map
|