@mseep/csv-editor 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.md +53 -0
- package/.github/ISSUE_TEMPLATE/feature_request.md +38 -0
- package/.github/workflows/deploy-docs.yml +62 -0
- package/.github/workflows/publish-github.yml +52 -0
- package/.github/workflows/publish.yml +44 -0
- package/.github/workflows/test.yml +32 -0
- package/.pre-commit-config.yaml +157 -0
- package/ALTERNATIVE_PUBLISHING.md +175 -0
- package/ARCHITECTURE.md +1011 -0
- package/CHANGELOG.md +99 -0
- package/CODE_OF_CONDUCT.md +41 -0
- package/CONTRIBUTING.md +427 -0
- package/Dockerfile +22 -0
- package/LICENSE +21 -0
- package/MCP_CONFIG.md +505 -0
- package/PUBLISHING.md +210 -0
- package/README.md +400 -0
- package/SECURITY.md +61 -0
- package/docs/README.md +41 -0
- package/docs/blog/2019-05-28-first-blog-post.md +12 -0
- package/docs/blog/2019-05-29-long-blog-post.md +44 -0
- package/docs/blog/2021-08-01-mdx-blog-post.mdx +24 -0
- package/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
- package/docs/blog/2021-08-26-welcome/index.md +29 -0
- package/docs/blog/authors.yml +25 -0
- package/docs/blog/tags.yml +19 -0
- package/docs/docs/api/overview.md +183 -0
- package/docs/docs/installation.md +252 -0
- package/docs/docs/intro.md +87 -0
- package/docs/docs/tutorial-basics/_category_.json +8 -0
- package/docs/docs/tutorial-basics/congratulations.md +23 -0
- package/docs/docs/tutorial-basics/create-a-blog-post.md +34 -0
- package/docs/docs/tutorial-basics/create-a-document.md +57 -0
- package/docs/docs/tutorial-basics/create-a-page.md +43 -0
- package/docs/docs/tutorial-basics/deploy-your-site.md +31 -0
- package/docs/docs/tutorial-basics/markdown-features.mdx +152 -0
- package/docs/docs/tutorial-extras/_category_.json +7 -0
- package/docs/docs/tutorial-extras/img/docsVersionDropdown.png +0 -0
- package/docs/docs/tutorial-extras/img/localeDropdown.png +0 -0
- package/docs/docs/tutorial-extras/manage-docs-versions.md +55 -0
- package/docs/docs/tutorial-extras/translate-your-site.md +88 -0
- package/docs/docs/tutorials/quickstart.md +365 -0
- package/docs/docusaurus.config.ts +163 -0
- package/docs/package-lock.json +17493 -0
- package/docs/package.json +48 -0
- package/docs/sidebars.ts +33 -0
- package/docs/src/components/HomepageFeatures/index.tsx +71 -0
- package/docs/src/components/HomepageFeatures/styles.module.css +11 -0
- package/docs/src/css/custom.css +30 -0
- package/docs/src/pages/index.module.css +23 -0
- package/docs/src/pages/index.tsx +44 -0
- package/docs/src/pages/markdown-page.md +7 -0
- package/docs/static/.nojekyll +0 -0
- package/docs/static/img/docusaurus-social-card.jpg +0 -0
- package/docs/static/img/docusaurus.png +0 -0
- package/docs/static/img/favicon.ico +0 -0
- package/docs/static/img/logo.svg +1 -0
- package/docs/static/img/undraw_docusaurus_mountain.svg +171 -0
- package/docs/static/img/undraw_docusaurus_react.svg +170 -0
- package/docs/static/img/undraw_docusaurus_tree.svg +40 -0
- package/docs/tsconfig.json +8 -0
- package/examples/README.md +48 -0
- package/examples/auto_save_demo.py +206 -0
- package/examples/auto_save_overwrite.py +201 -0
- package/examples/basic_usage.py +135 -0
- package/examples/demo.py +139 -0
- package/examples/history_demo.py +317 -0
- package/examples/test_default_autosave.py +124 -0
- package/examples/update_consignee_example.py +179 -0
- package/package.json +51 -0
- package/plans/2026-04-19-fastmcp3-migration-plan.md +1045 -0
- package/pyproject.toml +331 -0
- package/requirements-dev.txt +30 -0
- package/requirements.txt +22 -0
- package/scripts/publish.py +67 -0
- package/smithery.yaml +15 -0
- package/specs/2026-04-19-fastmcp3-migration-design.md +243 -0
- package/src/csv_editor/__init__.py +8 -0
- package/src/csv_editor/models/__init__.py +39 -0
- package/src/csv_editor/models/auto_save.py +246 -0
- package/src/csv_editor/models/csv_session.py +468 -0
- package/src/csv_editor/models/data_models.py +244 -0
- package/src/csv_editor/models/history_manager.py +456 -0
- package/src/csv_editor/prompts/__init__.py +0 -0
- package/src/csv_editor/prompts/data_prompts.py +13 -0
- package/src/csv_editor/resources/__init__.py +0 -0
- package/src/csv_editor/resources/csv_resources.py +22 -0
- package/src/csv_editor/server.py +640 -0
- package/src/csv_editor/tools/__init__.py +5 -0
- package/src/csv_editor/tools/analytics.py +700 -0
- package/src/csv_editor/tools/auto_save_operations.py +235 -0
- package/src/csv_editor/tools/data_operations.py +3 -0
- package/src/csv_editor/tools/history_operations.py +315 -0
- package/src/csv_editor/tools/io_operations.py +431 -0
- package/src/csv_editor/tools/transformations.py +663 -0
- package/src/csv_editor/tools/validation.py +822 -0
- package/src/csv_editor/utils/__init__.py +0 -0
- package/src/csv_editor/utils/validators.py +205 -0
- package/tests/README.md +65 -0
- package/tests/__init__.py +7 -0
- package/tests/conftest.py +50 -0
- package/tests/test_auto_save.py +378 -0
- package/tests/test_basic.py +103 -0
- package/tests/test_integration.py +356 -0
- package/tests/test_server_boot.py +50 -0
- package/tests/test_settings.py +184 -0
|
@@ -0,0 +1,822 @@
|
|
|
1
|
+
"""Data validation tools for CSV data quality checks."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from fastmcp import Context
|
|
9
|
+
|
|
10
|
+
from ..models.csv_session import get_session_manager
|
|
11
|
+
from ..models.data_models import OperationType
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def validate_schema(
|
|
17
|
+
session_id: str, schema: dict[str, dict[str, Any]], ctx: Context = None
|
|
18
|
+
) -> dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Validate data against a schema definition.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
session_id: Session identifier
|
|
24
|
+
schema: Schema definition with column rules
|
|
25
|
+
Example: {
|
|
26
|
+
"column_name": {
|
|
27
|
+
"type": "int", # int, float, str, bool, datetime
|
|
28
|
+
"nullable": False,
|
|
29
|
+
"min": 0,
|
|
30
|
+
"max": 100,
|
|
31
|
+
"pattern": "^[A-Z]+$",
|
|
32
|
+
"values": ["A", "B", "C"], # allowed values
|
|
33
|
+
"unique": True
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
ctx: FastMCP context
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Dict with validation results
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
manager = get_session_manager()
|
|
43
|
+
session = manager.get_session(session_id)
|
|
44
|
+
|
|
45
|
+
if not session or session.df is None:
|
|
46
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
47
|
+
|
|
48
|
+
df = session.df
|
|
49
|
+
validation_errors = {}
|
|
50
|
+
validation_summary = {
|
|
51
|
+
"total_columns": len(schema),
|
|
52
|
+
"valid_columns": 0,
|
|
53
|
+
"invalid_columns": 0,
|
|
54
|
+
"missing_columns": [],
|
|
55
|
+
"extra_columns": [],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Check for missing and extra columns
|
|
59
|
+
schema_columns = set(schema.keys())
|
|
60
|
+
df_columns = set(df.columns)
|
|
61
|
+
|
|
62
|
+
validation_summary["missing_columns"] = list(schema_columns - df_columns)
|
|
63
|
+
validation_summary["extra_columns"] = list(df_columns - schema_columns)
|
|
64
|
+
|
|
65
|
+
# Validate each column in schema
|
|
66
|
+
for col_name, rules in schema.items():
|
|
67
|
+
if col_name not in df.columns:
|
|
68
|
+
validation_errors[col_name] = [
|
|
69
|
+
{"error": "column_missing", "message": f"Column '{col_name}' not found in data"}
|
|
70
|
+
]
|
|
71
|
+
validation_summary["invalid_columns"] += 1
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
col_errors = []
|
|
75
|
+
col_data = df[col_name]
|
|
76
|
+
|
|
77
|
+
# Type validation
|
|
78
|
+
expected_type = rules.get("type")
|
|
79
|
+
if expected_type:
|
|
80
|
+
type_valid = False
|
|
81
|
+
if expected_type == "int":
|
|
82
|
+
type_valid = pd.api.types.is_integer_dtype(col_data)
|
|
83
|
+
elif expected_type == "float":
|
|
84
|
+
type_valid = pd.api.types.is_float_dtype(col_data)
|
|
85
|
+
elif expected_type == "str":
|
|
86
|
+
type_valid = pd.api.types.is_string_dtype(col_data) or col_data.dtype == object
|
|
87
|
+
elif expected_type == "bool":
|
|
88
|
+
type_valid = pd.api.types.is_bool_dtype(col_data)
|
|
89
|
+
elif expected_type == "datetime":
|
|
90
|
+
type_valid = pd.api.types.is_datetime64_any_dtype(col_data)
|
|
91
|
+
|
|
92
|
+
if not type_valid:
|
|
93
|
+
col_errors.append(
|
|
94
|
+
{
|
|
95
|
+
"error": "type_mismatch",
|
|
96
|
+
"message": f"Expected type '{expected_type}', got '{col_data.dtype}'",
|
|
97
|
+
"actual_type": str(col_data.dtype),
|
|
98
|
+
}
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Nullable validation
|
|
102
|
+
if not rules.get("nullable", True):
|
|
103
|
+
null_count = col_data.isna().sum()
|
|
104
|
+
if null_count > 0:
|
|
105
|
+
col_errors.append(
|
|
106
|
+
{
|
|
107
|
+
"error": "null_values",
|
|
108
|
+
"message": f"Column contains {null_count} null values",
|
|
109
|
+
"null_count": int(null_count),
|
|
110
|
+
"null_indices": df[col_data.isna()].index.tolist()[:100],
|
|
111
|
+
}
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Min/Max validation for numeric columns
|
|
115
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
116
|
+
if "min" in rules:
|
|
117
|
+
min_val = rules["min"]
|
|
118
|
+
violations = col_data[col_data < min_val]
|
|
119
|
+
if len(violations) > 0:
|
|
120
|
+
col_errors.append(
|
|
121
|
+
{
|
|
122
|
+
"error": "min_violation",
|
|
123
|
+
"message": f"{len(violations)} values below minimum {min_val}",
|
|
124
|
+
"violation_count": len(violations),
|
|
125
|
+
"min_found": float(violations.min()),
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
if "max" in rules:
|
|
130
|
+
max_val = rules["max"]
|
|
131
|
+
violations = col_data[col_data > max_val]
|
|
132
|
+
if len(violations) > 0:
|
|
133
|
+
col_errors.append(
|
|
134
|
+
{
|
|
135
|
+
"error": "max_violation",
|
|
136
|
+
"message": f"{len(violations)} values above maximum {max_val}",
|
|
137
|
+
"violation_count": len(violations),
|
|
138
|
+
"max_found": float(violations.max()),
|
|
139
|
+
}
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Pattern validation for string columns
|
|
143
|
+
if "pattern" in rules and (
|
|
144
|
+
col_data.dtype == object or pd.api.types.is_string_dtype(col_data)
|
|
145
|
+
):
|
|
146
|
+
pattern = rules["pattern"]
|
|
147
|
+
try:
|
|
148
|
+
non_null = col_data.dropna()
|
|
149
|
+
if len(non_null) > 0:
|
|
150
|
+
matches = non_null.astype(str).str.match(pattern)
|
|
151
|
+
violations = non_null[~matches]
|
|
152
|
+
if len(violations) > 0:
|
|
153
|
+
col_errors.append(
|
|
154
|
+
{
|
|
155
|
+
"error": "pattern_violation",
|
|
156
|
+
"message": f"{len(violations)} values don't match pattern '{pattern}'",
|
|
157
|
+
"violation_count": len(violations),
|
|
158
|
+
"sample_violations": violations.head(10).tolist(),
|
|
159
|
+
}
|
|
160
|
+
)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
col_errors.append(
|
|
163
|
+
{"error": "pattern_error", "message": f"Invalid regex pattern: {e!s}"}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Allowed values validation
|
|
167
|
+
if "values" in rules:
|
|
168
|
+
allowed = set(rules["values"])
|
|
169
|
+
actual = set(col_data.dropna().unique())
|
|
170
|
+
invalid = actual - allowed
|
|
171
|
+
if invalid:
|
|
172
|
+
col_errors.append(
|
|
173
|
+
{
|
|
174
|
+
"error": "invalid_values",
|
|
175
|
+
"message": f"Found {len(invalid)} invalid values",
|
|
176
|
+
"invalid_values": list(invalid)[:50],
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Uniqueness validation
|
|
181
|
+
if rules.get("unique", False):
|
|
182
|
+
duplicates = col_data.duplicated()
|
|
183
|
+
if duplicates.any():
|
|
184
|
+
col_errors.append(
|
|
185
|
+
{
|
|
186
|
+
"error": "duplicate_values",
|
|
187
|
+
"message": f"Column contains {duplicates.sum()} duplicate values",
|
|
188
|
+
"duplicate_count": int(duplicates.sum()),
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Length validation for strings
|
|
193
|
+
if col_data.dtype == object or pd.api.types.is_string_dtype(col_data):
|
|
194
|
+
if "min_length" in rules:
|
|
195
|
+
min_len = rules["min_length"]
|
|
196
|
+
str_data = col_data.dropna().astype(str)
|
|
197
|
+
short = str_data[str_data.str.len() < min_len]
|
|
198
|
+
if len(short) > 0:
|
|
199
|
+
col_errors.append(
|
|
200
|
+
{
|
|
201
|
+
"error": "min_length_violation",
|
|
202
|
+
"message": f"{len(short)} values shorter than {min_len} characters",
|
|
203
|
+
"violation_count": len(short),
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if "max_length" in rules:
|
|
208
|
+
max_len = rules["max_length"]
|
|
209
|
+
str_data = col_data.dropna().astype(str)
|
|
210
|
+
long = str_data[str_data.str.len() > max_len]
|
|
211
|
+
if len(long) > 0:
|
|
212
|
+
col_errors.append(
|
|
213
|
+
{
|
|
214
|
+
"error": "max_length_violation",
|
|
215
|
+
"message": f"{len(long)} values longer than {max_len} characters",
|
|
216
|
+
"violation_count": len(long),
|
|
217
|
+
}
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if col_errors:
|
|
221
|
+
validation_errors[col_name] = col_errors
|
|
222
|
+
validation_summary["invalid_columns"] += 1
|
|
223
|
+
else:
|
|
224
|
+
validation_summary["valid_columns"] += 1
|
|
225
|
+
|
|
226
|
+
is_valid = len(validation_errors) == 0 and len(validation_summary["missing_columns"]) == 0
|
|
227
|
+
|
|
228
|
+
session.record_operation(
|
|
229
|
+
OperationType.VALIDATE,
|
|
230
|
+
{
|
|
231
|
+
"type": "schema_validation",
|
|
232
|
+
"is_valid": is_valid,
|
|
233
|
+
"errors_count": len(validation_errors),
|
|
234
|
+
},
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return {
|
|
238
|
+
"success": True,
|
|
239
|
+
"is_valid": is_valid,
|
|
240
|
+
"summary": validation_summary,
|
|
241
|
+
"validation_errors": validation_errors,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.error(f"Error validating schema: {e!s}")
|
|
246
|
+
return {"success": False, "error": str(e)}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
async def check_data_quality(
|
|
250
|
+
session_id: str, rules: list[dict[str, Any]] | None = None, ctx: Context = None
|
|
251
|
+
) -> dict[str, Any]:
|
|
252
|
+
"""
|
|
253
|
+
Check data quality based on predefined or custom rules.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
session_id: Session identifier
|
|
257
|
+
rules: Custom quality rules to check. If None, uses default rules.
|
|
258
|
+
Example: [
|
|
259
|
+
{"type": "completeness", "threshold": 0.95},
|
|
260
|
+
{"type": "uniqueness", "column": "id"},
|
|
261
|
+
{"type": "consistency", "columns": ["start_date", "end_date"]}
|
|
262
|
+
]
|
|
263
|
+
ctx: FastMCP context
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Dict with quality check results
|
|
267
|
+
"""
|
|
268
|
+
try:
|
|
269
|
+
manager = get_session_manager()
|
|
270
|
+
session = manager.get_session(session_id)
|
|
271
|
+
|
|
272
|
+
if not session or session.df is None:
|
|
273
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
274
|
+
|
|
275
|
+
df = session.df
|
|
276
|
+
quality_results = {
|
|
277
|
+
"overall_score": 100.0,
|
|
278
|
+
"checks": [],
|
|
279
|
+
"issues": [],
|
|
280
|
+
"recommendations": [],
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# Default rules if none provided
|
|
284
|
+
if not rules:
|
|
285
|
+
rules = [
|
|
286
|
+
{"type": "completeness", "threshold": 0.95},
|
|
287
|
+
{"type": "duplicates", "threshold": 0.01},
|
|
288
|
+
{"type": "data_types"},
|
|
289
|
+
{"type": "outliers", "threshold": 0.05},
|
|
290
|
+
{"type": "consistency"},
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
total_score = 0
|
|
294
|
+
score_count = 0
|
|
295
|
+
|
|
296
|
+
for rule in rules:
|
|
297
|
+
rule_type = rule.get("type")
|
|
298
|
+
|
|
299
|
+
if rule_type == "completeness":
|
|
300
|
+
# Check data completeness
|
|
301
|
+
threshold = rule.get("threshold", 0.95)
|
|
302
|
+
columns = rule.get("columns", df.columns.tolist())
|
|
303
|
+
|
|
304
|
+
for col in columns:
|
|
305
|
+
if col in df.columns:
|
|
306
|
+
completeness = 1 - (df[col].isna().sum() / len(df))
|
|
307
|
+
passed = completeness >= threshold
|
|
308
|
+
score = completeness * 100
|
|
309
|
+
|
|
310
|
+
quality_results["checks"].append(
|
|
311
|
+
{
|
|
312
|
+
"type": "completeness",
|
|
313
|
+
"column": col,
|
|
314
|
+
"completeness": round(completeness, 4),
|
|
315
|
+
"threshold": threshold,
|
|
316
|
+
"passed": passed,
|
|
317
|
+
"score": round(score, 2),
|
|
318
|
+
}
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
if not passed:
|
|
322
|
+
quality_results["issues"].append(
|
|
323
|
+
{
|
|
324
|
+
"type": "incomplete_data",
|
|
325
|
+
"column": col,
|
|
326
|
+
"message": f"Column '{col}' is only {round(completeness*100, 2)}% complete",
|
|
327
|
+
"severity": "high" if completeness < 0.5 else "medium",
|
|
328
|
+
}
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
total_score += score
|
|
332
|
+
score_count += 1
|
|
333
|
+
|
|
334
|
+
elif rule_type == "duplicates":
|
|
335
|
+
# Check for duplicate rows
|
|
336
|
+
threshold = rule.get("threshold", 0.01)
|
|
337
|
+
subset = rule.get("columns")
|
|
338
|
+
|
|
339
|
+
duplicates = df.duplicated(subset=subset)
|
|
340
|
+
duplicate_ratio = duplicates.sum() / len(df)
|
|
341
|
+
passed = duplicate_ratio <= threshold
|
|
342
|
+
score = (1 - duplicate_ratio) * 100
|
|
343
|
+
|
|
344
|
+
quality_results["checks"].append(
|
|
345
|
+
{
|
|
346
|
+
"type": "duplicates",
|
|
347
|
+
"duplicate_rows": int(duplicates.sum()),
|
|
348
|
+
"duplicate_ratio": round(duplicate_ratio, 4),
|
|
349
|
+
"threshold": threshold,
|
|
350
|
+
"passed": passed,
|
|
351
|
+
"score": round(score, 2),
|
|
352
|
+
}
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if not passed:
|
|
356
|
+
quality_results["issues"].append(
|
|
357
|
+
{
|
|
358
|
+
"type": "duplicate_rows",
|
|
359
|
+
"message": f"Found {duplicates.sum()} duplicate rows ({round(duplicate_ratio*100, 2)}%)",
|
|
360
|
+
"severity": "high" if duplicate_ratio > 0.1 else "medium",
|
|
361
|
+
}
|
|
362
|
+
)
|
|
363
|
+
quality_results["recommendations"].append(
|
|
364
|
+
"Consider removing duplicate rows using the remove_duplicates tool"
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
total_score += score
|
|
368
|
+
score_count += 1
|
|
369
|
+
|
|
370
|
+
elif rule_type == "uniqueness":
|
|
371
|
+
# Check column uniqueness
|
|
372
|
+
column = rule.get("column")
|
|
373
|
+
if column and column in df.columns:
|
|
374
|
+
unique_ratio = df[column].nunique() / len(df)
|
|
375
|
+
expected_unique = rule.get("expected_unique", True)
|
|
376
|
+
|
|
377
|
+
if expected_unique:
|
|
378
|
+
passed = unique_ratio >= 0.99
|
|
379
|
+
score = unique_ratio * 100
|
|
380
|
+
else:
|
|
381
|
+
passed = True
|
|
382
|
+
score = 100
|
|
383
|
+
|
|
384
|
+
quality_results["checks"].append(
|
|
385
|
+
{
|
|
386
|
+
"type": "uniqueness",
|
|
387
|
+
"column": column,
|
|
388
|
+
"unique_values": int(df[column].nunique()),
|
|
389
|
+
"unique_ratio": round(unique_ratio, 4),
|
|
390
|
+
"passed": passed,
|
|
391
|
+
"score": round(score, 2),
|
|
392
|
+
}
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if not passed and expected_unique:
|
|
396
|
+
quality_results["issues"].append(
|
|
397
|
+
{
|
|
398
|
+
"type": "non_unique_values",
|
|
399
|
+
"column": column,
|
|
400
|
+
"message": f"Column '{column}' expected to be unique but has duplicates",
|
|
401
|
+
"severity": "high",
|
|
402
|
+
}
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
total_score += score
|
|
406
|
+
score_count += 1
|
|
407
|
+
|
|
408
|
+
elif rule_type == "data_types":
|
|
409
|
+
# Check data type consistency
|
|
410
|
+
for col in df.columns:
|
|
411
|
+
col_data = df[col].dropna()
|
|
412
|
+
if len(col_data) > 0:
|
|
413
|
+
# Check for mixed types
|
|
414
|
+
types = col_data.apply(type).unique()
|
|
415
|
+
mixed_types = len(types) > 1
|
|
416
|
+
|
|
417
|
+
# Check for numeric strings
|
|
418
|
+
if col_data.dtype == object:
|
|
419
|
+
numeric_strings = col_data.astype(str).str.match(r"^-?\d+\.?\d*$").sum()
|
|
420
|
+
numeric_ratio = numeric_strings / len(col_data)
|
|
421
|
+
else:
|
|
422
|
+
numeric_ratio = 0
|
|
423
|
+
|
|
424
|
+
score = 100 if not mixed_types else 50
|
|
425
|
+
|
|
426
|
+
quality_results["checks"].append(
|
|
427
|
+
{
|
|
428
|
+
"type": "data_type_consistency",
|
|
429
|
+
"column": col,
|
|
430
|
+
"dtype": str(df[col].dtype),
|
|
431
|
+
"mixed_types": mixed_types,
|
|
432
|
+
"numeric_strings": numeric_ratio > 0.9,
|
|
433
|
+
"score": score,
|
|
434
|
+
}
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
if numeric_ratio > 0.9:
|
|
438
|
+
quality_results["recommendations"].append(
|
|
439
|
+
f"Column '{col}' appears to contain numeric data stored as strings. "
|
|
440
|
+
f"Consider converting to numeric type using change_column_type tool"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
total_score += score
|
|
444
|
+
score_count += 1
|
|
445
|
+
|
|
446
|
+
elif rule_type == "outliers":
|
|
447
|
+
# Check for outliers in numeric columns
|
|
448
|
+
threshold = rule.get("threshold", 0.05)
|
|
449
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
450
|
+
|
|
451
|
+
for col in numeric_cols:
|
|
452
|
+
Q1 = df[col].quantile(0.25)
|
|
453
|
+
Q3 = df[col].quantile(0.75)
|
|
454
|
+
IQR = Q3 - Q1
|
|
455
|
+
|
|
456
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
457
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
458
|
+
|
|
459
|
+
outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
|
|
460
|
+
outlier_ratio = outliers / len(df)
|
|
461
|
+
passed = outlier_ratio <= threshold
|
|
462
|
+
score = (1 - min(outlier_ratio, 1)) * 100
|
|
463
|
+
|
|
464
|
+
quality_results["checks"].append(
|
|
465
|
+
{
|
|
466
|
+
"type": "outliers",
|
|
467
|
+
"column": col,
|
|
468
|
+
"outlier_count": int(outliers),
|
|
469
|
+
"outlier_ratio": round(outlier_ratio, 4),
|
|
470
|
+
"threshold": threshold,
|
|
471
|
+
"passed": passed,
|
|
472
|
+
"score": round(score, 2),
|
|
473
|
+
}
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if not passed:
|
|
477
|
+
quality_results["issues"].append(
|
|
478
|
+
{
|
|
479
|
+
"type": "outliers",
|
|
480
|
+
"column": col,
|
|
481
|
+
"message": f"Column '{col}' has {outliers} outliers ({round(outlier_ratio*100, 2)}%)",
|
|
482
|
+
"severity": "medium",
|
|
483
|
+
}
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
total_score += score
|
|
487
|
+
score_count += 1
|
|
488
|
+
|
|
489
|
+
elif rule_type == "consistency":
|
|
490
|
+
# Check data consistency
|
|
491
|
+
columns = rule.get("columns", [])
|
|
492
|
+
|
|
493
|
+
# Date consistency check
|
|
494
|
+
date_cols = df.select_dtypes(include=["datetime64"]).columns
|
|
495
|
+
if len(date_cols) >= 2 and not columns:
|
|
496
|
+
columns = date_cols.tolist()
|
|
497
|
+
|
|
498
|
+
if len(columns) >= 2:
|
|
499
|
+
col1, col2 = columns[0], columns[1]
|
|
500
|
+
if col1 in df.columns and col2 in df.columns:
|
|
501
|
+
# Check if col1 should be before col2 (e.g., start_date < end_date)
|
|
502
|
+
if pd.api.types.is_datetime64_any_dtype(
|
|
503
|
+
df[col1]
|
|
504
|
+
) and pd.api.types.is_datetime64_any_dtype(df[col2]):
|
|
505
|
+
inconsistent = (df[col1] > df[col2]).sum()
|
|
506
|
+
consistency_ratio = 1 - (inconsistent / len(df))
|
|
507
|
+
passed = consistency_ratio >= 0.99
|
|
508
|
+
score = consistency_ratio * 100
|
|
509
|
+
|
|
510
|
+
quality_results["checks"].append(
|
|
511
|
+
{
|
|
512
|
+
"type": "consistency",
|
|
513
|
+
"columns": [col1, col2],
|
|
514
|
+
"consistent_rows": len(df) - inconsistent,
|
|
515
|
+
"inconsistent_rows": int(inconsistent),
|
|
516
|
+
"consistency_ratio": round(consistency_ratio, 4),
|
|
517
|
+
"passed": passed,
|
|
518
|
+
"score": round(score, 2),
|
|
519
|
+
}
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
if not passed:
|
|
523
|
+
quality_results["issues"].append(
|
|
524
|
+
{
|
|
525
|
+
"type": "data_inconsistency",
|
|
526
|
+
"columns": [col1, col2],
|
|
527
|
+
"message": f"Found {inconsistent} rows where {col1} > {col2}",
|
|
528
|
+
"severity": "high",
|
|
529
|
+
}
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
total_score += score
|
|
533
|
+
score_count += 1
|
|
534
|
+
|
|
535
|
+
# Calculate overall score
|
|
536
|
+
if score_count > 0:
|
|
537
|
+
quality_results["overall_score"] = round(total_score / score_count, 2)
|
|
538
|
+
|
|
539
|
+
# Determine quality level
|
|
540
|
+
overall_score = quality_results["overall_score"]
|
|
541
|
+
if overall_score >= 95:
|
|
542
|
+
quality_results["quality_level"] = "Excellent"
|
|
543
|
+
elif overall_score >= 85:
|
|
544
|
+
quality_results["quality_level"] = "Good"
|
|
545
|
+
elif overall_score >= 70:
|
|
546
|
+
quality_results["quality_level"] = "Fair"
|
|
547
|
+
else:
|
|
548
|
+
quality_results["quality_level"] = "Poor"
|
|
549
|
+
|
|
550
|
+
# Add general recommendations
|
|
551
|
+
if not quality_results["recommendations"]:
|
|
552
|
+
if overall_score < 85:
|
|
553
|
+
quality_results["recommendations"].append(
|
|
554
|
+
"Consider running profile_data to get a comprehensive overview of data issues"
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
session.record_operation(
|
|
558
|
+
OperationType.QUALITY_CHECK,
|
|
559
|
+
{
|
|
560
|
+
"rules_count": len(rules),
|
|
561
|
+
"overall_score": overall_score,
|
|
562
|
+
"issues_count": len(quality_results["issues"]),
|
|
563
|
+
},
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
return {"success": True, "quality_results": quality_results}
|
|
567
|
+
|
|
568
|
+
except Exception as e:
|
|
569
|
+
logger.error(f"Error checking data quality: {e!s}")
|
|
570
|
+
return {"success": False, "error": str(e)}
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
async def find_anomalies(
|
|
574
|
+
session_id: str,
|
|
575
|
+
columns: list[str] | None = None,
|
|
576
|
+
sensitivity: float = 0.95,
|
|
577
|
+
methods: list[str] | None = None,
|
|
578
|
+
ctx: Context = None,
|
|
579
|
+
) -> dict[str, Any]:
|
|
580
|
+
"""
|
|
581
|
+
Find anomalies in the data using multiple detection methods.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
session_id: Session identifier
|
|
585
|
+
columns: Columns to check (None for all)
|
|
586
|
+
sensitivity: Detection sensitivity (0.0 to 1.0, higher = more sensitive)
|
|
587
|
+
methods: Detection methods to use (default: ["statistical", "pattern"])
|
|
588
|
+
ctx: FastMCP context
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Dict with anomaly detection results
|
|
592
|
+
"""
|
|
593
|
+
try:
|
|
594
|
+
manager = get_session_manager()
|
|
595
|
+
session = manager.get_session(session_id)
|
|
596
|
+
|
|
597
|
+
if not session or session.df is None:
|
|
598
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
599
|
+
|
|
600
|
+
df = session.df
|
|
601
|
+
|
|
602
|
+
if columns:
|
|
603
|
+
missing_cols = [col for col in columns if col not in df.columns]
|
|
604
|
+
if missing_cols:
|
|
605
|
+
return {"success": False, "error": f"Columns not found: {missing_cols}"}
|
|
606
|
+
target_cols = columns
|
|
607
|
+
else:
|
|
608
|
+
target_cols = df.columns.tolist()
|
|
609
|
+
|
|
610
|
+
if not methods:
|
|
611
|
+
methods = ["statistical", "pattern", "missing"]
|
|
612
|
+
|
|
613
|
+
anomalies = {
|
|
614
|
+
"summary": {"total_anomalies": 0, "affected_rows": set(), "affected_columns": []},
|
|
615
|
+
"by_column": {},
|
|
616
|
+
"by_method": {},
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
# Statistical anomalies (outliers)
|
|
620
|
+
if "statistical" in methods:
|
|
621
|
+
numeric_cols = df[target_cols].select_dtypes(include=[np.number]).columns
|
|
622
|
+
statistical_anomalies = {}
|
|
623
|
+
|
|
624
|
+
for col in numeric_cols:
|
|
625
|
+
col_data = df[col].dropna()
|
|
626
|
+
if len(col_data) > 0:
|
|
627
|
+
# Z-score method
|
|
628
|
+
z_scores = np.abs((col_data - col_data.mean()) / col_data.std())
|
|
629
|
+
z_threshold = 3 * (
|
|
630
|
+
1 - sensitivity + 0.5
|
|
631
|
+
) # Adjust threshold based on sensitivity
|
|
632
|
+
z_anomalies = df.index[z_scores > z_threshold].tolist()
|
|
633
|
+
|
|
634
|
+
# IQR method
|
|
635
|
+
Q1 = col_data.quantile(0.25)
|
|
636
|
+
Q3 = col_data.quantile(0.75)
|
|
637
|
+
IQR = Q3 - Q1
|
|
638
|
+
iqr_factor = 1.5 * (2 - sensitivity) # Adjust factor based on sensitivity
|
|
639
|
+
lower = Q1 - iqr_factor * IQR
|
|
640
|
+
upper = Q3 + iqr_factor * IQR
|
|
641
|
+
iqr_anomalies = df.index[(df[col] < lower) | (df[col] > upper)].tolist()
|
|
642
|
+
|
|
643
|
+
# Combine both methods
|
|
644
|
+
combined_anomalies = list(set(z_anomalies) | set(iqr_anomalies))
|
|
645
|
+
|
|
646
|
+
if combined_anomalies:
|
|
647
|
+
statistical_anomalies[col] = {
|
|
648
|
+
"anomaly_count": len(combined_anomalies),
|
|
649
|
+
"anomaly_indices": combined_anomalies[:100],
|
|
650
|
+
"anomaly_values": df.loc[combined_anomalies[:10], col].tolist(),
|
|
651
|
+
"mean": float(col_data.mean()),
|
|
652
|
+
"std": float(col_data.std()),
|
|
653
|
+
"lower_bound": float(lower),
|
|
654
|
+
"upper_bound": float(upper),
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
anomalies["summary"]["total_anomalies"] += len(combined_anomalies)
|
|
658
|
+
anomalies["summary"]["affected_rows"].update(combined_anomalies)
|
|
659
|
+
anomalies["summary"]["affected_columns"].append(col)
|
|
660
|
+
|
|
661
|
+
if statistical_anomalies:
|
|
662
|
+
anomalies["by_method"]["statistical"] = statistical_anomalies
|
|
663
|
+
|
|
664
|
+
# Pattern anomalies
|
|
665
|
+
if "pattern" in methods:
|
|
666
|
+
pattern_anomalies = {}
|
|
667
|
+
|
|
668
|
+
for col in target_cols:
|
|
669
|
+
if df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
|
|
670
|
+
col_data = df[col].dropna()
|
|
671
|
+
if len(col_data) > 0:
|
|
672
|
+
# Detect unusual patterns
|
|
673
|
+
value_counts = col_data.value_counts()
|
|
674
|
+
total_count = len(col_data)
|
|
675
|
+
|
|
676
|
+
# Find rare values (appearing less than threshold)
|
|
677
|
+
threshold = (1 - sensitivity) * 0.01 # Adjust threshold
|
|
678
|
+
rare_values = value_counts[value_counts / total_count < threshold]
|
|
679
|
+
|
|
680
|
+
if len(rare_values) > 0:
|
|
681
|
+
rare_indices = df[df[col].isin(rare_values.index)].index.tolist()
|
|
682
|
+
|
|
683
|
+
# Check for format anomalies (e.g., different case, special characters)
|
|
684
|
+
common_pattern = None
|
|
685
|
+
if len(value_counts) > 10:
|
|
686
|
+
# Detect common pattern from frequent values
|
|
687
|
+
top_values = value_counts.head(10).index
|
|
688
|
+
|
|
689
|
+
# Check if most values are uppercase/lowercase
|
|
690
|
+
upper_count = sum(1 for v in top_values if str(v).isupper())
|
|
691
|
+
lower_count = sum(1 for v in top_values if str(v).islower())
|
|
692
|
+
|
|
693
|
+
if upper_count > 7:
|
|
694
|
+
common_pattern = "uppercase"
|
|
695
|
+
elif lower_count > 7:
|
|
696
|
+
common_pattern = "lowercase"
|
|
697
|
+
|
|
698
|
+
format_anomalies = []
|
|
699
|
+
if common_pattern:
|
|
700
|
+
for idx, val in col_data.items():
|
|
701
|
+
if (
|
|
702
|
+
common_pattern == "uppercase" and not str(val).isupper()
|
|
703
|
+
) or (common_pattern == "lowercase" and not str(val).islower()):
|
|
704
|
+
format_anomalies.append(idx)
|
|
705
|
+
|
|
706
|
+
all_pattern_anomalies = list(set(rare_indices + format_anomalies))
|
|
707
|
+
|
|
708
|
+
if all_pattern_anomalies:
|
|
709
|
+
pattern_anomalies[col] = {
|
|
710
|
+
"anomaly_count": len(all_pattern_anomalies),
|
|
711
|
+
"rare_values": rare_values.head(10).to_dict(),
|
|
712
|
+
"anomaly_indices": all_pattern_anomalies[:100],
|
|
713
|
+
"common_pattern": common_pattern,
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
anomalies["summary"]["total_anomalies"] += len(
|
|
717
|
+
all_pattern_anomalies
|
|
718
|
+
)
|
|
719
|
+
anomalies["summary"]["affected_rows"].update(all_pattern_anomalies)
|
|
720
|
+
if col not in anomalies["summary"]["affected_columns"]:
|
|
721
|
+
anomalies["summary"]["affected_columns"].append(col)
|
|
722
|
+
|
|
723
|
+
if pattern_anomalies:
|
|
724
|
+
anomalies["by_method"]["pattern"] = pattern_anomalies
|
|
725
|
+
|
|
726
|
+
# Missing value anomalies
|
|
727
|
+
if "missing" in methods:
|
|
728
|
+
missing_anomalies = {}
|
|
729
|
+
|
|
730
|
+
for col in target_cols:
|
|
731
|
+
null_mask = df[col].isna()
|
|
732
|
+
null_count = null_mask.sum()
|
|
733
|
+
|
|
734
|
+
if null_count > 0:
|
|
735
|
+
null_ratio = null_count / len(df)
|
|
736
|
+
|
|
737
|
+
# Check for suspicious missing patterns
|
|
738
|
+
if 0 < null_ratio < 0.5: # Partially missing
|
|
739
|
+
# Check if missing values are clustered
|
|
740
|
+
null_indices = df.index[null_mask].tolist()
|
|
741
|
+
|
|
742
|
+
# Check for sequential missing values
|
|
743
|
+
sequential_missing = []
|
|
744
|
+
if len(null_indices) > 1:
|
|
745
|
+
for i in range(len(null_indices) - 1):
|
|
746
|
+
if null_indices[i + 1] - null_indices[i] == 1:
|
|
747
|
+
if (
|
|
748
|
+
not sequential_missing
|
|
749
|
+
or null_indices[i] - sequential_missing[-1][-1] == 1
|
|
750
|
+
):
|
|
751
|
+
if sequential_missing:
|
|
752
|
+
sequential_missing[-1].append(null_indices[i + 1])
|
|
753
|
+
else:
|
|
754
|
+
sequential_missing.append(
|
|
755
|
+
[null_indices[i], null_indices[i + 1]]
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# Flag as anomaly if there are suspicious patterns
|
|
759
|
+
is_anomaly = (
|
|
760
|
+
len(sequential_missing) > 0
|
|
761
|
+
and len(sequential_missing) > len(null_indices) * 0.3
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
if is_anomaly or (null_ratio > 0.1 and null_ratio < 0.3):
|
|
765
|
+
missing_anomalies[col] = {
|
|
766
|
+
"missing_count": int(null_count),
|
|
767
|
+
"missing_ratio": round(null_ratio, 4),
|
|
768
|
+
"missing_indices": null_indices[:100],
|
|
769
|
+
"sequential_clusters": len(sequential_missing),
|
|
770
|
+
"pattern": "clustered" if sequential_missing else "random",
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
anomalies["summary"]["affected_columns"].append(col)
|
|
774
|
+
|
|
775
|
+
if missing_anomalies:
|
|
776
|
+
anomalies["by_method"]["missing"] = missing_anomalies
|
|
777
|
+
|
|
778
|
+
# Organize anomalies by column
|
|
779
|
+
for method_name, method_anomalies in anomalies["by_method"].items():
|
|
780
|
+
for col, col_anomalies in method_anomalies.items():
|
|
781
|
+
if col not in anomalies["by_column"]:
|
|
782
|
+
anomalies["by_column"][col] = {}
|
|
783
|
+
anomalies["by_column"][col][method_name] = col_anomalies
|
|
784
|
+
|
|
785
|
+
# Convert set to list for JSON serialization
|
|
786
|
+
anomalies["summary"]["affected_rows"] = list(anomalies["summary"]["affected_rows"])[:1000]
|
|
787
|
+
anomalies["summary"]["affected_columns"] = list(
|
|
788
|
+
set(anomalies["summary"]["affected_columns"])
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
# Calculate anomaly score
|
|
792
|
+
total_cells = len(df) * len(target_cols)
|
|
793
|
+
anomaly_cells = len(anomalies["summary"]["affected_rows"]) * len(
|
|
794
|
+
anomalies["summary"]["affected_columns"]
|
|
795
|
+
)
|
|
796
|
+
anomaly_score = min(anomaly_cells / total_cells, 1.0) * 100
|
|
797
|
+
|
|
798
|
+
anomalies["summary"]["anomaly_score"] = round(anomaly_score, 2)
|
|
799
|
+
anomalies["summary"]["severity"] = (
|
|
800
|
+
"high" if anomaly_score > 10 else "medium" if anomaly_score > 5 else "low"
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
session.record_operation(
|
|
804
|
+
OperationType.ANOMALY_DETECTION,
|
|
805
|
+
{
|
|
806
|
+
"methods": methods,
|
|
807
|
+
"sensitivity": sensitivity,
|
|
808
|
+
"anomalies_found": anomalies["summary"]["total_anomalies"],
|
|
809
|
+
},
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
return {
|
|
813
|
+
"success": True,
|
|
814
|
+
"anomalies": anomalies,
|
|
815
|
+
"columns_analyzed": target_cols,
|
|
816
|
+
"methods_used": methods,
|
|
817
|
+
"sensitivity": sensitivity,
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
except Exception as e:
|
|
821
|
+
logger.error(f"Error finding anomalies: {e!s}")
|
|
822
|
+
return {"success": False, "error": str(e)}
|