@mseep/csv-editor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/.github/ISSUE_TEMPLATE/bug_report.md +53 -0
  2. package/.github/ISSUE_TEMPLATE/feature_request.md +38 -0
  3. package/.github/workflows/deploy-docs.yml +62 -0
  4. package/.github/workflows/publish-github.yml +52 -0
  5. package/.github/workflows/publish.yml +44 -0
  6. package/.github/workflows/test.yml +32 -0
  7. package/.pre-commit-config.yaml +157 -0
  8. package/ALTERNATIVE_PUBLISHING.md +175 -0
  9. package/ARCHITECTURE.md +1011 -0
  10. package/CHANGELOG.md +99 -0
  11. package/CODE_OF_CONDUCT.md +41 -0
  12. package/CONTRIBUTING.md +427 -0
  13. package/Dockerfile +22 -0
  14. package/LICENSE +21 -0
  15. package/MCP_CONFIG.md +505 -0
  16. package/PUBLISHING.md +210 -0
  17. package/README.md +400 -0
  18. package/SECURITY.md +61 -0
  19. package/docs/README.md +41 -0
  20. package/docs/blog/2019-05-28-first-blog-post.md +12 -0
  21. package/docs/blog/2019-05-29-long-blog-post.md +44 -0
  22. package/docs/blog/2021-08-01-mdx-blog-post.mdx +24 -0
  23. package/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
  24. package/docs/blog/2021-08-26-welcome/index.md +29 -0
  25. package/docs/blog/authors.yml +25 -0
  26. package/docs/blog/tags.yml +19 -0
  27. package/docs/docs/api/overview.md +183 -0
  28. package/docs/docs/installation.md +252 -0
  29. package/docs/docs/intro.md +87 -0
  30. package/docs/docs/tutorial-basics/_category_.json +8 -0
  31. package/docs/docs/tutorial-basics/congratulations.md +23 -0
  32. package/docs/docs/tutorial-basics/create-a-blog-post.md +34 -0
  33. package/docs/docs/tutorial-basics/create-a-document.md +57 -0
  34. package/docs/docs/tutorial-basics/create-a-page.md +43 -0
  35. package/docs/docs/tutorial-basics/deploy-your-site.md +31 -0
  36. package/docs/docs/tutorial-basics/markdown-features.mdx +152 -0
  37. package/docs/docs/tutorial-extras/_category_.json +7 -0
  38. package/docs/docs/tutorial-extras/img/docsVersionDropdown.png +0 -0
  39. package/docs/docs/tutorial-extras/img/localeDropdown.png +0 -0
  40. package/docs/docs/tutorial-extras/manage-docs-versions.md +55 -0
  41. package/docs/docs/tutorial-extras/translate-your-site.md +88 -0
  42. package/docs/docs/tutorials/quickstart.md +365 -0
  43. package/docs/docusaurus.config.ts +163 -0
  44. package/docs/package-lock.json +17493 -0
  45. package/docs/package.json +48 -0
  46. package/docs/sidebars.ts +33 -0
  47. package/docs/src/components/HomepageFeatures/index.tsx +71 -0
  48. package/docs/src/components/HomepageFeatures/styles.module.css +11 -0
  49. package/docs/src/css/custom.css +30 -0
  50. package/docs/src/pages/index.module.css +23 -0
  51. package/docs/src/pages/index.tsx +44 -0
  52. package/docs/src/pages/markdown-page.md +7 -0
  53. package/docs/static/.nojekyll +0 -0
  54. package/docs/static/img/docusaurus-social-card.jpg +0 -0
  55. package/docs/static/img/docusaurus.png +0 -0
  56. package/docs/static/img/favicon.ico +0 -0
  57. package/docs/static/img/logo.svg +1 -0
  58. package/docs/static/img/undraw_docusaurus_mountain.svg +171 -0
  59. package/docs/static/img/undraw_docusaurus_react.svg +170 -0
  60. package/docs/static/img/undraw_docusaurus_tree.svg +40 -0
  61. package/docs/tsconfig.json +8 -0
  62. package/examples/README.md +48 -0
  63. package/examples/auto_save_demo.py +206 -0
  64. package/examples/auto_save_overwrite.py +201 -0
  65. package/examples/basic_usage.py +135 -0
  66. package/examples/demo.py +139 -0
  67. package/examples/history_demo.py +317 -0
  68. package/examples/test_default_autosave.py +124 -0
  69. package/examples/update_consignee_example.py +179 -0
  70. package/package.json +51 -0
  71. package/plans/2026-04-19-fastmcp3-migration-plan.md +1045 -0
  72. package/pyproject.toml +331 -0
  73. package/requirements-dev.txt +30 -0
  74. package/requirements.txt +22 -0
  75. package/scripts/publish.py +67 -0
  76. package/smithery.yaml +15 -0
  77. package/specs/2026-04-19-fastmcp3-migration-design.md +243 -0
  78. package/src/csv_editor/__init__.py +8 -0
  79. package/src/csv_editor/models/__init__.py +39 -0
  80. package/src/csv_editor/models/auto_save.py +246 -0
  81. package/src/csv_editor/models/csv_session.py +468 -0
  82. package/src/csv_editor/models/data_models.py +244 -0
  83. package/src/csv_editor/models/history_manager.py +456 -0
  84. package/src/csv_editor/prompts/__init__.py +0 -0
  85. package/src/csv_editor/prompts/data_prompts.py +13 -0
  86. package/src/csv_editor/resources/__init__.py +0 -0
  87. package/src/csv_editor/resources/csv_resources.py +22 -0
  88. package/src/csv_editor/server.py +640 -0
  89. package/src/csv_editor/tools/__init__.py +5 -0
  90. package/src/csv_editor/tools/analytics.py +700 -0
  91. package/src/csv_editor/tools/auto_save_operations.py +235 -0
  92. package/src/csv_editor/tools/data_operations.py +3 -0
  93. package/src/csv_editor/tools/history_operations.py +315 -0
  94. package/src/csv_editor/tools/io_operations.py +431 -0
  95. package/src/csv_editor/tools/transformations.py +663 -0
  96. package/src/csv_editor/tools/validation.py +822 -0
  97. package/src/csv_editor/utils/__init__.py +0 -0
  98. package/src/csv_editor/utils/validators.py +205 -0
  99. package/tests/README.md +65 -0
  100. package/tests/__init__.py +7 -0
  101. package/tests/conftest.py +50 -0
  102. package/tests/test_auto_save.py +378 -0
  103. package/tests/test_basic.py +103 -0
  104. package/tests/test_integration.py +356 -0
  105. package/tests/test_server_boot.py +50 -0
  106. package/tests/test_settings.py +184 -0
@@ -0,0 +1,822 @@
1
+ """Data validation tools for CSV data quality checks."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from fastmcp import Context
9
+
10
+ from ..models.csv_session import get_session_manager
11
+ from ..models.data_models import OperationType
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ async def validate_schema(
17
+ session_id: str, schema: dict[str, dict[str, Any]], ctx: Context = None
18
+ ) -> dict[str, Any]:
19
+ """
20
+ Validate data against a schema definition.
21
+
22
+ Args:
23
+ session_id: Session identifier
24
+ schema: Schema definition with column rules
25
+ Example: {
26
+ "column_name": {
27
+ "type": "int", # int, float, str, bool, datetime
28
+ "nullable": False,
29
+ "min": 0,
30
+ "max": 100,
31
+ "pattern": "^[A-Z]+$",
32
+ "values": ["A", "B", "C"], # allowed values
33
+ "unique": True
34
+ }
35
+ }
36
+ ctx: FastMCP context
37
+
38
+ Returns:
39
+ Dict with validation results
40
+ """
41
+ try:
42
+ manager = get_session_manager()
43
+ session = manager.get_session(session_id)
44
+
45
+ if not session or session.df is None:
46
+ return {"success": False, "error": "Invalid session or no data loaded"}
47
+
48
+ df = session.df
49
+ validation_errors = {}
50
+ validation_summary = {
51
+ "total_columns": len(schema),
52
+ "valid_columns": 0,
53
+ "invalid_columns": 0,
54
+ "missing_columns": [],
55
+ "extra_columns": [],
56
+ }
57
+
58
+ # Check for missing and extra columns
59
+ schema_columns = set(schema.keys())
60
+ df_columns = set(df.columns)
61
+
62
+ validation_summary["missing_columns"] = list(schema_columns - df_columns)
63
+ validation_summary["extra_columns"] = list(df_columns - schema_columns)
64
+
65
+ # Validate each column in schema
66
+ for col_name, rules in schema.items():
67
+ if col_name not in df.columns:
68
+ validation_errors[col_name] = [
69
+ {"error": "column_missing", "message": f"Column '{col_name}' not found in data"}
70
+ ]
71
+ validation_summary["invalid_columns"] += 1
72
+ continue
73
+
74
+ col_errors = []
75
+ col_data = df[col_name]
76
+
77
+ # Type validation
78
+ expected_type = rules.get("type")
79
+ if expected_type:
80
+ type_valid = False
81
+ if expected_type == "int":
82
+ type_valid = pd.api.types.is_integer_dtype(col_data)
83
+ elif expected_type == "float":
84
+ type_valid = pd.api.types.is_float_dtype(col_data)
85
+ elif expected_type == "str":
86
+ type_valid = pd.api.types.is_string_dtype(col_data) or col_data.dtype == object
87
+ elif expected_type == "bool":
88
+ type_valid = pd.api.types.is_bool_dtype(col_data)
89
+ elif expected_type == "datetime":
90
+ type_valid = pd.api.types.is_datetime64_any_dtype(col_data)
91
+
92
+ if not type_valid:
93
+ col_errors.append(
94
+ {
95
+ "error": "type_mismatch",
96
+ "message": f"Expected type '{expected_type}', got '{col_data.dtype}'",
97
+ "actual_type": str(col_data.dtype),
98
+ }
99
+ )
100
+
101
+ # Nullable validation
102
+ if not rules.get("nullable", True):
103
+ null_count = col_data.isna().sum()
104
+ if null_count > 0:
105
+ col_errors.append(
106
+ {
107
+ "error": "null_values",
108
+ "message": f"Column contains {null_count} null values",
109
+ "null_count": int(null_count),
110
+ "null_indices": df[col_data.isna()].index.tolist()[:100],
111
+ }
112
+ )
113
+
114
+ # Min/Max validation for numeric columns
115
+ if pd.api.types.is_numeric_dtype(col_data):
116
+ if "min" in rules:
117
+ min_val = rules["min"]
118
+ violations = col_data[col_data < min_val]
119
+ if len(violations) > 0:
120
+ col_errors.append(
121
+ {
122
+ "error": "min_violation",
123
+ "message": f"{len(violations)} values below minimum {min_val}",
124
+ "violation_count": len(violations),
125
+ "min_found": float(violations.min()),
126
+ }
127
+ )
128
+
129
+ if "max" in rules:
130
+ max_val = rules["max"]
131
+ violations = col_data[col_data > max_val]
132
+ if len(violations) > 0:
133
+ col_errors.append(
134
+ {
135
+ "error": "max_violation",
136
+ "message": f"{len(violations)} values above maximum {max_val}",
137
+ "violation_count": len(violations),
138
+ "max_found": float(violations.max()),
139
+ }
140
+ )
141
+
142
+ # Pattern validation for string columns
143
+ if "pattern" in rules and (
144
+ col_data.dtype == object or pd.api.types.is_string_dtype(col_data)
145
+ ):
146
+ pattern = rules["pattern"]
147
+ try:
148
+ non_null = col_data.dropna()
149
+ if len(non_null) > 0:
150
+ matches = non_null.astype(str).str.match(pattern)
151
+ violations = non_null[~matches]
152
+ if len(violations) > 0:
153
+ col_errors.append(
154
+ {
155
+ "error": "pattern_violation",
156
+ "message": f"{len(violations)} values don't match pattern '{pattern}'",
157
+ "violation_count": len(violations),
158
+ "sample_violations": violations.head(10).tolist(),
159
+ }
160
+ )
161
+ except Exception as e:
162
+ col_errors.append(
163
+ {"error": "pattern_error", "message": f"Invalid regex pattern: {e!s}"}
164
+ )
165
+
166
+ # Allowed values validation
167
+ if "values" in rules:
168
+ allowed = set(rules["values"])
169
+ actual = set(col_data.dropna().unique())
170
+ invalid = actual - allowed
171
+ if invalid:
172
+ col_errors.append(
173
+ {
174
+ "error": "invalid_values",
175
+ "message": f"Found {len(invalid)} invalid values",
176
+ "invalid_values": list(invalid)[:50],
177
+ }
178
+ )
179
+
180
+ # Uniqueness validation
181
+ if rules.get("unique", False):
182
+ duplicates = col_data.duplicated()
183
+ if duplicates.any():
184
+ col_errors.append(
185
+ {
186
+ "error": "duplicate_values",
187
+ "message": f"Column contains {duplicates.sum()} duplicate values",
188
+ "duplicate_count": int(duplicates.sum()),
189
+ }
190
+ )
191
+
192
+ # Length validation for strings
193
+ if col_data.dtype == object or pd.api.types.is_string_dtype(col_data):
194
+ if "min_length" in rules:
195
+ min_len = rules["min_length"]
196
+ str_data = col_data.dropna().astype(str)
197
+ short = str_data[str_data.str.len() < min_len]
198
+ if len(short) > 0:
199
+ col_errors.append(
200
+ {
201
+ "error": "min_length_violation",
202
+ "message": f"{len(short)} values shorter than {min_len} characters",
203
+ "violation_count": len(short),
204
+ }
205
+ )
206
+
207
+ if "max_length" in rules:
208
+ max_len = rules["max_length"]
209
+ str_data = col_data.dropna().astype(str)
210
+ long = str_data[str_data.str.len() > max_len]
211
+ if len(long) > 0:
212
+ col_errors.append(
213
+ {
214
+ "error": "max_length_violation",
215
+ "message": f"{len(long)} values longer than {max_len} characters",
216
+ "violation_count": len(long),
217
+ }
218
+ )
219
+
220
+ if col_errors:
221
+ validation_errors[col_name] = col_errors
222
+ validation_summary["invalid_columns"] += 1
223
+ else:
224
+ validation_summary["valid_columns"] += 1
225
+
226
+ is_valid = len(validation_errors) == 0 and len(validation_summary["missing_columns"]) == 0
227
+
228
+ session.record_operation(
229
+ OperationType.VALIDATE,
230
+ {
231
+ "type": "schema_validation",
232
+ "is_valid": is_valid,
233
+ "errors_count": len(validation_errors),
234
+ },
235
+ )
236
+
237
+ return {
238
+ "success": True,
239
+ "is_valid": is_valid,
240
+ "summary": validation_summary,
241
+ "validation_errors": validation_errors,
242
+ }
243
+
244
+ except Exception as e:
245
+ logger.error(f"Error validating schema: {e!s}")
246
+ return {"success": False, "error": str(e)}
247
+
248
+
249
+ async def check_data_quality(
250
+ session_id: str, rules: list[dict[str, Any]] | None = None, ctx: Context = None
251
+ ) -> dict[str, Any]:
252
+ """
253
+ Check data quality based on predefined or custom rules.
254
+
255
+ Args:
256
+ session_id: Session identifier
257
+ rules: Custom quality rules to check. If None, uses default rules.
258
+ Example: [
259
+ {"type": "completeness", "threshold": 0.95},
260
+ {"type": "uniqueness", "column": "id"},
261
+ {"type": "consistency", "columns": ["start_date", "end_date"]}
262
+ ]
263
+ ctx: FastMCP context
264
+
265
+ Returns:
266
+ Dict with quality check results
267
+ """
268
+ try:
269
+ manager = get_session_manager()
270
+ session = manager.get_session(session_id)
271
+
272
+ if not session or session.df is None:
273
+ return {"success": False, "error": "Invalid session or no data loaded"}
274
+
275
+ df = session.df
276
+ quality_results = {
277
+ "overall_score": 100.0,
278
+ "checks": [],
279
+ "issues": [],
280
+ "recommendations": [],
281
+ }
282
+
283
+ # Default rules if none provided
284
+ if not rules:
285
+ rules = [
286
+ {"type": "completeness", "threshold": 0.95},
287
+ {"type": "duplicates", "threshold": 0.01},
288
+ {"type": "data_types"},
289
+ {"type": "outliers", "threshold": 0.05},
290
+ {"type": "consistency"},
291
+ ]
292
+
293
+ total_score = 0
294
+ score_count = 0
295
+
296
+ for rule in rules:
297
+ rule_type = rule.get("type")
298
+
299
+ if rule_type == "completeness":
300
+ # Check data completeness
301
+ threshold = rule.get("threshold", 0.95)
302
+ columns = rule.get("columns", df.columns.tolist())
303
+
304
+ for col in columns:
305
+ if col in df.columns:
306
+ completeness = 1 - (df[col].isna().sum() / len(df))
307
+ passed = completeness >= threshold
308
+ score = completeness * 100
309
+
310
+ quality_results["checks"].append(
311
+ {
312
+ "type": "completeness",
313
+ "column": col,
314
+ "completeness": round(completeness, 4),
315
+ "threshold": threshold,
316
+ "passed": passed,
317
+ "score": round(score, 2),
318
+ }
319
+ )
320
+
321
+ if not passed:
322
+ quality_results["issues"].append(
323
+ {
324
+ "type": "incomplete_data",
325
+ "column": col,
326
+ "message": f"Column '{col}' is only {round(completeness*100, 2)}% complete",
327
+ "severity": "high" if completeness < 0.5 else "medium",
328
+ }
329
+ )
330
+
331
+ total_score += score
332
+ score_count += 1
333
+
334
+ elif rule_type == "duplicates":
335
+ # Check for duplicate rows
336
+ threshold = rule.get("threshold", 0.01)
337
+ subset = rule.get("columns")
338
+
339
+ duplicates = df.duplicated(subset=subset)
340
+ duplicate_ratio = duplicates.sum() / len(df)
341
+ passed = duplicate_ratio <= threshold
342
+ score = (1 - duplicate_ratio) * 100
343
+
344
+ quality_results["checks"].append(
345
+ {
346
+ "type": "duplicates",
347
+ "duplicate_rows": int(duplicates.sum()),
348
+ "duplicate_ratio": round(duplicate_ratio, 4),
349
+ "threshold": threshold,
350
+ "passed": passed,
351
+ "score": round(score, 2),
352
+ }
353
+ )
354
+
355
+ if not passed:
356
+ quality_results["issues"].append(
357
+ {
358
+ "type": "duplicate_rows",
359
+ "message": f"Found {duplicates.sum()} duplicate rows ({round(duplicate_ratio*100, 2)}%)",
360
+ "severity": "high" if duplicate_ratio > 0.1 else "medium",
361
+ }
362
+ )
363
+ quality_results["recommendations"].append(
364
+ "Consider removing duplicate rows using the remove_duplicates tool"
365
+ )
366
+
367
+ total_score += score
368
+ score_count += 1
369
+
370
+ elif rule_type == "uniqueness":
371
+ # Check column uniqueness
372
+ column = rule.get("column")
373
+ if column and column in df.columns:
374
+ unique_ratio = df[column].nunique() / len(df)
375
+ expected_unique = rule.get("expected_unique", True)
376
+
377
+ if expected_unique:
378
+ passed = unique_ratio >= 0.99
379
+ score = unique_ratio * 100
380
+ else:
381
+ passed = True
382
+ score = 100
383
+
384
+ quality_results["checks"].append(
385
+ {
386
+ "type": "uniqueness",
387
+ "column": column,
388
+ "unique_values": int(df[column].nunique()),
389
+ "unique_ratio": round(unique_ratio, 4),
390
+ "passed": passed,
391
+ "score": round(score, 2),
392
+ }
393
+ )
394
+
395
+ if not passed and expected_unique:
396
+ quality_results["issues"].append(
397
+ {
398
+ "type": "non_unique_values",
399
+ "column": column,
400
+ "message": f"Column '{column}' expected to be unique but has duplicates",
401
+ "severity": "high",
402
+ }
403
+ )
404
+
405
+ total_score += score
406
+ score_count += 1
407
+
408
+ elif rule_type == "data_types":
409
+ # Check data type consistency
410
+ for col in df.columns:
411
+ col_data = df[col].dropna()
412
+ if len(col_data) > 0:
413
+ # Check for mixed types
414
+ types = col_data.apply(type).unique()
415
+ mixed_types = len(types) > 1
416
+
417
+ # Check for numeric strings
418
+ if col_data.dtype == object:
419
+ numeric_strings = col_data.astype(str).str.match(r"^-?\d+\.?\d*$").sum()
420
+ numeric_ratio = numeric_strings / len(col_data)
421
+ else:
422
+ numeric_ratio = 0
423
+
424
+ score = 100 if not mixed_types else 50
425
+
426
+ quality_results["checks"].append(
427
+ {
428
+ "type": "data_type_consistency",
429
+ "column": col,
430
+ "dtype": str(df[col].dtype),
431
+ "mixed_types": mixed_types,
432
+ "numeric_strings": numeric_ratio > 0.9,
433
+ "score": score,
434
+ }
435
+ )
436
+
437
+ if numeric_ratio > 0.9:
438
+ quality_results["recommendations"].append(
439
+ f"Column '{col}' appears to contain numeric data stored as strings. "
440
+ f"Consider converting to numeric type using change_column_type tool"
441
+ )
442
+
443
+ total_score += score
444
+ score_count += 1
445
+
446
+ elif rule_type == "outliers":
447
+ # Check for outliers in numeric columns
448
+ threshold = rule.get("threshold", 0.05)
449
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
450
+
451
+ for col in numeric_cols:
452
+ Q1 = df[col].quantile(0.25)
453
+ Q3 = df[col].quantile(0.75)
454
+ IQR = Q3 - Q1
455
+
456
+ lower_bound = Q1 - 1.5 * IQR
457
+ upper_bound = Q3 + 1.5 * IQR
458
+
459
+ outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
460
+ outlier_ratio = outliers / len(df)
461
+ passed = outlier_ratio <= threshold
462
+ score = (1 - min(outlier_ratio, 1)) * 100
463
+
464
+ quality_results["checks"].append(
465
+ {
466
+ "type": "outliers",
467
+ "column": col,
468
+ "outlier_count": int(outliers),
469
+ "outlier_ratio": round(outlier_ratio, 4),
470
+ "threshold": threshold,
471
+ "passed": passed,
472
+ "score": round(score, 2),
473
+ }
474
+ )
475
+
476
+ if not passed:
477
+ quality_results["issues"].append(
478
+ {
479
+ "type": "outliers",
480
+ "column": col,
481
+ "message": f"Column '{col}' has {outliers} outliers ({round(outlier_ratio*100, 2)}%)",
482
+ "severity": "medium",
483
+ }
484
+ )
485
+
486
+ total_score += score
487
+ score_count += 1
488
+
489
+ elif rule_type == "consistency":
490
+ # Check data consistency
491
+ columns = rule.get("columns", [])
492
+
493
+ # Date consistency check
494
+ date_cols = df.select_dtypes(include=["datetime64"]).columns
495
+ if len(date_cols) >= 2 and not columns:
496
+ columns = date_cols.tolist()
497
+
498
+ if len(columns) >= 2:
499
+ col1, col2 = columns[0], columns[1]
500
+ if col1 in df.columns and col2 in df.columns:
501
+ # Check if col1 should be before col2 (e.g., start_date < end_date)
502
+ if pd.api.types.is_datetime64_any_dtype(
503
+ df[col1]
504
+ ) and pd.api.types.is_datetime64_any_dtype(df[col2]):
505
+ inconsistent = (df[col1] > df[col2]).sum()
506
+ consistency_ratio = 1 - (inconsistent / len(df))
507
+ passed = consistency_ratio >= 0.99
508
+ score = consistency_ratio * 100
509
+
510
+ quality_results["checks"].append(
511
+ {
512
+ "type": "consistency",
513
+ "columns": [col1, col2],
514
+ "consistent_rows": len(df) - inconsistent,
515
+ "inconsistent_rows": int(inconsistent),
516
+ "consistency_ratio": round(consistency_ratio, 4),
517
+ "passed": passed,
518
+ "score": round(score, 2),
519
+ }
520
+ )
521
+
522
+ if not passed:
523
+ quality_results["issues"].append(
524
+ {
525
+ "type": "data_inconsistency",
526
+ "columns": [col1, col2],
527
+ "message": f"Found {inconsistent} rows where {col1} > {col2}",
528
+ "severity": "high",
529
+ }
530
+ )
531
+
532
+ total_score += score
533
+ score_count += 1
534
+
535
+ # Calculate overall score
536
+ if score_count > 0:
537
+ quality_results["overall_score"] = round(total_score / score_count, 2)
538
+
539
+ # Determine quality level
540
+ overall_score = quality_results["overall_score"]
541
+ if overall_score >= 95:
542
+ quality_results["quality_level"] = "Excellent"
543
+ elif overall_score >= 85:
544
+ quality_results["quality_level"] = "Good"
545
+ elif overall_score >= 70:
546
+ quality_results["quality_level"] = "Fair"
547
+ else:
548
+ quality_results["quality_level"] = "Poor"
549
+
550
+ # Add general recommendations
551
+ if not quality_results["recommendations"]:
552
+ if overall_score < 85:
553
+ quality_results["recommendations"].append(
554
+ "Consider running profile_data to get a comprehensive overview of data issues"
555
+ )
556
+
557
+ session.record_operation(
558
+ OperationType.QUALITY_CHECK,
559
+ {
560
+ "rules_count": len(rules),
561
+ "overall_score": overall_score,
562
+ "issues_count": len(quality_results["issues"]),
563
+ },
564
+ )
565
+
566
+ return {"success": True, "quality_results": quality_results}
567
+
568
+ except Exception as e:
569
+ logger.error(f"Error checking data quality: {e!s}")
570
+ return {"success": False, "error": str(e)}
571
+
572
+
573
+ async def find_anomalies(
574
+ session_id: str,
575
+ columns: list[str] | None = None,
576
+ sensitivity: float = 0.95,
577
+ methods: list[str] | None = None,
578
+ ctx: Context = None,
579
+ ) -> dict[str, Any]:
580
+ """
581
+ Find anomalies in the data using multiple detection methods.
582
+
583
+ Args:
584
+ session_id: Session identifier
585
+ columns: Columns to check (None for all)
586
+ sensitivity: Detection sensitivity (0.0 to 1.0, higher = more sensitive)
587
+ methods: Detection methods to use (default: ["statistical", "pattern"])
588
+ ctx: FastMCP context
589
+
590
+ Returns:
591
+ Dict with anomaly detection results
592
+ """
593
+ try:
594
+ manager = get_session_manager()
595
+ session = manager.get_session(session_id)
596
+
597
+ if not session or session.df is None:
598
+ return {"success": False, "error": "Invalid session or no data loaded"}
599
+
600
+ df = session.df
601
+
602
+ if columns:
603
+ missing_cols = [col for col in columns if col not in df.columns]
604
+ if missing_cols:
605
+ return {"success": False, "error": f"Columns not found: {missing_cols}"}
606
+ target_cols = columns
607
+ else:
608
+ target_cols = df.columns.tolist()
609
+
610
+ if not methods:
611
+ methods = ["statistical", "pattern", "missing"]
612
+
613
+ anomalies = {
614
+ "summary": {"total_anomalies": 0, "affected_rows": set(), "affected_columns": []},
615
+ "by_column": {},
616
+ "by_method": {},
617
+ }
618
+
619
+ # Statistical anomalies (outliers)
620
+ if "statistical" in methods:
621
+ numeric_cols = df[target_cols].select_dtypes(include=[np.number]).columns
622
+ statistical_anomalies = {}
623
+
624
+ for col in numeric_cols:
625
+ col_data = df[col].dropna()
626
+ if len(col_data) > 0:
627
+ # Z-score method
628
+ z_scores = np.abs((col_data - col_data.mean()) / col_data.std())
629
+ z_threshold = 3 * (
630
+ 1 - sensitivity + 0.5
631
+ ) # Adjust threshold based on sensitivity
632
+ z_anomalies = df.index[z_scores > z_threshold].tolist()
633
+
634
+ # IQR method
635
+ Q1 = col_data.quantile(0.25)
636
+ Q3 = col_data.quantile(0.75)
637
+ IQR = Q3 - Q1
638
+ iqr_factor = 1.5 * (2 - sensitivity) # Adjust factor based on sensitivity
639
+ lower = Q1 - iqr_factor * IQR
640
+ upper = Q3 + iqr_factor * IQR
641
+ iqr_anomalies = df.index[(df[col] < lower) | (df[col] > upper)].tolist()
642
+
643
+ # Combine both methods
644
+ combined_anomalies = list(set(z_anomalies) | set(iqr_anomalies))
645
+
646
+ if combined_anomalies:
647
+ statistical_anomalies[col] = {
648
+ "anomaly_count": len(combined_anomalies),
649
+ "anomaly_indices": combined_anomalies[:100],
650
+ "anomaly_values": df.loc[combined_anomalies[:10], col].tolist(),
651
+ "mean": float(col_data.mean()),
652
+ "std": float(col_data.std()),
653
+ "lower_bound": float(lower),
654
+ "upper_bound": float(upper),
655
+ }
656
+
657
+ anomalies["summary"]["total_anomalies"] += len(combined_anomalies)
658
+ anomalies["summary"]["affected_rows"].update(combined_anomalies)
659
+ anomalies["summary"]["affected_columns"].append(col)
660
+
661
+ if statistical_anomalies:
662
+ anomalies["by_method"]["statistical"] = statistical_anomalies
663
+
664
+ # Pattern anomalies
665
+ if "pattern" in methods:
666
+ pattern_anomalies = {}
667
+
668
+ for col in target_cols:
669
+ if df[col].dtype == object or pd.api.types.is_string_dtype(df[col]):
670
+ col_data = df[col].dropna()
671
+ if len(col_data) > 0:
672
+ # Detect unusual patterns
673
+ value_counts = col_data.value_counts()
674
+ total_count = len(col_data)
675
+
676
+ # Find rare values (appearing less than threshold)
677
+ threshold = (1 - sensitivity) * 0.01 # Adjust threshold
678
+ rare_values = value_counts[value_counts / total_count < threshold]
679
+
680
+ if len(rare_values) > 0:
681
+ rare_indices = df[df[col].isin(rare_values.index)].index.tolist()
682
+
683
+ # Check for format anomalies (e.g., different case, special characters)
684
+ common_pattern = None
685
+ if len(value_counts) > 10:
686
+ # Detect common pattern from frequent values
687
+ top_values = value_counts.head(10).index
688
+
689
+ # Check if most values are uppercase/lowercase
690
+ upper_count = sum(1 for v in top_values if str(v).isupper())
691
+ lower_count = sum(1 for v in top_values if str(v).islower())
692
+
693
+ if upper_count > 7:
694
+ common_pattern = "uppercase"
695
+ elif lower_count > 7:
696
+ common_pattern = "lowercase"
697
+
698
+ format_anomalies = []
699
+ if common_pattern:
700
+ for idx, val in col_data.items():
701
+ if (
702
+ common_pattern == "uppercase" and not str(val).isupper()
703
+ ) or (common_pattern == "lowercase" and not str(val).islower()):
704
+ format_anomalies.append(idx)
705
+
706
+ all_pattern_anomalies = list(set(rare_indices + format_anomalies))
707
+
708
+ if all_pattern_anomalies:
709
+ pattern_anomalies[col] = {
710
+ "anomaly_count": len(all_pattern_anomalies),
711
+ "rare_values": rare_values.head(10).to_dict(),
712
+ "anomaly_indices": all_pattern_anomalies[:100],
713
+ "common_pattern": common_pattern,
714
+ }
715
+
716
+ anomalies["summary"]["total_anomalies"] += len(
717
+ all_pattern_anomalies
718
+ )
719
+ anomalies["summary"]["affected_rows"].update(all_pattern_anomalies)
720
+ if col not in anomalies["summary"]["affected_columns"]:
721
+ anomalies["summary"]["affected_columns"].append(col)
722
+
723
+ if pattern_anomalies:
724
+ anomalies["by_method"]["pattern"] = pattern_anomalies
725
+
726
+ # Missing value anomalies
727
+ if "missing" in methods:
728
+ missing_anomalies = {}
729
+
730
+ for col in target_cols:
731
+ null_mask = df[col].isna()
732
+ null_count = null_mask.sum()
733
+
734
+ if null_count > 0:
735
+ null_ratio = null_count / len(df)
736
+
737
+ # Check for suspicious missing patterns
738
+ if 0 < null_ratio < 0.5: # Partially missing
739
+ # Check if missing values are clustered
740
+ null_indices = df.index[null_mask].tolist()
741
+
742
+ # Check for sequential missing values
743
+ sequential_missing = []
744
+ if len(null_indices) > 1:
745
+ for i in range(len(null_indices) - 1):
746
+ if null_indices[i + 1] - null_indices[i] == 1:
747
+ if (
748
+ not sequential_missing
749
+ or null_indices[i] - sequential_missing[-1][-1] == 1
750
+ ):
751
+ if sequential_missing:
752
+ sequential_missing[-1].append(null_indices[i + 1])
753
+ else:
754
+ sequential_missing.append(
755
+ [null_indices[i], null_indices[i + 1]]
756
+ )
757
+
758
+ # Flag as anomaly if there are suspicious patterns
759
+ is_anomaly = (
760
+ len(sequential_missing) > 0
761
+ and len(sequential_missing) > len(null_indices) * 0.3
762
+ )
763
+
764
+ if is_anomaly or (null_ratio > 0.1 and null_ratio < 0.3):
765
+ missing_anomalies[col] = {
766
+ "missing_count": int(null_count),
767
+ "missing_ratio": round(null_ratio, 4),
768
+ "missing_indices": null_indices[:100],
769
+ "sequential_clusters": len(sequential_missing),
770
+ "pattern": "clustered" if sequential_missing else "random",
771
+ }
772
+
773
+ anomalies["summary"]["affected_columns"].append(col)
774
+
775
+ if missing_anomalies:
776
+ anomalies["by_method"]["missing"] = missing_anomalies
777
+
778
+ # Organize anomalies by column
779
+ for method_name, method_anomalies in anomalies["by_method"].items():
780
+ for col, col_anomalies in method_anomalies.items():
781
+ if col not in anomalies["by_column"]:
782
+ anomalies["by_column"][col] = {}
783
+ anomalies["by_column"][col][method_name] = col_anomalies
784
+
785
+ # Convert set to list for JSON serialization
786
+ anomalies["summary"]["affected_rows"] = list(anomalies["summary"]["affected_rows"])[:1000]
787
+ anomalies["summary"]["affected_columns"] = list(
788
+ set(anomalies["summary"]["affected_columns"])
789
+ )
790
+
791
+ # Calculate anomaly score
792
+ total_cells = len(df) * len(target_cols)
793
+ anomaly_cells = len(anomalies["summary"]["affected_rows"]) * len(
794
+ anomalies["summary"]["affected_columns"]
795
+ )
796
+ anomaly_score = min(anomaly_cells / total_cells, 1.0) * 100
797
+
798
+ anomalies["summary"]["anomaly_score"] = round(anomaly_score, 2)
799
+ anomalies["summary"]["severity"] = (
800
+ "high" if anomaly_score > 10 else "medium" if anomaly_score > 5 else "low"
801
+ )
802
+
803
+ session.record_operation(
804
+ OperationType.ANOMALY_DETECTION,
805
+ {
806
+ "methods": methods,
807
+ "sensitivity": sensitivity,
808
+ "anomalies_found": anomalies["summary"]["total_anomalies"],
809
+ },
810
+ )
811
+
812
+ return {
813
+ "success": True,
814
+ "anomalies": anomalies,
815
+ "columns_analyzed": target_cols,
816
+ "methods_used": methods,
817
+ "sensitivity": sensitivity,
818
+ }
819
+
820
+ except Exception as e:
821
+ logger.error(f"Error finding anomalies: {e!s}")
822
+ return {"success": False, "error": str(e)}