@mseep/csv-editor 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/.github/ISSUE_TEMPLATE/bug_report.md +53 -0
  2. package/.github/ISSUE_TEMPLATE/feature_request.md +38 -0
  3. package/.github/workflows/deploy-docs.yml +62 -0
  4. package/.github/workflows/publish-github.yml +52 -0
  5. package/.github/workflows/publish.yml +44 -0
  6. package/.github/workflows/test.yml +32 -0
  7. package/.pre-commit-config.yaml +157 -0
  8. package/ALTERNATIVE_PUBLISHING.md +175 -0
  9. package/ARCHITECTURE.md +1011 -0
  10. package/CHANGELOG.md +99 -0
  11. package/CODE_OF_CONDUCT.md +41 -0
  12. package/CONTRIBUTING.md +427 -0
  13. package/Dockerfile +22 -0
  14. package/LICENSE +21 -0
  15. package/MCP_CONFIG.md +505 -0
  16. package/PUBLISHING.md +210 -0
  17. package/README.md +400 -0
  18. package/SECURITY.md +61 -0
  19. package/docs/README.md +41 -0
  20. package/docs/blog/2019-05-28-first-blog-post.md +12 -0
  21. package/docs/blog/2019-05-29-long-blog-post.md +44 -0
  22. package/docs/blog/2021-08-01-mdx-blog-post.mdx +24 -0
  23. package/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
  24. package/docs/blog/2021-08-26-welcome/index.md +29 -0
  25. package/docs/blog/authors.yml +25 -0
  26. package/docs/blog/tags.yml +19 -0
  27. package/docs/docs/api/overview.md +183 -0
  28. package/docs/docs/installation.md +252 -0
  29. package/docs/docs/intro.md +87 -0
  30. package/docs/docs/tutorial-basics/_category_.json +8 -0
  31. package/docs/docs/tutorial-basics/congratulations.md +23 -0
  32. package/docs/docs/tutorial-basics/create-a-blog-post.md +34 -0
  33. package/docs/docs/tutorial-basics/create-a-document.md +57 -0
  34. package/docs/docs/tutorial-basics/create-a-page.md +43 -0
  35. package/docs/docs/tutorial-basics/deploy-your-site.md +31 -0
  36. package/docs/docs/tutorial-basics/markdown-features.mdx +152 -0
  37. package/docs/docs/tutorial-extras/_category_.json +7 -0
  38. package/docs/docs/tutorial-extras/img/docsVersionDropdown.png +0 -0
  39. package/docs/docs/tutorial-extras/img/localeDropdown.png +0 -0
  40. package/docs/docs/tutorial-extras/manage-docs-versions.md +55 -0
  41. package/docs/docs/tutorial-extras/translate-your-site.md +88 -0
  42. package/docs/docs/tutorials/quickstart.md +365 -0
  43. package/docs/docusaurus.config.ts +163 -0
  44. package/docs/package-lock.json +17493 -0
  45. package/docs/package.json +48 -0
  46. package/docs/sidebars.ts +33 -0
  47. package/docs/src/components/HomepageFeatures/index.tsx +71 -0
  48. package/docs/src/components/HomepageFeatures/styles.module.css +11 -0
  49. package/docs/src/css/custom.css +30 -0
  50. package/docs/src/pages/index.module.css +23 -0
  51. package/docs/src/pages/index.tsx +44 -0
  52. package/docs/src/pages/markdown-page.md +7 -0
  53. package/docs/static/.nojekyll +0 -0
  54. package/docs/static/img/docusaurus-social-card.jpg +0 -0
  55. package/docs/static/img/docusaurus.png +0 -0
  56. package/docs/static/img/favicon.ico +0 -0
  57. package/docs/static/img/logo.svg +1 -0
  58. package/docs/static/img/undraw_docusaurus_mountain.svg +171 -0
  59. package/docs/static/img/undraw_docusaurus_react.svg +170 -0
  60. package/docs/static/img/undraw_docusaurus_tree.svg +40 -0
  61. package/docs/tsconfig.json +8 -0
  62. package/examples/README.md +48 -0
  63. package/examples/auto_save_demo.py +206 -0
  64. package/examples/auto_save_overwrite.py +201 -0
  65. package/examples/basic_usage.py +135 -0
  66. package/examples/demo.py +139 -0
  67. package/examples/history_demo.py +317 -0
  68. package/examples/test_default_autosave.py +124 -0
  69. package/examples/update_consignee_example.py +179 -0
  70. package/package.json +51 -0
  71. package/plans/2026-04-19-fastmcp3-migration-plan.md +1045 -0
  72. package/pyproject.toml +331 -0
  73. package/requirements-dev.txt +30 -0
  74. package/requirements.txt +22 -0
  75. package/scripts/publish.py +67 -0
  76. package/smithery.yaml +15 -0
  77. package/specs/2026-04-19-fastmcp3-migration-design.md +243 -0
  78. package/src/csv_editor/__init__.py +8 -0
  79. package/src/csv_editor/models/__init__.py +39 -0
  80. package/src/csv_editor/models/auto_save.py +246 -0
  81. package/src/csv_editor/models/csv_session.py +468 -0
  82. package/src/csv_editor/models/data_models.py +244 -0
  83. package/src/csv_editor/models/history_manager.py +456 -0
  84. package/src/csv_editor/prompts/__init__.py +0 -0
  85. package/src/csv_editor/prompts/data_prompts.py +13 -0
  86. package/src/csv_editor/resources/__init__.py +0 -0
  87. package/src/csv_editor/resources/csv_resources.py +22 -0
  88. package/src/csv_editor/server.py +640 -0
  89. package/src/csv_editor/tools/__init__.py +5 -0
  90. package/src/csv_editor/tools/analytics.py +700 -0
  91. package/src/csv_editor/tools/auto_save_operations.py +235 -0
  92. package/src/csv_editor/tools/data_operations.py +3 -0
  93. package/src/csv_editor/tools/history_operations.py +315 -0
  94. package/src/csv_editor/tools/io_operations.py +431 -0
  95. package/src/csv_editor/tools/transformations.py +663 -0
  96. package/src/csv_editor/tools/validation.py +822 -0
  97. package/src/csv_editor/utils/__init__.py +0 -0
  98. package/src/csv_editor/utils/validators.py +205 -0
  99. package/tests/README.md +65 -0
  100. package/tests/__init__.py +7 -0
  101. package/tests/conftest.py +50 -0
  102. package/tests/test_auto_save.py +378 -0
  103. package/tests/test_basic.py +103 -0
  104. package/tests/test_integration.py +356 -0
  105. package/tests/test_server_boot.py +50 -0
  106. package/tests/test_settings.py +184 -0
@@ -0,0 +1,700 @@
1
+ """Analytics tools for CSV data analysis."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from fastmcp import Context
9
+
10
+ from ..models.csv_session import get_session_manager
11
+ from ..models.data_models import OperationType
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ async def get_statistics(
17
+ session_id: str,
18
+ columns: list[str] | None = None,
19
+ include_percentiles: bool = True,
20
+ ctx: Context = None,
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Get statistical summary of numerical columns.
24
+
25
+ Args:
26
+ session_id: Session identifier
27
+ columns: Specific columns to analyze (None for all numeric)
28
+ include_percentiles: Include percentile values
29
+ ctx: FastMCP context
30
+
31
+ Returns:
32
+ Dict with statistics for each column
33
+ """
34
+ try:
35
+ manager = get_session_manager()
36
+ session = manager.get_session(session_id)
37
+
38
+ if not session or session.df is None:
39
+ return {"success": False, "error": "Invalid session or no data loaded"}
40
+
41
+ df = session.df
42
+
43
+ # Select columns to analyze
44
+ if columns:
45
+ missing_cols = [col for col in columns if col not in df.columns]
46
+ if missing_cols:
47
+ return {"success": False, "error": f"Columns not found: {missing_cols}"}
48
+ numeric_df = df[columns].select_dtypes(include=[np.number])
49
+ else:
50
+ numeric_df = df.select_dtypes(include=[np.number])
51
+
52
+ if numeric_df.empty:
53
+ return {"success": False, "error": "No numeric columns found"}
54
+
55
+ # Calculate statistics
56
+ stats = {}
57
+ percentiles = [0.25, 0.5, 0.75] if include_percentiles else []
58
+
59
+ for col in numeric_df.columns:
60
+ col_data = numeric_df[col].dropna()
61
+
62
+ col_stats = {
63
+ "count": int(col_data.count()),
64
+ "null_count": int(df[col].isna().sum()),
65
+ "mean": float(col_data.mean()),
66
+ "std": float(col_data.std()),
67
+ "min": float(col_data.min()),
68
+ "max": float(col_data.max()),
69
+ "sum": float(col_data.sum()),
70
+ "variance": float(col_data.var()),
71
+ "skewness": float(col_data.skew()),
72
+ "kurtosis": float(col_data.kurt()),
73
+ }
74
+
75
+ if include_percentiles:
76
+ col_stats["25%"] = float(col_data.quantile(0.25))
77
+ col_stats["50%"] = float(col_data.quantile(0.50))
78
+ col_stats["75%"] = float(col_data.quantile(0.75))
79
+ col_stats["iqr"] = col_stats["75%"] - col_stats["25%"]
80
+
81
+ stats[col] = col_stats
82
+
83
+ session.record_operation(
84
+ OperationType.ANALYZE, {"type": "statistics", "columns": list(stats.keys())}
85
+ )
86
+
87
+ return {
88
+ "success": True,
89
+ "statistics": stats,
90
+ "columns_analyzed": list(stats.keys()),
91
+ "total_rows": len(df),
92
+ }
93
+
94
+ except Exception as e:
95
+ logger.error(f"Error getting statistics: {e!s}")
96
+ return {"success": False, "error": str(e)}
97
+
98
+
99
+ async def get_column_statistics(
100
+ session_id: str, column: str, ctx: Context = None
101
+ ) -> dict[str, Any]:
102
+ """
103
+ Get detailed statistics for a specific column.
104
+
105
+ Args:
106
+ session_id: Session identifier
107
+ column: Column name to analyze
108
+ ctx: FastMCP context
109
+
110
+ Returns:
111
+ Dict with detailed column statistics
112
+ """
113
+ try:
114
+ manager = get_session_manager()
115
+ session = manager.get_session(session_id)
116
+
117
+ if not session or session.df is None:
118
+ return {"success": False, "error": "Invalid session or no data loaded"}
119
+
120
+ df = session.df
121
+
122
+ if column not in df.columns:
123
+ return {"success": False, "error": f"Column '{column}' not found"}
124
+
125
+ col_data = df[column]
126
+ result = {
127
+ "column": column,
128
+ "dtype": str(col_data.dtype),
129
+ "total_count": len(col_data),
130
+ "null_count": int(col_data.isna().sum()),
131
+ "null_percentage": round(col_data.isna().sum() / len(col_data) * 100, 2),
132
+ "unique_count": int(col_data.nunique()),
133
+ "unique_percentage": round(col_data.nunique() / len(col_data) * 100, 2),
134
+ }
135
+
136
+ # Numeric column statistics
137
+ if pd.api.types.is_numeric_dtype(col_data):
138
+ non_null = col_data.dropna()
139
+ result.update(
140
+ {
141
+ "type": "numeric",
142
+ "mean": float(non_null.mean()),
143
+ "median": float(non_null.median()),
144
+ "mode": float(non_null.mode()[0]) if len(non_null.mode()) > 0 else None,
145
+ "std": float(non_null.std()),
146
+ "variance": float(non_null.var()),
147
+ "min": float(non_null.min()),
148
+ "max": float(non_null.max()),
149
+ "range": float(non_null.max() - non_null.min()),
150
+ "sum": float(non_null.sum()),
151
+ "skewness": float(non_null.skew()),
152
+ "kurtosis": float(non_null.kurt()),
153
+ "25%": float(non_null.quantile(0.25)),
154
+ "50%": float(non_null.quantile(0.50)),
155
+ "75%": float(non_null.quantile(0.75)),
156
+ "iqr": float(non_null.quantile(0.75) - non_null.quantile(0.25)),
157
+ "zero_count": int((col_data == 0).sum()),
158
+ "positive_count": int((col_data > 0).sum()),
159
+ "negative_count": int((col_data < 0).sum()),
160
+ }
161
+ )
162
+
163
+ # Categorical column statistics
164
+ else:
165
+ value_counts = col_data.value_counts()
166
+ top_values = value_counts.head(10).to_dict()
167
+
168
+ result.update(
169
+ {
170
+ "type": "categorical",
171
+ "most_frequent": str(value_counts.index[0]) if len(value_counts) > 0 else None,
172
+ "most_frequent_count": (
173
+ int(value_counts.iloc[0]) if len(value_counts) > 0 else 0
174
+ ),
175
+ "top_10_values": {str(k): int(v) for k, v in top_values.items()},
176
+ }
177
+ )
178
+
179
+ # String-specific stats
180
+ if col_data.dtype == "object":
181
+ str_data = col_data.dropna().astype(str)
182
+ if len(str_data) > 0:
183
+ str_lengths = str_data.str.len()
184
+ result["string_stats"] = {
185
+ "min_length": int(str_lengths.min()),
186
+ "max_length": int(str_lengths.max()),
187
+ "mean_length": round(str_lengths.mean(), 2),
188
+ "empty_string_count": int((str_data == "").sum()),
189
+ }
190
+
191
+ session.record_operation(
192
+ OperationType.ANALYZE, {"type": "column_statistics", "column": column}
193
+ )
194
+
195
+ return {"success": True, "statistics": result}
196
+
197
+ except Exception as e:
198
+ logger.error(f"Error getting column statistics: {e!s}")
199
+ return {"success": False, "error": str(e)}
200
+
201
+
202
+ async def get_correlation_matrix(
203
+ session_id: str,
204
+ method: str = "pearson",
205
+ columns: list[str] | None = None,
206
+ min_correlation: float | None = None,
207
+ ctx: Context = None,
208
+ ) -> dict[str, Any]:
209
+ """
210
+ Calculate correlation matrix for numeric columns.
211
+
212
+ Args:
213
+ session_id: Session identifier
214
+ method: Correlation method ('pearson', 'spearman', 'kendall')
215
+ columns: Specific columns to include (None for all numeric)
216
+ min_correlation: Filter to show only correlations above this threshold
217
+ ctx: FastMCP context
218
+
219
+ Returns:
220
+ Dict with correlation matrix
221
+ """
222
+ try:
223
+ manager = get_session_manager()
224
+ session = manager.get_session(session_id)
225
+
226
+ if not session or session.df is None:
227
+ return {"success": False, "error": "Invalid session or no data loaded"}
228
+
229
+ df = session.df
230
+
231
+ # Select columns
232
+ if columns:
233
+ missing_cols = [col for col in columns if col not in df.columns]
234
+ if missing_cols:
235
+ return {"success": False, "error": f"Columns not found: {missing_cols}"}
236
+ numeric_df = df[columns].select_dtypes(include=[np.number])
237
+ else:
238
+ numeric_df = df.select_dtypes(include=[np.number])
239
+
240
+ if numeric_df.empty:
241
+ return {"success": False, "error": "No numeric columns found"}
242
+
243
+ if len(numeric_df.columns) < 2:
244
+ return {"success": False, "error": "Need at least 2 numeric columns for correlation"}
245
+
246
+ # Calculate correlation
247
+ if method not in ["pearson", "spearman", "kendall"]:
248
+ return {"success": False, "error": f"Invalid method: {method}"}
249
+
250
+ corr_matrix = numeric_df.corr(method=method)
251
+
252
+ # Convert to dict format
253
+ correlations = {}
254
+ for col1 in corr_matrix.columns:
255
+ correlations[col1] = {}
256
+ for col2 in corr_matrix.columns:
257
+ value = corr_matrix.loc[col1, col2]
258
+ if not pd.isna(value):
259
+ if min_correlation is None or abs(value) >= min_correlation or col1 == col2:
260
+ correlations[col1][col2] = round(float(value), 4)
261
+
262
+ # Find highly correlated pairs
263
+ high_correlations = []
264
+ for i, col1 in enumerate(corr_matrix.columns):
265
+ for col2 in corr_matrix.columns[i + 1 :]:
266
+ corr_value = corr_matrix.loc[col1, col2]
267
+ if not pd.isna(corr_value) and abs(corr_value) >= 0.7:
268
+ high_correlations.append(
269
+ {
270
+ "column1": col1,
271
+ "column2": col2,
272
+ "correlation": round(float(corr_value), 4),
273
+ }
274
+ )
275
+
276
+ high_correlations.sort(key=lambda x: abs(x["correlation"]), reverse=True)
277
+
278
+ session.record_operation(
279
+ OperationType.ANALYZE,
280
+ {"type": "correlation", "method": method, "columns": list(corr_matrix.columns)},
281
+ )
282
+
283
+ return {
284
+ "success": True,
285
+ "method": method,
286
+ "correlation_matrix": correlations,
287
+ "high_correlations": high_correlations,
288
+ "columns_analyzed": list(corr_matrix.columns),
289
+ }
290
+
291
+ except Exception as e:
292
+ logger.error(f"Error calculating correlation: {e!s}")
293
+ return {"success": False, "error": str(e)}
294
+
295
+
296
+ async def group_by_aggregate(
297
+ session_id: str,
298
+ group_by: list[str],
299
+ aggregations: dict[str, str | list[str]],
300
+ ctx: Context = None,
301
+ ) -> dict[str, Any]:
302
+ """
303
+ Group data and apply aggregation functions.
304
+
305
+ Args:
306
+ session_id: Session identifier
307
+ group_by: Columns to group by
308
+ aggregations: Dict mapping column names to aggregation functions
309
+ e.g., {"sales": ["sum", "mean"], "quantity": "sum"}
310
+ ctx: FastMCP context
311
+
312
+ Returns:
313
+ Dict with grouped data
314
+ """
315
+ try:
316
+ manager = get_session_manager()
317
+ session = manager.get_session(session_id)
318
+
319
+ if not session or session.df is None:
320
+ return {"success": False, "error": "Invalid session or no data loaded"}
321
+
322
+ df = session.df
323
+
324
+ # Validate group by columns
325
+ missing_cols = [col for col in group_by if col not in df.columns]
326
+ if missing_cols:
327
+ return {"success": False, "error": f"Group by columns not found: {missing_cols}"}
328
+
329
+ # Validate aggregation columns
330
+ agg_cols = list(aggregations.keys())
331
+ missing_agg_cols = [col for col in agg_cols if col not in df.columns]
332
+ if missing_agg_cols:
333
+ return {"success": False, "error": f"Aggregation columns not found: {missing_agg_cols}"}
334
+
335
+ # Prepare aggregation dict
336
+ agg_dict = {}
337
+ for col, funcs in aggregations.items():
338
+ if isinstance(funcs, str):
339
+ agg_dict[col] = [funcs]
340
+ else:
341
+ agg_dict[col] = funcs
342
+
343
+ # Perform groupby
344
+ grouped = df.groupby(group_by).agg(agg_dict)
345
+
346
+ # Flatten column names
347
+ grouped.columns = [
348
+ "_".join(col).strip() if col[1] else col[0] for col in grouped.columns.values
349
+ ]
350
+
351
+ # Reset index to make group columns regular columns
352
+ result_df = grouped.reset_index()
353
+
354
+ # Convert to dict for response
355
+ result = {
356
+ "data": result_df.to_dict(orient="records"),
357
+ "shape": {"rows": len(result_df), "columns": len(result_df.columns)},
358
+ "columns": result_df.columns.tolist(),
359
+ }
360
+
361
+ # Store grouped data in session
362
+ session.df = result_df
363
+ session.record_operation(
364
+ OperationType.GROUP_BY,
365
+ {"group_by": group_by, "aggregations": aggregations, "result_shape": result["shape"]},
366
+ )
367
+
368
+ return {
369
+ "success": True,
370
+ "grouped_data": result,
371
+ "group_by": group_by,
372
+ "aggregations": aggregations,
373
+ }
374
+
375
+ except Exception as e:
376
+ logger.error(f"Error in group by aggregate: {e!s}")
377
+ return {"success": False, "error": str(e)}
378
+
379
+
380
+ async def get_value_counts(
381
+ session_id: str,
382
+ column: str,
383
+ normalize: bool = False,
384
+ sort: bool = True,
385
+ ascending: bool = False,
386
+ top_n: int | None = None,
387
+ ctx: Context = None,
388
+ ) -> dict[str, Any]:
389
+ """
390
+ Get value counts for a column.
391
+
392
+ Args:
393
+ session_id: Session identifier
394
+ column: Column name to count values
395
+ normalize: Return proportions instead of counts
396
+ sort: Sort by frequency
397
+ ascending: Sort order
398
+ top_n: Return only top N values
399
+ ctx: FastMCP context
400
+
401
+ Returns:
402
+ Dict with value counts
403
+ """
404
+ try:
405
+ manager = get_session_manager()
406
+ session = manager.get_session(session_id)
407
+
408
+ if not session or session.df is None:
409
+ return {"success": False, "error": "Invalid session or no data loaded"}
410
+
411
+ df = session.df
412
+
413
+ if column not in df.columns:
414
+ return {"success": False, "error": f"Column '{column}' not found"}
415
+
416
+ # Get value counts
417
+ value_counts = df[column].value_counts(
418
+ normalize=normalize, sort=sort, ascending=ascending, dropna=False
419
+ )
420
+
421
+ # Apply top_n if specified
422
+ if top_n:
423
+ value_counts = value_counts.head(top_n)
424
+
425
+ # Convert to dict
426
+ counts_dict = {}
427
+ for value, count in value_counts.items():
428
+ key = str(value) if not pd.isna(value) else "NaN"
429
+ counts_dict[key] = float(count) if normalize else int(count)
430
+
431
+ # Calculate additional statistics
432
+ unique_count = df[column].nunique(dropna=False)
433
+ null_count = df[column].isna().sum()
434
+
435
+ session.record_operation(
436
+ OperationType.ANALYZE,
437
+ {"type": "value_counts", "column": column, "normalize": normalize, "top_n": top_n},
438
+ )
439
+
440
+ return {
441
+ "success": True,
442
+ "column": column,
443
+ "value_counts": counts_dict,
444
+ "unique_values": int(unique_count),
445
+ "null_count": int(null_count),
446
+ "total_count": len(df),
447
+ "normalized": normalize,
448
+ }
449
+
450
+ except Exception as e:
451
+ logger.error(f"Error getting value counts: {e!s}")
452
+ return {"success": False, "error": str(e)}
453
+
454
+
455
+ async def detect_outliers(
456
+ session_id: str,
457
+ columns: list[str] | None = None,
458
+ method: str = "iqr",
459
+ threshold: float = 1.5,
460
+ ctx: Context = None,
461
+ ) -> dict[str, Any]:
462
+ """
463
+ Detect outliers in numeric columns.
464
+
465
+ Args:
466
+ session_id: Session identifier
467
+ columns: Columns to check (None for all numeric)
468
+ method: Detection method ('iqr', 'zscore', 'isolation_forest')
469
+ threshold: Threshold for outlier detection (1.5 for IQR, 3 for z-score)
470
+ ctx: FastMCP context
471
+
472
+ Returns:
473
+ Dict with outlier information
474
+ """
475
+ try:
476
+ manager = get_session_manager()
477
+ session = manager.get_session(session_id)
478
+
479
+ if not session or session.df is None:
480
+ return {"success": False, "error": "Invalid session or no data loaded"}
481
+
482
+ df = session.df
483
+
484
+ # Select numeric columns
485
+ if columns:
486
+ missing_cols = [col for col in columns if col not in df.columns]
487
+ if missing_cols:
488
+ return {"success": False, "error": f"Columns not found: {missing_cols}"}
489
+ numeric_df = df[columns].select_dtypes(include=[np.number])
490
+ else:
491
+ numeric_df = df.select_dtypes(include=[np.number])
492
+
493
+ if numeric_df.empty:
494
+ return {"success": False, "error": "No numeric columns found"}
495
+
496
+ outliers = {}
497
+
498
+ if method == "iqr":
499
+ for col in numeric_df.columns:
500
+ Q1 = numeric_df[col].quantile(0.25)
501
+ Q3 = numeric_df[col].quantile(0.75)
502
+ IQR = Q3 - Q1
503
+
504
+ lower_bound = Q1 - threshold * IQR
505
+ upper_bound = Q3 + threshold * IQR
506
+
507
+ outlier_mask = (numeric_df[col] < lower_bound) | (numeric_df[col] > upper_bound)
508
+ outlier_indices = df.index[outlier_mask].tolist()
509
+
510
+ outliers[col] = {
511
+ "method": "IQR",
512
+ "lower_bound": float(lower_bound),
513
+ "upper_bound": float(upper_bound),
514
+ "outlier_count": len(outlier_indices),
515
+ "outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2),
516
+ "outlier_indices": outlier_indices[:100], # Limit to first 100
517
+ "q1": float(Q1),
518
+ "q3": float(Q3),
519
+ "iqr": float(IQR),
520
+ }
521
+
522
+ elif method == "zscore":
523
+ for col in numeric_df.columns:
524
+ z_scores = np.abs(
525
+ (numeric_df[col] - numeric_df[col].mean()) / numeric_df[col].std()
526
+ )
527
+ outlier_mask = z_scores > threshold
528
+ outlier_indices = df.index[outlier_mask].tolist()
529
+
530
+ outliers[col] = {
531
+ "method": "Z-Score",
532
+ "threshold": threshold,
533
+ "outlier_count": len(outlier_indices),
534
+ "outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2),
535
+ "outlier_indices": outlier_indices[:100], # Limit to first 100
536
+ "mean": float(numeric_df[col].mean()),
537
+ "std": float(numeric_df[col].std()),
538
+ }
539
+
540
+ else:
541
+ return {"success": False, "error": f"Unknown method: {method}"}
542
+
543
+ # Summary statistics
544
+ total_outliers = sum(info["outlier_count"] for info in outliers.values())
545
+
546
+ session.record_operation(
547
+ OperationType.ANALYZE,
548
+ {
549
+ "type": "outlier_detection",
550
+ "method": method,
551
+ "threshold": threshold,
552
+ "columns": list(outliers.keys()),
553
+ },
554
+ )
555
+
556
+ return {
557
+ "success": True,
558
+ "method": method,
559
+ "threshold": threshold,
560
+ "outliers": outliers,
561
+ "total_outliers": total_outliers,
562
+ "columns_analyzed": list(outliers.keys()),
563
+ }
564
+
565
+ except Exception as e:
566
+ logger.error(f"Error detecting outliers: {e!s}")
567
+ return {"success": False, "error": str(e)}
568
+
569
+
570
+ async def profile_data(
571
+ session_id: str,
572
+ include_correlations: bool = True,
573
+ include_outliers: bool = True,
574
+ ctx: Context = None,
575
+ ) -> dict[str, Any]:
576
+ """
577
+ Generate comprehensive data profile.
578
+
579
+ Args:
580
+ session_id: Session identifier
581
+ include_correlations: Include correlation analysis
582
+ include_outliers: Include outlier detection
583
+ ctx: FastMCP context
584
+
585
+ Returns:
586
+ Dict with complete data profile
587
+ """
588
+ try:
589
+ manager = get_session_manager()
590
+ session = manager.get_session(session_id)
591
+
592
+ if not session or session.df is None:
593
+ return {"success": False, "error": "Invalid session or no data loaded"}
594
+
595
+ df = session.df
596
+
597
+ profile = {
598
+ "overview": {
599
+ "row_count": len(df),
600
+ "column_count": len(df.columns),
601
+ "memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2),
602
+ "duplicate_rows": df.duplicated().sum(),
603
+ "duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2),
604
+ },
605
+ "columns": {},
606
+ }
607
+
608
+ # Analyze each column
609
+ for col in df.columns:
610
+ col_data = df[col]
611
+ col_profile = {
612
+ "dtype": str(col_data.dtype),
613
+ "null_count": int(col_data.isna().sum()),
614
+ "null_percentage": round(col_data.isna().sum() / len(df) * 100, 2),
615
+ "unique_count": int(col_data.nunique()),
616
+ "unique_percentage": round(col_data.nunique() / len(df) * 100, 2),
617
+ }
618
+
619
+ # Numeric column analysis
620
+ if pd.api.types.is_numeric_dtype(col_data):
621
+ col_profile["type"] = "numeric"
622
+ col_profile["statistics"] = {
623
+ "mean": float(col_data.mean()),
624
+ "std": float(col_data.std()),
625
+ "min": float(col_data.min()),
626
+ "max": float(col_data.max()),
627
+ "25%": float(col_data.quantile(0.25)),
628
+ "50%": float(col_data.quantile(0.50)),
629
+ "75%": float(col_data.quantile(0.75)),
630
+ "skewness": float(col_data.skew()),
631
+ "kurtosis": float(col_data.kurt()),
632
+ }
633
+ col_profile["zeros"] = int((col_data == 0).sum())
634
+ col_profile["negative_count"] = int((col_data < 0).sum())
635
+
636
+ # Datetime column analysis
637
+ elif pd.api.types.is_datetime64_any_dtype(col_data):
638
+ col_profile["type"] = "datetime"
639
+ non_null = col_data.dropna()
640
+ if len(non_null) > 0:
641
+ col_profile["date_range"] = {
642
+ "min": str(non_null.min()),
643
+ "max": str(non_null.max()),
644
+ "range_days": (non_null.max() - non_null.min()).days,
645
+ }
646
+
647
+ # Categorical/text column analysis
648
+ else:
649
+ col_profile["type"] = "categorical"
650
+ value_counts = col_data.value_counts()
651
+ col_profile["most_frequent"] = {
652
+ "value": str(value_counts.index[0]) if len(value_counts) > 0 else None,
653
+ "count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
654
+ }
655
+
656
+ # String-specific analysis
657
+ if col_data.dtype == "object":
658
+ str_lengths = col_data.dropna().astype(str).str.len()
659
+ if len(str_lengths) > 0:
660
+ col_profile["string_stats"] = {
661
+ "min_length": int(str_lengths.min()),
662
+ "max_length": int(str_lengths.max()),
663
+ "mean_length": round(str_lengths.mean(), 2),
664
+ }
665
+
666
+ profile["columns"][col] = col_profile
667
+
668
+ # Add correlations if requested
669
+ if include_correlations:
670
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
671
+ if len(numeric_cols) >= 2:
672
+ corr_result = await get_correlation_matrix(session_id, ctx=ctx)
673
+ if corr_result["success"]:
674
+ profile["correlations"] = corr_result["high_correlations"]
675
+
676
+ # Add outlier detection if requested
677
+ if include_outliers:
678
+ outlier_result = await detect_outliers(session_id, ctx=ctx)
679
+ if outlier_result["success"]:
680
+ profile["outliers"] = {
681
+ col: {"count": info["outlier_count"], "percentage": info["outlier_percentage"]}
682
+ for col, info in outlier_result["outliers"].items()
683
+ }
684
+
685
+ # Data quality score
686
+ total_cells = len(df) * len(df.columns)
687
+ missing_cells = df.isna().sum().sum()
688
+ quality_score = round((1 - missing_cells / total_cells) * 100, 2)
689
+ profile["data_quality_score"] = quality_score
690
+
691
+ session.record_operation(
692
+ OperationType.PROFILE,
693
+ {"include_correlations": include_correlations, "include_outliers": include_outliers},
694
+ )
695
+
696
+ return {"success": True, "profile": profile}
697
+
698
+ except Exception as e:
699
+ logger.error(f"Error profiling data: {e!s}")
700
+ return {"success": False, "error": str(e)}