@mseep/csv-editor 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.md +53 -0
- package/.github/ISSUE_TEMPLATE/feature_request.md +38 -0
- package/.github/workflows/deploy-docs.yml +62 -0
- package/.github/workflows/publish-github.yml +52 -0
- package/.github/workflows/publish.yml +44 -0
- package/.github/workflows/test.yml +32 -0
- package/.pre-commit-config.yaml +157 -0
- package/ALTERNATIVE_PUBLISHING.md +175 -0
- package/ARCHITECTURE.md +1011 -0
- package/CHANGELOG.md +99 -0
- package/CODE_OF_CONDUCT.md +41 -0
- package/CONTRIBUTING.md +427 -0
- package/Dockerfile +22 -0
- package/LICENSE +21 -0
- package/MCP_CONFIG.md +505 -0
- package/PUBLISHING.md +210 -0
- package/README.md +400 -0
- package/SECURITY.md +61 -0
- package/docs/README.md +41 -0
- package/docs/blog/2019-05-28-first-blog-post.md +12 -0
- package/docs/blog/2019-05-29-long-blog-post.md +44 -0
- package/docs/blog/2021-08-01-mdx-blog-post.mdx +24 -0
- package/docs/blog/2021-08-26-welcome/docusaurus-plushie-banner.jpeg +0 -0
- package/docs/blog/2021-08-26-welcome/index.md +29 -0
- package/docs/blog/authors.yml +25 -0
- package/docs/blog/tags.yml +19 -0
- package/docs/docs/api/overview.md +183 -0
- package/docs/docs/installation.md +252 -0
- package/docs/docs/intro.md +87 -0
- package/docs/docs/tutorial-basics/_category_.json +8 -0
- package/docs/docs/tutorial-basics/congratulations.md +23 -0
- package/docs/docs/tutorial-basics/create-a-blog-post.md +34 -0
- package/docs/docs/tutorial-basics/create-a-document.md +57 -0
- package/docs/docs/tutorial-basics/create-a-page.md +43 -0
- package/docs/docs/tutorial-basics/deploy-your-site.md +31 -0
- package/docs/docs/tutorial-basics/markdown-features.mdx +152 -0
- package/docs/docs/tutorial-extras/_category_.json +7 -0
- package/docs/docs/tutorial-extras/img/docsVersionDropdown.png +0 -0
- package/docs/docs/tutorial-extras/img/localeDropdown.png +0 -0
- package/docs/docs/tutorial-extras/manage-docs-versions.md +55 -0
- package/docs/docs/tutorial-extras/translate-your-site.md +88 -0
- package/docs/docs/tutorials/quickstart.md +365 -0
- package/docs/docusaurus.config.ts +163 -0
- package/docs/package-lock.json +17493 -0
- package/docs/package.json +48 -0
- package/docs/sidebars.ts +33 -0
- package/docs/src/components/HomepageFeatures/index.tsx +71 -0
- package/docs/src/components/HomepageFeatures/styles.module.css +11 -0
- package/docs/src/css/custom.css +30 -0
- package/docs/src/pages/index.module.css +23 -0
- package/docs/src/pages/index.tsx +44 -0
- package/docs/src/pages/markdown-page.md +7 -0
- package/docs/static/.nojekyll +0 -0
- package/docs/static/img/docusaurus-social-card.jpg +0 -0
- package/docs/static/img/docusaurus.png +0 -0
- package/docs/static/img/favicon.ico +0 -0
- package/docs/static/img/logo.svg +1 -0
- package/docs/static/img/undraw_docusaurus_mountain.svg +171 -0
- package/docs/static/img/undraw_docusaurus_react.svg +170 -0
- package/docs/static/img/undraw_docusaurus_tree.svg +40 -0
- package/docs/tsconfig.json +8 -0
- package/examples/README.md +48 -0
- package/examples/auto_save_demo.py +206 -0
- package/examples/auto_save_overwrite.py +201 -0
- package/examples/basic_usage.py +135 -0
- package/examples/demo.py +139 -0
- package/examples/history_demo.py +317 -0
- package/examples/test_default_autosave.py +124 -0
- package/examples/update_consignee_example.py +179 -0
- package/package.json +51 -0
- package/plans/2026-04-19-fastmcp3-migration-plan.md +1045 -0
- package/pyproject.toml +331 -0
- package/requirements-dev.txt +30 -0
- package/requirements.txt +22 -0
- package/scripts/publish.py +67 -0
- package/smithery.yaml +15 -0
- package/specs/2026-04-19-fastmcp3-migration-design.md +243 -0
- package/src/csv_editor/__init__.py +8 -0
- package/src/csv_editor/models/__init__.py +39 -0
- package/src/csv_editor/models/auto_save.py +246 -0
- package/src/csv_editor/models/csv_session.py +468 -0
- package/src/csv_editor/models/data_models.py +244 -0
- package/src/csv_editor/models/history_manager.py +456 -0
- package/src/csv_editor/prompts/__init__.py +0 -0
- package/src/csv_editor/prompts/data_prompts.py +13 -0
- package/src/csv_editor/resources/__init__.py +0 -0
- package/src/csv_editor/resources/csv_resources.py +22 -0
- package/src/csv_editor/server.py +640 -0
- package/src/csv_editor/tools/__init__.py +5 -0
- package/src/csv_editor/tools/analytics.py +700 -0
- package/src/csv_editor/tools/auto_save_operations.py +235 -0
- package/src/csv_editor/tools/data_operations.py +3 -0
- package/src/csv_editor/tools/history_operations.py +315 -0
- package/src/csv_editor/tools/io_operations.py +431 -0
- package/src/csv_editor/tools/transformations.py +663 -0
- package/src/csv_editor/tools/validation.py +822 -0
- package/src/csv_editor/utils/__init__.py +0 -0
- package/src/csv_editor/utils/validators.py +205 -0
- package/tests/README.md +65 -0
- package/tests/__init__.py +7 -0
- package/tests/conftest.py +50 -0
- package/tests/test_auto_save.py +378 -0
- package/tests/test_basic.py +103 -0
- package/tests/test_integration.py +356 -0
- package/tests/test_server_boot.py +50 -0
- package/tests/test_settings.py +184 -0
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
"""Analytics tools for CSV data analysis."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from fastmcp import Context
|
|
9
|
+
|
|
10
|
+
from ..models.csv_session import get_session_manager
|
|
11
|
+
from ..models.data_models import OperationType
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def get_statistics(
|
|
17
|
+
session_id: str,
|
|
18
|
+
columns: list[str] | None = None,
|
|
19
|
+
include_percentiles: bool = True,
|
|
20
|
+
ctx: Context = None,
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Get statistical summary of numerical columns.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
session_id: Session identifier
|
|
27
|
+
columns: Specific columns to analyze (None for all numeric)
|
|
28
|
+
include_percentiles: Include percentile values
|
|
29
|
+
ctx: FastMCP context
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Dict with statistics for each column
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
manager = get_session_manager()
|
|
36
|
+
session = manager.get_session(session_id)
|
|
37
|
+
|
|
38
|
+
if not session or session.df is None:
|
|
39
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
40
|
+
|
|
41
|
+
df = session.df
|
|
42
|
+
|
|
43
|
+
# Select columns to analyze
|
|
44
|
+
if columns:
|
|
45
|
+
missing_cols = [col for col in columns if col not in df.columns]
|
|
46
|
+
if missing_cols:
|
|
47
|
+
return {"success": False, "error": f"Columns not found: {missing_cols}"}
|
|
48
|
+
numeric_df = df[columns].select_dtypes(include=[np.number])
|
|
49
|
+
else:
|
|
50
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
51
|
+
|
|
52
|
+
if numeric_df.empty:
|
|
53
|
+
return {"success": False, "error": "No numeric columns found"}
|
|
54
|
+
|
|
55
|
+
# Calculate statistics
|
|
56
|
+
stats = {}
|
|
57
|
+
percentiles = [0.25, 0.5, 0.75] if include_percentiles else []
|
|
58
|
+
|
|
59
|
+
for col in numeric_df.columns:
|
|
60
|
+
col_data = numeric_df[col].dropna()
|
|
61
|
+
|
|
62
|
+
col_stats = {
|
|
63
|
+
"count": int(col_data.count()),
|
|
64
|
+
"null_count": int(df[col].isna().sum()),
|
|
65
|
+
"mean": float(col_data.mean()),
|
|
66
|
+
"std": float(col_data.std()),
|
|
67
|
+
"min": float(col_data.min()),
|
|
68
|
+
"max": float(col_data.max()),
|
|
69
|
+
"sum": float(col_data.sum()),
|
|
70
|
+
"variance": float(col_data.var()),
|
|
71
|
+
"skewness": float(col_data.skew()),
|
|
72
|
+
"kurtosis": float(col_data.kurt()),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if include_percentiles:
|
|
76
|
+
col_stats["25%"] = float(col_data.quantile(0.25))
|
|
77
|
+
col_stats["50%"] = float(col_data.quantile(0.50))
|
|
78
|
+
col_stats["75%"] = float(col_data.quantile(0.75))
|
|
79
|
+
col_stats["iqr"] = col_stats["75%"] - col_stats["25%"]
|
|
80
|
+
|
|
81
|
+
stats[col] = col_stats
|
|
82
|
+
|
|
83
|
+
session.record_operation(
|
|
84
|
+
OperationType.ANALYZE, {"type": "statistics", "columns": list(stats.keys())}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
"success": True,
|
|
89
|
+
"statistics": stats,
|
|
90
|
+
"columns_analyzed": list(stats.keys()),
|
|
91
|
+
"total_rows": len(df),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Error getting statistics: {e!s}")
|
|
96
|
+
return {"success": False, "error": str(e)}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def get_column_statistics(
|
|
100
|
+
session_id: str, column: str, ctx: Context = None
|
|
101
|
+
) -> dict[str, Any]:
|
|
102
|
+
"""
|
|
103
|
+
Get detailed statistics for a specific column.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
session_id: Session identifier
|
|
107
|
+
column: Column name to analyze
|
|
108
|
+
ctx: FastMCP context
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dict with detailed column statistics
|
|
112
|
+
"""
|
|
113
|
+
try:
|
|
114
|
+
manager = get_session_manager()
|
|
115
|
+
session = manager.get_session(session_id)
|
|
116
|
+
|
|
117
|
+
if not session or session.df is None:
|
|
118
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
119
|
+
|
|
120
|
+
df = session.df
|
|
121
|
+
|
|
122
|
+
if column not in df.columns:
|
|
123
|
+
return {"success": False, "error": f"Column '{column}' not found"}
|
|
124
|
+
|
|
125
|
+
col_data = df[column]
|
|
126
|
+
result = {
|
|
127
|
+
"column": column,
|
|
128
|
+
"dtype": str(col_data.dtype),
|
|
129
|
+
"total_count": len(col_data),
|
|
130
|
+
"null_count": int(col_data.isna().sum()),
|
|
131
|
+
"null_percentage": round(col_data.isna().sum() / len(col_data) * 100, 2),
|
|
132
|
+
"unique_count": int(col_data.nunique()),
|
|
133
|
+
"unique_percentage": round(col_data.nunique() / len(col_data) * 100, 2),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Numeric column statistics
|
|
137
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
138
|
+
non_null = col_data.dropna()
|
|
139
|
+
result.update(
|
|
140
|
+
{
|
|
141
|
+
"type": "numeric",
|
|
142
|
+
"mean": float(non_null.mean()),
|
|
143
|
+
"median": float(non_null.median()),
|
|
144
|
+
"mode": float(non_null.mode()[0]) if len(non_null.mode()) > 0 else None,
|
|
145
|
+
"std": float(non_null.std()),
|
|
146
|
+
"variance": float(non_null.var()),
|
|
147
|
+
"min": float(non_null.min()),
|
|
148
|
+
"max": float(non_null.max()),
|
|
149
|
+
"range": float(non_null.max() - non_null.min()),
|
|
150
|
+
"sum": float(non_null.sum()),
|
|
151
|
+
"skewness": float(non_null.skew()),
|
|
152
|
+
"kurtosis": float(non_null.kurt()),
|
|
153
|
+
"25%": float(non_null.quantile(0.25)),
|
|
154
|
+
"50%": float(non_null.quantile(0.50)),
|
|
155
|
+
"75%": float(non_null.quantile(0.75)),
|
|
156
|
+
"iqr": float(non_null.quantile(0.75) - non_null.quantile(0.25)),
|
|
157
|
+
"zero_count": int((col_data == 0).sum()),
|
|
158
|
+
"positive_count": int((col_data > 0).sum()),
|
|
159
|
+
"negative_count": int((col_data < 0).sum()),
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Categorical column statistics
|
|
164
|
+
else:
|
|
165
|
+
value_counts = col_data.value_counts()
|
|
166
|
+
top_values = value_counts.head(10).to_dict()
|
|
167
|
+
|
|
168
|
+
result.update(
|
|
169
|
+
{
|
|
170
|
+
"type": "categorical",
|
|
171
|
+
"most_frequent": str(value_counts.index[0]) if len(value_counts) > 0 else None,
|
|
172
|
+
"most_frequent_count": (
|
|
173
|
+
int(value_counts.iloc[0]) if len(value_counts) > 0 else 0
|
|
174
|
+
),
|
|
175
|
+
"top_10_values": {str(k): int(v) for k, v in top_values.items()},
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# String-specific stats
|
|
180
|
+
if col_data.dtype == "object":
|
|
181
|
+
str_data = col_data.dropna().astype(str)
|
|
182
|
+
if len(str_data) > 0:
|
|
183
|
+
str_lengths = str_data.str.len()
|
|
184
|
+
result["string_stats"] = {
|
|
185
|
+
"min_length": int(str_lengths.min()),
|
|
186
|
+
"max_length": int(str_lengths.max()),
|
|
187
|
+
"mean_length": round(str_lengths.mean(), 2),
|
|
188
|
+
"empty_string_count": int((str_data == "").sum()),
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
session.record_operation(
|
|
192
|
+
OperationType.ANALYZE, {"type": "column_statistics", "column": column}
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return {"success": True, "statistics": result}
|
|
196
|
+
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Error getting column statistics: {e!s}")
|
|
199
|
+
return {"success": False, "error": str(e)}
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
async def get_correlation_matrix(
|
|
203
|
+
session_id: str,
|
|
204
|
+
method: str = "pearson",
|
|
205
|
+
columns: list[str] | None = None,
|
|
206
|
+
min_correlation: float | None = None,
|
|
207
|
+
ctx: Context = None,
|
|
208
|
+
) -> dict[str, Any]:
|
|
209
|
+
"""
|
|
210
|
+
Calculate correlation matrix for numeric columns.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
session_id: Session identifier
|
|
214
|
+
method: Correlation method ('pearson', 'spearman', 'kendall')
|
|
215
|
+
columns: Specific columns to include (None for all numeric)
|
|
216
|
+
min_correlation: Filter to show only correlations above this threshold
|
|
217
|
+
ctx: FastMCP context
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Dict with correlation matrix
|
|
221
|
+
"""
|
|
222
|
+
try:
|
|
223
|
+
manager = get_session_manager()
|
|
224
|
+
session = manager.get_session(session_id)
|
|
225
|
+
|
|
226
|
+
if not session or session.df is None:
|
|
227
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
228
|
+
|
|
229
|
+
df = session.df
|
|
230
|
+
|
|
231
|
+
# Select columns
|
|
232
|
+
if columns:
|
|
233
|
+
missing_cols = [col for col in columns if col not in df.columns]
|
|
234
|
+
if missing_cols:
|
|
235
|
+
return {"success": False, "error": f"Columns not found: {missing_cols}"}
|
|
236
|
+
numeric_df = df[columns].select_dtypes(include=[np.number])
|
|
237
|
+
else:
|
|
238
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
239
|
+
|
|
240
|
+
if numeric_df.empty:
|
|
241
|
+
return {"success": False, "error": "No numeric columns found"}
|
|
242
|
+
|
|
243
|
+
if len(numeric_df.columns) < 2:
|
|
244
|
+
return {"success": False, "error": "Need at least 2 numeric columns for correlation"}
|
|
245
|
+
|
|
246
|
+
# Calculate correlation
|
|
247
|
+
if method not in ["pearson", "spearman", "kendall"]:
|
|
248
|
+
return {"success": False, "error": f"Invalid method: {method}"}
|
|
249
|
+
|
|
250
|
+
corr_matrix = numeric_df.corr(method=method)
|
|
251
|
+
|
|
252
|
+
# Convert to dict format
|
|
253
|
+
correlations = {}
|
|
254
|
+
for col1 in corr_matrix.columns:
|
|
255
|
+
correlations[col1] = {}
|
|
256
|
+
for col2 in corr_matrix.columns:
|
|
257
|
+
value = corr_matrix.loc[col1, col2]
|
|
258
|
+
if not pd.isna(value):
|
|
259
|
+
if min_correlation is None or abs(value) >= min_correlation or col1 == col2:
|
|
260
|
+
correlations[col1][col2] = round(float(value), 4)
|
|
261
|
+
|
|
262
|
+
# Find highly correlated pairs
|
|
263
|
+
high_correlations = []
|
|
264
|
+
for i, col1 in enumerate(corr_matrix.columns):
|
|
265
|
+
for col2 in corr_matrix.columns[i + 1 :]:
|
|
266
|
+
corr_value = corr_matrix.loc[col1, col2]
|
|
267
|
+
if not pd.isna(corr_value) and abs(corr_value) >= 0.7:
|
|
268
|
+
high_correlations.append(
|
|
269
|
+
{
|
|
270
|
+
"column1": col1,
|
|
271
|
+
"column2": col2,
|
|
272
|
+
"correlation": round(float(corr_value), 4),
|
|
273
|
+
}
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
high_correlations.sort(key=lambda x: abs(x["correlation"]), reverse=True)
|
|
277
|
+
|
|
278
|
+
session.record_operation(
|
|
279
|
+
OperationType.ANALYZE,
|
|
280
|
+
{"type": "correlation", "method": method, "columns": list(corr_matrix.columns)},
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return {
|
|
284
|
+
"success": True,
|
|
285
|
+
"method": method,
|
|
286
|
+
"correlation_matrix": correlations,
|
|
287
|
+
"high_correlations": high_correlations,
|
|
288
|
+
"columns_analyzed": list(corr_matrix.columns),
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.error(f"Error calculating correlation: {e!s}")
|
|
293
|
+
return {"success": False, "error": str(e)}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
async def group_by_aggregate(
|
|
297
|
+
session_id: str,
|
|
298
|
+
group_by: list[str],
|
|
299
|
+
aggregations: dict[str, str | list[str]],
|
|
300
|
+
ctx: Context = None,
|
|
301
|
+
) -> dict[str, Any]:
|
|
302
|
+
"""
|
|
303
|
+
Group data and apply aggregation functions.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
session_id: Session identifier
|
|
307
|
+
group_by: Columns to group by
|
|
308
|
+
aggregations: Dict mapping column names to aggregation functions
|
|
309
|
+
e.g., {"sales": ["sum", "mean"], "quantity": "sum"}
|
|
310
|
+
ctx: FastMCP context
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Dict with grouped data
|
|
314
|
+
"""
|
|
315
|
+
try:
|
|
316
|
+
manager = get_session_manager()
|
|
317
|
+
session = manager.get_session(session_id)
|
|
318
|
+
|
|
319
|
+
if not session or session.df is None:
|
|
320
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
321
|
+
|
|
322
|
+
df = session.df
|
|
323
|
+
|
|
324
|
+
# Validate group by columns
|
|
325
|
+
missing_cols = [col for col in group_by if col not in df.columns]
|
|
326
|
+
if missing_cols:
|
|
327
|
+
return {"success": False, "error": f"Group by columns not found: {missing_cols}"}
|
|
328
|
+
|
|
329
|
+
# Validate aggregation columns
|
|
330
|
+
agg_cols = list(aggregations.keys())
|
|
331
|
+
missing_agg_cols = [col for col in agg_cols if col not in df.columns]
|
|
332
|
+
if missing_agg_cols:
|
|
333
|
+
return {"success": False, "error": f"Aggregation columns not found: {missing_agg_cols}"}
|
|
334
|
+
|
|
335
|
+
# Prepare aggregation dict
|
|
336
|
+
agg_dict = {}
|
|
337
|
+
for col, funcs in aggregations.items():
|
|
338
|
+
if isinstance(funcs, str):
|
|
339
|
+
agg_dict[col] = [funcs]
|
|
340
|
+
else:
|
|
341
|
+
agg_dict[col] = funcs
|
|
342
|
+
|
|
343
|
+
# Perform groupby
|
|
344
|
+
grouped = df.groupby(group_by).agg(agg_dict)
|
|
345
|
+
|
|
346
|
+
# Flatten column names
|
|
347
|
+
grouped.columns = [
|
|
348
|
+
"_".join(col).strip() if col[1] else col[0] for col in grouped.columns.values
|
|
349
|
+
]
|
|
350
|
+
|
|
351
|
+
# Reset index to make group columns regular columns
|
|
352
|
+
result_df = grouped.reset_index()
|
|
353
|
+
|
|
354
|
+
# Convert to dict for response
|
|
355
|
+
result = {
|
|
356
|
+
"data": result_df.to_dict(orient="records"),
|
|
357
|
+
"shape": {"rows": len(result_df), "columns": len(result_df.columns)},
|
|
358
|
+
"columns": result_df.columns.tolist(),
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
# Store grouped data in session
|
|
362
|
+
session.df = result_df
|
|
363
|
+
session.record_operation(
|
|
364
|
+
OperationType.GROUP_BY,
|
|
365
|
+
{"group_by": group_by, "aggregations": aggregations, "result_shape": result["shape"]},
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
"success": True,
|
|
370
|
+
"grouped_data": result,
|
|
371
|
+
"group_by": group_by,
|
|
372
|
+
"aggregations": aggregations,
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
except Exception as e:
|
|
376
|
+
logger.error(f"Error in group by aggregate: {e!s}")
|
|
377
|
+
return {"success": False, "error": str(e)}
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
async def get_value_counts(
|
|
381
|
+
session_id: str,
|
|
382
|
+
column: str,
|
|
383
|
+
normalize: bool = False,
|
|
384
|
+
sort: bool = True,
|
|
385
|
+
ascending: bool = False,
|
|
386
|
+
top_n: int | None = None,
|
|
387
|
+
ctx: Context = None,
|
|
388
|
+
) -> dict[str, Any]:
|
|
389
|
+
"""
|
|
390
|
+
Get value counts for a column.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
session_id: Session identifier
|
|
394
|
+
column: Column name to count values
|
|
395
|
+
normalize: Return proportions instead of counts
|
|
396
|
+
sort: Sort by frequency
|
|
397
|
+
ascending: Sort order
|
|
398
|
+
top_n: Return only top N values
|
|
399
|
+
ctx: FastMCP context
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Dict with value counts
|
|
403
|
+
"""
|
|
404
|
+
try:
|
|
405
|
+
manager = get_session_manager()
|
|
406
|
+
session = manager.get_session(session_id)
|
|
407
|
+
|
|
408
|
+
if not session or session.df is None:
|
|
409
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
410
|
+
|
|
411
|
+
df = session.df
|
|
412
|
+
|
|
413
|
+
if column not in df.columns:
|
|
414
|
+
return {"success": False, "error": f"Column '{column}' not found"}
|
|
415
|
+
|
|
416
|
+
# Get value counts
|
|
417
|
+
value_counts = df[column].value_counts(
|
|
418
|
+
normalize=normalize, sort=sort, ascending=ascending, dropna=False
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Apply top_n if specified
|
|
422
|
+
if top_n:
|
|
423
|
+
value_counts = value_counts.head(top_n)
|
|
424
|
+
|
|
425
|
+
# Convert to dict
|
|
426
|
+
counts_dict = {}
|
|
427
|
+
for value, count in value_counts.items():
|
|
428
|
+
key = str(value) if not pd.isna(value) else "NaN"
|
|
429
|
+
counts_dict[key] = float(count) if normalize else int(count)
|
|
430
|
+
|
|
431
|
+
# Calculate additional statistics
|
|
432
|
+
unique_count = df[column].nunique(dropna=False)
|
|
433
|
+
null_count = df[column].isna().sum()
|
|
434
|
+
|
|
435
|
+
session.record_operation(
|
|
436
|
+
OperationType.ANALYZE,
|
|
437
|
+
{"type": "value_counts", "column": column, "normalize": normalize, "top_n": top_n},
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
"success": True,
|
|
442
|
+
"column": column,
|
|
443
|
+
"value_counts": counts_dict,
|
|
444
|
+
"unique_values": int(unique_count),
|
|
445
|
+
"null_count": int(null_count),
|
|
446
|
+
"total_count": len(df),
|
|
447
|
+
"normalized": normalize,
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
except Exception as e:
|
|
451
|
+
logger.error(f"Error getting value counts: {e!s}")
|
|
452
|
+
return {"success": False, "error": str(e)}
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
async def detect_outliers(
|
|
456
|
+
session_id: str,
|
|
457
|
+
columns: list[str] | None = None,
|
|
458
|
+
method: str = "iqr",
|
|
459
|
+
threshold: float = 1.5,
|
|
460
|
+
ctx: Context = None,
|
|
461
|
+
) -> dict[str, Any]:
|
|
462
|
+
"""
|
|
463
|
+
Detect outliers in numeric columns.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
session_id: Session identifier
|
|
467
|
+
columns: Columns to check (None for all numeric)
|
|
468
|
+
method: Detection method ('iqr', 'zscore', 'isolation_forest')
|
|
469
|
+
threshold: Threshold for outlier detection (1.5 for IQR, 3 for z-score)
|
|
470
|
+
ctx: FastMCP context
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Dict with outlier information
|
|
474
|
+
"""
|
|
475
|
+
try:
|
|
476
|
+
manager = get_session_manager()
|
|
477
|
+
session = manager.get_session(session_id)
|
|
478
|
+
|
|
479
|
+
if not session or session.df is None:
|
|
480
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
481
|
+
|
|
482
|
+
df = session.df
|
|
483
|
+
|
|
484
|
+
# Select numeric columns
|
|
485
|
+
if columns:
|
|
486
|
+
missing_cols = [col for col in columns if col not in df.columns]
|
|
487
|
+
if missing_cols:
|
|
488
|
+
return {"success": False, "error": f"Columns not found: {missing_cols}"}
|
|
489
|
+
numeric_df = df[columns].select_dtypes(include=[np.number])
|
|
490
|
+
else:
|
|
491
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
492
|
+
|
|
493
|
+
if numeric_df.empty:
|
|
494
|
+
return {"success": False, "error": "No numeric columns found"}
|
|
495
|
+
|
|
496
|
+
outliers = {}
|
|
497
|
+
|
|
498
|
+
if method == "iqr":
|
|
499
|
+
for col in numeric_df.columns:
|
|
500
|
+
Q1 = numeric_df[col].quantile(0.25)
|
|
501
|
+
Q3 = numeric_df[col].quantile(0.75)
|
|
502
|
+
IQR = Q3 - Q1
|
|
503
|
+
|
|
504
|
+
lower_bound = Q1 - threshold * IQR
|
|
505
|
+
upper_bound = Q3 + threshold * IQR
|
|
506
|
+
|
|
507
|
+
outlier_mask = (numeric_df[col] < lower_bound) | (numeric_df[col] > upper_bound)
|
|
508
|
+
outlier_indices = df.index[outlier_mask].tolist()
|
|
509
|
+
|
|
510
|
+
outliers[col] = {
|
|
511
|
+
"method": "IQR",
|
|
512
|
+
"lower_bound": float(lower_bound),
|
|
513
|
+
"upper_bound": float(upper_bound),
|
|
514
|
+
"outlier_count": len(outlier_indices),
|
|
515
|
+
"outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2),
|
|
516
|
+
"outlier_indices": outlier_indices[:100], # Limit to first 100
|
|
517
|
+
"q1": float(Q1),
|
|
518
|
+
"q3": float(Q3),
|
|
519
|
+
"iqr": float(IQR),
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
elif method == "zscore":
|
|
523
|
+
for col in numeric_df.columns:
|
|
524
|
+
z_scores = np.abs(
|
|
525
|
+
(numeric_df[col] - numeric_df[col].mean()) / numeric_df[col].std()
|
|
526
|
+
)
|
|
527
|
+
outlier_mask = z_scores > threshold
|
|
528
|
+
outlier_indices = df.index[outlier_mask].tolist()
|
|
529
|
+
|
|
530
|
+
outliers[col] = {
|
|
531
|
+
"method": "Z-Score",
|
|
532
|
+
"threshold": threshold,
|
|
533
|
+
"outlier_count": len(outlier_indices),
|
|
534
|
+
"outlier_percentage": round(len(outlier_indices) / len(df) * 100, 2),
|
|
535
|
+
"outlier_indices": outlier_indices[:100], # Limit to first 100
|
|
536
|
+
"mean": float(numeric_df[col].mean()),
|
|
537
|
+
"std": float(numeric_df[col].std()),
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
else:
|
|
541
|
+
return {"success": False, "error": f"Unknown method: {method}"}
|
|
542
|
+
|
|
543
|
+
# Summary statistics
|
|
544
|
+
total_outliers = sum(info["outlier_count"] for info in outliers.values())
|
|
545
|
+
|
|
546
|
+
session.record_operation(
|
|
547
|
+
OperationType.ANALYZE,
|
|
548
|
+
{
|
|
549
|
+
"type": "outlier_detection",
|
|
550
|
+
"method": method,
|
|
551
|
+
"threshold": threshold,
|
|
552
|
+
"columns": list(outliers.keys()),
|
|
553
|
+
},
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
return {
|
|
557
|
+
"success": True,
|
|
558
|
+
"method": method,
|
|
559
|
+
"threshold": threshold,
|
|
560
|
+
"outliers": outliers,
|
|
561
|
+
"total_outliers": total_outliers,
|
|
562
|
+
"columns_analyzed": list(outliers.keys()),
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.error(f"Error detecting outliers: {e!s}")
|
|
567
|
+
return {"success": False, "error": str(e)}
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
async def profile_data(
|
|
571
|
+
session_id: str,
|
|
572
|
+
include_correlations: bool = True,
|
|
573
|
+
include_outliers: bool = True,
|
|
574
|
+
ctx: Context = None,
|
|
575
|
+
) -> dict[str, Any]:
|
|
576
|
+
"""
|
|
577
|
+
Generate comprehensive data profile.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
session_id: Session identifier
|
|
581
|
+
include_correlations: Include correlation analysis
|
|
582
|
+
include_outliers: Include outlier detection
|
|
583
|
+
ctx: FastMCP context
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
Dict with complete data profile
|
|
587
|
+
"""
|
|
588
|
+
try:
|
|
589
|
+
manager = get_session_manager()
|
|
590
|
+
session = manager.get_session(session_id)
|
|
591
|
+
|
|
592
|
+
if not session or session.df is None:
|
|
593
|
+
return {"success": False, "error": "Invalid session or no data loaded"}
|
|
594
|
+
|
|
595
|
+
df = session.df
|
|
596
|
+
|
|
597
|
+
profile = {
|
|
598
|
+
"overview": {
|
|
599
|
+
"row_count": len(df),
|
|
600
|
+
"column_count": len(df.columns),
|
|
601
|
+
"memory_usage_mb": round(df.memory_usage(deep=True).sum() / (1024 * 1024), 2),
|
|
602
|
+
"duplicate_rows": df.duplicated().sum(),
|
|
603
|
+
"duplicate_percentage": round(df.duplicated().sum() / len(df) * 100, 2),
|
|
604
|
+
},
|
|
605
|
+
"columns": {},
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
# Analyze each column
|
|
609
|
+
for col in df.columns:
|
|
610
|
+
col_data = df[col]
|
|
611
|
+
col_profile = {
|
|
612
|
+
"dtype": str(col_data.dtype),
|
|
613
|
+
"null_count": int(col_data.isna().sum()),
|
|
614
|
+
"null_percentage": round(col_data.isna().sum() / len(df) * 100, 2),
|
|
615
|
+
"unique_count": int(col_data.nunique()),
|
|
616
|
+
"unique_percentage": round(col_data.nunique() / len(df) * 100, 2),
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
# Numeric column analysis
|
|
620
|
+
if pd.api.types.is_numeric_dtype(col_data):
|
|
621
|
+
col_profile["type"] = "numeric"
|
|
622
|
+
col_profile["statistics"] = {
|
|
623
|
+
"mean": float(col_data.mean()),
|
|
624
|
+
"std": float(col_data.std()),
|
|
625
|
+
"min": float(col_data.min()),
|
|
626
|
+
"max": float(col_data.max()),
|
|
627
|
+
"25%": float(col_data.quantile(0.25)),
|
|
628
|
+
"50%": float(col_data.quantile(0.50)),
|
|
629
|
+
"75%": float(col_data.quantile(0.75)),
|
|
630
|
+
"skewness": float(col_data.skew()),
|
|
631
|
+
"kurtosis": float(col_data.kurt()),
|
|
632
|
+
}
|
|
633
|
+
col_profile["zeros"] = int((col_data == 0).sum())
|
|
634
|
+
col_profile["negative_count"] = int((col_data < 0).sum())
|
|
635
|
+
|
|
636
|
+
# Datetime column analysis
|
|
637
|
+
elif pd.api.types.is_datetime64_any_dtype(col_data):
|
|
638
|
+
col_profile["type"] = "datetime"
|
|
639
|
+
non_null = col_data.dropna()
|
|
640
|
+
if len(non_null) > 0:
|
|
641
|
+
col_profile["date_range"] = {
|
|
642
|
+
"min": str(non_null.min()),
|
|
643
|
+
"max": str(non_null.max()),
|
|
644
|
+
"range_days": (non_null.max() - non_null.min()).days,
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
# Categorical/text column analysis
|
|
648
|
+
else:
|
|
649
|
+
col_profile["type"] = "categorical"
|
|
650
|
+
value_counts = col_data.value_counts()
|
|
651
|
+
col_profile["most_frequent"] = {
|
|
652
|
+
"value": str(value_counts.index[0]) if len(value_counts) > 0 else None,
|
|
653
|
+
"count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
# String-specific analysis
|
|
657
|
+
if col_data.dtype == "object":
|
|
658
|
+
str_lengths = col_data.dropna().astype(str).str.len()
|
|
659
|
+
if len(str_lengths) > 0:
|
|
660
|
+
col_profile["string_stats"] = {
|
|
661
|
+
"min_length": int(str_lengths.min()),
|
|
662
|
+
"max_length": int(str_lengths.max()),
|
|
663
|
+
"mean_length": round(str_lengths.mean(), 2),
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
profile["columns"][col] = col_profile
|
|
667
|
+
|
|
668
|
+
# Add correlations if requested
|
|
669
|
+
if include_correlations:
|
|
670
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
671
|
+
if len(numeric_cols) >= 2:
|
|
672
|
+
corr_result = await get_correlation_matrix(session_id, ctx=ctx)
|
|
673
|
+
if corr_result["success"]:
|
|
674
|
+
profile["correlations"] = corr_result["high_correlations"]
|
|
675
|
+
|
|
676
|
+
# Add outlier detection if requested
|
|
677
|
+
if include_outliers:
|
|
678
|
+
outlier_result = await detect_outliers(session_id, ctx=ctx)
|
|
679
|
+
if outlier_result["success"]:
|
|
680
|
+
profile["outliers"] = {
|
|
681
|
+
col: {"count": info["outlier_count"], "percentage": info["outlier_percentage"]}
|
|
682
|
+
for col, info in outlier_result["outliers"].items()
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
# Data quality score
|
|
686
|
+
total_cells = len(df) * len(df.columns)
|
|
687
|
+
missing_cells = df.isna().sum().sum()
|
|
688
|
+
quality_score = round((1 - missing_cells / total_cells) * 100, 2)
|
|
689
|
+
profile["data_quality_score"] = quality_score
|
|
690
|
+
|
|
691
|
+
session.record_operation(
|
|
692
|
+
OperationType.PROFILE,
|
|
693
|
+
{"include_correlations": include_correlations, "include_outliers": include_outliers},
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
return {"success": True, "profile": profile}
|
|
697
|
+
|
|
698
|
+
except Exception as e:
|
|
699
|
+
logger.error(f"Error profiling data: {e!s}")
|
|
700
|
+
return {"success": False, "error": str(e)}
|