ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,268 @@
1
+ """
2
+ Advanced data type conversion tools for handling tricky type issues.
3
+ """
4
+
5
+ import polars as pl
6
+ from pathlib import Path
7
+ from typing import Dict, Any, List, Optional
8
+ import sys
9
+ import os
10
+
11
+ # Add parent directory to path for imports
12
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from ds_agent.utils.polars_helpers import (
15
+ load_dataframe,
16
+ save_dataframe,
17
+ get_numeric_columns,
18
+ get_categorical_columns
19
+ )
20
+ from ds_agent.utils.validation import (
21
+ validate_file_exists,
22
+ validate_file_format,
23
+ validate_dataframe
24
+ )
25
+
26
+
27
+ def force_numeric_conversion(
28
+ file_path: str,
29
+ columns: List[str],
30
+ output_path: str,
31
+ errors: str = "coerce"
32
+ ) -> Dict[str, Any]:
33
+ """
34
+ Force convert columns to numeric type, even if they're detected as strings/objects.
35
+
36
+ This is crucial for datasets where numeric columns are stored as strings with
37
+ formatting issues (commas, spaces, currency symbols, etc.).
38
+
39
+ Args:
40
+ file_path: Path to CSV or Parquet file
41
+ columns: List of column names to force convert, or ["all"] for all non-ID columns
42
+ output_path: Path to save converted dataset
43
+ errors: How to handle conversion errors:
44
+ - "coerce": Invalid values become null (default)
45
+ - "raise": Raise error on invalid values
46
+
47
+ Returns:
48
+ Dictionary with conversion report and statistics
49
+ """
50
+ # Validation
51
+ validate_file_exists(file_path)
52
+ validate_file_format(file_path)
53
+
54
+ # Load data
55
+ df = load_dataframe(file_path)
56
+ validate_dataframe(df)
57
+
58
+ original_types = {col: str(df[col].dtype) for col in df.columns}
59
+
60
+ # Determine which columns to convert
61
+ if columns == ["all"]:
62
+ # Auto-detect: skip ID columns, already-numeric columns, and actual text columns
63
+ id_keywords = ['id', 'key', 'code', 'name', 'description', 'text', 'comment', 'notes']
64
+ target_columns = []
65
+
66
+ for col in df.columns:
67
+ # Skip if already numeric
68
+ if df[col].dtype in [pl.Int64, pl.Int32, pl.Float64, pl.Float32]:
69
+ continue
70
+
71
+ # Skip if looks like an ID or text column
72
+ col_lower = col.lower()
73
+ if any(keyword in col_lower for keyword in id_keywords):
74
+ continue
75
+
76
+ # Only attempt conversion if column looks numeric
77
+ # Sample first 100 non-null values to check if they're numeric-like
78
+ sample_values = df[col].drop_nulls().head(100).to_list()
79
+ if len(sample_values) == 0:
80
+ continue
81
+
82
+ numeric_like_count = 0
83
+ for val in sample_values[:min(50, len(sample_values))]: # Check first 50 samples
84
+ val_str = str(val).replace(",", "").replace(" ", "").replace("$", "").replace("€", "").strip()
85
+
86
+ # Check if it looks like a number (digits, decimal point, minus sign)
87
+ if val_str.replace(".", "").replace("-", "").replace("+", "").replace("e", "").replace("E", "").isdigit():
88
+ numeric_like_count += 1
89
+ # Also check for percentage-like values
90
+ elif val_str.endswith("%") and val_str[:-1].replace(".", "").isdigit():
91
+ numeric_like_count += 1
92
+
93
+ # Only include if >70% of samples look numeric
94
+ if len(sample_values) > 0 and (numeric_like_count / min(50, len(sample_values))) > 0.7:
95
+ target_columns.append(col)
96
+ print(f"🔍 '{col}': Detected as numeric-like ({numeric_like_count}/{min(50, len(sample_values))} samples)")
97
+ else:
98
+ print(f"⏭️ '{col}': Skipping (appears to be text, not numeric)")
99
+ else:
100
+ target_columns = columns
101
+
102
+ print(f"🔢 Force converting {len(target_columns)} columns to numeric...")
103
+
104
+ # Track conversion results
105
+ conversion_report = {
106
+ "successful_conversions": [],
107
+ "failed_conversions": [],
108
+ "null_values_introduced": {}
109
+ }
110
+
111
+ # Convert each column
112
+ for col in target_columns:
113
+ if col not in df.columns:
114
+ print(f"⚠️ Column '{col}' not found, skipping")
115
+ conversion_report["failed_conversions"].append(col)
116
+ continue
117
+
118
+ try:
119
+ # Get original null count
120
+ original_nulls = df[col].null_count()
121
+
122
+ # Try to convert to numeric
123
+ # First, clean the column if it's a string (remove commas, spaces, etc.)
124
+ if df[col].dtype == pl.Utf8:
125
+ # Remove common non-numeric characters
126
+ df = df.with_columns([
127
+ pl.col(col)
128
+ .str.replace_all(",", "") # Remove commas
129
+ .str.replace_all(" ", "") # Remove spaces
130
+ .str.replace_all("$", "") # Remove dollar signs
131
+ .str.replace_all("€", "") # Remove euro signs
132
+ .str.replace_all("%", "") # Remove percent signs
133
+ .str.strip_chars() # Strip whitespace
134
+ .alias(col)
135
+ ])
136
+
137
+ # Now convert to float
138
+ if errors == "coerce":
139
+ df = df.with_columns([
140
+ pl.col(col).cast(pl.Float64, strict=False).alias(col)
141
+ ])
142
+ else:
143
+ df = df.with_columns([
144
+ pl.col(col).cast(pl.Float64, strict=True).alias(col)
145
+ ])
146
+
147
+ # Check how many nulls were introduced
148
+ new_nulls = df[col].null_count()
149
+ nulls_introduced = new_nulls - original_nulls
150
+
151
+ conversion_report["successful_conversions"].append(col)
152
+ conversion_report["null_values_introduced"][col] = int(nulls_introduced)
153
+
154
+ if nulls_introduced > 0:
155
+ print(f"✅ '{col}': Converted to numeric ({nulls_introduced} values became null)")
156
+ else:
157
+ print(f"✅ '{col}': Converted to numeric (no data loss)")
158
+
159
+ except Exception as e:
160
+ print(f"❌ '{col}': Conversion failed - {str(e)}")
161
+ conversion_report["failed_conversions"].append(col)
162
+
163
+ # Save converted dataset
164
+ save_dataframe(df, output_path)
165
+
166
+ new_types = {col: str(df[col].dtype) for col in df.columns}
167
+
168
+ return {
169
+ "status": "success",
170
+ "message": f"Force converted {len(conversion_report['successful_conversions'])} columns to numeric",
171
+ "output_path": output_path,
172
+ "conversion_report": conversion_report,
173
+ "type_changes": {
174
+ col: {"from": original_types[col], "to": new_types[col]}
175
+ for col in conversion_report["successful_conversions"]
176
+ },
177
+ "total_successful": len(conversion_report["successful_conversions"]),
178
+ "total_failed": len(conversion_report["failed_conversions"]),
179
+ "total_nulls_introduced": sum(conversion_report["null_values_introduced"].values())
180
+ }
181
+
182
+
183
+ def smart_type_inference(
184
+ file_path: str,
185
+ output_path: str,
186
+ aggressive: bool = True
187
+ ) -> Dict[str, Any]:
188
+ """
189
+ Intelligently infer and fix data types for all columns.
190
+
191
+ This tool goes beyond basic type detection and tries to understand the
192
+ semantic meaning of each column to assign the correct type.
193
+
194
+ Args:
195
+ file_path: Path to CSV or Parquet file
196
+ output_path: Path to save dataset with fixed types
197
+ aggressive: If True, attempts aggressive type conversion (force numeric on ambiguous columns)
198
+
199
+ Returns:
200
+ Dictionary with type inference report
201
+ """
202
+ # Validation
203
+ validate_file_exists(file_path)
204
+ validate_file_format(file_path)
205
+
206
+ # Load data
207
+ df = load_dataframe(file_path)
208
+ validate_dataframe(df)
209
+
210
+ original_types = {col: str(df[col].dtype) for col in df.columns}
211
+ type_changes = {}
212
+
213
+ print(f"🧠 Performing smart type inference on {len(df.columns)} columns...")
214
+
215
+ for col in df.columns:
216
+ current_type = df[col].dtype
217
+
218
+ # Skip if already numeric
219
+ if current_type in [pl.Int64, pl.Int32, pl.Float64, pl.Float32]:
220
+ continue
221
+
222
+ # If it's a string column, try to infer the correct type
223
+ if current_type == pl.Utf8:
224
+ sample_values = df[col].drop_nulls().head(100).to_list()
225
+
226
+ if len(sample_values) == 0:
227
+ continue
228
+
229
+ # Try to detect if it's actually numeric
230
+ numeric_count = 0
231
+ for val in sample_values:
232
+ # Clean and test
233
+ cleaned = str(val).replace(",", "").replace(" ", "").replace("$", "").strip()
234
+ try:
235
+ float(cleaned)
236
+ numeric_count += 1
237
+ except:
238
+ pass
239
+
240
+ # If >80% of values are numeric, convert to numeric
241
+ if numeric_count / len(sample_values) > 0.8:
242
+ print(f"🔢 '{col}': Detected as numeric ({numeric_count}/{len(sample_values)} samples)")
243
+
244
+ # Clean and convert
245
+ df = df.with_columns([
246
+ pl.col(col)
247
+ .str.replace_all(",", "")
248
+ .str.replace_all(" ", "")
249
+ .str.replace_all("$", "")
250
+ .str.replace_all("€", "")
251
+ .str.strip_chars()
252
+ .cast(pl.Float64, strict=False)
253
+ .alias(col)
254
+ ])
255
+
256
+ type_changes[col] = {"from": "Utf8", "to": "Float64", "reason": "numeric_pattern_detected"}
257
+
258
+ # Save dataset
259
+ save_dataframe(df, output_path)
260
+
261
+ return {
262
+ "status": "success",
263
+ "message": f"Smart type inference completed, changed {len(type_changes)} columns",
264
+ "output_path": output_path,
265
+ "type_changes": type_changes,
266
+ "original_types": original_types,
267
+ "new_types": {col: str(df[col].dtype) for col in df.columns}
268
+ }
@@ -0,0 +1,433 @@
1
+ """
2
+ Data Wrangling Tools
3
+ Tools for merging, concatenating, and manipulating multiple datasets.
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ from typing import Dict, Any, List, Optional, Literal
9
+ from pathlib import Path
10
+ import sys
11
+ import os
12
+
13
+ # Add parent directory to path for imports
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ from ds_agent.utils.polars_helpers import (
17
+ load_dataframe,
18
+ save_dataframe,
19
+ )
20
+ from ds_agent.utils.validation import (
21
+ validate_file_exists,
22
+ validate_file_format,
23
+ validate_dataframe,
24
+ )
25
+
26
+
27
+ def merge_datasets(
28
+ left_path: str,
29
+ right_path: str,
30
+ output_path: str,
31
+ how: Literal["inner", "left", "right", "outer", "cross"] = "inner",
32
+ on: Optional[str] = None,
33
+ left_on: Optional[str] = None,
34
+ right_on: Optional[str] = None,
35
+ suffix: str = "_right"
36
+ ) -> Dict[str, Any]:
37
+ """
38
+ Merge two datasets using various join strategies (SQL-like join operations).
39
+
40
+ This function performs database-style joins on two datasets, similar to SQL JOIN operations.
41
+ Supports inner, left, right, outer, and cross joins.
42
+
43
+ Args:
44
+ left_path: Path to left dataset (CSV or Parquet)
45
+ right_path: Path to right dataset (CSV or Parquet)
46
+ output_path: Path to save merged dataset
47
+ how: Join type - "inner", "left", "right", "outer", or "cross"
48
+ - "inner": Only rows with matching keys in both datasets
49
+ - "left": All rows from left, matching rows from right (nulls if no match)
50
+ - "right": All rows from right, matching rows from left (nulls if no match)
51
+ - "outer": All rows from both (nulls where no match)
52
+ - "cross": Cartesian product (all combinations)
53
+ on: Column name to join on (if same in both datasets)
54
+ left_on: Column name in left dataset (if different from right)
55
+ right_on: Column name in right dataset (if different from left)
56
+ suffix: Suffix to add to duplicate column names from right dataset (default: "_right")
57
+
58
+ Returns:
59
+ Dictionary with merge report including:
60
+ - success: bool
61
+ - output_path: str
62
+ - left_rows: int
63
+ - right_rows: int
64
+ - result_rows: int
65
+ - merge_type: str
66
+ - join_columns: dict
67
+ - duplicate_columns: list (columns that got suffixed)
68
+
69
+ Examples:
70
+ >>> # Simple join on same column name
71
+ >>> merge_datasets(
72
+ ... "customers.csv",
73
+ ... "orders.csv",
74
+ ... "merged.csv",
75
+ ... how="left",
76
+ ... on="customer_id"
77
+ ... )
78
+
79
+ >>> # Join on different column names
80
+ >>> merge_datasets(
81
+ ... "products.csv",
82
+ ... "sales.csv",
83
+ ... "product_sales.csv",
84
+ ... how="inner",
85
+ ... left_on="product_id",
86
+ ... right_on="prod_id"
87
+ ... )
88
+ """
89
+ try:
90
+ # Validation
91
+ validate_file_exists(left_path)
92
+ validate_file_exists(right_path)
93
+ validate_file_format(left_path)
94
+ validate_file_format(right_path)
95
+
96
+ # Load datasets
97
+ left_df = load_dataframe(left_path)
98
+ right_df = load_dataframe(right_path)
99
+
100
+ validate_dataframe(left_df)
101
+ validate_dataframe(right_df)
102
+
103
+ left_rows = len(left_df)
104
+ right_rows = len(right_df)
105
+
106
+ # Determine join columns
107
+ if on:
108
+ # Same column name in both datasets
109
+ join_left_on = on
110
+ join_right_on = on
111
+
112
+ # Validate column exists
113
+ if on not in left_df.columns:
114
+ return {
115
+ "success": False,
116
+ "error": f"Column '{on}' not found in left dataset. Available: {left_df.columns}"
117
+ }
118
+ if on not in right_df.columns:
119
+ return {
120
+ "success": False,
121
+ "error": f"Column '{on}' not found in right dataset. Available: {right_df.columns}"
122
+ }
123
+ elif left_on and right_on:
124
+ # Different column names
125
+ join_left_on = left_on
126
+ join_right_on = right_on
127
+
128
+ # Validate columns exist
129
+ if left_on not in left_df.columns:
130
+ return {
131
+ "success": False,
132
+ "error": f"Column '{left_on}' not found in left dataset. Available: {left_df.columns}"
133
+ }
134
+ if right_on not in right_df.columns:
135
+ return {
136
+ "success": False,
137
+ "error": f"Column '{right_on}' not found in right dataset. Available: {right_df.columns}"
138
+ }
139
+ else:
140
+ return {
141
+ "success": False,
142
+ "error": "Must specify either 'on' (same column name) or both 'left_on' and 'right_on' (different names)"
143
+ }
144
+
145
+ # Check for duplicate column names (excluding join columns)
146
+ left_cols = set(left_df.columns)
147
+ right_cols = set(right_df.columns)
148
+ duplicate_cols = list((left_cols & right_cols) - {join_left_on, join_right_on})
149
+
150
+ # Perform merge
151
+ merged_df = left_df.join(
152
+ right_df,
153
+ left_on=join_left_on,
154
+ right_on=join_right_on,
155
+ how=how,
156
+ suffix=suffix
157
+ )
158
+
159
+ result_rows = len(merged_df)
160
+
161
+ # Save result
162
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
163
+ save_dataframe(merged_df, output_path)
164
+
165
+ # Build report
166
+ report = {
167
+ "success": True,
168
+ "output_path": output_path,
169
+ "left_file": Path(left_path).name,
170
+ "right_file": Path(right_path).name,
171
+ "left_rows": left_rows,
172
+ "right_rows": right_rows,
173
+ "result_rows": result_rows,
174
+ "result_columns": len(merged_df.columns),
175
+ "merge_type": how,
176
+ "join_columns": {
177
+ "left": join_left_on,
178
+ "right": join_right_on
179
+ },
180
+ "duplicate_columns": duplicate_cols,
181
+ "rows_added": result_rows - left_rows if how in ["left", "inner"] else None,
182
+ "message": f"Successfully merged {left_rows:,} rows with {right_rows:,} rows using {how} join → {result_rows:,} rows"
183
+ }
184
+
185
+ # Add warnings
186
+ if how == "inner" and result_rows < min(left_rows, right_rows):
187
+ report["warning"] = f"Inner join reduced data: only {result_rows:,} of {min(left_rows, right_rows):,} rows had matches"
188
+ elif how == "outer" and result_rows > left_rows + right_rows:
189
+ report["warning"] = "Outer join created duplicate rows - check for many-to-many relationships"
190
+
191
+ if duplicate_cols:
192
+ report["note"] = f"{len(duplicate_cols)} column(s) were suffixed with '{suffix}': {', '.join(duplicate_cols)}"
193
+
194
+ return report
195
+
196
+ except Exception as e:
197
+ return {
198
+ "success": False,
199
+ "error": str(e),
200
+ "error_type": type(e).__name__
201
+ }
202
+
203
+
204
+ def concat_datasets(
205
+ file_paths: List[str],
206
+ output_path: str,
207
+ axis: Literal["vertical", "horizontal"] = "vertical",
208
+ ignore_index: bool = True
209
+ ) -> Dict[str, Any]:
210
+ """
211
+ Concatenate multiple datasets vertically (stack rows) or horizontally (add columns).
212
+
213
+ Args:
214
+ file_paths: List of file paths to concatenate (CSV or Parquet)
215
+ output_path: Path to save concatenated dataset
216
+ axis: "vertical" to stack rows (union), "horizontal" to add columns side-by-side
217
+ ignore_index: If True, reset index after concatenation (default: True)
218
+
219
+ Returns:
220
+ Dictionary with concatenation report including:
221
+ - success: bool
222
+ - output_path: str
223
+ - input_files: int
224
+ - result_rows: int
225
+ - result_cols: int
226
+ - axis: str
227
+
228
+ Examples:
229
+ >>> # Stack multiple CSV files (union)
230
+ >>> concat_datasets(
231
+ ... ["jan_sales.csv", "feb_sales.csv", "mar_sales.csv"],
232
+ ... "q1_sales.csv",
233
+ ... axis="vertical"
234
+ ... )
235
+
236
+ >>> # Combine datasets side-by-side (add columns)
237
+ >>> concat_datasets(
238
+ ... ["features.csv", "labels.csv"],
239
+ ... "full_dataset.csv",
240
+ ... axis="horizontal"
241
+ ... )
242
+ """
243
+ try:
244
+ # Validation
245
+ if not file_paths or len(file_paths) < 2:
246
+ return {
247
+ "success": False,
248
+ "error": "Must provide at least 2 files to concatenate"
249
+ }
250
+
251
+ for fp in file_paths:
252
+ validate_file_exists(fp)
253
+ validate_file_format(fp)
254
+
255
+ # Load all datasets
256
+ dfs = []
257
+ file_info = []
258
+
259
+ for fp in file_paths:
260
+ df = load_dataframe(fp)
261
+ validate_dataframe(df)
262
+ dfs.append(df)
263
+ file_info.append({
264
+ "file": Path(fp).name,
265
+ "rows": len(df),
266
+ "columns": len(df.columns)
267
+ })
268
+
269
+ # Perform concatenation
270
+ if axis == "vertical":
271
+ # Stack rows (union) - requires same columns
272
+ result = pl.concat(dfs, how="vertical")
273
+ else: # horizontal
274
+ # Add columns side-by-side - requires same number of rows
275
+ row_counts = [len(df) for df in dfs]
276
+ if len(set(row_counts)) > 1:
277
+ return {
278
+ "success": False,
279
+ "error": f"Horizontal concatenation requires same number of rows. Got: {row_counts}",
280
+ "file_info": file_info
281
+ }
282
+ result = pl.concat(dfs, how="horizontal")
283
+
284
+ # Save result
285
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
286
+ save_dataframe(result, output_path)
287
+
288
+ return {
289
+ "success": True,
290
+ "output_path": output_path,
291
+ "input_files": len(file_paths),
292
+ "file_info": file_info,
293
+ "result_rows": len(result),
294
+ "result_cols": len(result.columns),
295
+ "axis": axis,
296
+ "message": f"Successfully concatenated {len(file_paths)} files ({axis}) → {len(result):,} rows × {len(result.columns)} columns"
297
+ }
298
+
299
+ except Exception as e:
300
+ return {
301
+ "success": False,
302
+ "error": str(e),
303
+ "error_type": type(e).__name__
304
+ }
305
+
306
+
307
+ def reshape_dataset(
308
+ file_path: str,
309
+ output_path: str,
310
+ operation: Literal["pivot", "melt", "transpose"],
311
+ **kwargs
312
+ ) -> Dict[str, Any]:
313
+ """
314
+ Reshape dataset using pivot, melt, or transpose operations.
315
+
316
+ Args:
317
+ file_path: Path to CSV or Parquet file
318
+ output_path: Path to save reshaped dataset
319
+ operation: "pivot" (wide format), "melt" (long format), or "transpose"
320
+ **kwargs: Operation-specific parameters
321
+ For pivot: index, columns, values, aggregate_function
322
+ For melt: id_vars, value_vars, var_name, value_name
323
+
324
+ Returns:
325
+ Dictionary with reshape report
326
+
327
+ Examples:
328
+ >>> # Pivot: wide format
329
+ >>> reshape_dataset(
330
+ ... "sales_long.csv",
331
+ ... "sales_wide.csv",
332
+ ... operation="pivot",
333
+ ... index="date",
334
+ ... columns="product",
335
+ ... values="sales"
336
+ ... )
337
+
338
+ >>> # Melt: long format
339
+ >>> reshape_dataset(
340
+ ... "sales_wide.csv",
341
+ ... "sales_long.csv",
342
+ ... operation="melt",
343
+ ... id_vars=["date"],
344
+ ... value_vars=["product_a", "product_b"],
345
+ ... var_name="product",
346
+ ... value_name="sales"
347
+ ... )
348
+ """
349
+ try:
350
+ # Validation
351
+ validate_file_exists(file_path)
352
+ validate_file_format(file_path)
353
+
354
+ # Load data
355
+ df = load_dataframe(file_path)
356
+ validate_dataframe(df)
357
+
358
+ original_shape = (len(df), len(df.columns))
359
+
360
+ # Perform operation
361
+ if operation == "pivot":
362
+ # Pivot to wide format
363
+ index = kwargs.get("index")
364
+ columns = kwargs.get("columns")
365
+ values = kwargs.get("values")
366
+
367
+ if not all([index, columns, values]):
368
+ return {
369
+ "success": False,
370
+ "error": "Pivot requires: index, columns, values parameters"
371
+ }
372
+
373
+ result = df.pivot(
374
+ index=index,
375
+ columns=columns,
376
+ values=values
377
+ )
378
+
379
+ elif operation == "melt":
380
+ # Melt to long format
381
+ id_vars = kwargs.get("id_vars")
382
+ value_vars = kwargs.get("value_vars")
383
+ var_name = kwargs.get("var_name", "variable")
384
+ value_name = kwargs.get("value_name", "value")
385
+
386
+ if not id_vars:
387
+ return {
388
+ "success": False,
389
+ "error": "Melt requires: id_vars parameter"
390
+ }
391
+
392
+ result = df.melt(
393
+ id_vars=id_vars,
394
+ value_vars=value_vars,
395
+ variable_name=var_name,
396
+ value_name=value_name
397
+ )
398
+
399
+ elif operation == "transpose":
400
+ # Transpose rows and columns
401
+ result = df.transpose()
402
+
403
+ else:
404
+ return {
405
+ "success": False,
406
+ "error": f"Unknown operation: {operation}. Use 'pivot', 'melt', or 'transpose'"
407
+ }
408
+
409
+ # Save result
410
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
411
+ save_dataframe(result, output_path)
412
+
413
+ return {
414
+ "success": True,
415
+ "output_path": output_path,
416
+ "operation": operation,
417
+ "original_shape": {
418
+ "rows": original_shape[0],
419
+ "columns": original_shape[1]
420
+ },
421
+ "result_shape": {
422
+ "rows": len(result),
423
+ "columns": len(result.columns)
424
+ },
425
+ "message": f"Successfully {operation}ed dataset: {original_shape[0]}×{original_shape[1]} → {len(result)}×{len(result.columns)}"
426
+ }
427
+
428
+ except Exception as e:
429
+ return {
430
+ "success": False,
431
+ "error": str(e),
432
+ "error_type": type(e).__name__
433
+ }