dipencsv 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ Metadata-Version: 2.4
2
+ Name: dipencsv
3
+ Version: 0.1.0
4
+ Summary: A beginner-friendly CSV analysis and ML data preparation toolkit
5
+ Author: Dipendra
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pandas>=1.5.0
10
+ Requires-Dist: openpyxl>=3.0.0
11
+ Requires-Dist: scikit-learn>=1.0.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
14
+
15
+ # DipenCSV 🐼
16
+
17
+ > DipenCSV is to pandas what seaborn is to matplotlib — a friendlier, higher-level API.
18
+
19
+ A beginner-friendly CSV analysis and ML data preparation toolkit built on top of pandas. Helps students and small teams clean, analyze, and prepare CSV data for machine learning — without needing to know pandas.
20
+
21
+ ## Who is it for?
22
+ - 🎓 Students cleaning data for ML assignments
23
+ - 🚀 Small startups needing quick data insights
24
+ - 👨‍💻 Developers who work with CSVs occasionally
25
+ - 📊 Anyone who finds pandas too complex
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install dipencsv
31
+ ```
32
+
33
+ ## Quick Start
34
+
35
+ ```python
36
+ from dipencsv import Data
37
+
38
+ data = Data("your_file.csv")
39
+ data.magic() # one click everything
40
+ ```
41
+
42
+ ## Full ML Workflow in 15 lines
43
+
44
+ ```python
45
+ from dipencsv import Data
46
+ from sklearn.ensemble import RandomForestClassifier
47
+
48
+ data = Data("titanic.csv")
49
+ data.clean()
50
+ data.drop(["passengerid", "name", "ticket", "cabin"])
51
+ data.encode("sex")
52
+ data.encode("embarked")
53
+ data.remove_outliers("fare")
54
+ data.normalize("fare")
55
+ data.normalize("age")
56
+
57
+ train, test = data.split(test_size=0.2)
58
+
59
+ X_train = train.drop("survived", axis=1)
60
+ y_train = train["survived"]
61
+ X_test = test.drop("survived", axis=1)
62
+ y_test = test["survived"]
63
+
64
+ model = RandomForestClassifier()
65
+ model.fit(X_train, y_train)
66
+ print(f"Accuracy: {model.score(X_test, y_test):.2f}")
67
+ # Accuracy: 0.76
68
+ ```
69
+
70
+ ---
71
+
72
+ ## All Commands
73
+
74
+ ### Loading
75
+ ```python
76
+ data = Data("file.csv") # auto mode
77
+ data = Data("bigfile.csv", stream=True) # force stream mode
78
+ data = Data("file.csv", auto_mode=False) # force pandas mode
79
+ ```
80
+
81
+ ### Properties
82
+ ```python
83
+ print(data.columns) # list of column names
84
+ print(data.shape) # (rows, columns)
85
+ ```
86
+
87
+ ### Understanding Data
88
+ ```python
89
+ data.summary() # rows, cols, types, missing values
90
+ data.report() # duplicates, missing %, basic stats
91
+ data.explain() # trends, issues, suggestions
92
+ data.magic() # ⭐ one click full analysis
93
+ ```
94
+
95
+ ### Cleaning
96
+ ```python
97
+ data.clean() # auto clean (safe mode)
98
+ data.clean(strategy="aggressive") # drop rows with missing values
99
+ data.drop("column") # drop one column
100
+ data.drop(["col1", "col2"]) # drop multiple columns
101
+ data.rename("old_name", "new_name") # rename column
102
+ data.fill("column", value) # fill missing values manually
103
+ ```
104
+
105
+ > ⚠️ After clean(), column names become lowercase with underscores.
106
+ > Always check: `print(data.columns)`
107
+
108
+ ### ML Preparation
109
+ ```python
110
+ data.encode("gender") # text → numbers (label encoding)
111
+ data.normalize("age") # scale to [0, 1]
112
+ data.standardize("salary") # scale to mean=0, std=1
113
+ data.remove_outliers("price") # remove extreme values (IQR method)
114
+ train, test = data.split(test_size=0.2) # train/test split
115
+ ```
116
+
117
+ ### Analytics
118
+ ```python
119
+ data.mean("salary")
120
+ data.median("age")
121
+ data.max("salary")
122
+ data.min("salary")
123
+ data.count("city")
124
+ data.correlation("age", "salary")
125
+ data.correlation_matrix()
126
+ data.distribution("salary")
127
+ data.outliers("salary")
128
+ data.describe()
129
+ data.value_counts("city")
130
+ data.group_mean("city", "salary")
131
+ ```
132
+
133
+ ### Querying
134
+ ```python
135
+ data.first() # first 5 rows
136
+ data.first(10) # first 10 rows
137
+ data.last() # last 5 rows
138
+ data.sort("age") # sort ascending
139
+ data.sort("age", asc=False) # sort descending
140
+ data.find("city", "Bangalore") # find rows by value
141
+ ```
142
+
143
+ ### Intelligence
144
+ ```python
145
+ data.ask("average salary")
146
+ data.ask("top 10 salary")
147
+ data.ask("highest salary by city")
148
+ data.ask("lowest age")
149
+ data.ask("count city")
150
+ data.explain()
151
+ data.magic()
152
+ ```
153
+
154
+ ### Export
155
+ ```python
156
+ data.export("output.csv") # CSV
157
+ data.export("output.json") # JSON
158
+ data.export("output.xlsx") # Excel (requires openpyxl)
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Stream Mode (Big Files)
164
+
165
+ DipenCSV auto detects file size and switches to stream mode for files > 500MB:
166
+
167
+ ```python
168
+ data = Data("hugefile.csv") # auto detects
169
+ data = Data("hugefile.csv", stream=True) # force stream
170
+ ```
171
+
172
+ Supported in stream mode: `mean()`, `max()`, `min()`, `count()`, `group_mean()`, `filter()`, `top_n()`
173
+
174
+ ---
175
+
176
+ ## Smart Error Handling
177
+
178
+ ```python
179
+ data.mean("salry")
180
+ # ❌ Column 'salry' not found.
181
+ # 💡 Did you mean:
182
+ # - salary
183
+ # - salary_usd
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Common Issues
189
+
190
+ **KeyError after clean()**
191
+ ```python
192
+ data.clean()
193
+ print(data.columns) # check actual column names
194
+ data.mean("salary") # use lowercase
195
+ ```
196
+
197
+ **Excel export failing**
198
+ ```bash
199
+ pip install openpyxl
200
+ ```
201
+
202
+ **Big file crashes**
203
+ ```python
204
+ data = Data("bigfile.csv", stream=True)
205
+ ```
206
+
207
+ ---
208
+
209
+ ## Dependencies
210
+ - `pandas >= 1.5.0`
211
+ - `scikit-learn >= 1.0.0`
212
+ - `openpyxl >= 3.0.0` (optional, Excel export)
213
+
214
+ ## Documentation
215
+ Full docs available in the `docs/` folder.
216
+ ## Documentation
217
+
218
+ | Doc | What it covers |
219
+ |-----|----------------|
220
+ | [Getting Started](docs/getting_started.md) | installation, quick start |
221
+ | [Loading Data](docs/loading_data.md) | Data(), auto mode, stream mode |
222
+ | [Properties](docs/properties.md) | columns, shape |
223
+ | [Understanding Data](docs/understanding_data.md) | summary, report, explain, magic |
224
+ | [Cleaning Data](docs/cleaning_data.md) | clean, drop, rename, fill |
225
+ | [ML Preparation](docs/ml_prep.md) | encode, normalize, split etc |
226
+ | [Analytics](docs/analytics.md) | mean, correlation, outliers etc |
227
+ | [Query](docs/query.md) | sort, find, first, last |
228
+ | [Intelligence](docs/intelligence.md) | ask, explain, magic |
229
+ | [Stream Mode](docs/stream_mode.md) | big file handling |
230
+ | [Export](docs/export.md) | csv, json, xlsx |
231
+ | [Examples](docs/examples.md) | real world examples |
232
+ | [FAQ](docs/faq.md) | common student questions |
233
+ | [ML Workflow](docs/ml_workflow.md) | full end-to-end ML example |
234
+
235
+ ## License
236
+ MIT License
237
+
238
+ ## Author
239
+ Built by Dipendra — a CS student who wanted pandas to be less painful.
@@ -0,0 +1,225 @@
1
+ # DipenCSV 🐼
2
+
3
+ > DipenCSV is to pandas what seaborn is to matplotlib — a friendlier, higher-level API.
4
+
5
+ A beginner-friendly CSV analysis and ML data preparation toolkit built on top of pandas. Helps students and small teams clean, analyze, and prepare CSV data for machine learning — without needing to know pandas.
6
+
7
+ ## Who is it for?
8
+ - 🎓 Students cleaning data for ML assignments
9
+ - 🚀 Small startups needing quick data insights
10
+ - 👨‍💻 Developers who work with CSVs occasionally
11
+ - 📊 Anyone who finds pandas too complex
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install dipencsv
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```python
22
+ from dipencsv import Data
23
+
24
+ data = Data("your_file.csv")
25
+ data.magic() # one click everything
26
+ ```
27
+
28
+ ## Full ML Workflow in 15 lines
29
+
30
+ ```python
31
+ from dipencsv import Data
32
+ from sklearn.ensemble import RandomForestClassifier
33
+
34
+ data = Data("titanic.csv")
35
+ data.clean()
36
+ data.drop(["passengerid", "name", "ticket", "cabin"])
37
+ data.encode("sex")
38
+ data.encode("embarked")
39
+ data.remove_outliers("fare")
40
+ data.normalize("fare")
41
+ data.normalize("age")
42
+
43
+ train, test = data.split(test_size=0.2)
44
+
45
+ X_train = train.drop("survived", axis=1)
46
+ y_train = train["survived"]
47
+ X_test = test.drop("survived", axis=1)
48
+ y_test = test["survived"]
49
+
50
+ model = RandomForestClassifier()
51
+ model.fit(X_train, y_train)
52
+ print(f"Accuracy: {model.score(X_test, y_test):.2f}")
53
+ # Accuracy: 0.76
54
+ ```
55
+
56
+ ---
57
+
58
+ ## All Commands
59
+
60
+ ### Loading
61
+ ```python
62
+ data = Data("file.csv") # auto mode
63
+ data = Data("bigfile.csv", stream=True) # force stream mode
64
+ data = Data("file.csv", auto_mode=False) # force pandas mode
65
+ ```
66
+
67
+ ### Properties
68
+ ```python
69
+ print(data.columns) # list of column names
70
+ print(data.shape) # (rows, columns)
71
+ ```
72
+
73
+ ### Understanding Data
74
+ ```python
75
+ data.summary() # rows, cols, types, missing values
76
+ data.report() # duplicates, missing %, basic stats
77
+ data.explain() # trends, issues, suggestions
78
+ data.magic() # ⭐ one click full analysis
79
+ ```
80
+
81
+ ### Cleaning
82
+ ```python
83
+ data.clean() # auto clean (safe mode)
84
+ data.clean(strategy="aggressive") # drop rows with missing values
85
+ data.drop("column") # drop one column
86
+ data.drop(["col1", "col2"]) # drop multiple columns
87
+ data.rename("old_name", "new_name") # rename column
88
+ data.fill("column", value) # fill missing values manually
89
+ ```
90
+
91
+ > ⚠️ After clean(), column names become lowercase with underscores.
92
+ > Always check: `print(data.columns)`
93
+
94
+ ### ML Preparation
95
+ ```python
96
+ data.encode("gender") # text → numbers (label encoding)
97
+ data.normalize("age") # scale to [0, 1]
98
+ data.standardize("salary") # scale to mean=0, std=1
99
+ data.remove_outliers("price") # remove extreme values (IQR method)
100
+ train, test = data.split(test_size=0.2) # train/test split
101
+ ```
102
+
103
+ ### Analytics
104
+ ```python
105
+ data.mean("salary")
106
+ data.median("age")
107
+ data.max("salary")
108
+ data.min("salary")
109
+ data.count("city")
110
+ data.correlation("age", "salary")
111
+ data.correlation_matrix()
112
+ data.distribution("salary")
113
+ data.outliers("salary")
114
+ data.describe()
115
+ data.value_counts("city")
116
+ data.group_mean("city", "salary")
117
+ ```
118
+
119
+ ### Querying
120
+ ```python
121
+ data.first() # first 5 rows
122
+ data.first(10) # first 10 rows
123
+ data.last() # last 5 rows
124
+ data.sort("age") # sort ascending
125
+ data.sort("age", asc=False) # sort descending
126
+ data.find("city", "Bangalore") # find rows by value
127
+ ```
128
+
129
+ ### Intelligence
130
+ ```python
131
+ data.ask("average salary")
132
+ data.ask("top 10 salary")
133
+ data.ask("highest salary by city")
134
+ data.ask("lowest age")
135
+ data.ask("count city")
136
+ data.explain()
137
+ data.magic()
138
+ ```
139
+
140
+ ### Export
141
+ ```python
142
+ data.export("output.csv") # CSV
143
+ data.export("output.json") # JSON
144
+ data.export("output.xlsx") # Excel (requires openpyxl)
145
+ ```
146
+
147
+ ---
148
+
149
+ ## Stream Mode (Big Files)
150
+
151
+ DipenCSV auto detects file size and switches to stream mode for files > 500MB:
152
+
153
+ ```python
154
+ data = Data("hugefile.csv") # auto detects
155
+ data = Data("hugefile.csv", stream=True) # force stream
156
+ ```
157
+
158
+ Supported in stream mode: `mean()`, `max()`, `min()`, `count()`, `group_mean()`, `filter()`, `top_n()`
159
+
160
+ ---
161
+
162
+ ## Smart Error Handling
163
+
164
+ ```python
165
+ data.mean("salry")
166
+ # ❌ Column 'salry' not found.
167
+ # 💡 Did you mean:
168
+ # - salary
169
+ # - salary_usd
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Common Issues
175
+
176
+ **KeyError after clean()**
177
+ ```python
178
+ data.clean()
179
+ print(data.columns) # check actual column names
180
+ data.mean("salary") # use lowercase
181
+ ```
182
+
183
+ **Excel export failing**
184
+ ```bash
185
+ pip install openpyxl
186
+ ```
187
+
188
+ **Big file crashes**
189
+ ```python
190
+ data = Data("bigfile.csv", stream=True)
191
+ ```
192
+
193
+ ---
194
+
195
+ ## Dependencies
196
+ - `pandas >= 1.5.0`
197
+ - `scikit-learn >= 1.0.0`
198
+ - `openpyxl >= 3.0.0` (optional, Excel export)
199
+
200
+ ## Documentation
201
+ Full docs available in the `docs/` folder.
202
+ ## Documentation
203
+
204
+ | Doc | What it covers |
205
+ |-----|----------------|
206
+ | [Getting Started](docs/getting_started.md) | installation, quick start |
207
+ | [Loading Data](docs/loading_data.md) | Data(), auto mode, stream mode |
208
+ | [Properties](docs/properties.md) | columns, shape |
209
+ | [Understanding Data](docs/understanding_data.md) | summary, report, explain, magic |
210
+ | [Cleaning Data](docs/cleaning_data.md) | clean, drop, rename, fill |
211
+ | [ML Preparation](docs/ml_prep.md) | encode, normalize, split etc |
212
+ | [Analytics](docs/analytics.md) | mean, correlation, outliers etc |
213
+ | [Query](docs/query.md) | sort, find, first, last |
214
+ | [Intelligence](docs/intelligence.md) | ask, explain, magic |
215
+ | [Stream Mode](docs/stream_mode.md) | big file handling |
216
+ | [Export](docs/export.md) | csv, json, xlsx |
217
+ | [Examples](docs/examples.md) | real world examples |
218
+ | [FAQ](docs/faq.md) | common student questions |
219
+ | [ML Workflow](docs/ml_workflow.md) | full end-to-end ML example |
220
+
221
+ ## License
222
+ MIT License
223
+
224
+ ## Author
225
+ Built by Dipendra — a CS student who wanted pandas to be less painful.
@@ -0,0 +1,8 @@
1
+ # dipencsv/__init__.py
2
+
3
+ from dipencsv.core import Data
4
+
5
+ __version__ = "0.1.0"
6
+ __author__ = "Dipendra"
7
+
8
+ __all__ = ["Data"]
@@ -0,0 +1,70 @@
1
+ # dipencsv/analytics.py
2
+
3
+ import pandas as pd
4
+ from dipencsv.errors import format_error, format_success
5
+
6
+ def describe(df: pd.DataFrame) -> pd.DataFrame:
7
+ numeric_df = df.select_dtypes(include="number")
8
+ if numeric_df.empty:
9
+ print("⚠️ No numeric columns found")
10
+ return pd.DataFrame()
11
+ return numeric_df.describe().round(2)
12
+
13
+
14
+ def value_counts(df: pd.DataFrame, col: str) -> pd.Series:
15
+ return df[col].value_counts()
16
+
17
+
18
+ def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
19
+ numeric_df = df.select_dtypes(include="number")
20
+ if numeric_df.empty:
21
+ print("⚠️ No numeric columns found")
22
+ return pd.DataFrame()
23
+ return numeric_df.corr().round(2)
24
+
25
+
26
+ def outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
27
+ q1 = df[col].quantile(0.25)
28
+ q3 = df[col].quantile(0.75)
29
+ iqr = q3 - q1
30
+ lower = q1 - 1.5 * iqr
31
+ upper = q3 + 1.5 * iqr
32
+ result = df[(df[col] < lower) | (df[col] > upper)]
33
+ print(f"📊 Outliers in '{col}': {len(result)} rows")
34
+ return result
35
+
36
+
37
+ def distribution(df: pd.DataFrame, col: str) -> dict:
38
+ return {
39
+ "mean" : round(float(df[col].mean()), 2),
40
+ "median": round(float(df[col].median()), 2),
41
+ "std" : round(float(df[col].std()), 2),
42
+ "min" : round(float(df[col].min()), 2),
43
+ "max" : round(float(df[col].max()), 2),
44
+ "skew" : round(float(df[col].skew()), 2),
45
+ }
46
+ def sort(df: pd.DataFrame, col: str, asc: bool = True) -> pd.DataFrame:
47
+ if col not in df.columns:
48
+ print(format_error(f"Column '{col}' not found"))
49
+ return df
50
+ df = df.sort_values(by=col, ascending=asc).reset_index(drop=True)
51
+ direction = "ascending" if asc else "descending"
52
+ print(format_success(f"Sorted by '{col}' ({direction})"))
53
+ return df
54
+
55
+
56
+ def first(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
57
+ return df.head(n)
58
+
59
+
60
+ def last(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
61
+ return df.tail(n)
62
+
63
+
64
+ def find(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
65
+ if col not in df.columns:
66
+ print(format_error(f"Column '{col}' not found"))
67
+ return pd.DataFrame()
68
+ result = df[df[col] == value]
69
+ print(format_success(f"Found {len(result)} rows where '{col}' = '{value}'"))
70
+ return result
@@ -0,0 +1,154 @@
1
+ # dipencsv/cleaner.py
2
+
3
+ import pandas as pd
4
+
5
+ from dipencsv.errors import format_error, format_warning, format_success
6
+
7
+
8
+ def clean(df: pd.DataFrame, strategy: str = "safe") -> pd.DataFrame:
9
+ df = df.copy()
10
+
11
+ # fix column names
12
+ df = _fix_columns(df)
13
+
14
+ # remove duplicates
15
+ before = len(df)
16
+ df = df.drop_duplicates()
17
+ removed = before - len(df)
18
+ if removed > 0:
19
+ print(format_warning(f"Removed {removed} duplicate rows"))
20
+
21
+ # handle missing values
22
+ df = _handle_missing(df, strategy)
23
+
24
+ # convert text numbers to numeric
25
+ df = _fix_numeric(df)
26
+
27
+ print(format_success("Cleaning done"))
28
+ return df
29
+
30
+
31
+ def _fix_columns(df: pd.DataFrame) -> pd.DataFrame:
32
+ df.columns = (
33
+ df.columns
34
+ .str.strip()
35
+ .str.lower()
36
+ .str.replace(" ", "_")
37
+ .str.replace(r"[^\w]", "", regex=True)
38
+ )
39
+ return df
40
+
41
+
42
+ def _handle_missing(df: pd.DataFrame, strategy: str) -> pd.DataFrame:
43
+ if strategy == "safe":
44
+ for col in df.columns:
45
+ if df[col].isnull().sum() == 0:
46
+ continue
47
+ if df[col].dtype in ["float64", "int64"]:
48
+ df[col] = df[col].fillna(df[col].median())
49
+ else:
50
+ mode = df[col].mode()
51
+ if not mode.empty:
52
+ df[col] = df[col].fillna(mode[0])
53
+ elif strategy == "aggressive":
54
+ before = len(df)
55
+ df = df.dropna()
56
+ print(format_warning(f"Dropped {before - len(df)} rows with missing values"))
57
+ return df
58
+
59
+
60
+ def _fix_numeric(df: pd.DataFrame) -> pd.DataFrame:
61
+ for col in df.select_dtypes(include=["object", "str"]).columns:
62
+ converted = pd.to_numeric(df[col], errors="coerce")
63
+ if converted.notna().sum() > len(df) * 0.7:
64
+ df[col] = converted
65
+ return df
66
+
67
+ def drop(df: pd.DataFrame, cols) -> pd.DataFrame:
68
+ if isinstance(cols, str):
69
+ cols = [cols]
70
+ missing = [c for c in cols if c not in df.columns]
71
+ if missing:
72
+ print(format_error(f"Columns not found: {missing}"))
73
+ return df
74
+ df = df.drop(columns=cols)
75
+ print(format_success(f"Dropped columns: {cols}"))
76
+ return df
77
+
78
+
79
+ def rename(df: pd.DataFrame, old: str, new: str) -> pd.DataFrame:
80
+ if old not in df.columns:
81
+ print(format_error(f"Column '{old}' not found"))
82
+ return df
83
+ df = df.rename(columns={old: new})
84
+ print(format_success(f"Renamed '{old}' → '{new}'"))
85
+ return df
86
+
87
+
88
+ def fill(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
89
+ if col not in df.columns:
90
+ print(format_error(f"Column '{col}' not found"))
91
+ return df
92
+ df[col] = df[col].fillna(value)
93
+ print(format_success(f"Filled missing values in '{col}' with {value}"))
94
+ return df
95
+
96
+ def encode(df: pd.DataFrame, col: str) -> pd.DataFrame:
97
+ if col not in df.columns:
98
+ print(format_error(f"Column '{col}' not found"))
99
+ return df
100
+ unique_vals = df[col].unique()
101
+ mapping = {val: idx for idx, val in enumerate(unique_vals)}
102
+ df[col] = df[col].map(mapping)
103
+ print(format_success(f"Encoded '{col}': {mapping}"))
104
+ return df
105
+
106
+
107
+ def normalize(df: pd.DataFrame, col: str) -> pd.DataFrame:
108
+ if col not in df.columns:
109
+ print(format_error(f"Column '{col}' not found"))
110
+ return df
111
+ min_val = df[col].min()
112
+ max_val = df[col].max()
113
+ if max_val == min_val:
114
+ print(format_error(f"Cannot normalize '{col}' — all values are the same"))
115
+ return df
116
+ df[col] = (df[col] - min_val) / (max_val - min_val)
117
+ print(format_success(f"Normalized '{col}' to range [0, 1]"))
118
+ return df
119
+
120
+
121
+ def standardize(df: pd.DataFrame, col: str) -> pd.DataFrame:
122
+ if col not in df.columns:
123
+ print(format_error(f"Column '{col}' not found"))
124
+ return df
125
+ mean = df[col].mean()
126
+ std = df[col].std()
127
+ if std == 0:
128
+ print(format_error(f"Cannot standardize '{col}' — std is 0"))
129
+ return df
130
+ df[col] = (df[col] - mean) / std
131
+ print(format_success(f"Standardized '{col}' — mean=0, std=1"))
132
+ return df
133
+
134
+ def remove_outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
135
+ if col not in df.columns:
136
+ print(format_error(f"Column '{col}' not found"))
137
+ return df
138
+ q1 = df[col].quantile(0.25)
139
+ q3 = df[col].quantile(0.75)
140
+ iqr = q3 - q1
141
+ lower = q1 - 1.5 * iqr
142
+ upper = q3 + 1.5 * iqr
143
+ before = len(df)
144
+ df = df[(df[col] >= lower) & (df[col] <= upper)]
145
+ removed = before - len(df)
146
+ print(format_success(f"Removed {removed} outliers from '{col}'"))
147
+ return df
148
+
149
+
150
+ def split(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42):
151
+ from sklearn.model_selection import train_test_split
152
+ train, test = train_test_split(df, test_size=test_size, random_state=random_state)
153
+ print(format_success(f"Split done — train: {len(train)} rows, test: {len(test)} rows"))
154
+ return train, test