dipencsv 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dipencsv-0.1.0/PKG-INFO +239 -0
- dipencsv-0.1.0/README.md +225 -0
- dipencsv-0.1.0/dipencsv/__init__.py +8 -0
- dipencsv-0.1.0/dipencsv/analytics.py +70 -0
- dipencsv-0.1.0/dipencsv/cleaner.py +154 -0
- dipencsv-0.1.0/dipencsv/core.py +226 -0
- dipencsv-0.1.0/dipencsv/engine/__init__.py +6 -0
- dipencsv-0.1.0/dipencsv/engine/base.py +42 -0
- dipencsv-0.1.0/dipencsv/engine/pandas_engine.py +53 -0
- dipencsv-0.1.0/dipencsv/engine/stream_engine.py +91 -0
- dipencsv-0.1.0/dipencsv/errors.py +23 -0
- dipencsv-0.1.0/dipencsv/exporter.py +45 -0
- dipencsv-0.1.0/dipencsv/intelligence.py +194 -0
- dipencsv-0.1.0/dipencsv/loader.py +24 -0
- dipencsv-0.1.0/dipencsv/summary.py +103 -0
- dipencsv-0.1.0/dipencsv.egg-info/PKG-INFO +239 -0
- dipencsv-0.1.0/dipencsv.egg-info/SOURCES.txt +26 -0
- dipencsv-0.1.0/dipencsv.egg-info/dependency_links.txt +1 -0
- dipencsv-0.1.0/dipencsv.egg-info/requires.txt +6 -0
- dipencsv-0.1.0/dipencsv.egg-info/top_level.txt +1 -0
- dipencsv-0.1.0/pyproject.toml +28 -0
- dipencsv-0.1.0/setup.cfg +4 -0
- dipencsv-0.1.0/tests/test_analytics.py +95 -0
- dipencsv-0.1.0/tests/test_cleaner.py +111 -0
- dipencsv-0.1.0/tests/test_core.py +142 -0
- dipencsv-0.1.0/tests/test_intelligence.py +81 -0
- dipencsv-0.1.0/tests/test_loader.py +43 -0
- dipencsv-0.1.0/tests/test_summary.py +51 -0
dipencsv-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dipencsv
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A beginner-friendly CSV analysis and ML data preparation toolkit
|
|
5
|
+
Author: Dipendra
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pandas>=1.5.0
|
|
10
|
+
Requires-Dist: openpyxl>=3.0.0
|
|
11
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
|
|
15
|
+
# DipenCSV 🐼
|
|
16
|
+
|
|
17
|
+
> DipenCSV is to pandas what seaborn is to matplotlib — a friendlier, higher-level API.
|
|
18
|
+
|
|
19
|
+
A beginner-friendly CSV analysis and ML data preparation toolkit built on top of pandas. Helps students and small teams clean, analyze, and prepare CSV data for machine learning — without needing to know pandas.
|
|
20
|
+
|
|
21
|
+
## Who is it for?
|
|
22
|
+
- 🎓 Students cleaning data for ML assignments
|
|
23
|
+
- 🚀 Small startups needing quick data insights
|
|
24
|
+
- 👨💻 Developers who work with CSVs occasionally
|
|
25
|
+
- 📊 Anyone who finds pandas too complex
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install dipencsv
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from dipencsv import Data
|
|
37
|
+
|
|
38
|
+
data = Data("your_file.csv")
|
|
39
|
+
data.magic() # one click everything
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Full ML Workflow in 15 lines
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from dipencsv import Data
|
|
46
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
47
|
+
|
|
48
|
+
data = Data("titanic.csv")
|
|
49
|
+
data.clean()
|
|
50
|
+
data.drop(["passengerid", "name", "ticket", "cabin"])
|
|
51
|
+
data.encode("sex")
|
|
52
|
+
data.encode("embarked")
|
|
53
|
+
data.remove_outliers("fare")
|
|
54
|
+
data.normalize("fare")
|
|
55
|
+
data.normalize("age")
|
|
56
|
+
|
|
57
|
+
train, test = data.split(test_size=0.2)
|
|
58
|
+
|
|
59
|
+
X_train = train.drop("survived", axis=1)
|
|
60
|
+
y_train = train["survived"]
|
|
61
|
+
X_test = test.drop("survived", axis=1)
|
|
62
|
+
y_test = test["survived"]
|
|
63
|
+
|
|
64
|
+
model = RandomForestClassifier()
|
|
65
|
+
model.fit(X_train, y_train)
|
|
66
|
+
print(f"Accuracy: {model.score(X_test, y_test):.2f}")
|
|
67
|
+
# Accuracy: 0.76
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## All Commands
|
|
73
|
+
|
|
74
|
+
### Loading
|
|
75
|
+
```python
|
|
76
|
+
data = Data("file.csv") # auto mode
|
|
77
|
+
data = Data("bigfile.csv", stream=True) # force stream mode
|
|
78
|
+
data = Data("file.csv", auto_mode=False) # force pandas mode
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Properties
|
|
82
|
+
```python
|
|
83
|
+
print(data.columns) # list of column names
|
|
84
|
+
print(data.shape) # (rows, columns)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Understanding Data
|
|
88
|
+
```python
|
|
89
|
+
data.summary() # rows, cols, types, missing values
|
|
90
|
+
data.report() # duplicates, missing %, basic stats
|
|
91
|
+
data.explain() # trends, issues, suggestions
|
|
92
|
+
data.magic() # ⭐ one click full analysis
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Cleaning
|
|
96
|
+
```python
|
|
97
|
+
data.clean() # auto clean (safe mode)
|
|
98
|
+
data.clean(strategy="aggressive") # drop rows with missing values
|
|
99
|
+
data.drop("column") # drop one column
|
|
100
|
+
data.drop(["col1", "col2"]) # drop multiple columns
|
|
101
|
+
data.rename("old_name", "new_name") # rename column
|
|
102
|
+
data.fill("column", value) # fill missing values manually
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
> ⚠️ After clean(), column names become lowercase with underscores.
|
|
106
|
+
> Always check: `print(data.columns)`
|
|
107
|
+
|
|
108
|
+
### ML Preparation
|
|
109
|
+
```python
|
|
110
|
+
data.encode("gender") # text → numbers (label encoding)
|
|
111
|
+
data.normalize("age") # scale to [0, 1]
|
|
112
|
+
data.standardize("salary") # scale to mean=0, std=1
|
|
113
|
+
data.remove_outliers("price") # remove extreme values (IQR method)
|
|
114
|
+
train, test = data.split(test_size=0.2) # train/test split
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Analytics
|
|
118
|
+
```python
|
|
119
|
+
data.mean("salary")
|
|
120
|
+
data.median("age")
|
|
121
|
+
data.max("salary")
|
|
122
|
+
data.min("salary")
|
|
123
|
+
data.count("city")
|
|
124
|
+
data.correlation("age", "salary")
|
|
125
|
+
data.correlation_matrix()
|
|
126
|
+
data.distribution("salary")
|
|
127
|
+
data.outliers("salary")
|
|
128
|
+
data.describe()
|
|
129
|
+
data.value_counts("city")
|
|
130
|
+
data.group_mean("city", "salary")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Querying
|
|
134
|
+
```python
|
|
135
|
+
data.first() # first 5 rows
|
|
136
|
+
data.first(10) # first 10 rows
|
|
137
|
+
data.last() # last 5 rows
|
|
138
|
+
data.sort("age") # sort ascending
|
|
139
|
+
data.sort("age", asc=False) # sort descending
|
|
140
|
+
data.find("city", "Bangalore") # find rows by value
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Intelligence
|
|
144
|
+
```python
|
|
145
|
+
data.ask("average salary")
|
|
146
|
+
data.ask("top 10 salary")
|
|
147
|
+
data.ask("highest salary by city")
|
|
148
|
+
data.ask("lowest age")
|
|
149
|
+
data.ask("count city")
|
|
150
|
+
data.explain()
|
|
151
|
+
data.magic()
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Export
|
|
155
|
+
```python
|
|
156
|
+
data.export("output.csv") # CSV
|
|
157
|
+
data.export("output.json") # JSON
|
|
158
|
+
data.export("output.xlsx") # Excel (requires openpyxl)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Stream Mode (Big Files)
|
|
164
|
+
|
|
165
|
+
DipenCSV auto detects file size and switches to stream mode for files > 500MB:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
data = Data("hugefile.csv") # auto detects
|
|
169
|
+
data = Data("hugefile.csv", stream=True) # force stream
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Supported in stream mode: `mean()`, `max()`, `min()`, `count()`, `group_mean()`, `filter()`, `top_n()`
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Smart Error Handling
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
data.mean("salry")
|
|
180
|
+
# ❌ Column 'salry' not found.
|
|
181
|
+
# 💡 Did you mean:
|
|
182
|
+
# - salary
|
|
183
|
+
# - salary_usd
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Common Issues
|
|
189
|
+
|
|
190
|
+
**KeyError after clean()**
|
|
191
|
+
```python
|
|
192
|
+
data.clean()
|
|
193
|
+
print(data.columns) # check actual column names
|
|
194
|
+
data.mean("salary") # use lowercase
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**Excel export failing**
|
|
198
|
+
```bash
|
|
199
|
+
pip install openpyxl
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Big file crashes**
|
|
203
|
+
```python
|
|
204
|
+
data = Data("bigfile.csv", stream=True)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Dependencies
|
|
210
|
+
- `pandas >= 1.5.0`
|
|
211
|
+
- `scikit-learn >= 1.0.0`
|
|
212
|
+
- `openpyxl >= 3.0.0` (optional, Excel export)
|
|
213
|
+
|
|
214
|
+
## Documentation
|
|
215
|
+
Full docs available in the `docs/` folder.
|
|
216
|
+
## Documentation
|
|
217
|
+
|
|
218
|
+
| Doc | What it covers |
|
|
219
|
+
|-----|----------------|
|
|
220
|
+
| [Getting Started](docs/getting_started.md) | installation, quick start |
|
|
221
|
+
| [Loading Data](docs/loading_data.md) | Data(), auto mode, stream mode |
|
|
222
|
+
| [Properties](docs/properties.md) | columns, shape |
|
|
223
|
+
| [Understanding Data](docs/understanding_data.md) | summary, report, explain, magic |
|
|
224
|
+
| [Cleaning Data](docs/cleaning_data.md) | clean, drop, rename, fill |
|
|
225
|
+
| [ML Preparation](docs/ml_prep.md) | encode, normalize, split etc |
|
|
226
|
+
| [Analytics](docs/analytics.md) | mean, correlation, outliers etc |
|
|
227
|
+
| [Query](docs/query.md) | sort, find, first, last |
|
|
228
|
+
| [Intelligence](docs/intelligence.md) | ask, explain, magic |
|
|
229
|
+
| [Stream Mode](docs/stream_mode.md) | big file handling |
|
|
230
|
+
| [Export](docs/export.md) | csv, json, xlsx |
|
|
231
|
+
| [Examples](docs/examples.md) | real world examples |
|
|
232
|
+
| [FAQ](docs/faq.md) | common student questions |
|
|
233
|
+
| [ML Workflow](docs/ml_workflow.md) | full end-to-end ML example |
|
|
234
|
+
|
|
235
|
+
## License
|
|
236
|
+
MIT License
|
|
237
|
+
|
|
238
|
+
## Author
|
|
239
|
+
Built by Dipendra — a CS student who wanted pandas to be less painful.
|
dipencsv-0.1.0/README.md
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# DipenCSV 🐼
|
|
2
|
+
|
|
3
|
+
> DipenCSV is to pandas what seaborn is to matplotlib — a friendlier, higher-level API.
|
|
4
|
+
|
|
5
|
+
A beginner-friendly CSV analysis and ML data preparation toolkit built on top of pandas. Helps students and small teams clean, analyze, and prepare CSV data for machine learning — without needing to know pandas.
|
|
6
|
+
|
|
7
|
+
## Who is it for?
|
|
8
|
+
- 🎓 Students cleaning data for ML assignments
|
|
9
|
+
- 🚀 Small startups needing quick data insights
|
|
10
|
+
- 👨💻 Developers who work with CSVs occasionally
|
|
11
|
+
- 📊 Anyone who finds pandas too complex
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install dipencsv
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from dipencsv import Data
|
|
23
|
+
|
|
24
|
+
data = Data("your_file.csv")
|
|
25
|
+
data.magic() # one click everything
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Full ML Workflow in 15 lines
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from dipencsv import Data
|
|
32
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
33
|
+
|
|
34
|
+
data = Data("titanic.csv")
|
|
35
|
+
data.clean()
|
|
36
|
+
data.drop(["passengerid", "name", "ticket", "cabin"])
|
|
37
|
+
data.encode("sex")
|
|
38
|
+
data.encode("embarked")
|
|
39
|
+
data.remove_outliers("fare")
|
|
40
|
+
data.normalize("fare")
|
|
41
|
+
data.normalize("age")
|
|
42
|
+
|
|
43
|
+
train, test = data.split(test_size=0.2)
|
|
44
|
+
|
|
45
|
+
X_train = train.drop("survived", axis=1)
|
|
46
|
+
y_train = train["survived"]
|
|
47
|
+
X_test = test.drop("survived", axis=1)
|
|
48
|
+
y_test = test["survived"]
|
|
49
|
+
|
|
50
|
+
model = RandomForestClassifier()
|
|
51
|
+
model.fit(X_train, y_train)
|
|
52
|
+
print(f"Accuracy: {model.score(X_test, y_test):.2f}")
|
|
53
|
+
# Accuracy: 0.76
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## All Commands
|
|
59
|
+
|
|
60
|
+
### Loading
|
|
61
|
+
```python
|
|
62
|
+
data = Data("file.csv") # auto mode
|
|
63
|
+
data = Data("bigfile.csv", stream=True) # force stream mode
|
|
64
|
+
data = Data("file.csv", auto_mode=False) # force pandas mode
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Properties
|
|
68
|
+
```python
|
|
69
|
+
print(data.columns) # list of column names
|
|
70
|
+
print(data.shape) # (rows, columns)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Understanding Data
|
|
74
|
+
```python
|
|
75
|
+
data.summary() # rows, cols, types, missing values
|
|
76
|
+
data.report() # duplicates, missing %, basic stats
|
|
77
|
+
data.explain() # trends, issues, suggestions
|
|
78
|
+
data.magic() # ⭐ one click full analysis
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Cleaning
|
|
82
|
+
```python
|
|
83
|
+
data.clean() # auto clean (safe mode)
|
|
84
|
+
data.clean(strategy="aggressive") # drop rows with missing values
|
|
85
|
+
data.drop("column") # drop one column
|
|
86
|
+
data.drop(["col1", "col2"]) # drop multiple columns
|
|
87
|
+
data.rename("old_name", "new_name") # rename column
|
|
88
|
+
data.fill("column", value) # fill missing values manually
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
> ⚠️ After clean(), column names become lowercase with underscores.
|
|
92
|
+
> Always check: `print(data.columns)`
|
|
93
|
+
|
|
94
|
+
### ML Preparation
|
|
95
|
+
```python
|
|
96
|
+
data.encode("gender") # text → numbers (label encoding)
|
|
97
|
+
data.normalize("age") # scale to [0, 1]
|
|
98
|
+
data.standardize("salary") # scale to mean=0, std=1
|
|
99
|
+
data.remove_outliers("price") # remove extreme values (IQR method)
|
|
100
|
+
train, test = data.split(test_size=0.2) # train/test split
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Analytics
|
|
104
|
+
```python
|
|
105
|
+
data.mean("salary")
|
|
106
|
+
data.median("age")
|
|
107
|
+
data.max("salary")
|
|
108
|
+
data.min("salary")
|
|
109
|
+
data.count("city")
|
|
110
|
+
data.correlation("age", "salary")
|
|
111
|
+
data.correlation_matrix()
|
|
112
|
+
data.distribution("salary")
|
|
113
|
+
data.outliers("salary")
|
|
114
|
+
data.describe()
|
|
115
|
+
data.value_counts("city")
|
|
116
|
+
data.group_mean("city", "salary")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Querying
|
|
120
|
+
```python
|
|
121
|
+
data.first() # first 5 rows
|
|
122
|
+
data.first(10) # first 10 rows
|
|
123
|
+
data.last() # last 5 rows
|
|
124
|
+
data.sort("age") # sort ascending
|
|
125
|
+
data.sort("age", asc=False) # sort descending
|
|
126
|
+
data.find("city", "Bangalore") # find rows by value
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Intelligence
|
|
130
|
+
```python
|
|
131
|
+
data.ask("average salary")
|
|
132
|
+
data.ask("top 10 salary")
|
|
133
|
+
data.ask("highest salary by city")
|
|
134
|
+
data.ask("lowest age")
|
|
135
|
+
data.ask("count city")
|
|
136
|
+
data.explain()
|
|
137
|
+
data.magic()
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Export
|
|
141
|
+
```python
|
|
142
|
+
data.export("output.csv") # CSV
|
|
143
|
+
data.export("output.json") # JSON
|
|
144
|
+
data.export("output.xlsx") # Excel (requires openpyxl)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Stream Mode (Big Files)
|
|
150
|
+
|
|
151
|
+
DipenCSV auto detects file size and switches to stream mode for files > 500MB:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
data = Data("hugefile.csv") # auto detects
|
|
155
|
+
data = Data("hugefile.csv", stream=True) # force stream
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Supported in stream mode: `mean()`, `max()`, `min()`, `count()`, `group_mean()`, `filter()`, `top_n()`
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Smart Error Handling
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
data.mean("salry")
|
|
166
|
+
# ❌ Column 'salry' not found.
|
|
167
|
+
# 💡 Did you mean:
|
|
168
|
+
# - salary
|
|
169
|
+
# - salary_usd
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Common Issues
|
|
175
|
+
|
|
176
|
+
**KeyError after clean()**
|
|
177
|
+
```python
|
|
178
|
+
data.clean()
|
|
179
|
+
print(data.columns) # check actual column names
|
|
180
|
+
data.mean("salary") # use lowercase
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Excel export failing**
|
|
184
|
+
```bash
|
|
185
|
+
pip install openpyxl
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Big file crashes**
|
|
189
|
+
```python
|
|
190
|
+
data = Data("bigfile.csv", stream=True)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Dependencies
|
|
196
|
+
- `pandas >= 1.5.0`
|
|
197
|
+
- `scikit-learn >= 1.0.0`
|
|
198
|
+
- `openpyxl >= 3.0.0` (optional, Excel export)
|
|
199
|
+
|
|
200
|
+
## Documentation
|
|
201
|
+
Full docs available in the `docs/` folder.
|
|
202
|
+
## Documentation
|
|
203
|
+
|
|
204
|
+
| Doc | What it covers |
|
|
205
|
+
|-----|----------------|
|
|
206
|
+
| [Getting Started](docs/getting_started.md) | installation, quick start |
|
|
207
|
+
| [Loading Data](docs/loading_data.md) | Data(), auto mode, stream mode |
|
|
208
|
+
| [Properties](docs/properties.md) | columns, shape |
|
|
209
|
+
| [Understanding Data](docs/understanding_data.md) | summary, report, explain, magic |
|
|
210
|
+
| [Cleaning Data](docs/cleaning_data.md) | clean, drop, rename, fill |
|
|
211
|
+
| [ML Preparation](docs/ml_prep.md) | encode, normalize, split etc |
|
|
212
|
+
| [Analytics](docs/analytics.md) | mean, correlation, outliers etc |
|
|
213
|
+
| [Query](docs/query.md) | sort, find, first, last |
|
|
214
|
+
| [Intelligence](docs/intelligence.md) | ask, explain, magic |
|
|
215
|
+
| [Stream Mode](docs/stream_mode.md) | big file handling |
|
|
216
|
+
| [Export](docs/export.md) | csv, json, xlsx |
|
|
217
|
+
| [Examples](docs/examples.md) | real world examples |
|
|
218
|
+
| [FAQ](docs/faq.md) | common student questions |
|
|
219
|
+
| [ML Workflow](docs/ml_workflow.md) | full end-to-end ML example |
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
MIT License
|
|
223
|
+
|
|
224
|
+
## Author
|
|
225
|
+
Built by Dipendra — a CS student who wanted pandas to be less painful.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# dipencsv/analytics.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from dipencsv.errors import format_error, format_success
|
|
5
|
+
|
|
6
|
+
def describe(df: pd.DataFrame) -> pd.DataFrame:
|
|
7
|
+
numeric_df = df.select_dtypes(include="number")
|
|
8
|
+
if numeric_df.empty:
|
|
9
|
+
print("⚠️ No numeric columns found")
|
|
10
|
+
return pd.DataFrame()
|
|
11
|
+
return numeric_df.describe().round(2)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def value_counts(df: pd.DataFrame, col: str) -> pd.Series:
|
|
15
|
+
return df[col].value_counts()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
|
|
19
|
+
numeric_df = df.select_dtypes(include="number")
|
|
20
|
+
if numeric_df.empty:
|
|
21
|
+
print("⚠️ No numeric columns found")
|
|
22
|
+
return pd.DataFrame()
|
|
23
|
+
return numeric_df.corr().round(2)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
27
|
+
q1 = df[col].quantile(0.25)
|
|
28
|
+
q3 = df[col].quantile(0.75)
|
|
29
|
+
iqr = q3 - q1
|
|
30
|
+
lower = q1 - 1.5 * iqr
|
|
31
|
+
upper = q3 + 1.5 * iqr
|
|
32
|
+
result = df[(df[col] < lower) | (df[col] > upper)]
|
|
33
|
+
print(f"📊 Outliers in '{col}': {len(result)} rows")
|
|
34
|
+
return result
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def distribution(df: pd.DataFrame, col: str) -> dict:
|
|
38
|
+
return {
|
|
39
|
+
"mean" : round(float(df[col].mean()), 2),
|
|
40
|
+
"median": round(float(df[col].median()), 2),
|
|
41
|
+
"std" : round(float(df[col].std()), 2),
|
|
42
|
+
"min" : round(float(df[col].min()), 2),
|
|
43
|
+
"max" : round(float(df[col].max()), 2),
|
|
44
|
+
"skew" : round(float(df[col].skew()), 2),
|
|
45
|
+
}
|
|
46
|
+
def sort(df: pd.DataFrame, col: str, asc: bool = True) -> pd.DataFrame:
|
|
47
|
+
if col not in df.columns:
|
|
48
|
+
print(format_error(f"Column '{col}' not found"))
|
|
49
|
+
return df
|
|
50
|
+
df = df.sort_values(by=col, ascending=asc).reset_index(drop=True)
|
|
51
|
+
direction = "ascending" if asc else "descending"
|
|
52
|
+
print(format_success(f"Sorted by '{col}' ({direction})"))
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def first(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
|
|
57
|
+
return df.head(n)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def last(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
|
|
61
|
+
return df.tail(n)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def find(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
|
|
65
|
+
if col not in df.columns:
|
|
66
|
+
print(format_error(f"Column '{col}' not found"))
|
|
67
|
+
return pd.DataFrame()
|
|
68
|
+
result = df[df[col] == value]
|
|
69
|
+
print(format_success(f"Found {len(result)} rows where '{col}' = '{value}'"))
|
|
70
|
+
return result
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# dipencsv/cleaner.py
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from dipencsv.errors import format_error, format_warning, format_success
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def clean(df: pd.DataFrame, strategy: str = "safe") -> pd.DataFrame:
|
|
9
|
+
df = df.copy()
|
|
10
|
+
|
|
11
|
+
# fix column names
|
|
12
|
+
df = _fix_columns(df)
|
|
13
|
+
|
|
14
|
+
# remove duplicates
|
|
15
|
+
before = len(df)
|
|
16
|
+
df = df.drop_duplicates()
|
|
17
|
+
removed = before - len(df)
|
|
18
|
+
if removed > 0:
|
|
19
|
+
print(format_warning(f"Removed {removed} duplicate rows"))
|
|
20
|
+
|
|
21
|
+
# handle missing values
|
|
22
|
+
df = _handle_missing(df, strategy)
|
|
23
|
+
|
|
24
|
+
# convert text numbers to numeric
|
|
25
|
+
df = _fix_numeric(df)
|
|
26
|
+
|
|
27
|
+
print(format_success("Cleaning done"))
|
|
28
|
+
return df
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _fix_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
32
|
+
df.columns = (
|
|
33
|
+
df.columns
|
|
34
|
+
.str.strip()
|
|
35
|
+
.str.lower()
|
|
36
|
+
.str.replace(" ", "_")
|
|
37
|
+
.str.replace(r"[^\w]", "", regex=True)
|
|
38
|
+
)
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _handle_missing(df: pd.DataFrame, strategy: str) -> pd.DataFrame:
|
|
43
|
+
if strategy == "safe":
|
|
44
|
+
for col in df.columns:
|
|
45
|
+
if df[col].isnull().sum() == 0:
|
|
46
|
+
continue
|
|
47
|
+
if df[col].dtype in ["float64", "int64"]:
|
|
48
|
+
df[col] = df[col].fillna(df[col].median())
|
|
49
|
+
else:
|
|
50
|
+
mode = df[col].mode()
|
|
51
|
+
if not mode.empty:
|
|
52
|
+
df[col] = df[col].fillna(mode[0])
|
|
53
|
+
elif strategy == "aggressive":
|
|
54
|
+
before = len(df)
|
|
55
|
+
df = df.dropna()
|
|
56
|
+
print(format_warning(f"Dropped {before - len(df)} rows with missing values"))
|
|
57
|
+
return df
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _fix_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
|
61
|
+
for col in df.select_dtypes(include=["object", "str"]).columns:
|
|
62
|
+
converted = pd.to_numeric(df[col], errors="coerce")
|
|
63
|
+
if converted.notna().sum() > len(df) * 0.7:
|
|
64
|
+
df[col] = converted
|
|
65
|
+
return df
|
|
66
|
+
|
|
67
|
+
def drop(df: pd.DataFrame, cols) -> pd.DataFrame:
|
|
68
|
+
if isinstance(cols, str):
|
|
69
|
+
cols = [cols]
|
|
70
|
+
missing = [c for c in cols if c not in df.columns]
|
|
71
|
+
if missing:
|
|
72
|
+
print(format_error(f"Columns not found: {missing}"))
|
|
73
|
+
return df
|
|
74
|
+
df = df.drop(columns=cols)
|
|
75
|
+
print(format_success(f"Dropped columns: {cols}"))
|
|
76
|
+
return df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def rename(df: pd.DataFrame, old: str, new: str) -> pd.DataFrame:
|
|
80
|
+
if old not in df.columns:
|
|
81
|
+
print(format_error(f"Column '{old}' not found"))
|
|
82
|
+
return df
|
|
83
|
+
df = df.rename(columns={old: new})
|
|
84
|
+
print(format_success(f"Renamed '{old}' → '{new}'"))
|
|
85
|
+
return df
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def fill(df: pd.DataFrame, col: str, value) -> pd.DataFrame:
|
|
89
|
+
if col not in df.columns:
|
|
90
|
+
print(format_error(f"Column '{col}' not found"))
|
|
91
|
+
return df
|
|
92
|
+
df[col] = df[col].fillna(value)
|
|
93
|
+
print(format_success(f"Filled missing values in '{col}' with {value}"))
|
|
94
|
+
return df
|
|
95
|
+
|
|
96
|
+
def encode(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
97
|
+
if col not in df.columns:
|
|
98
|
+
print(format_error(f"Column '{col}' not found"))
|
|
99
|
+
return df
|
|
100
|
+
unique_vals = df[col].unique()
|
|
101
|
+
mapping = {val: idx for idx, val in enumerate(unique_vals)}
|
|
102
|
+
df[col] = df[col].map(mapping)
|
|
103
|
+
print(format_success(f"Encoded '{col}': {mapping}"))
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def normalize(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
108
|
+
if col not in df.columns:
|
|
109
|
+
print(format_error(f"Column '{col}' not found"))
|
|
110
|
+
return df
|
|
111
|
+
min_val = df[col].min()
|
|
112
|
+
max_val = df[col].max()
|
|
113
|
+
if max_val == min_val:
|
|
114
|
+
print(format_error(f"Cannot normalize '{col}' — all values are the same"))
|
|
115
|
+
return df
|
|
116
|
+
df[col] = (df[col] - min_val) / (max_val - min_val)
|
|
117
|
+
print(format_success(f"Normalized '{col}' to range [0, 1]"))
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def standardize(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
122
|
+
if col not in df.columns:
|
|
123
|
+
print(format_error(f"Column '{col}' not found"))
|
|
124
|
+
return df
|
|
125
|
+
mean = df[col].mean()
|
|
126
|
+
std = df[col].std()
|
|
127
|
+
if std == 0:
|
|
128
|
+
print(format_error(f"Cannot standardize '{col}' — std is 0"))
|
|
129
|
+
return df
|
|
130
|
+
df[col] = (df[col] - mean) / std
|
|
131
|
+
print(format_success(f"Standardized '{col}' — mean=0, std=1"))
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
def remove_outliers(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
135
|
+
if col not in df.columns:
|
|
136
|
+
print(format_error(f"Column '{col}' not found"))
|
|
137
|
+
return df
|
|
138
|
+
q1 = df[col].quantile(0.25)
|
|
139
|
+
q3 = df[col].quantile(0.75)
|
|
140
|
+
iqr = q3 - q1
|
|
141
|
+
lower = q1 - 1.5 * iqr
|
|
142
|
+
upper = q3 + 1.5 * iqr
|
|
143
|
+
before = len(df)
|
|
144
|
+
df = df[(df[col] >= lower) & (df[col] <= upper)]
|
|
145
|
+
removed = before - len(df)
|
|
146
|
+
print(format_success(f"Removed {removed} outliers from '{col}'"))
|
|
147
|
+
return df
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def split(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42):
|
|
151
|
+
from sklearn.model_selection import train_test_split
|
|
152
|
+
train, test = train_test_split(df, test_size=test_size, random_state=random_state)
|
|
153
|
+
print(format_success(f"Split done — train: {len(train)} rows, test: {len(test)} rows"))
|
|
154
|
+
return train, test
|